14 (git 13) Analysis XI (Whole Genome Phylogenies)
This pipeline can be executed as follows:
cd $BASE_DIR/nf/13_analysis_phylo_whg
nextflow run analysis_phylo_whg.nf
14.1 Summary
The whole genome phylogenies can be reconstructed within the nextflow script analysis_phylo_whg.nf
(located under $BASE_DIR/nf/analysis_phylo_whg/
).
14.2 Details of analysis_phylo_whg.nf
This part of the analysis was actually manged manually and not via
nextflow
. We still report the analysis as a.nf
script as we believe this is a cleaner and more concise report of the conducted analysis.
14.2.1 Setup
The nextflow script starts by opening the genotype data and feeding it into two different streams.
#!/usr/bin/env nextflow
// git 13.1
// Open the SNP data set
Channel
.fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
.into{ vcf_snps_ch; vcf_snps_ch2 }
We also open the allBP data set (genotypes of SNPS + invariant sites).
// git 13.2
// Open the allBP data set (will be expanded x 24 LGs)
Channel
.fromFilePairs("../../1_genotyping/3_gatk_filtered/filterd.allBP.non_ref.vcf.{gz,gz.tbi}")
.set{ vcf_allbp_ch }
Then, we initialize the different window sizes considered (1-50 kb).
// git 13.3
Channel
.from( [ 1, 5, 10, 50 ] )
.set{ window_size_ch }
Then we create one bed file per windowsize, containing the window positions.
// git 13.4
// Compile summary table
process segment_windows {
label 'L_loc_slice_windows'
publishDir "../../2_analysis/window_stats/windows/", mode: 'copy'
input:
val( kb_size ) from window_size_ch
output:
set val( kb_size ), file( "windows_${kb_size}kb.bed.gz" ) into ( windows_ch, windows_ch2, windows_ch3 )
script:
"""
#!/usr/bin/env Rscript
library(hypogen)
x_bp <- ${kb_size} * 1000
window_non_overlap <- function(CHROM, LENGTH, windowsize = x_bp ){
tibble(CHROM = CHROM,
START = seq(from = 1, to = LENGTH, by = windowsize) - 1, # (produces overlap of one which is needed for bedtools)
END = lead(START, default = LENGTH) ) }
hypo_karyotype %>%
select(CHROM, LENGTH) %>%
pmap_dfr(window_non_overlap) %>%
write_tsv(file = "windows_${kb_size}kb.bed.gz")
"""
}
We drop the Serranid samples (the outgroup) from the genotypes.
// git 13.5
process filter_hamlets {
label 'L_20g2h_outgroup_drop'
input:
set val( vcfId ), file( vcf ) from vcf_snps_ch2
output:
set val( "hamlets_only" ), file( "hamlets_only.vcf.gz*" ) into vcf_hamlets_ch
script:
"""
echo -e "20478tabhon\\n28393torpan\\ns_tort_3torpan" > outgroup.pop
vcftools --gzvcf ${vcf[0]} \
--remove outgroup.pop \
--mac 1 \
--recode \
--stdout | \
bgzip > hamlets_only.vcf.gz
tabix hamlets_only.vcf.gz
"""
}
Then, we compute the coverage (n of SNPs) within each window.
// git 13.6
// Coverage of SNPs vcf for SNPdensity, allBP for Ns
process compute_coverage {
label 'L_140g1h_coverage'
publishDir "../../2_analysis/window_stats/coverages/", mode: 'copy'
input:
//set vcfId, file( vcf ), val( kb_size ), file( window ) from vcf_snps_filterd_ch.combine( windows_ch )
set val( vcfId ), file( vcf ), val( kb_size ), file( window ) from vcf_snps_ch.concat( vcf_hamlets_ch ).map{ [it[0].minus(".vcf"), it[1]]}.combine( windows_ch )
output:
set val( kb_size ), file( "${vcfId}.${kb_size}kb_cov.tsv.gz" ) into coverage_ch
script:
"""
bedtools coverage \
-a ${window} \
-b ${vcf[0]} \
-counts | gzip > ${vcfId}.${kb_size}kb_cov.tsv.gz
"""
}
To split the genotypes by linkage group, we initialize a LG-channel…
// git 13.7
Channel
.from( ('01'..'09') + ('10'..'19') + ('20'..'24') )
.set{ lg_ch }
…and split the genotypes.
// git 13.8
process subset_allBP {
label 'L_140g10h_coverage'
publishDir "../../1_genotyping/3_gatk_filtered/non_ref_byLG/", mode: 'copy'
input:
//set vcfId, file( vcf ), val( kb_size ), file( window ) from vcf_snps_filterd_ch.combine( windows_ch )
set val( lg ) from lg_ch
output:
set val( lg ), file( "filterd.allBP.non_ref.LG${lg}.vcf.gz" ), file( "filterd.allBP.non_ref.LG${lg}.vcf.gz.tbi" ) into allbp_non_ref_ch
script:
"""
vcftools \
--gzvcf \$BASE_DIR/1_genotyping/3_gatk_filtered//filterd.allBP.non_ref.vcf.gz \
--chr LG${lg} \
--recode \
--stdout | bgzip > filterd.allBP.non_ref.LG${lg}.vcf.gz
tabix filterd.allBP.non_ref.LG${lg}.vcf.gz
"""
}
Then we also compute the coverage (in terms of callable sites, not just SNPs) within windows.
// git 13.9
process compute_coverage_allBP {
label 'L_140g1h_coverage'
publishDir "../../2_analysis/window_stats/coverages/", mode: 'copy'
input:
set val( lg ), file( vcf ), file( tbi ), val( kb_size ), file( window ) from allbp_non_ref_ch.combine( windows_ch2 )
output:
set val( kb_size ), file( "filterd.allBP.LG${lg}.${kb_size}kb_cov.tsv.gz" ) into coverage_allbp_ch
script:
"""
echo -e "CHROM\\tSTART\\tEND" > bed_LG${lg}.bed
zgrep "LG${lg}" ${window} >> bed_LG${lg}.bed
gzip bed_LG${lg}.bed
bedtools coverage \
-a bed_LG${lg}.bed.gz \
-b ${vcf} \
-counts | gzip > filterd.allBP.LG${lg}.${kb_size}kb_cov.tsv.gz
"""
}
So, now we can merge the coverage information for each window.
// git 13.10
// Compile summary table
process complie_window_stats {
label 'L_20g2h_windows_stats'
publishDir "../../2_analysis/window_stats/window_stats/", mode: 'copy'
input:
set val( kb_size ), file( windows ) from coverage_ch.concat( coverage_allbp_ch ).groupTuple()
output:
set val( kb_size ), file( "window_stats.${kb_size}kb.tsv.gz" ) into window_out_ch
script:
"""
#!/usr/bin/env Rscript
library(tidyverse)
data_SNPs <- read_tsv("phased_mac2.${kb_size}kb_cov.tsv.gz",
col_names = c("CHROM", "START", "END", "COV_SNP"))
data_HYP <- read_tsv("hamlets_only.${kb_size}kb_cov.tsv.gz",
col_names = c("CHROM", "START", "END", "COV_HYP"))
all_bp_files <- dir(pattern = "filterd.allBP.LG*")
data_allBPs <- map_dfr(all_bp_files, .f = function(x){read_tsv(x, col_names = c("CHROM", "START", "END", "COV_ALL"))})
data <- data_SNPs %>%
left_join(data_HYP, by = c(CHROM = "CHROM", START = "START", END = "END")) %>%
left_join(data_allBPs, by = c(CHROM = "CHROM", START = "START", END = "END")) %>%
filter(COV_ALL > 0 ) %>%
mutate(SNP_density = round(COV_SNP/ COV_ALL, 2),
REL_COV = round(COV_ALL/ (END-START), 2))
write_tsv(x = data, file = "window_stats.${kb_size}kb.tsv.gz")
"""
}
From this point onward, the analysis was actually done manually (we still format it according to nextflow for clarity).
// ----------------------- DISCLAIMER ----------------------
// form here on, this pipeline was not actually run using
// nextflow, but managed manually
// ---------------------------------------------------------
Now, the subset of windows to be considered for the phylogeny was sampled.
// git 13.11
// Subset to 5000 random windows
process subset_windows {
label 'L_loc_subset_windows'
input:
set val( kb_size ), file( windows ) from window_out_ch.filter({ it[1] == 5 })
output:
set val( kb_size ), file( "5000x_${kb_size}kb_v1.bed" ) into window_subset_ch
script:
"""
#!/usr/bin/env Rscript
library(hypogen) # attaches tidyverse automatically
### Draw windows with coverage (sequence contiguity) and SNP-based cutoffs
stats5 <- read_tsv("window_stats.${kb_size}kb.tsv.gz")
set.seed(64)
selected_windows <- stats5 %>%
filter(REL_COV >= 0.80 &
COV_HYP >= 50) %>%
sample_n(5000) %>%
arrange(CHROM, START)
write_tsv(selected_windows %>% select(CHROM, START, END), file = "5000x_${kb_size}kb_v1.bed")
"""
}
To parallelize the analysis, several looping-channels are opened.
// git 13.12
// index for sub-channels
Channel.from( 1..50 ).into{ loop50_idx_ch1; loop50_idx_ch2; loop50_idx_ch3; loop50_idx_ch4 }
For the selected windows, the genotypes are extracted.
// git 13.13
// Extract windows from all BP
process extract_windows {
label 'L_20g2h_extract_windows'
input:
set val( idx ), file( windows ) from loop50_idx_ch1.combine( window_out_ch )
output:
file( "*_v1_all.vcf.gz" ) into window_extract_loop
script:
"""
PER_TASK=100
START_NUM=\$(( (${idx} - 1) * \$PER_TASK + 1 ))
END_NUM=\$(( ${idx} * \$PER_TASK ))
echo This is task ${idx}, which will do runs \$START_NUM to \$END_NUM
for (( run=\$START_NUM ; run<=END_NUM ; run++ ))
do
echo This is task ${idx}, run number \$run
chr=\$(awk -v line="\$run" 'BEGIN { FS = "\\t" } ; NR==line+1 { print \$1 }' 5000x_5kb_v1.bed)
sta=\$(awk -v line="\$run" 'BEGIN { FS = "\\t" } ; NR==line+1 { print \$2 }' 5000x_5kb_v1.bed)
end=\$(awk -v line="\$run" 'BEGIN { FS = "\\t" } ; NR==line+1 { print \$3 }' 5000x_5kb_v1.bed)
printf -v i "%04d" \$run
vcftools \
--gzvcf \$BASE_DIR/1_genotyping/3_gatk_filtered/byLG/filterd.allBP."\$chr".vcf.gz \
--chr "\$chr" \
--from-bp "\$sta" \
--to-bp "\$end" \
--recode --stdout | \
grep -v "##" | bgzip > window_"\$i"_v1_all.vcf.gz
done
"""
}
Again, a channel is created for parallelizing…
// git 13.14
// index for sub-channels
Channel.from( 1..20 ).set{ loop20_idx_ch }
// bundle the distributed sub-channels
window_extract_loop.collect().map{ [ it ] }.set{ window_extract_ch1, window_extract_ch2 }
…and again, the outgroup samples are removed from the genoptypes.
// git 13.15
// Remove samples (all / noS data sets)
process remove_samples {
label 'L_20g2h_remove_samples'
input:
set val( idx ), file( vcf ) from loop20_idx_ch.combine( window_extract_ch1 )
output:
file( "*_noS.vcf.gz" ) into samples_removed_loop
script:
"""
PER_TASK=250
START_NUM=\$(( (${idx} - 1) * \$PER_TASK + 1 ))
END_NUM=\$(( ${idx} * \$PER_TASK ))
echo This is task ${idx}, which will do runs \$START_NUM to \$END_NUM
for (( run=\$START_NUM ; run<=END_NUM ; run++ ))
do
echo This is task ${idx}, run number \$run
printf -v i "%04d" \$run
vcftools \
--gzvcf window_"\$i"*.vcf.gz \
--remove samples_Serr.txt \
--recode \
--stdout | bgzip > window_"\$i"_v1_noS.vcf.gz
done
"""
}
// bundle the distributed sub-channels
samples_removed_loop.collect().map{ [ it ] }.set{ samples_removed_ch }
Then, we translate the genotypes from vcf to fasta format.
// git 13.16
// Convert to Fasta (IUPAC-encoded)
process fasta_convert {
label 'L_20g2h_convert_to_fasta'
input:
set val( idx ), file( vcf_noS ), file( vcf_all ) from loop50_idx_ch2.combine( samples_removed_ch ).combine( window_extract_ch2 )
output:
file( "*.fas" ) into fasta_convert_loop
script:
"""
PER_TASK=100
START_NUM=\$(( (${idx} - 1) * \$PER_TASK + 1 ))
END_NUM=\$(( ${idx} * \$PER_TASK ))
echo This is task ${idx}, which will do runs \$START_NUM to \$END_NUM
for (( run=\$START_NUM ; run<=END_NUM ; run++ ))
do
echo This is task ${idx}, run number \$run
printf -v i "%04d" \$run
for j in all noS
do
bgzip -cd window_"\$i"_v1_"\$j".vcf.gz | vcf-to-tab > window_"\$i"_v1_"\$j".tab
perl \$SFTWR/vcf-tab-to-fasta/vcf_tab_to_fasta_alignment.pl -i window_"\$i"_v1_"\$j".tab > window_"\$i"_v1_"\$j".fas
rm window_"\$i"_v1_"\$j".tab*
done
done
"""
}
// bundle the distributed sub-channels
fasta_convert_loop.collect().map{ [ it ] }.set{ fasta_convert_ch }
The fasta sequences are aligned.
// git 13.17
// Align sequences in windows
process fasta_align {
label 'L_20g2h_fasta_align'
input:
set val( idx ), file( fa ) from loop50_idx_ch3.combine( fasta_convert_ch )
output:
file( "*.aln" ) into fasta_align_loop
script:
"""
PER_TASK=100
START_NUM=\$(( (${idx} - 1) * \$PER_TASK + 1 ))
END_NUM=\$(( ${idx} * \$PER_TASK ))
echo This is task ${idx}, which will do runs \$START_NUM to \$END_NUM
for (( run=${idx} ; run<=END_NUM ; run++ ))
do
echo This is task ${idx}, run number \$run
printf -v i "%04d" \$run
for j in all noS
do
mafft --auto window_"\$i"_v1_"\$j".fas > "window_"\$i"_v1_"\$j".aln
done
done
"""
}
// bundle the distributed sub-channels
fasta_align_loop.collect().map{ [ it ] }.set{ fasta_align_ch }
Next, the trees are inferred within each window.
// git 13.18
// Infer local trees
process local_trees {
label 'L_20g2h_local_trees'
input:
set val( idx ), file( aln ) from loop50_idx_ch4.combine( fasta_align_ch )
output:
file( "*.treefile" ) into local_trees_loop
script:
"""
PER_TASK=100
START_NUM=$(( (${idx}- 1) * \$PER_TASK + 1 ))
END_NUM=$(( ${idx} * \$PER_TASK ))
echo This is task ${idx}, which will do runs \$START_NUM to \$END_NUM
for (( run=${idx} ; run<=END_NUM ; run++ ))
do
echo This is task ${idx}, run number \$run
printf -v i "%04d" \$run
for j in all noS
do
iqtree2 -s window_"\$i"_v1_"\$j".aln --prefix locus_"\$i"_v1_"\$j" -o PL17_160floflo -T 1
done
done
"""
}
// bundle the distributed sub-channels
local_trees_loop.collect().map{ [ it ] }.set{ local_trees_ch }
Finally, the individual trees are summarized.
// git 13.19
// Calculate summary tree
process summary_trees {
label 'L_20g2h_summary_trees'
publishDir "../../2_analysis/astral/", mode: 'copy'
input:
file( tree ) from local_trees_ch
output:
set file( "astral*.tre" ), file( "astral*.log" ) into summary_trees_ch
script:
"""
cat ./*_noS.treefile > genetrees_5000x_5kb_v1_noS.tre
java -jar \$SFTWR/ASTRAL_5.7.5/astral.5.7.5.jar -i genetrees_5000x_5kb_v1_noS.tre -o astral_5000x_5kb_v1_noS.tre 2> astral_5000x_5kb_v1_noS.log
cat ./*_all.treefile > genetrees_5000x_5kb_v1_all.tre
java -jar \$SFTWR/ASTRAL_5.7.5/astral.5.7.5.jar -i genetrees_5000x_5kb_v1_all.tre -o astral_5000x_5kb_v1_all.tre 2> astral_5000x_5kb_v1_all.log
"""
}