16 (git 15) Analysis XIII (diversity with and without outlier regions)
This pipeline can be executed as follows:
cd $BASE_DIR/nf/15_analysis_pi
source ../../sh/nextflow_alias.sh
nf_run_pi
16.1 Summary
The diversity is computed within the nextflow script analysis_pi.nf
(located under $BASE_DIR/nf/15_analysis_pi/
).
It takes the all BP data set and computes \(\pi\).
16.2 Details of analysis_pi.nf
16.2.1 Setup
The nextflow script starts by opening the genotype data and feeding it into a stream.
#!/usr/bin/env nextflow
// This pipeline includes the analysis run on the
// all callable sites data sheet (pi).
// git 15.1
// load genotypes
Channel
.fromFilePairs("../../1_genotyping/3_gatk_filtered/filterd.allBP.vcf.{gz,gz.tbi}")
.set{ vcf_pi_ch }
To split the analysis by linkage group, we initialize a LG-channel.
// git 15.2
// initialize LGs
Channel
.from( ('01'..'09') + ('10'..'19') + ('20'..'24') )
.set{ lg_pi_ch }
We also want to compute the diversity twice, once as is and once without the outlier regions.
// git 15.3
// set outlier window mode
Channel
.from( "", "_no_outlier" )
.set{ outlier_mode_ch }
Furthermore, we want to analyze diversity at two scales (10kb and 50kb).
// git 15.4
// init slining window resolutions
Channel
.from( 1, 5 )
.set{ kb_ch }
Then we filter the genotypes, depending on population, linkage group and outlier mode.
// git 15.5
// init all sampled populations (for pi)
Channel
.from('indbel', 'maybel', 'nigbel', 'puebel', 'unibel', 'abehon', 'gumhon', 'nighon', 'puehon', 'ranhon', 'unihon', 'nigpan', 'puepan', 'unipan')
.combine( vcf_pi_ch )
.combine( kb_ch )
.combine( lg_pi_ch )
.combine( outlier_mode_ch )
.set{ input_ch }
// filter genotypes and convert formats
process recode_genotypes {
label 'L_32g15h_recode'
tag "${spec}"
input:
set val( spec ), vcfId, file( vcf ), val( kb ), val( lg ), val( outlier ) from input_ch
output:
set val( spec ), val( kb ), val( lg ), val( outlier ), file( "*.geno.gz" ), file( "pop.txt" ) into geno_ch
script:
"""
if [ "${outlier}" == "_no_outlier" ];then
tail -n +2 \$BASE_DIR/2_analysis/summaries/fst_outliers_998.tsv | \
cut -f 2,3,4 > outlier.bed
SUBSET="--exclude-bed outlier.bed"
else
SUBSET=""
fi
vcfsamplenames ${vcf[0]} | \
grep ${spec} > pop.txt
vcftools --gzvcf ${vcf[0]} \
--keep pop.txt \
--chr LG${lg} \
\$SUBSET \
--recode \
--stdout | bgzip > ${spec}${outlier}.LG${lg}.vcf.gz
python \$SFTWR/genomics_general/VCF_processing/parseVCF.py \
-i ${spec}${outlier}.LG${lg}.vcf.gz | \
gzip > ${spec}${outlier}.LG${lg}.geno.gz
"""
}
Then we calculate pi for each population.
// git 15.6
// calculate pi per species
process pi_per_spec {
label 'L_32g15h_pi'
tag "${spec}"
input:
set val( spec ), val( kb ), val( lg ), val( outlier ), file( geno ), file( pop ) from geno_ch
output:
set val( "${spec}${outlier}.${kb}" ), val( kb ), file( "*0kb.csv.gz" ) into pi_lg_ch
script:
"""
awk '{print \$1"\\t"substr(\$1,length(\$1)-5,length(\$1)-1)}' ${pop} > ${spec}.pop
python \$SFTWR/genomics_general/popgenWindows.py \
-w ${kb}0000 \
-s ${kb}000 \
--popsFile ${spec}.pop \
-p ${spec} \
-g ${geno} \
-o pi.${spec}${outlier}.LG${lg}.${kb}0kb.csv.gz \
-f phased \
--writeFailedWindows \
-T 1
"""
}
Finally, we merge the output from the individual linkage groups.
process merge_pi {
label 'L_32g4h_pi_merge'
tag "${spec_outlier_kb}"
publishDir "../../2_analysis/pi/${kb[0]}0k", mode: 'copy'
input:
set val( spec_outlier_kb ), val( kb ), file( pi ) from pi_lg_ch.groupTuple()
output:
file( "pi.${spec_outlier_kb}0kb.tsv.gz" ) into pi_output_ch
script:
"""
echo -e "CHROM\\tBIN_START\\tBIN_END\\tBIN_MID_SITES\\tN_SITES\\tPI" > pi.${spec_outlier_kb}0kb.tsv
for k in \$(ls *.csv.gz); do
zcat \$k | \
grep -v "scaff" | \
sed -s "s/,/\t/g" >> pi.${spec_outlier_kb}0kb.tsv
done
gzip pi.${spec_outlier_kb}0kb.tsv
"""
}