10 (git 9) Analysis VII (hybridization)
This pipeline can be executed as follows:
cd $BASE_DIR/nf/09_analysis_hybridization
source ../../sh/nextflow_alias.sh
nf_run_hybrid
10.1 Summary
The hybridization classes are assigned within the nextflow script analysis_hybridization.nf
(located under $BASE_DIR/nf/09_analysis_hybridization/
), which runs on the SNPs only data set.
Below is an overview of the steps involved in the analysis.
(The green dot indicates the genotype input, red arrows depict output that is exported for further use.)
10.2 Details of analysis_hybridization.nf
10.2.1 Data preparation
The nextflow script starts by opening the genotype data.
#!/usr/bin/env nextflow
// git 9.1
// open genotype data
Channel
.fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
.into{ vcf_loc1; vcf_loc2; vcf_loc3 }
Since we are going to work on the three sampling locations independently, we create channel for the locations.
// git 9.2
// initialize location channel
Channel
.from( "bel", "hon", "pan")
.set{ locations_ch }
Next, we define the species sets sampled at the individual locations.
// git 9.3
// define location specific sepcies set
Channel.from( [[1, "ind"], [2, "may"], [3, "nig"], [4, "pue"], [5, "uni"]] ).into{ bel_spec1_ch; bel_spec2_ch }
Channel.from( [[1, "abe"], [2, "gum"], [3, "nig"], [4, "pue"], [5, "ran"], [6, "uni"]] ).into{ hon_spec1_ch; hon_spec2_ch }
Channel.from( [[1, "nig"], [2, "pue"], [3, "uni"]] ).into{ pan_spec1_ch; pan_spec2_ch }
For each location, we create all possible species pairs and then merge the channels of the different locations.
// git 9.4
// prepare pairwise new_hybrids
// ------------------------------
/* (create all possible species pairs depending on location
and combine with genotype subset (for the respective location))*/
bel_pairs_ch = Channel.from( "bel" )
.combine( vcf_loc1 )
.combine(bel_spec1_ch)
.combine(bel_spec2_ch)
.filter{ it[3] < it[5] }
.map{ it[0,1,2,4,6]}
hon_pairs_ch = Channel.from( "hon" )
.combine( vcf_loc2 )
.combine(hon_spec1_ch)
.combine(hon_spec2_ch)
.filter{ it[3] < it[5] }
.map{ it[0,1,2,4,6]}
pan_pairs_ch = Channel.from( "pan" )
.combine( vcf_loc3 )
.combine(pan_spec1_ch)
.combine(pan_spec2_ch)
.filter{ it[3] < it[5] }
.map{ it[0,1,2,4,6]}
bel_pairs_ch.concat( hon_pairs_ch, pan_pairs_ch ).set { all_fst_pairs_ch }
We are going to run newhybrids
only on a subset of the most diverged SNPs for each species pair.
For this we first need to compute the \(F_{ST}\) on a SNP level for each species pair.
// git 9.5
// comute pairwise fsts for SNP filtering
process fst_run {
label 'L_20g45m_fst_run'
tag "${spec1}${loc}-${spec2}${loc}"
input:
set val( loc ), val( vcfidx ), file( vcf ), val( spec1 ), val( spec2 ) from all_fst_pairs_ch
output:
set val( loc ), val( spec1 ), val( spec2 ), file( "${vcf[0]}" ), file( "*.fst.tsv.gz" ), file( "${spec1}${loc}.pop"), file( "${spec2}${loc}.pop") into fst_SNPS
script:
"""
vcfsamplenames ${vcf[0]} | grep ${spec1}${loc} > ${spec1}${loc}.pop
vcfsamplenames ${vcf[0]} | grep ${spec2}${loc} > ${spec2}${loc}.pop
vcftools --gzvcf ${vcf[0]} \
--weir-fst-pop ${spec1}${loc}.pop \
--weir-fst-pop ${spec2}${loc}.pop \
--stdout | gzip > ${spec1}${loc}-${spec2}${loc}.fst.tsv.gz
"""
}
For each species pair, the 800 most diverged SNPs for this particular pair are selected.
// git 9.6
// select the 800 most differentiated SNPs for each population pair
process filter_fst {
label 'L_8g15m_filter_fst'
tag "${spec1}${loc}-${spec2}${loc}"
input:
set val( loc ), val( spec1 ), val( spec2 ), file( vcf ), file( fst ), file( pop1 ), file( pop2 ) from fst_SNPS
output:
set val( loc ), val( spec1 ), val( spec2 ), file( vcf ), file( pop1 ), file( pop2 ), file( "*SNPs.snps" ) into filter_SNPs
script:
"""
Rscript --vanilla \$BASE_DIR/R/filter_snps.R ${fst} 800 ${spec1}${loc}-${spec2}${loc}
"""
}
This pre-selection is then filtered to guarantee a minimum distance of 5 kb between all SNPs.
Of this filtered SNP set, 80 SNPs are randomly sampled and formateted for the newhybrid
analysis.
// git 9.7
// filter the SNP set by min distance (5kb), than randomly pick 80 SNPs
// then reformat newhybrid input
process prep_nh_input {
label 'L_8g15m_prep_nh'
tag "${spec1}${loc}-${spec2}${loc}"
input:
set val( loc ), val( spec1 ), val( spec2 ), file( vcf ), file( pop1 ), file( pop2 ), file( snps ) from filter_SNPs
output:
set val( loc ), val( spec1 ), val( spec2 ), file( "*_individuals.txt" ), file( "*.80SNPs.txt") into newhybrids_input
script:
"""
vcftools \
--gzvcf ${vcf} \
--keep ${pop1} \
--keep ${pop2} \
--thin 5000 \
--out newHyb.${spec1}${loc}-${spec2}${loc} \
--positions ${snps} \
--recode
grep '#' newHyb.${spec1}${loc}-${spec2}${loc}.recode.vcf > newHyb.${spec1}${loc}-${spec2}${loc}.80SNPs.vcf
grep -v '#' newHyb.${spec1}${loc}-${spec2}${loc}.recode.vcf | \
shuf -n 80 | \
sort -k 1 -k2 >> newHyb.${spec1}${loc}-${spec2}${loc}.80SNPs.vcf
grep '#CHROM' newHyb.${spec1}${loc}-${spec2}${loc}.80SNPs.vcf | \
cut -f 10- | \
sed 's/\\t/\\n/g' > newHyb.${spec1}${loc}-${spec2}${loc}.80SNPs_individuals.txt
/usr/bin/java -Xmx1024m -Xms512M \
-jar \$SFTWR/PGDSpider/PGDSpider2-cli.jar \
-inputfile newHyb.${spec1}${loc}-${spec2}${loc}.80SNPs.vcf \
-inputformat VCF \
-outputfile newHyb.${spec1}${loc}-${spec2}${loc}.80SNPs.txt \
-outputformat NEWHYBRIDS \
-spid \$BASE_DIR/ressources/vcf2nh.spid
"""
}
Using a prepared R
sript we then can run newhybrids
on the SNP selection.
// git 9.8
// Run new hybrids
// (copy of nh_input is needed because nh can't read links)
process run_nh {
label 'L_20g15h4x_run_nh'
tag "${spec1}${loc}-${spec2}${loc}"
publishDir "../../2_analysis/newhyb/", mode: 'copy'
input:
set val( loc ), val( spec1 ), val( spec2 ), file( inds ), file( snps ) from newhybrids_input
output:
set file( "nh_input/NH.Results/newHyb.*/*_individuals.txt" ), file( "nh_input/NH.Results/newHyb.*/*_PofZ.txt" ) into newhybrids_output
script:
"""
mkdir -p nh_input
cp ${snps} nh_input/${snps}
cp ${inds} nh_input/${inds}
Rscript --vanilla \$BASE_DIR/R/run_newhybrids.R
"""
}
Finally, we are done with the hybridization analysis.