4 (git 3) Analysis I (FST & GxP)
This pipeline can be executed as follows:
cd $BASE_DIR/nf/03_analysis_fst_gxp
source ../../sh/nextflow_alias.sh
nf_run_basic
4.1 Summary
The genetic differentiation, as well as the genotype x phenotype association, are computed within the nextflow script analysis_fst_gxp.nf
(located under $BASE_DIR/nf/03_analysis_fst_gxp/
).
It takes the SNPs only data set and computes \(F_{ST}\) and the GxP association.
Below is an overview of the steps involved in the analysis.
(The green dot indicates the genotype input, red arrows depict output that is exported for further use.)
4.2 Details of analysis_fst_gxp.nf
4.2.1 Setup
The nextflow script starts by opening the genotype data and feeding it into three different streams.
#!/usr/bin/env nextflow
// git 3.1
// open genotype data
Channel
.fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
.into{ vcf_locations; vcf_filter; vcf_gxp; vcf_adapt }
Since we are going to work on the three sampling locations independently, we create channel for the locations.
// git 3.2
// initialize location channel
Channel
.from( "bel", "hon", "pan")
.set{ locations_ch }
Then we attach the genotypes to the locations.
// git 3.3
// attach genotypes to location
locations_ch
.combine( vcf_locations )
.set{ vcf_location_combo }
Next, we define the species sets sampled at the individual locations.
// git 3.4
// define location specific sepcies set
Channel.from( [[1, "ind"], [2, "may"], [3, "nig"], [4, "pue"], [5, "uni"]] ).into{ bel_spec1_ch; bel_spec2_ch }
Channel.from( [[1, "abe"], [2, "gum"], [3, "nig"], [4, "pue"], [5, "ran"], [6, "uni"]] ).into{ hon_spec1_ch; hon_spec2_ch }
Channel.from( [[1, "nig"], [2, "pue"], [3, "uni"]] ).into{ pan_spec1_ch; pan_spec2_ch }
For each location, the data is subset to include only local hamlets.
// git 3.5
// subset data to local hamlets
process subset_vcf_by_location {
label "L_20g2h_subset_vcf"
input:
set val( loc ), vcfId, file( vcf ) from vcf_location_combo
output:
set val( loc ), file( "${loc}.vcf.gz" ), file( "${loc}.pop" ) into ( vcf_loc_pair1, vcf_loc_pair2, vcf_loc_pair3 )
script:
"""
vcfsamplenames ${vcf[0]} | \
grep ${loc} | \
grep -v tor | \
grep -v tab > ${loc}.pop
vcftools --gzvcf ${vcf[0]} \
--keep ${loc}.pop \
--mac 3 \
--recode \
--stdout | gzip > ${loc}.vcf.gz
"""
}
As we also want to compute global statistics, we create another subset including all sampled hamlets but excluding the outgroups.
// git 3.6
// subset the global data set to hamlets only
process subset_vcf_hamlets_only {
label "L_20g15h_filter_hamlets_only"
publishDir "../../1_genotyping/4_phased/", mode: 'copy' , pattern: "*.vcf.gz"
//module "R3.5.2"
input:
set vcfId, file( vcf ) from vcf_filter
output:
file( "hamlets_only.vcf.gz*" ) into vcf_hamlets_only
set file( "hamlets_only.vcf.gz*" ), file( "hamlets_only.pop.txt" ) into vcf_multi_fst
script:
"""
vcfsamplenames ${vcf[0]} | \
grep -v "abe\\|gum\\|ind\\|may\\|nig\\|pue\\|ran\\|uni" > outgroup.pop
vcfsamplenames ${vcf[0]} | \
grep "abe\\|gum\\|ind\\|may\\|nig\\|pue\\|ran\\|uni" | \
awk '{print \$1"\\t"\$1}' | \
sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > hamlets_only.pop.txt
vcftools \
--gzvcf ${vcf[0]} \
--remove outgroup.pop \
--recode \
--stdout | gzip > hamlets_only.vcf.gz
"""
}
4.2.2 FST
Using this subsets, we compute the global differentiation among all hamlet populations.
// ----------- Fst section -----------
// git 3.7
// compute global fst
process fst_multi {
label 'L_20g15h_fst_multi'
publishDir "../../2_analysis/fst/50k/", mode: 'copy' , pattern: "*.50k.tsv.gz"
publishDir "../../2_analysis/fst/10k/", mode: 'copy' , pattern: "*.10k.tsv.gz"
publishDir "../../2_analysis/fst/logs/", mode: 'copy' , pattern: "*.log"
publishDir "../../2_analysis/summaries", mode: 'copy' , pattern: "fst_outliers_998.tsv"
//conda "$HOME/miniconda2/envs/py3"
//module "R3.5.2"
input:
set file( vcf ), file( pop ) from vcf_multi_fst
output:
file( "multi_fst*" ) into multi_fst_output
file( "fst_outliers_998.tsv" ) into fst_outlier_output
script:
"""
awk '{print \$1"\\t"\$2\$3}' ${pop} > pop.txt
for k in abehon gumhon indbel maybel nigbel nighon nigpan puebel puehon puepan ranhon unibel unihon unipan; do
grep \$k pop.txt | cut -f 1 > pop.\$k.txt
done
POP="--weir-fst-pop pop.abehon.txt \
--weir-fst-pop pop.gumhon.txt \
--weir-fst-pop pop.indbel.txt \
--weir-fst-pop pop.maybel.txt \
--weir-fst-pop pop.nigbel.txt \
--weir-fst-pop pop.nighon.txt \
--weir-fst-pop pop.nigpan.txt \
--weir-fst-pop pop.puebel.txt \
--weir-fst-pop pop.puehon.txt \
--weir-fst-pop pop.puepan.txt \
--weir-fst-pop pop.ranhon.txt \
--weir-fst-pop pop.unibel.txt \
--weir-fst-pop pop.unihon.txt \
--weir-fst-pop pop.unipan.txt"
# fst by SNP
# ----------
vcftools --gzvcf ${vcf} \
\$POP \
--stdout 2> multi_fst_snp.log | \
gzip > multi_fst.tsv.gz
# fst 50kb window
# ---------------
vcftools --gzvcf ${vcf} \
\$POP \
--fst-window-step 5000 \
--fst-window-size 50000 \
--stdout 2> multi_fst.50k.log | \
gzip > multi_fst.50k.tsv.gz
# fst 10kb window
# ---------------
vcftools --gzvcf ${vcf} \
\$POP \
--fst-window-step 1000 \
--fst-window-size 10000 \
--stdout 2> multi_fst.10k.log | \
gzip > multi_fst_snp.tsv.gz
Rscript --vanilla \$BASE_DIR/R/table_fst_outliers.R multi_fst.50k.tsv.gz
"""
}
Then, we set up the pair wise species comparisons…
// git 3.8
// prepare pairwise fsts
// ------------------------------
/* (create all possible species pairs depending on location
and combine with genotype subset (for the respective location))*/
// ------------------------------
/* channel content after joinig:
set [0:val(loc), 1:file(vcf), 2:file(pop), 3:val(spec1), 4:val(spec2)]*/
// ------------------------------
bel_pairs_ch = Channel.from( "bel" )
.join( vcf_loc_pair1 )
.combine(bel_spec1_ch)
.combine(bel_spec2_ch)
.filter{ it[3] < it[5] }
.map{ it[0,1,2,4,6]}
hon_pairs_ch = Channel.from( "hon" )
.join( vcf_loc_pair2 )
.combine(hon_spec1_ch)
.combine(hon_spec2_ch)
.filter{ it[3] < it[5] }
.map{ it[0,1,2,4,6]}
pan_pairs_ch = Channel.from( "pan" )
.join( vcf_loc_pair3 )
.combine(pan_spec1_ch)
.combine(pan_spec2_ch)
.filter{ it[3] < it[5] }
.map{ it[0,1,2,4,6]}
bel_pairs_ch.concat( hon_pairs_ch, pan_pairs_ch ).set { all_fst_pairs_ch }
… and run them.
// git 3.9
// compute pairwise fsts
process fst_run {
label 'L_32g4h_fst_run'
publishDir "../../2_analysis/fst/50k/", mode: 'copy' , pattern: "*.50k.windowed.weir.fst.gz"
publishDir "../../2_analysis/fst/10k/", mode: 'copy' , pattern: "*.10k.windowed.weir.fst.gz"
publishDir "../../2_analysis/fst/logs/", mode: 'copy' , pattern: "${loc}-${spec1}-${spec2}.log"
input:
set val( loc ), file( vcf ), file( pop ), val( spec1 ), val( spec2 ) from all_fst_pairs_ch
output:
set val( loc ), file( "*.50k.windowed.weir.fst.gz" ), file( "${loc}-${spec1}-${spec2}.log" ) into fst_50k
file( "*.10k.windowed.weir.fst.gz" ) into fst_10k_output
file( "${loc}-${spec1}-${spec2}.log" ) into fst_logs
script:
"""
grep ${spec1} ${pop} > pop1.txt
grep ${spec2} ${pop} > pop2.txt
vcftools --gzvcf ${vcf} \
--weir-fst-pop pop1.txt \
--weir-fst-pop pop2.txt \
--fst-window-step 5000 \
--fst-window-size 50000 \
--out ${loc}-${spec1}-${spec2}.50k 2> ${loc}-${spec1}-${spec2}.log
vcftools --gzvcf ${vcf} \
--weir-fst-pop pop1.txt \
--weir-fst-pop pop2.txt \
--fst-window-size 10000 \
--fst-window-step 1000 \
--out ${loc}-${spec1}-${spec2}.10k
gzip *.windowed.weir.fst
"""
}
The genome wide summaries are compiled from the log files of VCFtools
.
// git 3.10
/* collect the VCFtools logs to crate a table with the
genome wide fst values */
process fst_globals {
label 'L_loc_fst_globals'
publishDir "../../2_analysis/summaries", mode: 'copy' , pattern: "fst_globals.txt"
//module "R3.5.2"
input:
file( log ) from fst_logs.collect()
output:
file( "fst_globals.txt" ) into fst_glob
script:
"""
cat *.log | \
grep -E 'Weir and Cockerham|--out' | \
grep -A 3 50k | \
sed '/^--/d; s/^.*--out //g; s/.50k//g; /^Output/d; s/Weir and Cockerham //g; s/ Fst estimate: /\t/g' | \
paste - - - | \
cut -f 1,3,5 | \
sed 's/^\\(...\\)-/\\1\\t/g' > fst_globals.txt
"""
}
4.2.3 GxP
The software used for the GxP association (gemma) uses genotypes in plink
file format, so the first step was to convert the input.
// ----------- G x P section -----------
// git 3.11
// reformat genotypes (1)
process plink12 {
label 'L_20g2h_plink12'
input:
set vcfId, file( vcf ) from vcf_gxp
output:
set file( "GxP_plink.map" ), file( "GxP_plink.ped" ) into plink_GxP
script:
"""
vcfsamplenames ${vcf[0]} | \
grep -v "tor\\|tab\\|flo" | \
awk '{print \$1"\\t"\$1}' | \
sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > pop.txt
vcftools \
--gzvcf ${vcf[0]} \
--plink \
--out GxP_plink
plink \
--file GxP_plink \
--recode12 \
--out hapmap
"""
}
// git 3.12
// reformat genotypes (2)
process GxP_run {
label 'L_20g2h_GxP_binary'
input:
set file( map ), file( ped ) from plink_GxP
output:
set file( "*.bed" ), file( "*.bim" ),file( "*.fam" ) into plink_binary
script:
"""
# convert genotypes into binary format (bed/bim/fam)
plink \
--noweb \
--file GxP_plink \
--make-bed \
--out GxP_plink_binary
"""
}
Additionally to the genotypes, the phenotypes of the samples are needed.
// git 3.13
// import phenotypes
Channel
.fromPath("../../metadata/phenotypes.sc")
.set{ phenotypes_raw }
As a legacy of the early analysis, we perform a PCA on the phenotypes and report the extended phenotype data including the scoring on PC1 & PC2.
// git 3.14
// run PCA on phenotypes
process phenotye_pca {
label "L_loc_phenotype_pca"
publishDir "../../2_analysis/phenotype", mode: 'copy' , pattern: "*.txt"
//module "R3.5.2"
input:
file( sc ) from phenotypes_raw
output:
file( "phenotypes.txt" ) into phenotype_file
file( "phenotype_pca*.pdf" ) into phenotype_pca
script:
"""
Rscript --vanilla \$BASE_DIR/R/phenotypes_pca.R ${sc}
"""
}
We set up all the traits for which we want to perform a GxP association.
// git 3.15
// setup GxP traits
Channel
.from("Bars", "Snout", "Peduncle")
.set{ traits_ch }
Then, we combine the reformatted genotypes with the phenotyes and the traits of interest.
// git 3.16
// bundle GxP input
traits_ch.combine( plink_binary ).combine( phenotype_file ).set{ trait_plink_combo }
Having collected all the input, we can now run the GxP.
// git 3.17
// actually run the GxP
process gemma_run {
label 'L_32g4h_GxP_run'
publishDir "../../2_analysis/GxP/bySNP/", mode: 'copy'
//module "R3.5.2"
input:
set val( pheno ), file( bed ), file( bim ), file( fam ), file( pheno_file ) from trait_plink_combo
output:
file("*.GxP.txt.gz") into gemma_results
script:
"""
source \$BASE_DIR/sh/body.sh
BASE_NAME=\$(echo ${fam} | sed 's/.fam//g')
mv ${fam} \$BASE_NAME-old.fam
cp \${BASE_NAME}-old.fam ${fam}
# 1) replace the phenotype values
Rscript --vanilla \$BASE_DIR/R/assign_phenotypes.R ${fam} ${pheno_file} ${pheno}
# 2) create relatedness matrix of samples using gemma
gemma -bfile \$BASE_NAME -gk 1 -o ${pheno}
# 3) fit linear model using gemma (-lm)
gemma -bfile \$BASE_NAME -lm 4 -miss 0.1 -notsnp -o ${pheno}.lm
# 4) fit linear mixed model using gemma (-lmm)
gemma -bfile \$BASE_NAME -k output/${pheno}.cXX.txt -lmm 4 -o ${pheno}.lmm
# 5) reformat output
sed 's/\\trs\\t/\\tCHROM\\tPOS\\t/g; s/\\([0-2][0-9]\\):/\\1\\t/g' output/${pheno}.lm.assoc.txt | \
cut -f 2,3,9-14 | body sort -k1,1 -k2,2n | gzip > ${pheno}.lm.GxP.txt.gz
sed 's/\\trs\\t/\\tCHROM\\tPOS\\t/g; s/\\([0-2][0-9]\\):/\\1\\t/g' output/${pheno}.lmm.assoc.txt | \
cut -f 2,3,8-10,13-15 | body sort -k1,1 -k2,2n | gzip > ${pheno}.lmm.GxP.txt.gz
"""
}
To smooth the GxP results, we initialize two resolutions (50 kb windows with 5 kb increments and 10 kb windows with 1 kb increments).
// git 3.18
// setup smoothing levels
Channel
.from([[50000, 5000], [10000, 1000]])
.set{ gxp_smoothing_levels }
The we apply all smoothing levels to the raw GxP output…
// git 3.19
// apply all smoothing levels
gemma_results.combine( gxp_smoothing_levels ).set{ gxp_smoothing_input }
.. and run the smoothing.
// git 3.20
// actually run the smoothing
process gemma_smooth {
label 'L_20g2h_GxP_smooth'
publishDir "../../2_analysis/GxP/${win}", mode: 'copy'
input:
set file( lm ), file( lmm ), val( win ), val( step ) from gxp_smoothing_input
output:
set val( win ), file( "*.lm.*k.txt.gz" ) into gxp_lm_smoothing_output
set val( win ), file( "*.lmm.*k.txt.gz" ) into gxp_lmm_smoothing_output
script:
"""
\$BASE_DIR/sh/gxp_slider ${lm} ${win} ${step}
\$BASE_DIR/sh/gxp_slider ${lmm} ${win} ${step}
"""
}
At this step we are done with differentiation and GxP.
4.2.4 FST within species (adaptation scenario)
To judge the independence of the individual populations of the hamlets species sampled at several locations, we also compute the \(F_{ST}\) among those populations. We start by defining the species-set for which we sampled multiple populations.
// Fst within species ---------------------------------------------------------
// git 3.21
// define species set
Channel
.from( "nig", "pue", "uni")
.set{ species_ch }
Then, we define the sampling locations.
// git 3.22
// define location set
Channel.from( [[1, "bel"], [2, "hon"], [3, "pan"]]).into{ locations_ch_1;locations_ch_2 }
Next, we create all population-pairs and bind it to the genotype file.
// git 3.23
// create location pairs
locations_ch_1
.combine(locations_ch_2)
.filter{ it[0] < it[2] }
.map{ it[1,3]}
.combine( species_ch )
.combine( vcf_adapt )
.set{ vcf_location_combo_adapt }
Last, we compute the pair wise \(F_{ST}\) among all populations.
// git 3.24
// compute pairwise fsts
process fst_run_adapt {
label 'L_20g4h_fst_run_adapt'
publishDir "../../2_analysis/fst/adapt/", mode: 'copy' , pattern: "*.log"
input:
set val( loc1 ), val( loc2 ), val( spec ), val(vcf_indx), file( vcf ) from vcf_location_combo_adapt
output:
file( "adapt_${spec}${loc1}-${spec}${loc2}.log" ) into fst_adapt_logs
script:
"""
vcfsamplenames ${vcf[0]} | grep ${spec}${loc1} > pop1.txt
vcfsamplenames ${vcf[0]} | grep ${spec}${loc2} > pop2.txt
vcftools --gzvcf ${vcf[0]} \
--weir-fst-pop pop1.txt \
--weir-fst-pop pop2.txt \
--out adapt_${spec}${loc1}-${spec}${loc2} 2> adapt_${spec}${loc1}-${spec}${loc2}.log
"""
}