4 (git 3) Analysis I (FST & GxP)

This pipeline can be executed as follows:

cd $BASE_DIR/nf/03_analysis_fst_gxp
source ../../sh/nextflow_alias.sh
nf_run_basic

4.1 Summary

The genetic differentiation, as well as the genotype x phenotype association, are computed within the nextflow script analysis_fst_gxp.nf (located under $BASE_DIR/nf/03_analysis_fst_gxp/). It takes the SNPs only data set and computes \(F_{ST}\) and the GxP association. Below is an overview of the steps involved in the analysis. (The green dot indicates the genotype input, red arrows depict output that is exported for further use.)

4.2 Details of analysis_fst_gxp.nf

4.2.1 Setup

The nextflow script starts by opening the genotype data and feeding it into three different streams.

#!/usr/bin/env nextflow
// git 3.1
// open genotype data
Channel
    .fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
    .into{ vcf_locations; vcf_filter; vcf_gxp; vcf_adapt }

Since we are going to work on the three sampling locations independently, we create channel for the locations.

// git 3.2
// initialize location channel
Channel
    .from( "bel", "hon", "pan")
    .set{ locations_ch }

Then we attach the genotypes to the locations.

// git 3.3
// attach genotypes to location
locations_ch
    .combine( vcf_locations )
    .set{ vcf_location_combo }

Next, we define the species sets sampled at the individual locations.

// git 3.4
// define location specific sepcies set
Channel.from( [[1, "ind"], [2, "may"], [3, "nig"], [4, "pue"], [5, "uni"]] ).into{ bel_spec1_ch; bel_spec2_ch }
Channel.from( [[1, "abe"], [2, "gum"], [3, "nig"], [4, "pue"], [5, "ran"], [6, "uni"]] ).into{ hon_spec1_ch; hon_spec2_ch }
Channel.from( [[1, "nig"], [2, "pue"], [3, "uni"]] ).into{ pan_spec1_ch; pan_spec2_ch }

For each location, the data is subset to include only local hamlets.

// git 3.5
// subset data to local hamlets
process subset_vcf_by_location {
    label "L_20g2h_subset_vcf"

    input:
    set val( loc ), vcfId, file( vcf ) from vcf_location_combo

    output:
    set val( loc ), file( "${loc}.vcf.gz" ), file( "${loc}.pop" ) into ( vcf_loc_pair1, vcf_loc_pair2, vcf_loc_pair3 )

    script:
    """
   vcfsamplenames ${vcf[0]} | \
       grep ${loc} | \
       grep -v tor | \
       grep -v tab > ${loc}.pop

   vcftools --gzvcf ${vcf[0]} \
       --keep ${loc}.pop \
       --mac 3 \
       --recode \
       --stdout | gzip > ${loc}.vcf.gz
   """
}

As we also want to compute global statistics, we create another subset including all sampled hamlets but excluding the outgroups.

// git 3.6
// subset the global data set to hamlets only
process subset_vcf_hamlets_only {
    label "L_20g15h_filter_hamlets_only"
    publishDir "../../1_genotyping/4_phased/", mode: 'copy' , pattern: "*.vcf.gz"
    //module "R3.5.2"

    input:
    set vcfId, file( vcf ) from vcf_filter

    output:
    file( "hamlets_only.vcf.gz*" ) into vcf_hamlets_only
    set file( "hamlets_only.vcf.gz*" ), file( "hamlets_only.pop.txt" ) into vcf_multi_fst

    script:
    """
   vcfsamplenames ${vcf[0]} | \
       grep -v "abe\\|gum\\|ind\\|may\\|nig\\|pue\\|ran\\|uni" > outgroup.pop

   vcfsamplenames ${vcf[0]} | \
       grep "abe\\|gum\\|ind\\|may\\|nig\\|pue\\|ran\\|uni" | \
       awk '{print \$1"\\t"\$1}' | \
       sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > hamlets_only.pop.txt

   vcftools \
       --gzvcf ${vcf[0]} \
       --remove outgroup.pop \
       --recode \
       --stdout | gzip > hamlets_only.vcf.gz
   """
}

4.2.2 FST

Using this subsets, we compute the global differentiation among all hamlet populations.

// ----------- Fst section -----------
// git 3.7
// compute global fst
process fst_multi {
    label 'L_20g15h_fst_multi'
    publishDir "../../2_analysis/fst/50k/", mode: 'copy' , pattern: "*.50k.tsv.gz"
    publishDir "../../2_analysis/fst/10k/", mode: 'copy' , pattern: "*.10k.tsv.gz"
    publishDir "../../2_analysis/fst/logs/", mode: 'copy' , pattern: "*.log"
    publishDir "../../2_analysis/summaries", mode: 'copy' , pattern: "fst_outliers_998.tsv"
    //conda "$HOME/miniconda2/envs/py3"
    //module "R3.5.2"

    input:
    set file( vcf ), file( pop ) from vcf_multi_fst

    output:
    file( "multi_fst*" ) into multi_fst_output
    file( "fst_outliers_998.tsv" )  into fst_outlier_output

    script:
    """
   awk '{print \$1"\\t"\$2\$3}' ${pop} > pop.txt

   for k in abehon gumhon indbel maybel nigbel nighon nigpan puebel puehon puepan ranhon unibel unihon unipan; do
   grep \$k pop.txt | cut -f 1 > pop.\$k.txt
   done

   POP="--weir-fst-pop pop.abehon.txt \
   --weir-fst-pop pop.gumhon.txt \
   --weir-fst-pop pop.indbel.txt \
   --weir-fst-pop pop.maybel.txt \
   --weir-fst-pop pop.nigbel.txt \
   --weir-fst-pop pop.nighon.txt \
   --weir-fst-pop pop.nigpan.txt \
   --weir-fst-pop pop.puebel.txt \
   --weir-fst-pop pop.puehon.txt \
   --weir-fst-pop pop.puepan.txt \
   --weir-fst-pop pop.ranhon.txt \
   --weir-fst-pop pop.unibel.txt \
   --weir-fst-pop pop.unihon.txt \
   --weir-fst-pop pop.unipan.txt"

   # fst by SNP
   # ----------
   vcftools --gzvcf ${vcf} \
    \$POP \
   --stdout  2> multi_fst_snp.log | \
   gzip > multi_fst.tsv.gz

   # fst 50kb window
   # ---------------
   vcftools --gzvcf ${vcf} \
    \$POP \
   --fst-window-step 5000 \
   --fst-window-size 50000 \
   --stdout  2> multi_fst.50k.log | \
   gzip > multi_fst.50k.tsv.gz

   # fst 10kb window
   # ---------------
   vcftools --gzvcf ${vcf} \
    \$POP \
   --fst-window-step 1000 \
   --fst-window-size 10000 \
   --stdout  2> multi_fst.10k.log | \
   gzip > multi_fst_snp.tsv.gz

   Rscript --vanilla \$BASE_DIR/R/table_fst_outliers.R multi_fst.50k.tsv.gz
   """
}

Then, we set up the pair wise species comparisons…

// git 3.8
// prepare pairwise fsts
// ------------------------------
/* (create all possible species pairs depending on location
   and combine with genotype subset (for the respective location))*/
// ------------------------------
/* channel content after joinig:
  set [0:val(loc), 1:file(vcf), 2:file(pop), 3:val(spec1), 4:val(spec2)]*/
// ------------------------------
bel_pairs_ch = Channel.from( "bel" )
    .join( vcf_loc_pair1 )
    .combine(bel_spec1_ch)
    .combine(bel_spec2_ch)
    .filter{ it[3] < it[5] }
    .map{ it[0,1,2,4,6]}
hon_pairs_ch = Channel.from( "hon" )
    .join( vcf_loc_pair2 )
    .combine(hon_spec1_ch)
    .combine(hon_spec2_ch)
    .filter{ it[3] < it[5] }
    .map{ it[0,1,2,4,6]}
pan_pairs_ch = Channel.from( "pan" )
    .join( vcf_loc_pair3 )
    .combine(pan_spec1_ch)
    .combine(pan_spec2_ch)
    .filter{ it[3] < it[5] }
    .map{ it[0,1,2,4,6]}
bel_pairs_ch.concat( hon_pairs_ch, pan_pairs_ch  ).set { all_fst_pairs_ch }

… and run them.

// git 3.9
// compute pairwise fsts
process fst_run {
    label 'L_32g4h_fst_run'
    publishDir "../../2_analysis/fst/50k/", mode: 'copy' , pattern: "*.50k.windowed.weir.fst.gz"
    publishDir "../../2_analysis/fst/10k/", mode: 'copy' , pattern: "*.10k.windowed.weir.fst.gz"
    publishDir "../../2_analysis/fst/logs/", mode: 'copy' , pattern: "${loc}-${spec1}-${spec2}.log"

    input:
    set val( loc ), file( vcf ), file( pop ), val( spec1 ), val( spec2 ) from all_fst_pairs_ch

    output:
    set val( loc ), file( "*.50k.windowed.weir.fst.gz" ), file( "${loc}-${spec1}-${spec2}.log" ) into fst_50k
    file( "*.10k.windowed.weir.fst.gz" ) into fst_10k_output
    file( "${loc}-${spec1}-${spec2}.log" ) into fst_logs

    script:
    """
   grep ${spec1} ${pop} > pop1.txt
   grep ${spec2} ${pop} > pop2.txt

   vcftools --gzvcf ${vcf} \
       --weir-fst-pop pop1.txt \
       --weir-fst-pop pop2.txt \
       --fst-window-step 5000 \
       --fst-window-size 50000 \
       --out ${loc}-${spec1}-${spec2}.50k 2> ${loc}-${spec1}-${spec2}.log

   vcftools --gzvcf ${vcf} \
       --weir-fst-pop pop1.txt \
       --weir-fst-pop pop2.txt \
       --fst-window-size 10000 \
       --fst-window-step 1000 \
       --out ${loc}-${spec1}-${spec2}.10k

   gzip *.windowed.weir.fst
   """
}

The genome wide summaries are compiled from the log files of VCFtools.

// git 3.10
/* collect the VCFtools logs to crate a table with the
   genome wide fst values */
process fst_globals {
    label 'L_loc_fst_globals'
    publishDir "../../2_analysis/summaries", mode: 'copy' , pattern: "fst_globals.txt"
    //module "R3.5.2"

    input:
    file( log ) from fst_logs.collect()

    output:
    file( "fst_globals.txt" ) into fst_glob

    script:
    """
   cat *.log | \
       grep -E 'Weir and Cockerham|--out' | \
       grep -A 3 50k | \
       sed '/^--/d; s/^.*--out //g; s/.50k//g; /^Output/d; s/Weir and Cockerham //g; s/ Fst estimate: /\t/g' | \
       paste - - - | \
       cut -f 1,3,5 | \

   sed 's/^\\(...\\)-/\\1\\t/g' > fst_globals.txt
   """
}

4.2.3 GxP

The software used for the GxP association (gemma) uses genotypes in plink file format, so the first step was to convert the input.

// ----------- G x P section -----------

// git 3.11
// reformat genotypes (1)
process plink12 {
    label 'L_20g2h_plink12'

    input:
    set vcfId, file( vcf ) from vcf_gxp

    output:
    set file( "GxP_plink.map" ), file( "GxP_plink.ped" ) into plink_GxP

    script:
    """
   vcfsamplenames ${vcf[0]} | \
       grep -v "tor\\|tab\\|flo" | \
       awk '{print \$1"\\t"\$1}' | \
       sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > pop.txt

   vcftools \
       --gzvcf ${vcf[0]} \
       --plink \
       --out GxP_plink

   plink \
       --file GxP_plink \
       --recode12 \
       --out hapmap
   """
}

// git 3.12
// reformat genotypes (2)
process GxP_run {
    label 'L_20g2h_GxP_binary'

    input:
    set file( map ), file( ped ) from plink_GxP

    output:
    set file( "*.bed" ), file( "*.bim" ),file( "*.fam" ) into plink_binary

    script:
    """
   # convert genotypes into binary format (bed/bim/fam)
   plink \
       --noweb \
       --file GxP_plink \
       --make-bed \
       --out GxP_plink_binary
   """
}

Additionally to the genotypes, the phenotypes of the samples are needed.

// git 3.13
// import phenotypes
Channel
    .fromPath("../../metadata/phenotypes.sc")
    .set{ phenotypes_raw }

As a legacy of the early analysis, we perform a PCA on the phenotypes and report the extended phenotype data including the scoring on PC1 & PC2.

// git 3.14
// run PCA on phenotypes
process phenotye_pca {
    label "L_loc_phenotype_pca"
    publishDir "../../2_analysis/phenotype", mode: 'copy' , pattern: "*.txt"
    //module "R3.5.2"

    input:
    file( sc ) from phenotypes_raw

    output:
    file( "phenotypes.txt" ) into phenotype_file
    file( "phenotype_pca*.pdf" ) into  phenotype_pca

    script:
    """
   Rscript --vanilla \$BASE_DIR/R/phenotypes_pca.R ${sc}
   """
}

We set up all the traits for which we want to perform a GxP association.

// git 3.15
// setup GxP traits
Channel
    .from("Bars", "Snout", "Peduncle")
    .set{ traits_ch }

Then, we combine the reformatted genotypes with the phenotyes and the traits of interest.

// git 3.16
// bundle GxP input
traits_ch.combine( plink_binary ).combine( phenotype_file ).set{ trait_plink_combo }

Having collected all the input, we can now run the GxP.

// git 3.17
// actually run the GxP
process gemma_run {
 label 'L_32g4h_GxP_run'
 publishDir "../../2_analysis/GxP/bySNP/", mode: 'copy'
 //module "R3.5.2"

 input:
 set  val( pheno ), file( bed ), file( bim ), file( fam ), file( pheno_file ) from trait_plink_combo

 output:
 file("*.GxP.txt.gz") into gemma_results

 script:
    """
   source \$BASE_DIR/sh/body.sh
   BASE_NAME=\$(echo  ${fam} | sed 's/.fam//g')

   mv ${fam} \$BASE_NAME-old.fam
   cp \${BASE_NAME}-old.fam ${fam}

   # 1) replace the phenotype values
   Rscript --vanilla \$BASE_DIR/R/assign_phenotypes.R ${fam} ${pheno_file} ${pheno}

   # 2) create relatedness matrix of samples using gemma
   gemma -bfile \$BASE_NAME -gk 1 -o ${pheno}

   # 3) fit linear model using gemma (-lm)
   gemma -bfile \$BASE_NAME -lm 4 -miss 0.1 -notsnp -o ${pheno}.lm

   # 4) fit linear mixed model using gemma (-lmm)
   gemma -bfile \$BASE_NAME -k output/${pheno}.cXX.txt -lmm 4 -o ${pheno}.lmm

   # 5) reformat output
   sed 's/\\trs\\t/\\tCHROM\\tPOS\\t/g; s/\\([0-2][0-9]\\):/\\1\\t/g' output/${pheno}.lm.assoc.txt | \
       cut -f 2,3,9-14 | body sort -k1,1 -k2,2n | gzip > ${pheno}.lm.GxP.txt.gz
   sed 's/\\trs\\t/\\tCHROM\\tPOS\\t/g; s/\\([0-2][0-9]\\):/\\1\\t/g' output/${pheno}.lmm.assoc.txt | \
       cut -f 2,3,8-10,13-15 | body sort -k1,1 -k2,2n | gzip > ${pheno}.lmm.GxP.txt.gz
   """
}

To smooth the GxP results, we initialize two resolutions (50 kb windows with 5 kb increments and 10 kb windows with 1 kb increments).

// git 3.18
// setup smoothing levels
Channel
    .from([[50000, 5000], [10000, 1000]])
    .set{ gxp_smoothing_levels }

The we apply all smoothing levels to the raw GxP output…

// git 3.19
// apply all smoothing levels
gemma_results.combine( gxp_smoothing_levels ).set{ gxp_smoothing_input }

.. and run the smoothing.

// git 3.20
// actually run the smoothing
process gemma_smooth {
    label 'L_20g2h_GxP_smooth'
    publishDir "../../2_analysis/GxP/${win}", mode: 'copy'

    input:
    set file( lm ), file( lmm ), val( win ), val( step ) from gxp_smoothing_input

    output:
    set val( win ), file( "*.lm.*k.txt.gz" ) into gxp_lm_smoothing_output
    set val( win ), file( "*.lmm.*k.txt.gz" ) into gxp_lmm_smoothing_output

    script:
    """
   \$BASE_DIR/sh/gxp_slider ${lm} ${win} ${step}
   \$BASE_DIR/sh/gxp_slider ${lmm} ${win} ${step}
   """
}

At this step we are done with differentiation and GxP.

4.2.4 FST within species (adaptation scenario)

To judge the independence of the individual populations of the hamlets species sampled at several locations, we also compute the \(F_{ST}\) among those populations. We start by defining the species-set for which we sampled multiple populations.

// Fst within species ---------------------------------------------------------
// git 3.21
// define species set
Channel
    .from( "nig", "pue", "uni")
    .set{ species_ch }

Then, we define the sampling locations.

// git 3.22
// define location set
Channel.from( [[1, "bel"], [2, "hon"], [3, "pan"]]).into{ locations_ch_1;locations_ch_2 }

Next, we create all population-pairs and bind it to the genotype file.

// git 3.23
// create location pairs
locations_ch_1
    .combine(locations_ch_2)
    .filter{ it[0] < it[2] }
    .map{ it[1,3]}
    .combine( species_ch )
    .combine( vcf_adapt )
    .set{ vcf_location_combo_adapt }

Last, we compute the pair wise \(F_{ST}\) among all populations.

// git 3.24
// compute pairwise fsts
process fst_run_adapt {
    label 'L_20g4h_fst_run_adapt'
    publishDir "../../2_analysis/fst/adapt/", mode: 'copy' , pattern: "*.log"

    input:
    set val( loc1 ), val( loc2 ), val( spec ), val(vcf_indx), file( vcf ) from vcf_location_combo_adapt

    output:
    file( "adapt_${spec}${loc1}-${spec}${loc2}.log" ) into fst_adapt_logs

    script:
    """
   vcfsamplenames ${vcf[0]} | grep ${spec}${loc1} > pop1.txt
   vcfsamplenames ${vcf[0]} | grep ${spec}${loc2} > pop2.txt

   vcftools --gzvcf ${vcf[0]} \
       --weir-fst-pop pop1.txt \
       --weir-fst-pop pop2.txt \
       --out adapt_${spec}${loc1}-${spec}${loc2} 2> adapt_${spec}${loc1}-${spec}${loc2}.log
   """
}