8 (git 7) Analysis V (Principal Component Analysis)

This pipeline can be executed as follows:

cd $BASE_DIR/nf/07_analysis_pca
source ../../sh/nextflow_alias.sh
nf_run_pca

8.1 Summary

The PCAs are produced within the nextflow script analysis_pca.nf (located under $BASE_DIR/nf/07_analysis_pca/). It takes the SNPs only data set and runs the PCAs (for all samples and within locations) both for the whole genome and for the highly differentiated regions excluded. Below is an overview of the steps involved in the analysis. (The green dots indicate the input files, red dots depict output that is exported for further use.)

8.2 Details of analysis_pca.nf

8.2.1 Setup

The nextflow script starts by setting the channel for data subset options.

#!/usr/bin/env nextflow
// git 7.1
// prepare subset modes (whole genome vs non-diverged regions)
Channel
    .from( "whg", "subset_non_diverged")
    .set{ subset_type_ch }

Also, the file with the genomic coordinates of the outlier regions is loaded.

// git 7.2
// load table with differentiation outlier regions
Channel
    .fromPath( "../../2_analysis/summaries/fst_outliers_998.tsv" )
    .set{ outlier_tab }

Then it combines the genotype file with the subset type and the outlier location file.

// git 7.3
// open genotype data
Channel
    .fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
    .combine( outlier_tab )
    .combine( subset_type_ch )
    .set{ vcf_ch }

Depending on subset type, the outlier regions are excluded from the genotypes.

// git 7.4
// depending on subset mode, subset vcf
process subset_vcf_divergence_based {
    label "L_20g2h_subset_divergence"

    input:
    set  vcfId, file( vcf ), file( outlier_tab ), val( subset_type ) from vcf_ch

    output:
    set file( "${subset_type}.vcf.gz" ), file( "${subset_type}.vcf.gz.tbi" ), val( subset_type ) into ( vcf_locations, vcf_all_samples_pca )

    script:
    """
   if [ "${subset_type}" == "subset_non_diverged" ];then
       awk -v OFS="\\t" '{print \$2,\$3,\$4}' ${outlier_tab} > diverged_regions.bed
       SUBSET="--exclude-bed diverged_regions.bed"
   else
       SUBSET=""
   fi

   vcftools --gzvcf ${vcf[0]} \
       \$SUBSET \
       --recode \
       --stdout | bgzip > ${subset_type}.vcf.gz

   tabix ${subset_type}.vcf.gz
   """
}

The locations channel is set…

// git 7.5
// prepare location channel for separate pcas
Channel
    .from( "bel", "hon", "pan")
    .set{ locations_ch }

…and combined with the genotypes.

// git 7.6
// attach genotypes to location channel
locations_ch
    .combine( vcf_locations )
    .set{ vcf_location_combo }

Then, the genotypes are subset by location.

// git 7.7
// subset vcf by location
process subset_vcf_by_location {
    label "L_20g2h_subset_vcf"

    input:
    set val( loc ), file( vcf ), file( vcfidx ), val( subset_type ) from vcf_location_combo

    output:
    set val( loc ), file( "*.vcf.gz" ), file( "*.pop" ), val( subset_type ) into ( vcf_loc_pca )

    script:
    """
   vcfsamplenames ${vcf} | \
       grep ${loc} | \
       grep -v tor | \
       grep -v tab > ${loc}.${subset_type}.pop
   vcftools --gzvcf ${vcf} \
       --keep ${loc}.${subset_type}.pop \
       --mac 3 \
       --recode \
       --stdout | gzip > ${loc}.${subset_type}.vcf.gz
   """
}

Finally, the PCAs are run within the different locations…

// PCA section
// -----------
// git 7.8
// run pca by location
process pca_location {
    label "L_20g15h_pca_location"
    publishDir "../../figures/pca", mode: 'copy' , pattern: "*.pdf"
    publishDir "../../2_analysis/pca", mode: 'copy' , pattern: "*.gz"

    input:
    set val( loc ), file( vcf ), file( pop ), val( subset_type ) from vcf_loc_pca

    output:
    set file( "*.prime_pca.pdf" ), file( "*.pca.pdf" ), file( "*.exp_var.txt.gz" ), file( "*.scores.txt.gz" ) into pca_loc_out

    script:
    """
   awk '{print \$1"\\t"\$1}' ${loc}.${subset_type}.pop | \
       sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > ${loc}.${subset_type}.pop.txt
   Rscript --vanilla \$BASE_DIR/R/vcf2pca.R ${vcf} ${loc}.${subset_type}.pop.txt 6
   """
}

…and for the entire data set.

// git 7.9
// run pca for global data set
process pca_all {
    label "L_20g15h_pca_all"
    publishDir "../../figures/pca", mode: 'copy' , pattern: "*.pdf"
    publishDir "../../2_analysis/pca", mode: 'copy' , pattern: "*.txt.gz"
    publishDir "../../1_genotyping/4_phased/", mode: 'copy' , pattern: "*.vcf.gz"

    input:
    set file( vcf ), file( vcfidx ), val( subset_type ) from vcf_all_samples_pca

    output:
    set file( "*.prime_pca.pdf" ), file( "*.pca.pdf" ), file( "*.exp_var.txt.gz" ), file( "*.scores.txt.gz" ) into pca_all_out
    file( "hamlets_only.${subset_type}.vcf.gz" ) into vcf_hamlets_only
    set file( "hamlets_only.${subset_type}.vcf.gz" ), file( "hamlets_only.${subset_type}.pop.txt" ) into vcf_multi_fst

    script:
    """
   # complete PCA, all samples ------------
   vcfsamplenames ${vcf} | \
       awk '{print \$1"\\t"\$1}' | \
       sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > all.${subset_type}.pop.txt
   Rscript --vanilla \$BASE_DIR/R/vcf2pca.R ${vcf} all.${subset_type}.pop.txt 6

   # PCA without outgroups ---------------
   vcfsamplenames ${vcf} | \
       grep -v "abe\\|gum\\|ind\\|may\\|nig\\|pue\\|ran\\|uni\\|flo" > outgroup.${subset_type}.pop
   vcfsamplenames ${vcf} | \
       grep "abe\\|gum\\|ind\\|may\\|nig\\|pue\\|ran\\|uni\\|flo" | \
       awk '{print \$1"\\t"\$1}' | \
       sed 's/\\t.*\\(...\\)\\(...\\)\$/\\t\\1\\t\\2/g' > hamlets_only.${subset_type}.pop.txt
   vcftools \
       --gzvcf ${vcf} \
       --remove outgroup.${subset_type}.pop \
       --recode \
       --stdout | gzip > hamlets_only.${subset_type}.vcf.gz
   Rscript --vanilla \$BASE_DIR/R/vcf2pca.R hamlets_only.${subset_type}.vcf.gz hamlets_only.${subset_type}.pop.txt 6
   """
}