2 (git 1) Genotyping I (SNPs only)
This pipeline can be executed as follows:
cd $BASE_DIR/nf/01_genotyping
source ../../sh/nextflow_alias.sh
nf_run_gatk
2.1 Summary
The genotyping procedure is controlled by the nextflow script genotyping.nf
(located under $BASE_DIR/nf/01_genotyping/
).
It takes the analysis from the raw sequencing data to the genotyped and phased SNPs.
Below is an overview of the steps involved in the genotyping process.
(The green dot indicates the raw data input, red arrows depict output that is exported for further use.)
2.2 Details of genotyping.nf
2.2.1 Data preparation
The nextflow script starts with a small header and then opens the analysis by reading a table with meta data about the samples. The table is parsed and the values are stored in nextflow variables.
#!/usr/bin/env nextflow
/* ===============================================================
Disclaimer: This pipeline needs a lot of time & memory to run:
All in all we used roughly 10 TB and ran for about 1 Month
(mainly due to limited bandwidth on the cluster durint the
"receive_tuple step)
===============================================================
*/
// git 1.1
/* open the pipeline based on the metadata spread sheet that includes all
information necessary to assign read groups to the sequencing data,
split the spread sheet by row and feed it into a channel */
Channel
.fromPath('../../metadata/file_info.txt')
.splitCsv(header:true, sep:"\t")
.map{ row -> [ id:row.id, label:row.label, file_fwd:row.file_fwd, file_rev:row.file_rev, flowcell_id_fwd:row.flowcell_id_fwd, lane_fwd:row.lane_fwd, company:row.company] }
.set { samples_ch }
Below is a little preview of the table containing the sample meta data:
id | label | spec | geo | date | coord_N | coord_W | company | .. |
---|---|---|---|---|---|---|---|---|
16_21-30 | 16_21-30nigpan | nig | pan | 2016 | NA | NA | duke | … |
16_21-30 | 16_21-30nigpan | nig | pan | 2016 | NA | NA | duke | … |
16_31-40 | 16_31-40unipan | uni | pan | 2016 | NA | NA | duke | … |
16_31-40 | 16_31-40unipan | uni | pan | 2016 | NA | NA | duke | … |
17996 | 17996indbel | ind | bel | 2004-07-27 | 16.801 | -88.079 | novogene | … |
17997 | 17997indbel | ind | bel | 2004-07-27 | 16.801 | -88.079 | novogene | … |
… | … | … | … | … | … | … | … |
The first step to prepare the data for the GATK best practices, is to convert the sample sequences from *.fq
to *.bam
format to assign read groups:
// git 1.2
/* for every sequencing file, convert into ubam format and assign read groups */
process split_samples {
label 'L_20g2h_split_samples'
input:
val x from samples_ch
output:
set val( "${x.label}.${x.lane_fwd}" ), file( "${x.label}.${x.lane_fwd}.ubam.bam" ) into ubams_mark, ubams_merge
script:
"""
echo -e "---------------------------------"
echo -e "Label:\t\t${x.label}\nFwd:\t\t${x.file_fwd}\nRev:\t\t${x.file_rev}"
echo -e "Flowcell:\t${x.flowcell_id_fwd}\nLane:\t\t${x.lane_fwd}"
echo -e "Read group:\t${x.flowcell_id_fwd}.${x.lane_fwd}\nCompany:\t${x.company}"
mkdir -p \$BASE_DIR/temp_files
gatk --java-options "-Xmx20G" \
FastqToSam \
-SM=${x.label} \
-F1=\$BASE_DIR/data/seqdata/${x.file_fwd} \
-F2=\$BASE_DIR/data/seqdata/${x.file_rev} \
-O=${x.label}.${x.lane_fwd}.ubam.bam \
-RG=${x.label}.${x.lane_fwd} \
-LB=${x.label}".lib1" \
-PU=${x.flowcell_id_fwd}.${x.lane_fwd} \
-PL=Illumina \
-CN=${x.company} \
--TMP_DIR=\$BASE_DIR/temp_files;
"""
}
The second step is marking the Illumina adapters.
// git 1.3
/* for every ubam file, mark Illumina adapters */
process mark_adapters {
label 'L_20g2h_mark_adapters'
tag "${sample}"
input:
set val( sample ), file( input ) from ubams_mark
output:
set val( sample ), file( "*.adapter.bam") into adapter_bams
file "*.adapter.metrics.txt" into adapter_metrics
script:
"""
gatk --java-options "-Xmx18G" \
MarkIlluminaAdapters \
-I=${input} \
-O=${sample}.adapter.bam \
-M=${sample}.adapter.metrics.txt \
-TMP_DIR=\$BASE_DIR/temp_files;
"""
}
We need to pass on the unaligned .bam
file and the file containing the adapter information together, so the output of the first two processes are matched by the combined sample and sequencing lane information.
// git 1.4
adapter_bams
.combine(ubams_merge, by:0)
.set {merge_input}
For the actual mapping, the sequences are transformed back into .fq
format, aligned using bwa
and merged back with their original read group information.
// git 1.5
/* this step includes a 3 step pipeline:
* - re-transformatikon into fq format
* - mapping aginst the reference genome_file
* - merging with the basuch ubams to include
read group information */
process map_and_merge {
label 'L_75g24h8t_map_and_merge'
tag "${sample}"
input:
set val( sample ), file( adapter_bam_input ), file( ubam_input ) from merge_input
output:
set val( sample ), file( "*.mapped.bam" ) into mapped_bams
script:
"""
set -o pipefail
gatk --java-options "-Xmx68G" \
SamToFastq \
-I=${adapter_bam_input} \
-FASTQ=/dev/stdout \
-INTERLEAVE=true \
-NON_PF=true \
-TMP_DIR=\$BASE_DIR/temp_files | \
bwa mem -M -t 8 -p \$BASE_DIR/ressources/HP_genome_unmasked_01.fa /dev/stdin |
gatk --java-options "-Xmx68G" \
MergeBamAlignment \
--VALIDATION_STRINGENCY SILENT \
--EXPECTED_ORIENTATIONS FR \
--ATTRIBUTES_TO_RETAIN X0 \
-ALIGNED_BAM=/dev/stdin \
-UNMAPPED_BAM=${ubam_input} \
-OUTPUT=${sample}.mapped.bam \
--REFERENCE_SEQUENCE=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa.gz \
-PAIRED_RUN true \
--SORT_ORDER "unsorted" \
--IS_BISULFITE_SEQUENCE false \
--ALIGNED_READS_ONLY false \
--CLIP_ADAPTERS false \
--MAX_RECORDS_IN_RAM 2000000 \
--ADD_MATE_CIGAR true \
--MAX_INSERTIONS_OR_DELETIONS -1 \
--PRIMARY_ALIGNMENT_STRATEGY MostDistant \
--UNMAPPED_READ_STRATEGY COPY_TO_TAG \
--ALIGNER_PROPER_PAIR_FLAGS true \
--UNMAP_CONTAMINANT_READS true \
-TMP_DIR=\$BASE_DIR/temp_files
"""
}
Next, the duplicates are being marked.
// git 1.6
/* for every mapped sample,sort and mark duplicates
* (intermediate step is required to create .bai file) */
process mark_duplicates {
label 'L_32g30h_mark_duplicates'
publishDir "../../1_genotyping/0_sorted_bams/", mode: 'symlink'
tag "${sample}"
input:
set val( sample ), file( input ) from mapped_bams
output:
set val { sample - ~/\.(\d+)/ }, val( sample ), file( "*.dedup.bam") into dedup_bams
file "*.dedup.metrics.txt" into dedup_metrics
script:
"""
set -o pipefail
gatk --java-options "-Xmx30G" \
SortSam \
-I=${input} \
-O=/dev/stdout \
--SORT_ORDER="coordinate" \
--CREATE_INDEX=false \
--CREATE_MD5_FILE=false \
-TMP_DIR=\$BASE_DIR/temp_files \
| \
gatk --java-options "-Xmx30G" \
SetNmAndUqTags \
--INPUT=/dev/stdin \
--OUTPUT=intermediate.bam \
--CREATE_INDEX=true \
--CREATE_MD5_FILE=true \
-TMP_DIR=\$BASE_DIR/temp_files \
--REFERENCE_SEQUENCE=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa.gz
gatk --java-options "-Xmx30G" \
MarkDuplicates \
-I=intermediate.bam \
-O=${sample}.dedup.bam \
-M=${sample}.dedup.metrics.txt \
-MAX_FILE_HANDLES=1000 \
-TMP_DIR=\$BASE_DIR/temp_files
rm intermediate*
"""
}
As a preparation for the actual genotyping, the .bam
files are being indexed.
// git 1.7
/* index al bam files */
process index_bam {
label 'L_32g1h_index_bam'
tag "${sample}"
input:
set val( sample ), val( sample_lane ), file( input ) from dedup_bams
output:
set val( sample ), val( sample_lane ), file( input ), file( "*.bai") into ( indexed_bams, pir_bams )
script:
"""
gatk --java-options "-Xmx30G" \
BuildBamIndex \
-INPUT=${input}
"""
}
At this point the preparation of the sequencing is done and we can start with the genotyping. (The output of the data preparation is split and one copy is later also used to prepare the read aware phasing in the process called extractPirs.)
2.2.2 Genotying
Since some of our samples were split over several lanes, we now need to collect all .bam
files for each sample.
// git 1.8
/* collect all bam files for each sample */
indexed_bams
.groupTuple()
.set {tubbled}
Now, we can create the genotype likelihoods for each individual sample.
// git 1.9
/* create one *.g.vcf file per sample */
process receive_tuple {
label 'L_36g47h_receive_tuple'
publishDir "../../1_genotyping/1_gvcfs/", mode: 'symlink'
tag "${sample}"
input:
set sample, sample_lane, bam, bai from tubbled
output:
file( "*.g.vcf.gz") into gvcfs
file( "*.vcf.gz.tbi") into tbis
script:
"""
INPUT=\$(echo ${bam} | sed 's/\\[/-I /g; s/\\]//g; s/,/ -I/g')
gatk --java-options "-Xmx35g" HaplotypeCaller \
-R=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa \
\$INPUT \
-O ${sample}.g.vcf.gz \
-ERC GVCF
"""
}
The individual genotype likelihoods are collected and combined for the entire data set.
// git 1.10
/* collect and combine all *.g.vcf files */
process gather_gvcfs {
label 'L_O88g90h_gather_gvcfs'
publishDir "../../1_genotyping/1_gvcfs/", mode: 'symlink'
echo true
input:
file( gvcf ) from gvcfs.collect()
file( tbi ) from tbis.collect()
output:
set file( "cohort.g.vcf.gz" ), file( "cohort.g.vcf.gz.tbi" ) into ( gcvf_snps, gvcf_acs, gvcf_indel )
script:
"""
GVCF=\$(echo " ${gvcf}" | sed 's/ /-V /g; s/vcf.gz/vcf.gz /g')
gatk --java-options "-Xmx85g" \
CombineGVCFs \
-R=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa \
\$GVCF \
-O cohort.g.vcf.gz
"""
}
All samples are jointly genotyped.
// git 1.11
/* actual genotyping step (varinat sites only) */
process joint_genotype_snps {
label 'L_O88g90h_joint_genotype'
publishDir "../../1_genotyping/2_raw_vcfs/", mode: 'symlink'
input:
set file( vcf ), file( tbi ) from gcvf_snps
output:
set file( "raw_var_sites.vcf.gz" ), file( "raw_var_sites.vcf.gz.tbi" ) into ( raw_var_sites, raw_var_sites_to_metrics )
script:
"""
gatk --java-options "-Xmx85g" \
GenotypeGVCFs \
-R=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa \
-V=${vcf} \
-O=intermediate.vcf.gz
gatk --java-options "-Xmx85G" \
SelectVariants \
-R=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa \
-V=intermediate.vcf.gz \
--select-type-to-include=SNP \
-O=raw_var_sites.vcf.gz
rm intermediate.*
"""
}
The output of this process is split and used to collect the genotype metrics to inform the hard filtering of SNPs and to pass on the genotypes to the process called filterSNPs.
At this point we create a channel containing all 24 hamlet linkage groups (LGs). This is used later (in the process called extractPirs) since all LGs are phased separately and only located at this part of the script for historical reasons (sorry :/).
// git 1.12
/* generate a LG channel */
Channel
.from( ('01'..'09') + ('10'..'19') + ('20'..'24') )
.into{ LG_ids1; LG_ids2 }
The metrics of the raw genotypes are collected.
// git 1.13
/* produce metrics table to determine filtering thresholds - ups forgot to extract SNPS first*/
process joint_genotype_metrics {
label 'L_28g5h_genotype_metrics'
publishDir "../../1_genotyping/2_raw_vcfs/", mode: 'move'
input:
set file( vcf ), file( tbi ) from raw_var_sites_to_metrics
output:
file( "${vcf}.table.txt" ) into raw_metrics
script:
"""
gatk --java-options "-Xmx25G" \
VariantsToTable \
--variant=${vcf} \
--output=${vcf}.table.txt \
-F=CHROM -F=POS -F=MQ \
-F=QD -F=FS -F=MQRankSum -F=ReadPosRankSum \
--show-filtered
"""
}
Based on the thresholds derived from the genotype metrics, the genotypes are first tagged and then filtered. After this, the data is filtered for missingness and only bi-allelic SNPs are selected.
// git 1.14
/* filter snps basaed on locus annotations, missingness
and type (bi-allelic only) */
process filterSNPs {
label 'L_78g10h_filter_Snps'
publishDir "../../1_genotyping/3_gatk_filtered/", mode: 'symlink'
input:
set file( vcf ), file( tbi ) from raw_var_sites
output:
set file( "filterd_bi-allelic.vcf.gz" ), file( "filterd_bi-allelic.vcf.gz.tbi" ) into filtered_snps
script:
"""
gatk --java-options "-Xmx75G" \
VariantFiltration \
-R=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa \
-V ${vcf} \
-O=intermediate.vcf.gz \
--filter-expression "QD < 2.5" \
--filter-name "filter_QD" \
--filter-expression "FS > 25.0" \
--filter-name "filter_FS" \
--filter-expression "MQ < 52.0 || MQ > 65.0" \
--filter-name "filter_MQ" \
--filter-expression "MQRankSum < -0.2 || MQRankSum > 0.2" \
--filter-name "filter_MQRankSum" \
--filter-expression "ReadPosRankSum < -2.0 || ReadPosRankSum > 2.0 " \
--filter-name "filter_ReadPosRankSum"
gatk --java-options "-Xmx75G" \
SelectVariants \
-R=\$BASE_DIR/ressources/HP_genome_unmasked_01.fa \
-V=intermediate.vcf.gz \
-O=intermediate.filterd.vcf.gz \
--exclude-filtered
vcftools \
--gzvcf intermediate.filterd.vcf.gz \
--max-missing-count 17 \
--max-alleles 2 \
--stdout \
--recode | \
bgzip > filterd_bi-allelic.vcf.gz
tabix -p vcf filterd_bi-allelic.vcf.gz
rm intermediate.*
"""
}
At this point, the genotying is done.
2.2.3 Phasing
To get from genotypes to haplotypes, we apply read-aware phasing using Shapeit. This takes the original sequencing reads into account, so in the first step the reads are screened for Phase Informative Reads (i.e. reads containing more than a single SNP).
This needs to be done for each LG independently, so we first need to split the genotypes before running extractPIRs
.
// git 1.15
// extract phase informative reads from
// alignments and SNPs
process extractPirs {
label 'L_78g10h_extract_pirs'
input:
val( lg ) from LG_ids2
set val( sample ), val( sample_lane ), file( input ), file( index ) from pir_bams.collect()
set file( vcf ), file( tbi ) from filtered_snps
output:
set val( lg ), file( "filterd_bi-allelic.LG${lg}.vcf.gz" ), file( "filterd_bi-allelic.LG${lg}.vcf.gz.tbi" ), file( "PIRsList-LG${lg}.txt" ) into pirs_lg
script:
"""
LG="LG${lg}"
awk -v OFS='\t' -v dir=\$PWD -v lg=\$LG '{print \$1,dir"/"\$2,lg}' \$BASE_DIR/metadata/bamlist_proto.txt > bamlist.txt
vcftools \
--gzvcf ${vcf} \
--chr \$LG \
--stdout \
--recode | \
bgzip > filterd_bi-allelic.LG${lg}.vcf.gz
tabix -p vcf filterd_bi-allelic.LG${lg}.vcf.gz
extractPIRs \
--bam bamlist.txt \
--vcf filterd_bi-allelic.LG${lg}.vcf.gz \
--out PIRsList-LG${lg}.txt \
--base-quality 20 \
--read-quality 15
"""
}
Using those PIRs, we can then proceed with the actual phasing.
The resulting haplotypes are converted back into .vcf
format.
// git 1.16
// run the actual phasing
process run_shapeit {
label 'L_75g24h8t_run_shapeit'
input:
set val( lg ), file( vcf ), file( tbi ), file( pirs ) from pirs_lg
output:
file( "phased-LG${lg}.vcf.gz" ) into phased_lgs
script:
"""
LG="LG${lg}"
shapeit \
-assemble \
--input-vcf ${vcf} \
--input-pir ${pirs} \
--thread 8 \
-O phased-LG${lg}
shapeit \
-convert \
--input-hap phased-LG${lg} \
--output-vcf phased-LG${lg}.vcf
bgzip phased-LG${lg}.vcf
"""
}
After the phasing, we merge the LGs back together to get a single data set. We export a comple data set as well as one that was filtered for a minor allele count of at least two.
// git 1.17
// merge the phased LGs back together.
// the resulting vcf file represents
// the 'SNPs only' data set
process merge_phased {
label 'L_28g5h_merge_phased_vcf'
publishDir "../../1_genotyping/4_phased/", mode: 'move'
input:
file( vcf ) from phased_lgs.collect()
output:
set file( "phased.vcf.gz" ), file( "phased.vcf.gz.tbi" ) into phased_vcf
set file( "phased_mac2.vcf.gz" ), file( "phased_mac2.vcf.gz.tbi" ) into phased_mac2_vcf
script:
"""
vcf-concat \
phased-LG* | \
grep -v ^\$ | \
tee phased.vcf | \
vcftools --vcf - --mac 2 --recode --stdout | \
bgzip > phased_mac2.vcf.gz
bgzip phased.vcf
tabix -p vcf phased.vcf.gz
tabix -p vcf phased_mac2.vcf.gz
"""
}
Finally, we are done with the entire genotyping procedure for the SNPs olny data set.
2.2.4 Indel masks
The genotyping.nf
workflow contains an appendix that makes use of the genotyping likelihoods created in step git 1.10 to create an indel mask that is later used in the inference of the hamlet demographic history (git 8.x).
(This part is excluded from the initial visualization of this script)
We restart by reopening the joint-sample genotype likelyhoods file and calling the indels from it.
/* ========================================= */
/* appendix: generate indel masks for msmc: */
// git 1.18
// reopen the gvcf file to also genotype indels
process joint_genotype_indel {
label 'L_O88g90h_genotype_indel'
publishDir "../../1_genotyping/2_raw_vcfs/", mode: 'copy'
input:
set file( vcf ), file( tbi ) from gvcf_indel
output:
set file( "raw_var_indel.vcf.gz" ), file( "raw_var_indel.vcf.gz.tbi" ) into ( raw_indel, raw_indel_to_metrics )
script:
"""
gatk --java-options "-Xmx85g" \
GenotypeGVCFs \
-R=\$REF_GENOME \
-V=${vcf} \
-O=intermediate.vcf.gz
gatk --java-options "-Xmx85G" \
SelectVariants \
-R=\$REF_GENOME \
-V=intermediate.vcf.gz \
--select-type-to-include=INDEL \
-O=raw_var_indel.vcf.gz
rm intermediate.*
"""
}
We export the the indel genotype metrics to determine cutoff values for the hard filtering step.
// git 1.19
// export indel metrics for filtering
process indel_metrics {
label 'L_28g5h_genotype_metrics'
publishDir "../../1_genotyping/2_raw_vcfs/", mode: 'copy'
input:
set file( vcf ), file( tbi ) from raw_indel_to_metrics
output:
file( "${vcf}.table.txt" ) into raw_indel_metrics
script:
"""
gatk --java-options "-Xmx25G" \
VariantsToTable \
--variant=${vcf} \
--output=${vcf}.table.txt \
-F=CHROM -F=POS -F=MQ \
-F=QD -F=FS -F=MQRankSum -F=ReadPosRankSum \
--show-filtered
"""
}
Based on the exported metrics the genotypes are being filtered.
// git 1.20
// hard filter indels and create mask
process filterIndels {
label 'L_78g10h_filter_indels'
publishDir "../../1_genotyping/3_gatk_filtered/", mode: 'copy'
input:
set file( vcf ), file( tbi ) from raw_indel
output:
set file( "filterd.indel.vcf.gz" ), file( "filterd.indel.vcf.gz.tbi" ) into filtered_indel
file( "indel_mask.bed.gz" ) into indel_mask_ch
/* FILTER THRESHOLDS NEED TO BE UPDATED */
script:
"""
gatk --java-options "-Xmx75G" \
VariantFiltration \
-R=\$REF_GENOME \
-V ${vcf} \
-O=intermediate.vcf.gz \
--filter-expression "QD < 2.5" \
--filter-name "filter_QD" \
--filter-expression "FS > 25.0" \
--filter-name "filter_FS" \
--filter-expression "MQ < 52.0 || MQ > 65.0" \
--filter-name "filter_MQ" \
--filter-expression "SOR > 3.0" \
--filter-name "filter_SOR" \
--filter-expression "InbreedingCoeff < -0.25" \
--filter-name "filter_InbreedingCoeff" \
--filter-expression "MQRankSum < -0.2 || MQRankSum > 0.2" \
--filter-name "filter_MQRankSum" \
--filter-expression "ReadPosRankSum < -2.0 || ReadPosRankSum > 2.0 " \
--filter-name "filter_ReadPosRankSum"
gatk --java-options "-Xmx75G" \
SelectVariants \
-R=\$REF_GENOME \
-V=intermediate.vcf.gz \
-O=filterd.indel.vcf.gz \
--exclude-filtered
zcat filterd.indel.vcf.gz | \
awk '! /\\#/' | \
awk '{if(length(\$4) > length(\$5)) print \$1"\\t"(\$2-6)"\\t"(\$2+length(\$4)+4); else print \$1"\\t"(\$2-6)"\\t"(\$2+length(\$5)+4)}' | \
gzip -c > indel_mask.bed.gz
rm intermediate.*
"""
}
Since we need one indel mask per linkage group, we create a channel of LGs.
// git 1.21
/* create channel of linkage groups */
Channel
.from( ('01'..'09') + ('10'..'19') + ('20'..'24') )
.map{ "LG" + it }
.into{ lg_ch }
The linkage group channel is combined with the filtered indels.
// git 1.22
// attach linkage groups to indel masks
lg_ch.combine( filtered_indel ).set{ filtered_indel_lg }
Finally, one mask per linkage group is created from the indel positions.
// git 1.23
// split indel mask by linkage group
process split_indel_mask {
label 'L_loc_split_indel_mask'
publishDir "../../ressources/indel_masks/", mode: 'copy'
input:
set val( lg ), file( bed ) from filtered_indel_lg
output:
set val( lg ), file( "indel_mask.${lg}.bed.gz " ) into lg_indel_mask
script:
"""
gzip -cd ${bed} | \
grep ${lg} | \
gzip -c > indel_mask.${lg}.bed.gz
"""
}
All indel masks are exported to the resources folder within the root directory.