17 (git 16) Analysis XIV (Identity by Descent)
This pipeline can be executed as follows:
cd $BASE_DIR/nf/16_analysis_ibd
nextflow run analysis_ibd.nf -c ../../nextflow.config -resume
17.1 Summary
Identity by descent is computed within the nextflow script analysis_ibd.nf
(located under $BASE_DIR/nf/16_analysis_ibd/
).
It takes the phased genotypes and computes the IBD segments.
Below is an overview of the steps involved in the analysis.
17.2 Details of analysis_ibd.nf
17.2.1 Setup
The nextflow script starts by opening the phased genotypes.
#!/usr/bin/env nextflow
// This pipeline includes the analysis calculating the pair-wise IBD
// git 16.1
// load genotypes
Channel
.fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
.set{ genotypes_raw_ch }
Then, outgroup samples are removed.
// git 16.2
// drop outgroups
process drop_outgroups {
input:
set vcfId, file( vcf ) from genotypes_raw_ch
output:
file( "phased_mac2.no_outgroup.vcf.gz" ) into genotypes_ch
script:
"""
vcfsamplenames ${vcf[0]} | \
grep "tor\\|tab\\|flo" > outrgr.pop
vcftools --gzvcf ${vcf[0]} \
--remove outrgr.pop \
--recode \
--stdout | bgzip > phased_mac2.no_outgroup.vcf.gz
"""
}
The IDB analysis is run for several IBD fragment size thresholds, so here we are initializing the different thresholds.
// git 16.3
// Set IBD fragment sizes
Channel
.from([[ 25000, 10000, 7 ],
[ 15000, 7500, 10 ],
[ 10000, 5000, 8 ]])
.set{ seq_sizes_ch }
Also, the IBD analysis is additionally run with specific parts of the genome excluded - here the different exclusion models are initialized.
// git 16.4
// Set filter mode
Channel
.from([["direct", ""],
["bed", "95"]])
.set{ filtermode_ch }
Truffle is run to compute the IBD segments.
// git 16.5
// run truffle
process run_truffle {
publishDir "../../2_analysis/ibd/", mode: 'copy'
input:
set file( vcf ), val( sz1 ), val( sz2 ), val( sz3 ), val( mode ), val( excluding ) from genotypes_ch.combine( seq_sizes_ch ).combine( filtermode_ch )
output:
set val( sz3 ), file( "no_outgr_${mode}${excluding}_${sz3}.ibd.tsv" ), file( "no_outgr_${mode}${excluding}_${sz3}.segments.tsv" ) into truffle_result
script:
if( mode == 'direct' )
"""
truffle \
--vcf ${vcf} \
--segments \
--nofiltering \
--ibs1markers ${sz1} \
--ibs2markers ${sz2} \
--out no_outgr_${mode}${excluding}_${sz3} \
--cpu 8
sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.ibd > no_outgr_${mode}${excluding}_${sz3}.ibd.tsv
sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.segments > no_outgr_${mode}${excluding}_${sz3}.segments.tsv
"""
else if( mode == 'filter' )
"""
vcftools --gzvcf ${vcf} \
--not-chr ${excluding} \
--recode \
--stdout | bgzip > tmp.vcf.gz
truffle \
--vcf tmp.vcf.gz \
--segments \
--nofiltering \
--ibs1markers ${sz1} \
--ibs2markers ${sz2} \
--out no_outgr_${mode}${excluding}_${sz3} \
--cpu 8
sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.ibd > no_outgr_${mode}${excluding}_${sz3}.ibd.tsv
sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.segments > no_outgr_${mode}${excluding}_${sz3}.segments.tsv
rm tmp.vcf.gz
"""
else if( mode == 'bed' )
"""
vcftools --gzvcf ${vcf} \
--exclude-bed ../../../../../ressources/plugin/idb_above_${excluding}.bed \
--recode \
--stdout | bgzip > tmp.vcf.gz
truffle \
--vcf tmp.vcf.gz \
--segments \
--nofiltering \
--ibs1markers ${sz1} \
--ibs2markers ${sz2} \
--out no_outgr_${mode}${excluding}_${sz3} \
--cpu 8
sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.ibd > no_outgr_${mode}${excluding}_${sz3}.ibd.tsv
sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.segments > no_outgr_${mode}${excluding}_${sz3}.segments.tsv
rm tmp.vcf.gz
"""
}
Within R, the genomic coordinates of the IBD segments are converted to cM positions.
// git 16.6
// convert IBD segments to cM
process convert_to_cM {
publishDir "../../2_analysis/ibd/cM_converted", mode: 'copy'
input:
set val( sz3 ), file( truffle_summary ) , file( truffle_segments ) from truffle_result
output:
set file( "*.converted.tsv" ), file( "*.conv_summary.tsv" ), file( "*.conv_filterd.tsv" ) into cM_result
script:
"""
#!/usr/bin/env Rscript
base_dir <- Sys.getenv("BASE_DIR")
args <- c("ressources/recombination/",
"MAP1cm.txt", "MAP1bp.txt",
"MAP2cm.txt", "MAP2bp.txt",
"${truffle_segments}",
"${truffle_summary}")
renv::activate(base_dir)
library(GenomicOriginsScripts)
library(hypogen)
library(patchwork)
library(plyranges)
rec_path <- str_c(base_dir, as.character(args[1]))
hypo_map1_cm <- as.character(args[2])
hypo_map1_bp <- as.character(args[3])
hypo_map2_cm <- as.character(args[4])
hypo_map2_bp <- as.character(args[5])
segment_file <- as.character(args[6])
summary_file <- as.character(args[7])
truffle_conv <- segment_file %>% str_replace(pattern = ".segments.tsv", replacement = ".converted.tsv") %>% str_remove(".*/")
truffle_sum <- segment_file %>% str_replace(pattern = ".segments.tsv", replacement = ".conv_summary.tsv") %>% str_remove(".*/")
truffle_filt <- segment_file %>% str_replace(pattern = ".segments.tsv", replacement = ".conv_filterd.tsv") %>% str_remove(".*/")
read_maps <- function(cm_file, bp_file){
read_tsv(cm_file) %>%
group_by(LG) %>%
mutate(LGnm = as.roman(LG) %>% as.numeric(),
CHROM = str_c("LG", str_pad(LGnm, width = 2, pad = 0)),
cM = ifelse(LG == "VIII", max(cM)-cM,cM)) %>%
ungroup() %>%
dplyr::select(Loci, cM, CHROM) %>%
full_join(read_tsv(bp_file) %>%
filter(!(duplicated(Loci) | duplicated(Loci, fromlast = TRUE))),
by = c(Loci = "Loci", CHROM = "LG")) %>%
left_join(hypo_chrom_start) %>%
mutate(GPOS = GSTART + bp) %>%
filter(!is.na(GPOS),
!is.na(cM)) %>%
arrange(GPOS)
}
make_lg_seg <- function(lg = "LG08", n = 31, gmap = gmap1){
data_pos <- tibble(CHROM = rep(lg, n),
start = seq(from = hypo_karyotype\$GSTART[hypo_karyotype\$CHROM == lg],
to = hypo_karyotype\$GEND[hypo_karyotype\$CHROM == lg],
length = n) %>%
floor(),
GSTART = hypo_karyotype\$GSTART[hypo_karyotype\$CHROM == lg]) %>%
mutate(GPOS = start,
start = start - GSTART,
end = start) %>%
as_iranges()
map_pos <- gmap %>%
filter(CHROM == lg) %>%
attach_end(LG = lg) %>%
mutate(start = lag(bp, default = 0),
start_cM = lag(cM, default = 0)) %>%
dplyr::select(CHROM, start, end = bp, start_cM, end_cM = cM) %>%
mutate(start_bp = start, end_bp = end) %>%
as_iranges()
list(data = data_pos, map = map_pos)
}
attach_end <- function(data, LG = "LG01"){
data %>%
bind_rows(.,
data %>%
filter(row_number() == last(row_number())) %>%
mutate(GPOS = hypo_karyotype\$GEND[hypo_karyotype\$CHROM == CHROM],
bp = hypo_karyotype\$LENGTH[hypo_karyotype\$CHROM == CHROM],
Loci = as.numeric(
str_c("-99",
str_remove(string = CHROM, "LG"))
)
))
}
bin_rescaler <- function(bp, start_cM, end_cM, start_bp, end_bp,...){
scales::rescale(x = bp,
to = c(start_cM, end_cM),
from = c(start_bp, end_bp))
}
interpol_data <- function(lg, ...){
data_pair <- make_lg_seg(lg = lg, ...)
plyranges::join_overlap_inner(data_pair\$data,
data_pair\$map) %>%
as.data.frame() %>%
as_tibble() %>%
dplyr::select(CHROM = CHROM.x, bp = start, GSTART, GPOS, start_cM:end_bp) %>%
mutate(interpol_cM = pmap_dbl(cur_data(), bin_rescaler))
}
na_to_zero <- function(x){
x_type <- typeof(x)
if_else(is.na(x), as(0,Class = x_type), x) %>%
as.double() %>% as(Class = x_type)
}
convert_bp_to_cm <- function(data, lg = "LG08", gmap = gmap1){
gmap_in <- deparse(substitute(gmap))
data_pos <- data %>%
filter( CHROM == lg ) %>%
dplyr::select(PAIR, TYPE, CHROM, START, END, NMARKERS) %>%
mutate(seg_id = str_c(PAIR,"_",CHROM,"_",START)) %>%
pivot_longer(cols = START:END, names_to = "PART", values_to = "start") %>%
mutate(end = start) %>%
as_iranges()
map_pos <- gmap %>%
filter(CHROM == lg) %>%
attach_end(LG = lg) %>%
mutate(start = lag(bp, default = 0) + 1, # avoid overlapping segments - causes duplications in joining
start_cM = lag(cM, default = 0)) %>%
dplyr::select(start, end = bp, start_cM, end_cM = cM) %>%
mutate(start_bp = start, end_bp = end) %>%
as_iranges()
map_nr <- str_replace(gmap_in, pattern = "gmap", replacement = "_m")
plyranges::join_overlap_inner(data_pos,
map_pos) %>%
as.data.frame() %>%
as_tibble() %>%
dplyr::select(CHROM = CHROM, bp = start, PAIR, TYPE, PART, start_cM:end_bp, seg_id, NMARKERS) %>%
mutate(interpol_cM = pmap_dbl(cur_data(), bin_rescaler)) %>%
pivot_wider(id_cols = c(CHROM,PAIR,TYPE,seg_id, PART,NMARKERS),
values_from = c(bp,interpol_cM),
names_from = PART) %>%
dplyr::select(-seg_id) %>%
mutate(length_bp = bp_END - bp_START,
length_cM = interpol_cM_END - interpol_cM_START) %>%
set_names(value = c("CHROM", "PAIR", "TYPE", "NMARKERS", "bp_START", "bp_END",
str_c(c("interpol_cM_START", "interpol_cM_END"), map_nr),
"length_bp", str_c("length_cM", map_nr)))
}
# actual script -------------------
gmap1 <- read_maps(cm_file = str_c(rec_path, hypo_map1_cm),
bp_file = str_c(rec_path, hypo_map1_bp))
gmap2 <- read_maps(cm_file = str_c(rec_path, hypo_map2_cm),
bp_file = str_c(rec_path, hypo_map2_bp))
lgs <- 1:24 %>%
str_pad(width = 2, pad = 0) %>%
str_c("LG",.)
segments_individual_interpol_map1 <- lgs %>% map_dfr(interpol_data, n = 51)
segments_individual_interpol_map2 <- lgs %>% map_dfr(interpol_data, n = 51, gmap = gmap2)
segments_individual <- vroom::vroom(segment_file) %>%
mutate(LENGTH = LENGTH * 10^6,
START = POS * 10^6,
END = START + LENGTH,
PAIR = str_c(ID1, "-", ID2))
segments_summary <- vroom::vroom(summary_file) %>%
mutate(PAIR = str_c(ID1, "-", ID2))
bounds_gmap1 <- gmap1 %>%
group_by(CHROM) %>%
filter(cM == max(cM)) %>%
filter(bp == max(bp)) %>%
ungroup() %>%
dplyr::select(CHROM, cM) %>%
mutate(GSTART_cM = cumsum(lag(cM, default = 0)),
GEND_cM = cM + GSTART_cM,
GMID_cM = (GSTART_cM + GEND_cM) / 2,
grp = c("even", "odd")[ 1+row_number() %% 2 ])
bounds_gmap2 <- gmap2 %>%
group_by(CHROM) %>%
filter(cM == max(cM)) %>%
filter(bp == max(bp)) %>%
ungroup() %>%
dplyr::select(CHROM, cM) %>%
mutate(GSTART_cM = cumsum(lag(cM, default = 0)),
GEND_cM = cM + GSTART_cM,
GMID_cM = (GSTART_cM + GEND_cM) / 2,
grp = c("even", "odd")[ 1+row_number() %% 2 ])
hypo_all_starts <- hypo_karyotype %>%
dplyr::select(CHROM, GSTART, GEND) %>%
left_join(bounds_gmap1 %>%
dplyr::select(CHROM, GSTART_cM_m1 = GSTART_cM, GEND_cM_m1 = GEND_cM)) %>%
left_join(bounds_gmap2 %>%
dplyr::select(CHROM, GSTART_cM_m2 = GSTART_cM, GEND_cM_m2 = GEND_cM))
converted_segments <- lgs %>%
map_dfr(convert_bp_to_cm, data = segments_individual) %>%
left_join( lgs %>%
map_dfr(convert_bp_to_cm, data = segments_individual, gmap = gmap2) ) %>%
left_join(hypo_all_starts) %>%
mutate(G_SEG_START = GSTART + bp_START,
G_SEG_END = GSTART + bp_END,
G_SEG_START_cM_m1 = GSTART_cM_m1 + interpol_cM_START_m1,
G_SEG_END_cM_m1 = GSTART_cM_m1 + interpol_cM_END_m1,
G_SEG_START_cM_m2 = GSTART_cM_m2 + interpol_cM_START_m2,
G_SEG_END_cM_m2 = GSTART_cM_m2 + interpol_cM_END_m2)
hypo_cM_length_map1 <- max(hypo_all_starts\$GEND_cM_m1)
hypo_cM_length_map2 <- max(hypo_all_starts\$GEND_cM_m2)
hypo_bp_length <- hypo_karyotype\$GEND[hypo_karyotype\$CHROM == "LG24"]
control <- converted_segments %>%
ungroup() %>%
group_by(PAIR, TYPE) %>%
summarise(seq_length = sum(length_bp),
n_mark = sum(NMARKERS),
cm_length_m1 = sum(length_cM_m1),
cm_length_m2 = sum(length_cM_m2)) %>%
ungroup() %>%
pivot_wider(id_cols = PAIR, names_from = TYPE, values_from = seq_length:cm_length_m2, values_fill = 0) %>%
left_join(segments_summary, ., ) %>%
mutate(across(.cols = seq_length_IBD1:cm_length_m2_IBD2, .fns = na_to_zero)) %>%
mutate(IBD0_manual = (NMARK - (n_mark_IBD1 + n_mark_IBD2)) / NMARK,
IBD1_manual = n_mark_IBD1 / NMARK,
IBD2_manual = n_mark_IBD2 / NMARK,
icheck_0 = IBD0_manual - IBD0,
icheck_1 = IBD1_manual - IBD1,
icheck_2 = IBD2 - IBD2,
# compile ibd by sequence map
ibd0_bp = (hypo_bp_length - (seq_length_IBD1 + seq_length_IBD2)) / hypo_bp_length,
ibd1_bp = seq_length_IBD1 / hypo_bp_length,
ibd2_bp = seq_length_IBD2 / hypo_bp_length,
# compile ibd by genetic map 1
ibd0_cM_m1 = (hypo_cM_length_map1 - (cm_length_m1_IBD1 + cm_length_m1_IBD2)) / hypo_cM_length_map1,
ibd1_cM_m1 = cm_length_m1_IBD1 / hypo_cM_length_map1,
ibd2_cM_m1 = cm_length_m1_IBD2 / hypo_cM_length_map1,
# compile ibd by genetic map 2
ibd0_cM_m2 = (hypo_cM_length_map2 - (cm_length_m2_IBD1 + cm_length_m2_IBD2)) / hypo_cM_length_map2,
ibd1_cM_m2 = cm_length_m2_IBD1 / hypo_cM_length_map2,
ibd2_cM_m2 = cm_length_m2_IBD2 / hypo_cM_length_map2)
cM_treshold <- 0.2
summary_filterd <- converted_segments %>%
filter(length_cM_m1 > cM_treshold & length_cM_m2 > cM_treshold) %>%
ungroup() %>%
group_by(PAIR, TYPE) %>%
summarise(seq_length = sum(length_bp),
n_mark = sum(NMARKERS),
cm_length_m1 = sum(length_cM_m1),
cm_length_m2 = sum(length_cM_m2)) %>%
ungroup() %>%
pivot_wider(id_cols = PAIR, names_from = TYPE, values_from = seq_length:cm_length_m2, values_fill = 0) %>%
left_join(segments_summary, ., ) %>%
mutate(across(.cols = seq_length_IBD1:cm_length_m2_IBD2, .fns = na_to_zero)) %>%
mutate(IBD0_manual = (NMARK - (n_mark_IBD1 + n_mark_IBD2)) / NMARK,
IBD1_manual = n_mark_IBD1 / NMARK,
IBD2_manual = n_mark_IBD2 / NMARK,
icheck_0 = IBD0_manual - IBD0,
icheck_1 = IBD1_manual - IBD1,
icheck_2 = IBD2 - IBD2_manual,
# compile ibd by sequence map
ibd0_bp = (hypo_bp_length - (seq_length_IBD1 + seq_length_IBD2)) / hypo_bp_length,
ibd1_bp = seq_length_IBD1 / hypo_bp_length,
ibd2_bp = seq_length_IBD2 / hypo_bp_length,
# compile ibd by genetic map 1
ibd0_cM_m1 = (hypo_cM_length_map1 - (cm_length_m1_IBD1 + cm_length_m1_IBD2)) / hypo_cM_length_map1,
ibd1_cM_m1 = cm_length_m1_IBD1 / hypo_cM_length_map1,
ibd2_cM_m1 = cm_length_m1_IBD2 / hypo_cM_length_map1,
# compile ibd by genetic map 2
ibd0_cM_m2 = (hypo_cM_length_map2 - (cm_length_m2_IBD1 + cm_length_m2_IBD2)) / hypo_cM_length_map2,
ibd1_cM_m2 = cm_length_m2_IBD1 / hypo_cM_length_map2,
ibd2_cM_m2 = cm_length_m2_IBD2 / hypo_cM_length_map2)
write_tsv(x = converted_segments, file = truffle_conv)
write_tsv(x = control, file = truffle_sum)
write_tsv(x = summary_filterd, file = truffle_filt)
"""
}