17 (git 16) Analysis XIV (Identity by Descent)

This pipeline can be executed as follows:

cd $BASE_DIR/nf/16_analysis_ibd
nextflow run analysis_ibd.nf -c ../../nextflow.config -resume

17.1 Summary

Identity by descent is computed within the nextflow script analysis_ibd.nf (located under $BASE_DIR/nf/16_analysis_ibd/). It takes the phased genotypes and computes the IBD segments. Below is an overview of the steps involved in the analysis.

17.2 Details of analysis_ibd.nf

17.2.1 Setup

The nextflow script starts by opening the phased genotypes.

#!/usr/bin/env nextflow
// This pipeline includes the analysis calculating the pair-wise IBD

// git 16.1
// load genotypes
Channel
    .fromFilePairs("../../1_genotyping/4_phased/phased_mac2.vcf.{gz,gz.tbi}")
    .set{ genotypes_raw_ch }

Then, outgroup samples are removed.

// git 16.2
// drop outgroups
process drop_outgroups {
    
    input:
    set vcfId, file( vcf ) from genotypes_raw_ch

    output:
    file( "phased_mac2.no_outgroup.vcf.gz" ) into genotypes_ch

    script:
    """
   vcfsamplenames ${vcf[0]} | \
       grep "tor\\|tab\\|flo" > outrgr.pop

   vcftools --gzvcf ${vcf[0]} \
       --remove outrgr.pop \
       --recode \
       --stdout | bgzip > phased_mac2.no_outgroup.vcf.gz
   """
}

The IDB analysis is run for several IBD fragment size thresholds, so here we are initializing the different thresholds.

// git 16.3
// Set IBD fragment sizes
Channel
    .from([[ 25000, 10000, 7 ],
           [ 15000, 7500, 10 ],
           [ 10000, 5000, 8 ]])
    .set{ seq_sizes_ch }

Also, the IBD analysis is additionally run with specific parts of the genome excluded - here the different exclusion models are initialized.

// git 16.4
// Set filter mode
Channel
    .from([["direct", ""],
           ["bed", "95"]])
    .set{ filtermode_ch }

Truffle is run to compute the IBD segments.

// git 16.5
// run truffle
process run_truffle {
    publishDir "../../2_analysis/ibd/", mode: 'copy'

    input:
    set file( vcf ), val( sz1 ), val( sz2 ), val( sz3 ), val( mode ), val( excluding ) from genotypes_ch.combine( seq_sizes_ch ).combine( filtermode_ch )

    output:
    set val( sz3 ), file( "no_outgr_${mode}${excluding}_${sz3}.ibd.tsv" ), file( "no_outgr_${mode}${excluding}_${sz3}.segments.tsv" ) into truffle_result

    script:
    if( mode == 'direct' )
        """
       truffle \
           --vcf ${vcf} \
           --segments \
           --nofiltering \
           --ibs1markers ${sz1} \
           --ibs2markers ${sz2} \
           --out no_outgr_${mode}${excluding}_${sz3} \
           --cpu 8
       
       sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.ibd > no_outgr_${mode}${excluding}_${sz3}.ibd.tsv
       sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.segments > no_outgr_${mode}${excluding}_${sz3}.segments.tsv
       """
    else if( mode == 'filter' )
        """
       vcftools --gzvcf ${vcf} \
           --not-chr ${excluding} \
           --recode \
           --stdout | bgzip > tmp.vcf.gz 
       
       truffle \
           --vcf tmp.vcf.gz  \
           --segments \
           --nofiltering \
           --ibs1markers ${sz1} \
           --ibs2markers ${sz2} \
           --out no_outgr_${mode}${excluding}_${sz3} \
           --cpu 8
       
       sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.ibd > no_outgr_${mode}${excluding}_${sz3}.ibd.tsv
       sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.segments > no_outgr_${mode}${excluding}_${sz3}.segments.tsv
       rm tmp.vcf.gz 
       """
    else if( mode == 'bed' )
        """
       vcftools --gzvcf ${vcf} \
           --exclude-bed ../../../../../ressources/plugin/idb_above_${excluding}.bed \
           --recode \
           --stdout | bgzip > tmp.vcf.gz 
       
       truffle \
           --vcf tmp.vcf.gz  \
           --segments \
           --nofiltering \
           --ibs1markers ${sz1} \
           --ibs2markers ${sz2} \
           --out no_outgr_${mode}${excluding}_${sz3} \
           --cpu 8
       
       sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.ibd > no_outgr_${mode}${excluding}_${sz3}.ibd.tsv
       sed 's/^\\s*//g; s/\\s\\+/\\t/g' no_outgr_${mode}${excluding}_${sz3}.segments > no_outgr_${mode}${excluding}_${sz3}.segments.tsv
       rm tmp.vcf.gz 
       """
}

Within R, the genomic coordinates of the IBD segments are converted to cM positions.

// git 16.6
// convert IBD segments to cM
process convert_to_cM {
    publishDir "../../2_analysis/ibd/cM_converted", mode: 'copy'

    input:
    set val( sz3 ), file( truffle_summary ) , file( truffle_segments ) from truffle_result

    output:
    set file( "*.converted.tsv" ), file( "*.conv_summary.tsv" ), file( "*.conv_filterd.tsv" ) into cM_result

    script:
    """
   #!/usr/bin/env Rscript
   base_dir <- Sys.getenv("BASE_DIR")

   args <- c("ressources/recombination/",
          "MAP1cm.txt", "MAP1bp.txt",
          "MAP2cm.txt", "MAP2bp.txt",
          "${truffle_segments}",
          "${truffle_summary}")
   
   renv::activate(base_dir)

   library(GenomicOriginsScripts)
   library(hypogen)
   library(patchwork)
   library(plyranges)

   rec_path <- str_c(base_dir, as.character(args[1]))
   hypo_map1_cm <- as.character(args[2])
   hypo_map1_bp <- as.character(args[3])
   hypo_map2_cm <- as.character(args[4])
   hypo_map2_bp <- as.character(args[5])
   segment_file <- as.character(args[6])
   summary_file <- as.character(args[7])

   truffle_conv <- segment_file %>% str_replace(pattern = ".segments.tsv", replacement = ".converted.tsv") %>% str_remove(".*/")
   truffle_sum <- segment_file %>% str_replace(pattern = ".segments.tsv", replacement = ".conv_summary.tsv") %>% str_remove(".*/")
   truffle_filt <- segment_file %>% str_replace(pattern = ".segments.tsv", replacement = ".conv_filterd.tsv") %>% str_remove(".*/")

   read_maps <- function(cm_file, bp_file){
   read_tsv(cm_file) %>% 
       group_by(LG) %>% 
       mutate(LGnm = as.roman(LG) %>% as.numeric(),
           CHROM = str_c("LG", str_pad(LGnm, width = 2, pad = 0)),
           cM = ifelse(LG == "VIII", max(cM)-cM,cM)) %>% 
       ungroup() %>% 
       dplyr::select(Loci, cM, CHROM)  %>% 
       full_join(read_tsv(bp_file) %>% 
                   filter(!(duplicated(Loci) | duplicated(Loci, fromlast = TRUE))),
               by = c(Loci = "Loci", CHROM = "LG")) %>% 
       left_join(hypo_chrom_start) %>% 
       mutate(GPOS = GSTART + bp) %>% 
       filter(!is.na(GPOS),
           !is.na(cM)) %>% 
       arrange(GPOS)
   }

   make_lg_seg <- function(lg = "LG08", n = 31, gmap = gmap1){
   data_pos <- tibble(CHROM = rep(lg, n),
                       start = seq(from = hypo_karyotype\$GSTART[hypo_karyotype\$CHROM == lg],
                                   to = hypo_karyotype\$GEND[hypo_karyotype\$CHROM == lg],
                                   length = n) %>%
                       floor(),
                       GSTART = hypo_karyotype\$GSTART[hypo_karyotype\$CHROM == lg]) %>% 
       mutate(GPOS = start,
           start = start - GSTART,
           end = start) %>% 
       as_iranges()
   
   map_pos <- gmap %>%
       filter(CHROM == lg) %>% 
       attach_end(LG = lg) %>% 
       mutate(start = lag(bp, default = 0),
           start_cM = lag(cM, default = 0)) %>% 
       dplyr::select(CHROM, start, end = bp, start_cM, end_cM = cM) %>%
       mutate(start_bp = start, end_bp = end) %>% 
       as_iranges()
   
   list(data = data_pos, map = map_pos)
   }

   attach_end <- function(data, LG = "LG01"){
   data %>% 
       bind_rows(., 
               data %>%
                   filter(row_number() == last(row_number())) %>% 
                   mutate(GPOS = hypo_karyotype\$GEND[hypo_karyotype\$CHROM == CHROM],
                       bp = hypo_karyotype\$LENGTH[hypo_karyotype\$CHROM == CHROM],
                       Loci = as.numeric(
                           str_c("-99",
                               str_remove(string = CHROM, "LG"))
                       )
                   ))
   }

   bin_rescaler <- function(bp, start_cM, end_cM, start_bp, end_bp,...){
   scales::rescale(x = bp,
                   to = c(start_cM, end_cM),
                   from = c(start_bp, end_bp))
   }

   interpol_data <- function(lg, ...){
   data_pair <- make_lg_seg(lg = lg, ...)
   
   plyranges::join_overlap_inner(data_pair\$data,
                                   data_pair\$map) %>% 
       as.data.frame() %>% 
       as_tibble() %>% 
       dplyr::select(CHROM = CHROM.x, bp = start, GSTART, GPOS, start_cM:end_bp) %>% 
       mutate(interpol_cM = pmap_dbl(cur_data(), bin_rescaler)) 
   }

   na_to_zero <- function(x){
   x_type <- typeof(x)
   if_else(is.na(x), as(0,Class = x_type), x) %>% 
       as.double() %>% as(Class = x_type)
   }

   convert_bp_to_cm <- function(data, lg = "LG08", gmap = gmap1){
   gmap_in <- deparse(substitute(gmap))
   
   data_pos <- data %>% 
       filter( CHROM == lg ) %>% 
       dplyr::select(PAIR, TYPE, CHROM, START, END, NMARKERS) %>% 
       mutate(seg_id = str_c(PAIR,"_",CHROM,"_",START)) %>% 
       pivot_longer(cols = START:END, names_to = "PART", values_to = "start") %>%
       mutate(end = start) %>% 
       as_iranges()
   
   map_pos <- gmap %>%
       filter(CHROM == lg) %>%
       attach_end(LG = lg) %>%
       mutate(start = lag(bp, default = 0) + 1, # avoid overlapping segments - causes duplications in joining
           start_cM = lag(cM, default = 0)) %>%
       dplyr::select(start, end = bp, start_cM, end_cM = cM) %>%
       mutate(start_bp = start, end_bp = end) %>%
       as_iranges()
   
   map_nr <- str_replace(gmap_in, pattern = "gmap", replacement = "_m")
   
   plyranges::join_overlap_inner(data_pos,
                                   map_pos) %>%
       as.data.frame() %>%
       as_tibble() %>%
       dplyr::select(CHROM = CHROM, bp = start, PAIR, TYPE, PART, start_cM:end_bp, seg_id, NMARKERS) %>%
       mutate(interpol_cM = pmap_dbl(cur_data(), bin_rescaler)) %>%
       pivot_wider(id_cols = c(CHROM,PAIR,TYPE,seg_id, PART,NMARKERS),
                   values_from = c(bp,interpol_cM),
                   names_from = PART) %>%
       dplyr::select(-seg_id) %>%
       mutate(length_bp = bp_END - bp_START,
           length_cM = interpol_cM_END - interpol_cM_START) %>% 
       set_names(value = c("CHROM", "PAIR", "TYPE", "NMARKERS", "bp_START", "bp_END",
                           str_c(c("interpol_cM_START", "interpol_cM_END"), map_nr),
                           "length_bp", str_c("length_cM", map_nr)))
   }

   # actual script -------------------
   gmap1 <- read_maps(cm_file = str_c(rec_path, hypo_map1_cm),
                   bp_file = str_c(rec_path, hypo_map1_bp))
   gmap2 <- read_maps(cm_file = str_c(rec_path, hypo_map2_cm),
                   bp_file = str_c(rec_path, hypo_map2_bp))


   lgs <- 1:24 %>%
   str_pad(width = 2, pad = 0) %>%
   str_c("LG",.)

   segments_individual_interpol_map1 <- lgs %>% map_dfr(interpol_data, n = 51)
   segments_individual_interpol_map2 <- lgs %>% map_dfr(interpol_data, n = 51, gmap = gmap2)

   segments_individual <- vroom::vroom(segment_file) %>%
   mutate(LENGTH = LENGTH * 10^6,
           START = POS * 10^6,
           END = START + LENGTH,
           PAIR = str_c(ID1, "-", ID2))

   segments_summary <- vroom::vroom(summary_file) %>%
   mutate(PAIR = str_c(ID1, "-", ID2))

   bounds_gmap1 <- gmap1 %>% 
   group_by(CHROM) %>% 
   filter(cM == max(cM)) %>% 
   filter(bp == max(bp)) %>% 
   ungroup() %>% 
   dplyr::select(CHROM, cM) %>%
   mutate(GSTART_cM = cumsum(lag(cM, default = 0)),
           GEND_cM = cM + GSTART_cM,
           GMID_cM = (GSTART_cM + GEND_cM) / 2,
           grp = c("even", "odd")[ 1+row_number() %% 2 ])

   bounds_gmap2 <- gmap2 %>% 
   group_by(CHROM) %>% 
   filter(cM == max(cM)) %>% 
   filter(bp == max(bp)) %>% 
   ungroup() %>% 
   dplyr::select(CHROM, cM) %>%
   mutate(GSTART_cM = cumsum(lag(cM, default = 0)),
           GEND_cM = cM + GSTART_cM,
           GMID_cM = (GSTART_cM + GEND_cM) / 2,
           grp = c("even", "odd")[ 1+row_number() %% 2 ])

   hypo_all_starts <- hypo_karyotype %>% 
   dplyr::select(CHROM, GSTART, GEND) %>% 
   left_join(bounds_gmap1 %>% 
               dplyr::select(CHROM, GSTART_cM_m1 = GSTART_cM, GEND_cM_m1 = GEND_cM)) %>% 
   left_join(bounds_gmap2 %>% 
               dplyr::select(CHROM, GSTART_cM_m2 = GSTART_cM, GEND_cM_m2 = GEND_cM))

   converted_segments <- lgs %>% 
   map_dfr(convert_bp_to_cm, data = segments_individual) %>% 
   left_join( lgs %>% 
               map_dfr(convert_bp_to_cm, data = segments_individual, gmap = gmap2) ) %>%
   left_join(hypo_all_starts) %>% 
   mutate(G_SEG_START = GSTART + bp_START,
           G_SEG_END = GSTART + bp_END,
           G_SEG_START_cM_m1 = GSTART_cM_m1 + interpol_cM_START_m1,
           G_SEG_END_cM_m1 = GSTART_cM_m1 + interpol_cM_END_m1,
           G_SEG_START_cM_m2 = GSTART_cM_m2 + interpol_cM_START_m2,
           G_SEG_END_cM_m2 = GSTART_cM_m2 + interpol_cM_END_m2)

   hypo_cM_length_map1 <- max(hypo_all_starts\$GEND_cM_m1)
   hypo_cM_length_map2 <- max(hypo_all_starts\$GEND_cM_m2)
   hypo_bp_length <- hypo_karyotype\$GEND[hypo_karyotype\$CHROM == "LG24"]

   control <- converted_segments %>%
   ungroup() %>%
   group_by(PAIR, TYPE) %>%
   summarise(seq_length = sum(length_bp),
               n_mark = sum(NMARKERS),
               cm_length_m1 = sum(length_cM_m1),
               cm_length_m2 = sum(length_cM_m2)) %>%
   ungroup() %>%
   pivot_wider(id_cols = PAIR, names_from = TYPE, values_from = seq_length:cm_length_m2, values_fill = 0) %>% 
   left_join(segments_summary, .,  ) %>% 
   mutate(across(.cols = seq_length_IBD1:cm_length_m2_IBD2, .fns = na_to_zero)) %>% 
   mutate(IBD0_manual = (NMARK - (n_mark_IBD1 + n_mark_IBD2)) / NMARK,
           IBD1_manual = n_mark_IBD1 / NMARK,
           IBD2_manual = n_mark_IBD2 / NMARK,
           icheck_0 = IBD0_manual - IBD0,
           icheck_1 = IBD1_manual - IBD1,
           icheck_2 = IBD2 - IBD2,
           # compile ibd by sequence map
           ibd0_bp = (hypo_bp_length - (seq_length_IBD1 + seq_length_IBD2)) / hypo_bp_length,
           ibd1_bp = seq_length_IBD1 / hypo_bp_length,
           ibd2_bp = seq_length_IBD2 / hypo_bp_length,
           # compile ibd by genetic map 1
           ibd0_cM_m1 = (hypo_cM_length_map1 - (cm_length_m1_IBD1 + cm_length_m1_IBD2)) / hypo_cM_length_map1,
           ibd1_cM_m1 = cm_length_m1_IBD1 / hypo_cM_length_map1,
           ibd2_cM_m1 = cm_length_m1_IBD2 / hypo_cM_length_map1,
           # compile ibd by genetic map 2
           ibd0_cM_m2 = (hypo_cM_length_map2 - (cm_length_m2_IBD1 + cm_length_m2_IBD2)) / hypo_cM_length_map2,
           ibd1_cM_m2 = cm_length_m2_IBD1 / hypo_cM_length_map2,
           ibd2_cM_m2 = cm_length_m2_IBD2 / hypo_cM_length_map2)

   cM_treshold <- 0.2
   summary_filterd <- converted_segments %>%
   filter(length_cM_m1 > cM_treshold & length_cM_m2 > cM_treshold) %>% 
   ungroup() %>%
   group_by(PAIR, TYPE) %>%
   summarise(seq_length = sum(length_bp),
               n_mark = sum(NMARKERS),
               cm_length_m1 = sum(length_cM_m1),
               cm_length_m2 = sum(length_cM_m2)) %>%
   ungroup() %>%
   pivot_wider(id_cols = PAIR, names_from = TYPE, values_from = seq_length:cm_length_m2, values_fill = 0) %>% 
   left_join(segments_summary, .,  ) %>% 
   mutate(across(.cols = seq_length_IBD1:cm_length_m2_IBD2, .fns = na_to_zero)) %>% 
   mutate(IBD0_manual = (NMARK - (n_mark_IBD1 + n_mark_IBD2)) / NMARK,
           IBD1_manual = n_mark_IBD1 / NMARK,
           IBD2_manual = n_mark_IBD2 / NMARK,
           icheck_0 = IBD0_manual - IBD0,
           icheck_1 = IBD1_manual - IBD1,
           icheck_2 = IBD2 - IBD2_manual,
           # compile ibd by sequence map
           ibd0_bp = (hypo_bp_length - (seq_length_IBD1 + seq_length_IBD2)) / hypo_bp_length,
           ibd1_bp = seq_length_IBD1 / hypo_bp_length,
           ibd2_bp = seq_length_IBD2 / hypo_bp_length,
           # compile ibd by genetic map 1
           ibd0_cM_m1 = (hypo_cM_length_map1 - (cm_length_m1_IBD1 + cm_length_m1_IBD2)) / hypo_cM_length_map1,
           ibd1_cM_m1 = cm_length_m1_IBD1 / hypo_cM_length_map1,
           ibd2_cM_m1 = cm_length_m1_IBD2 / hypo_cM_length_map1,
           # compile ibd by genetic map 2
           ibd0_cM_m2 = (hypo_cM_length_map2 - (cm_length_m2_IBD1 + cm_length_m2_IBD2)) / hypo_cM_length_map2,
           ibd1_cM_m2 = cm_length_m2_IBD1 / hypo_cM_length_map2,
           ibd2_cM_m2 = cm_length_m2_IBD2 / hypo_cM_length_map2)

   write_tsv(x = converted_segments, file = truffle_conv)
   write_tsv(x = control, file = truffle_sum)
   write_tsv(x = summary_filterd, file = truffle_filt)
   """
}