| | parse_one_substitute <- function(aaChg, wt.sequence, offset=0, seq.lim=1001) { |
| | if (is.na(wt.sequence)) { |
| | ref = NA |
| | pos = NA |
| | alt = NA |
| | wt = NA |
| | sequence = NA |
| | sequence.len = NA |
| | seq.start = NA |
| | seq.end = NA |
| | pos.orig = NA |
| | sequence.orig = NA |
| | wt.orig = NA |
| | sequence.len.orig = NA |
| | } else { |
| | protein.dictionary <- c( |
| | "A"="Ala", "R"="Arg", "N"="Asn", "D"="Asp", "C"="Cys", "Q"="Gln", "E"="Glu", |
| | "G"="Gly", "H"="His", "I"="Ile", "L"="Leu", "K"="Lys", "M"="Met", "F"="Phe", |
| | "P"="Pro", "O"="Pyl", "S"="Ser", "U"="Sec", "T"="Thr", "W"="Trp", "Y"="Tyr", |
| | "V"="Val", "B"="Asx", "Z"="Glx", "X"="Xaa", "J"="Xle" |
| | ) |
| | protein.reverse.dictionary <- names(protein.dictionary) |
| | names(protein.reverse.dictionary) <- protein.dictionary |
| | pos_raw <- regmatches(aaChg, gregexpr('[0-9]+', aaChg))[[1]] |
| | pos <- as.numeric(pos_raw) + offset |
| | if (length(pos) == 0) { |
| | |
| | ref <- NA |
| | pos <- NA |
| | alt <- NA |
| | if (aaChg == "p.=" | aaChg == "p.(=)" | aaChg == "_wt") { |
| | newseq <- wt.sequence |
| | } else { |
| | newseq <- NA |
| | } |
| | } else if (length(pos) == 2 & !grepl('fs', aaChg)) { |
| | |
| | ref_aa_start <- strsplit(substr(aaChg, 3, |
| | nchar(aaChg)), |
| | split = pos_raw[1])[[1]][1] |
| | remains <- strsplit(substr(aaChg, 3, |
| | nchar(aaChg)), |
| | split = pos_raw[1])[[1]][2] |
| | ref_aa_end <- strsplit(substr(remains, 2, |
| | nchar(remains)), |
| | split = pos_raw[2])[[1]][1] |
| | remains <- strsplit(substr(remains, 2, |
| | nchar(remains)), |
| | split = pos_raw[2])[[1]][2] |
| | if (remains == "del") { |
| | alt <- "" |
| | } else { |
| | alt <- strsplit(remains, split = 'delins')[[1]][2] |
| | } |
| | if (nchar(ref_aa_start) > 1) { |
| | ref_aa_start <- as.character(protein.reverse.dictionary[ref_aa_start]) |
| | } |
| | if (nchar(ref_aa_end) > 1) { |
| | ref_aa_end <- as.character(protein.reverse.dictionary[ref_aa_end]) |
| | } |
| | if (nchar(alt) > 1) { |
| | alt <- as.character(protein.reverse.dictionary[alt]) |
| | } |
| | if (ref_aa_start == substr(wt.sequence, pos[1], pos[1]) & |
| | ref_aa_end == substr(wt.sequence, pos[2], pos[2])) { |
| | newseq <- wt.sequence |
| | ref <- substr(wt.sequence, pos[1], pos[2]) |
| | substr(newseq, pos[1], pos[1]) <- alt |
| | for (i in (pos[1]+1):pos[2]) { |
| | newseq <- paste(unlist(strsplit(newseq, ""))[-(pos[1]+1)], collapse = "") |
| | } |
| | pos <- pos[1] |
| | } else { |
| | ref <- substr(wt.sequence, pos[1], pos[2]) |
| | alt <- NA |
| | newseq <- NA |
| | pos <- NA |
| | } |
| | } else if (length(pos) == 2 & grepl('fs', aaChg)) { |
| | pos <- pos[1] |
| | ref <- substr(wt.sequence, pos, pos) |
| | alt <- NA |
| | newseq <- NA |
| | } else if (pos > nchar(wt.sequence)) { |
| | |
| | ref <- NA |
| | pos <- pos |
| | alt <- NA |
| | if (aaChg == "p.=" | aaChg == "p.(=)" | aaChg == "_wt") { |
| | newseq <- wt.sequence |
| | } else { |
| | newseq <- NA |
| | } |
| | } else { |
| | |
| | ref_alt_raw <- strsplit(substr(aaChg, 3, |
| | nchar(aaChg)), |
| | split = pos_raw)[[1]] |
| | ref <- ref_alt_raw[1] |
| | if (nchar(ref) > 1) { |
| | ref <- as.character(protein.reverse.dictionary[ref]) |
| | } |
| | if (ref == substr(wt.sequence, pos, pos)) { |
| | newseq <- wt.sequence |
| | if (ref_alt_raw[2] == "~" | ref_alt_raw[2] == "del") { |
| | alt <- NA |
| | newseq <- paste(unlist(strsplit(wt.sequence, ""))[-pos], collapse = "") |
| | } else if (ref_alt_raw[2] == "*" | ref_alt_raw[2] == "Ter") { |
| | alt <- NA |
| | newseq <- substr(wt.sequence, 1, pos-1) |
| | } else if (ref_alt_raw[2] == "=") { |
| | |
| | alt <- NA |
| | } else { |
| | alt <- ref_alt_raw[2] |
| | if (nchar(alt) > 1) { |
| | alt <- as.character(protein.reverse.dictionary[alt]) |
| | } |
| | substr(newseq, pos, pos) <- alt |
| | } |
| | } else { |
| | ref <- substr(wt.sequence, pos, pos) |
| | alt <- NA |
| | newseq <- NA |
| | } |
| | } |
| | if (!is.na(newseq) & nchar(newseq)<=1) { |
| | newseq <- NA |
| | } |
| | |
| | sequence.len.orig <- nchar(newseq) |
| | sequence.orig <- newseq |
| | pos.orig <- pos |
| | wt.orig <- wt.sequence |
| | if (!is.na(sequence.len.orig) & |
| | !is.na(pos.orig) & |
| | (sequence.len.orig > seq.lim | nchar(wt.orig) > seq.lim)) { |
| | sequence.len <- seq.lim |
| | if (pos.orig < (seq.lim+1)/2) { |
| | sequence <- substr(sequence.orig, 1, seq.lim) |
| | wt <- substr(wt.orig, 1, seq.lim) |
| | pos <- pos.orig |
| | seq.start <- 1 |
| | seq.end <- seq.lim |
| | } else if (pos.orig + (seq.lim-1)/2 > sequence.len.orig) { |
| | sequence <- substr(sequence.orig, |
| | sequence.len.orig-seq.lim+1, |
| | sequence.len.orig) |
| | wt <- substr(wt.orig, |
| | sequence.len.orig-seq.lim+1, |
| | sequence.len.orig) |
| | pos <- pos.orig - sequence.len.orig + seq.lim |
| | seq.start <- sequence.len.orig - seq.lim + 1 |
| | seq.end <- sequence.len.orig |
| | } else { |
| | sequence <- substr(sequence.orig, |
| | pos.orig-(seq.lim-1)/2, |
| | pos.orig+(seq.lim-1)/2) |
| | wt <- substr(wt.sequence, |
| | pos.orig-(seq.lim-1)/2, |
| | pos.orig+(seq.lim-1)/2) |
| | pos <- (seq.lim+1)/2 |
| | seq.start <- pos.orig-(seq.lim-1)/2 |
| | seq.end <- pos.orig+(seq.lim-1)/2 |
| | } |
| | } else { |
| | sequence.len <- sequence.len.orig |
| | sequence <- sequence.orig |
| | wt <- wt.sequence |
| | pos <- pos.orig |
| | seq.start <- 1 |
| | seq.end <- sequence.len.orig |
| | } |
| | } |
| | result <- list(ref=ref, pos=pos, alt=alt, |
| | wt = wt, |
| | sequence = sequence, |
| | sequence.len = sequence.len, |
| | seq.start = seq.start, |
| | seq.end = seq.end, |
| | pos.orig = pos.orig, |
| | sequence.orig = sequence.orig, |
| | wt.orig = wt.orig, |
| | sequence.len.orig = sequence.len.orig) |
| | result |
| | } |