contact_analyzer.r

Contact analyzer

Discovery

Function to discover contacts between donor and acceptor atoms Includes a ligand residue number and a list of corresponding donor and acceptor atoms

Use a pdb file with hydrogens May be lengthy for larger molecules/larger numbers of frame It is not necessary to include all frames in the discovery step Fewer steps and lower frequency reduces calculation time and false negative results

In [1]:

contactr <- function(pdb, dist.thresh = 3.5, freq.thresh = 0.1,
                     lgd.resno = 38,
                     lgd.donors = c('O2', 'O1', 'N1'),
                     lgd.acceptors = NA,
                     donor.atoms = c('N', 'O', 'S', 'F'),
                     acceptor.atoms = c('O', 'N', 'S', 'F', 'Cl', 'Br', 'I')) {


  donor_atoms <- donor.atoms
  acceptor_atoms <- acceptor.atoms

  donors <- list(
    Gua = c('N2', 'N1'),
    Ade = c('N6'),
    Thy = c('N3'),
    Cyt = c('N4')
  )

  acceptors <- list(
    Gua = c('O6', 'N3', 'N7', "O4'"),
    Ade = c('N1', 'N3', 'N7', "O4'"),
    Thy = c('O4', 'O2', "O4'"),
    Cyt = c('N3', 'O2', "O4'")
  )

  all.donor <- unique(c(donors$G, donors$A, donors$T, donors$C))
  all.acceptor <- unique(c(acceptors$G, acceptors$A, acceptors$T, acceptors$C))

  print(paste('Looking for contacts within', dist.thresh, 'angstroms in',
              nrow(pdb$xyz), 'frames'))

  lapply(
    1:nrow(pdb$xyz),
    function(i) {
      dist <- dist.xyz(a = pdb$xyz[i,], all.pairs = TRUE) %>%
        lazy_dt() %>%
        mutate(atom1 = 1:nrow(.), .before = 1) %>%
        pivot_longer(cols = -atom1, names_to = "atom2", values_to = "value") %>%
        mutate(atom2 = as.numeric(atom2)) %>%
        # only keeps half of the matrix
        filter(atom1 < atom2) %>%
        # remove distance above threshold
        filter(value < dist.thresh) %>%
        left_join(pdb$atom %>%
                    select(eleno, resno, resid, elety, elesy),
                  by = c("atom1" = "eleno")) %>%
        rename(resno1 = resno, resid1 = resid, elety1 = elety, elesy1 = elesy) %>%
        left_join(pdb$atom %>%
                    select(eleno, resno, resid, elety, elesy),
                  by = c("atom2" = "eleno")) %>%
        rename(resno2 = resno, resid2 = resid, elety2 = elety, elesy2 = elesy) %>%
        # removes intra-residue contacts
        filter(resno1 != resno2) %>%
        # selects atoms capable of forming hydrogen bonds
        filter(elesy1 %in% donor_atoms & elesy2 %in% acceptor_atoms |
                 elesy1 %in% acceptor_atoms & elesy2 %in% donor_atoms) %>%
        #remove 'D' from residue name
        mutate(
          resid1 = ifelse(substr(resid1, 1, 1) == "D", substr(resid1, 2, nchar(resid1)), resid1),
          resid2 = ifelse(substr(resid2, 1, 1) == "D", substr(resid2, 2, nchar(resid2)), resid2)
        ) %>%
        # only keep atoms in donor or acceptor groups depending of the residue
        filter(
          (resid1 == 'G' & elety1 %in% donors$Gua &
             ((resid2 == 'G' & elety2 %in% acceptors$Gua) |
                (resid2 == 'C' & elety2 %in% acceptors$Cyt) |
                (resid2 == 'A' & elety2 %in% acceptors$Ade) |
                (resid2 == 'T' & elety2 %in% acceptors$Thy)
             )) |
            (resid2 == 'G' & elety2 %in% donors$Gua &
               ((resid1 == 'G' & elety1 %in% acceptors$Gua) |
                  (resid1 == 'C' & elety1 %in% acceptors$Cyt) |
                  (resid1 == 'A' & elety1 %in% acceptors$Ade) |
                  (resid1 == 'T' & elety1 %in% acceptors$Thy)
               )) |
            (resid1 == 'C' & elety1 %in% donors$Cyt &
               ((resid2 == 'G' & elety2 %in% acceptors$Gua) |
                  (resid2 == 'C' & elety2 %in% acceptors$Cyt) |
                  (resid2 == 'A' & elety2 %in% acceptors$Ade) |
                  (resid2 == 'T' & elety2 %in% acceptors$Thy)
               )) |
            (resid2 == 'C' & elety2 %in% donors$Cyt &
               ((resid1 == 'G' & elety1 %in% acceptors$Gua) |
                  (resid1 == 'C' & elety1 %in% acceptors$Cyt) |
                  (resid1 == 'A' & elety1 %in% acceptors$Ade) |
                  (resid1 == 'T' & elety1 %in% acceptors$Thy)
               )) |
            (resid1 == 'A' & elety1 %in% donors$Ade &
               ((resid2 == 'G' & elety2 %in% acceptors$Gua) |
                  (resid2 == 'C' & elety2 %in% acceptors$Cyt) |
                  (resid2 == 'A' & elety2 %in% acceptors$Ade) |
                  (resid2 == 'T' & elety2 %in% acceptors$Thy)
               )) |
            (resid2 == 'A' & elety2 %in% donors$Ade &
               ((resid1 == 'G' & elety1 %in% acceptors$Gua) |
                  (resid1 == 'C' & elety1 %in% acceptors$Cyt) |
                  (resid1 == 'A' & elety1 %in% acceptors$Ade) |
                  (resid1 == 'T' & elety1 %in% acceptors$Thy)
               )) |
            (resid1 == 'T' & elety1 %in% donors$Thy &
               ((resid2 == 'G' & elety2 %in% acceptors$Gua) |
                  (resid2 == 'C' & elety2 %in% acceptors$Cyt) |
                  (resid2 == 'A' & elety2 %in% acceptors$Ade) |
                  (resid2 == 'T' & elety2 %in% acceptors$Thy)
               )) |
            (resid2 == 'T' & elety2 %in% donors$Thy &
               ((resid1 == 'G' & elety1 %in% acceptors$Gua) |
                  (resid1 == 'C' & elety1 %in% acceptors$Cyt) |
                  (resid1 == 'A' & elety1 %in% acceptors$Ade) |
                  (resid1 == 'T' & elety1 %in% acceptors$Thy)
               )) |
            !(resid1 %in% c('G', 'A', 'T', 'C')) |
            !(resid2 %in% c('G', 'A', 'T', 'C'))
        ) %>%
        as.data.table()

      return(dist)
    }
  ) %>%
    rbindlist(.) %>%
    group_by(resid1, resno1, elety1,
             resid2, resno2, elety2) %>%
    # keep a single row per contact, outputs value and frequency within ensemble
    summarise(
      value = mean(value, na.rm = TRUE),
      n = n(),
      freq = n/nrow(pdb$xyz)
    ) %>%
    # only keep contacts with a frequency above threshold
    filter(freq >= freq.thresh) %>%
    ungroup() %>%
    # assign donors/acceptors
    mutate(
      donor.resid = case_when(
        resid1 == 'G' & elety1 %in% donors$Gua ~ resid1,
        resid1 == 'A' & elety1 %in% donors$Ade ~ resid1,
        resid1 == 'T' & elety1 %in% donors$Thy ~ resid1,
        resid1 == 'C' & elety1 %in% donors$Cyt ~ resid1,
        resno1 == lgd.resno & elety1 %in% lgd.donors ~ resid1,
        TRUE ~ resid2
      ),
      donor.resno = case_when(
        resid1 == 'G' & elety1 %in% donors$Gua ~ resno1,
        resid1 == 'A' & elety1 %in% donors$Ade ~ resno1,
        resid1 == 'T' & elety1 %in% donors$Thy ~ resno1,
        resid1 == 'C' & elety1 %in% donors$Cyt ~ resno1,
        resno1 == lgd.resno & elety1 %in% lgd.donors ~ resno1,
        TRUE ~ resno2
      ),
      donor.elety = case_when(
        resid1 == 'G' & elety1 %in% donors$Gua ~ elety1,
        resid1 == 'A' & elety1 %in% donors$Ade ~ elety1,
        resid1 == 'T' & elety1 %in% donors$Thy ~ elety1,
        resid1 == 'C' & elety1 %in% donors$Cyt ~ elety1,
        resno1 == lgd.resno & elety1 %in% lgd.donors ~ elety1,
        TRUE ~ elety2
      ),
      acceptor.resid = case_when(
        resid1 == 'G' & elety1 %in% acceptors$Gua ~ resid1,
        resid1 == 'A' & elety1 %in% acceptors$Ade ~ resid1,
        resid1 == 'T' & elety1 %in% acceptors$Thy ~ resid1,
        resid1 == 'C' & elety1 %in% acceptors$Cyt ~ resid1,
        resno1 == lgd.resno & elety1 %in% lgd.acceptors ~ resid1,
        TRUE ~ resid2
      ),
      acceptor.resno = case_when(
        resid1 == 'G' & elety1 %in% acceptors$Gua ~ resno1,
        resid1 == 'A' & elety1 %in% acceptors$Ade ~ resno1,
        resid1 == 'T' & elety1 %in% acceptors$Thy ~ resno1,
        resid1 == 'C' & elety1 %in% acceptors$Cyt ~ resno1,
        resno1 == lgd.resno & elety1 %in% lgd.acceptors ~ resno1,
        TRUE ~ resno2
      ),
      acceptor.elety = case_when(
        resid1 == 'G' & elety1 %in% acceptors$Gua ~ elety1,
        resid1 == 'A' & elety1 %in% acceptors$Ade ~ elety1,
        resid1 == 'T' & elety1 %in% acceptors$Thy ~ elety1,
        resid1 == 'C' & elety1 %in% acceptors$Cyt ~ elety1,
        resno1 == lgd.resno & elety1 %in% lgd.acceptors ~ elety1,
        TRUE ~ elety2
      )
    ) %>%
    # remove useless columns
    select(-c(resid1, resno1, elety1, resid2, resno2, elety2)) %>%
    # generate pair names
    mutate(
      pair = paste0(donor.resid, donor.resno, ':', donor.elety, '-',
                    acceptor.resid, acceptor.resno, ':', acceptor.elety),
      .before = 1
    ) %>%
    # arrange pairs
    arrange(donor.resno, acceptor.resno)

}

Atom-atom distance calculation and smoothing

In [2]:

atom.distr <- function(pair, pdb.input, span = 0.005, info = info.long) {

  pair.a <- str_split(pair, '-')[[1]][1]
  pair.b <- str_split(pair, '-')[[1]][2]

  dist <- data.frame(
    frame = 1:nrow(pdb.input$xyz),
    dist = dist.xyz(
      a = com(
        pdb.input,
        inds = atom.select(
          pdb.input,
          resno = as.numeric(gsub("\\D", "", str_split(pair.a, ':')[[1]][1])),
          elety = str_split(pair.a, ':')[[1]][2]
        )
      ),
      b = com(
        pdb.input,
        inds = atom.select(
          pdb.input,
          resno = as.numeric(gsub("\\D", "", str_split(pair.b, ':')[[1]][1])),
          elety = str_split(pair.b, ':')[[1]][2]
        )
      ),
      all.pairs = FALSE
    )
  ) %>%
    cbind(
      .,
      dist.loess = loess(
        dist ~ frame,
        data = .,
        span = span
      )$fitted
    ) %>%
    mutate(pair = pair, .before = 1) %>%
    mutate(
      t = frame * info$time.per.frame,
      pair = str_replace(pair, '-', '•')
    )

  return(dist)
}

Bond angle calculation and smoothing

REQUIRES A PDB FILE WITH HYDROGENS!!! ALSO REQUIRES TO CUSTOMISE LIGAND HYDROGEN ATOMS SWITCH

In [3]:

bond.angler <- function(pair, pdb.input, info, span = 0.005, clean = TRUE, rectify = 50) {

  # pair <- pair
  # print(pair)

  print(paste('Processing pair:', pair))

  res.1 <- str_split(pair, '-')[[1]][1]
  res.3 <- str_split(pair, '-')[[1]][2]

  elety.1 <- str_split(res.1, ':')[[1]][2]
  elety.3 <- str_split(res.3, ':')[[1]][2]

  res.1 <- str_split(res.1, ':')[[1]][1]
  res.3 <- str_split(res.3, ':')[[1]][1]

  resno.1 <- as.numeric(gsub("\\D", "", res.1))
  resno.2 <- resno.1
  resno.3 <- as.numeric(gsub("\\D", "", res.3))

  res.1 <- gsub("\\d", "", res.1)
  res.3 <- gsub("\\d", "", res.3)

  elety.2 <- switch(
    res.1,
    'G' = switch(
      elety.1,
      'N1' = c('H1'),
      'N2' = c('H21', 'H22')
    ),
    'A' = c('H61', 'H62'),
    'C' = c('H41', 'H42'),
    'T' = c('H3'),
    'LDP' = switch(
      elety.1,
      'N1' = c('HN11', 'HN12', 'HN13'),
      'O1' = c('HO1'),
      'O2' = c('HO2')
    )
  )

  print(paste('Residues:', res.1, res.1, res.3))
  print(paste('Numbers:', resno.1, resno.2, resno.3))
  print(paste('Elements:', elety.1, elety.2, elety.3))

  # Function that calculates the bond angle between three atoms along all frames
  base.function <- function(hydrogen) {

    print(paste('Processing hydrogen:', hydrogen))

    # Select the atoms
    angle.inds <- combine.select(
      atom.select(
        pdb.input,
        resno = resno.1,
        elety =  elety.1
      ),
      atom.select(
        pdb.input,
        resno = resno.2,
        elety = hydrogen
      ),
      atom.select(
        pdb.input,
        resno = resno.3,
        elety = elety.3
      ),
      operator = 'OR'
    )

    # Calculate the bond angles
    angles <- sapply(
      1:nrow(pdb.input$xyz), function(x) {
        angle.xyz(
          pdb.input$xyz[x,][angle.inds$xyz]
        )
      }
    ) %>%
      as_tibble() %>%
      rename('angle' = 'value') %>%
      # add the residue names
      mutate(
        frame = 1:nrow(.),
        pair = pair,
        h = hydrogen, #track info of which hydrogen binds, useful if several interconverting
        angle = ifelse(angle < rectify, 180 - angle, angle)
      ) %>%
      # add the loess fit
      cbind(
        .,
        angle.loess = loess(
          angle ~ frame,
          data = .,
          span = span
        )$fitted
      ) %>%
      mutate(t = frame * info$time.per.frame, .after = 1)

    return(angles)
  }

  # Apply the function to the list of hydrogen from the pair
  angle.list <- lapply(
    elety.2,
    base.function
  ) %>%
    do.call(rbind, .)

  print(paste('Length of angle list before cleaning:', nrow(angle.list)))

  # If clean is TRUE, keep only the maximum angle for each frame
  # and reapply the loess fit
  if (clean) {
    angle.list <- angle.list %>%
      group_by(pair, t) %>%
      filter(angle == max(angle)) %>%
      ungroup() %>%
      select(-angle.loess) %>%
      cbind(
        .,
        angle.loess = loess(
          angle ~ t,
          data = .,
          span = span
        )$fitted
      ) %>%
      mutate(pair = paste0(res.1, resno.1, ':', elety.1, '•', res.3, resno.3, ':', elety.3))

    print(paste('Length of angle list after cleaning:', nrow(angle.list)))
  }

  print(paste('Finished angle list nrow', nrow(angle.list)))
  return(angle.list)
}

Bond formation frequency calculation and plotting

Bond frequency calculation

In [4]:

bond.freqr <- function(dist, angl, dist.cutoff = 3.0, angl.cutoff = 120, dist.lax.cutoff = 3.2, angl.lax.cutoff = 100){

  bond.index <- angl %>%
    left_join(
      dist,
      by = c('frame', 'pair', 't')
    ) %>%
    mutate(
      bond.type = case_when(
        dist < dist.cutoff & angle > angl.cutoff ~ 'canonical',
        dist < dist.lax.cutoff & angle > angl.lax.cutoff ~ 'non canonical',
        TRUE ~ 'no bond'
      ),
      bond.type = factor(bond.type, levels = c('no bond', 'non canonical', 'canonical')),
      h.bond.index = angle/dist
    ) %>%
    select(pair, frame, t, dist, angle, bond.type, h.bond.index, everything())

  bond.freq <- bond.index %>%
    # group_by(pair) %>%
    group_by(frame, t, pair) %>%
    # summarise the frequency of formation of canonical and non-canonical bonds
    summarise(
      canonical = sum(bond.type == 'canonical') ,
      non.canonical = sum(bond.type == 'non canonical'),
      no.bond = sum(bond.type == 'no bond')
    ) %>%
    group_by(pair) %>%
    summarise(
      canonical = sum(canonical)/n(),
      non.canonical = sum(non.canonical)/n(),
      no.bond = sum(no.bond)/n()
    ) %>%
    pivot_longer(-c(pair), names_to = 'bond.type', values_to = 'frequency') %>%
    mutate(bond.type = factor(gsub('\\.', ' ', bond.type), levels = c('no bond', 'non canonical', 'canonical')))

  return(list(bond.index = bond.index, bond.freq = bond.freq))
}

Bond formation frequency plotting

In [5]:

hbond.cumul.plotr <- function(bond.freq = hbond.rstslt, freq.cutoff = 0.25,
                              facet = FALSE, main.exp = 'rMD', exp.order = c('rMD', 'uMD', 'no LDP'),
                              highlight = 'LDP', highlight.region = c(11:18, 33:37)){

  df <- lapply(
    list(
      bond.freq$bond.index,
      bond.freq$bond.freq
    ),
    function(x){
      x %>%
        mutate(
          pair = if_else(
            str_detect(pair, highlight),
            paste0("<b><span style='color: hotpink;'>", pair, "</span></b>"),
            pair
          ),
          # highlight = if_else(str_detect(pair, highlight), TRUE, FALSE),
          highlight = case_when(
            str_detect(pair, highlight) ~ '2.highlight',
            as.numeric(str_extract(pair, '\\d+')) %in% highlight.region ~ '3.region',
            TRUE ~ '1.no highlight'
          ),
          pair = if_else(
            as.numeric(str_extract(pair, '\\d+')) %in% highlight.region,
            paste0("<b><span style='color: steelblue;'>", pair, "</span></b>"),
            pair
          )
        )
    }
  )

  if (isFALSE(facet)) {
    pair.levels <- df[[2]] %>%
      filter(bond.type == 'canonical') %>%
      group_by(pair) %>%
      summarise(freq = sum(frequency)) %>%
      filter(freq >= freq.cutoff) %>%
      arrange((freq)) %>%
      pull(pair)
  } else {
    pair.levels <- df[[2]] %>%
      filter(
        exp == main.exp,
        bond.type == 'canonical') %>%
      group_by(pair) %>%
      summarise(freq = sum(frequency)) %>%
      filter(freq >= freq.cutoff) %>%
      arrange((freq)) %>%
      pull(pair)
  }

  if (isFALSE(facet)) {
    (
      df[[1]] %>%
        filter(pair %in% pair.levels) %>%
        ggplot(., aes(x = t, y = factor(pair, levels = pair.levels),
                      alpha = h.bond.index, fill = highlight)) +
        geom_tile(color = NA) +
        scale_fill_manual(values = c('grey20', '#ff748c', '#b0c4de')) +
        scale_alpha_continuous(range = c(0,1), name = 'H-bond index', limits = c(30, NA)) +
        scale_x_continuous(expand = c(0,0), name = 't (ns)') +
        scale_y_discrete(expand = c(0,0), name = '') +
        custom.theme(1)
    ) + (
      df[[2]] %>%
        filter(pair %in% pair.levels) %>%
        ggplot(., aes(x = frequency, y = factor(pair, levels = pair.levels),
                      alpha = bond.type, fill = highlight)) +
        geom_col(position = 'stack') +
        scale_alpha_manual(values = c(0, 0.35, 1)) +
        scale_fill_manual(values = c('grey40', '#ff748c', '#b0c4de')) +
        scale_x_continuous(expand = c(0,0)) +
        scale_y_discrete(expand = c(0,0), name = '') +
        custom.theme(1)
    ) +
      plot_layout(guides = 'auto') &
      theme(legend.position = 'none')
  } else {

    (
      df[[1]] %>%
        filter(pair %in% pair.levels) %>%
        ggplot(., aes(x = t, y = factor(pair, levels = pair.levels),
                      alpha = h.bond.index, fill = highlight)) +
        facet_wrap(~factor(exp, levels = exp.order), nrow = 1) +
        geom_tile(color = NA) +
        scale_fill_manual(values = c('grey20', '#ff748c', '#b0c4de')) +
        scale_alpha_continuous(range = c(0,1), name = 'H-bond index', limits = c(30, NA)) +
        scale_x_continuous(expand = c(0,0), name = 't (ns)') +
        scale_y_discrete(expand = c(0,0), name = '', drop = FALSE) +
        custom.theme(1)
    ) +
      (
        df[[2]] %>%
          filter(pair %in% pair.levels) %>%
          ggplot(., aes(x = frequency, y = factor(pair, levels = pair.levels),
                        alpha = bond.type, fill = highlight)) +
          facet_wrap(~factor(exp, levels = exp.order), nrow = 1) +
          geom_col(position = 'stack') +
          scale_alpha_manual(values = c(0, 0.35, 1)) +
          scale_fill_manual(values = c('grey40', '#ff748c', '#b0c4de')) +
          scale_x_continuous(expand = c(0,0)) +
          scale_y_discrete(expand = c(0,0), name = '', drop = FALSE) +
          custom.theme(1)

      ) +
      plot_layout(guides = 'auto', ncol = 1) &
      theme(legend.position = 'none')

  }



}

Distances to rings

Unrelated to contact finder as the latter only ouputs H-bond contacts. Here pairs to be analyzed are given as input with residue1-residue2 format.

In [6]:

ring.distr <- function(pair, pdb.input = pdb.long, span = 0.005) {

  res.a <- str_split(pair, '-')[[1]][1]
  res.b <- str_split(pair, '-')[[1]][2]

  eletyr <- function(res) {
    switch(
      res,
      'G' = c('N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9'),
      'A' = c('N1', 'C2', 'N3', 'C4', 'C5', 'C6', 'N7', 'C8', 'N9'),
      'T' = c('N1', 'C2', 'N3', 'C4', 'C5', 'C6'),
      'C' = c('N1', 'C2', 'N3', 'C4', 'C5', 'C6'),
      'LDP' = c('C1', 'C2', 'C3', 'C4', 'C5', 'C6'),
      stop(paste(res, 'not recognized'))
    )
  }

  elety.a <- eletyr(gsub("\\d", "", res.a))
  elety.b <- eletyr(gsub("\\d", "", res.b))

  dist <- data.frame(
    frame = 1:nrow(pdb.input$xyz),
    dist = dist.xyz(
      a = com(
        pdb.input,
        inds = atom.select(
          pdb.input,
          resno = as.numeric(gsub("\\D", "", res.a)),
          elety = elety.a
        )
      ),
      b = com(
        pdb.input,
        inds = atom.select(
          pdb.input,
          resno = as.numeric(gsub("\\D", "", res.b)),
          elety = elety.b
        )
      ),
      all.pairs = FALSE
    )
  ) %>%
    cbind(
      .,
      dist.loess = loess(
        dist ~ frame,
        data = .,
        span = span
      )$fitted
    ) %>%
    mutate(pair = pair, .before = 1)

  return(dist)
}