rst.error.r

Restraint errors

batch calculations for restraint errors

In [1]:

rst.error.batchr <- function(input.file, rst = 'data/unambig_8columns.tbl',
                             states = 20,
                             error.thresh = 0.1, occurence.thresh = 0.2,
                             sort = 'error',
                             offset = 0){

  df <- read_table(
    input.file,
    col_names = FALSE,
    skip = states*4 + 29,
    comment = "#") %>%
    mutate_all(~str_remove(., ";")) %>%
    mutate_all(~str_remove(., ":")) %>%
    mutate_all(~str_remove(., '>')) %>%
    rename(
      atom.1 = X1,
      res.1.name = X2,
      res.1 = X3,
      atom.2 = X5,
      res.2.name = X6,
      res.2 = X7,
      target = X9,
      occurence = ncol(.),
      max.error = ncol(.) - 1,
      min.error = ncol(.) - 3,
      std.error = ncol(.) - 4,
      mean.error = ncol(.) - 6
    ) %>%
    select(-X4, -X8, -X10,-(ncol(.) - 2), -(ncol(.) - 5)) %>%
    mutate(
      res.1 = as.numeric(res.1) + 10,
      res.2 = as.numeric(res.2) + 10
    )

  if (nrow(df) == 0) {
    stop("No violations found")
  }

  for (i in 1:states) {
    # rename column in position 7+i by the column.name
    df <- df %>%
      rename_with(~paste0("state.", i), starts_with(paste0("X", 10 + i)))
  }


  df <- df %>%
    mutate_at(vars(res.1, res.2, occurence, max.error, min.error,
                   std.error, mean.error, target), as.numeric) %>%
    #all columns starting by state to numeric
    mutate_at(vars(starts_with("state")), as.numeric) %>%
    filter(
      mean.error > error.thresh,
      occurence > occurence.thresh * states
    ) %>%
    mutate_at(vars(atom.1, atom.2), ~str_remove(., "\\*")) %>%
    mutate(
      res.1.name = case_when(
        res.1.name == 'DT' | res.1.name == 'DT5' | res.1.name == 'DT3' ~ 'T',
        res.1.name == 'DC' | res.1.name == 'DC5' | res.1.name == 'DC3' ~ 'C',
        res.1.name == 'DG' | res.1.name == 'DG5' | res.1.name == 'DG3' ~ 'G',
        res.1.name == 'DA' | res.1.name == 'DA5' | res.1.name == 'DA3' ~ 'A',
        TRUE ~ res.1.name
      ),
      res.2.name = case_when(
        res.2.name == 'DT' | res.2.name == 'DT5' | res.2.name == 'DT3' ~ 'T',
        res.2.name == 'DC' | res.2.name == 'DC5' | res.2.name == 'DC3' ~ 'C',
        res.2.name == 'DG' | res.2.name == 'DG5' | res.2.name == 'DG3' ~ 'G',
        res.2.name == 'DA' | res.2.name == 'DA5' | res.2.name == 'DA3' ~ 'A',
        TRUE ~ res.2.name
      )
    ) %>%
    mutate(
      # import these two case_when from R/super.mapr.R
      map.atom.1 = case_when(
        res.1.name == 'LDP' & atom.1 %in% c('HAL', 'HAM', 'HAN') ~ 'HM',
        res.1.name == 'T' & atom.1 %in% c('H71', 'H72', 'H73') ~ 'M7',
        res.1.name == 'G' & atom.1 %in% c('H21', 'H22') ~ 'HN2',
        res.1.name == 'T' & atom.1 %in% c("H2''", "H2'") ~ "Q2'",
        res.1.name == 'G' & atom.1 %in% c("H2''", "H2'") ~ "Q2'",
        res.1.name == 'G' & atom.1 %in% c("H5''", "H5'") ~ "Q5'",
        TRUE ~ atom.1
      ),
      map.atom.2 = case_when(
        res.2.name == 'LDP' & atom.2 %in% c('HAL', 'HAM', 'HAN') ~ 'HM',
        res.2.name == 'T' & atom.2 %in% c('H71', 'H72', 'H73') ~ 'M7',
        res.2.name == 'G' & atom.2 %in% c('H21', 'H22') ~ 'HN2',
        res.2.name == 'T' & atom.2 %in% c("H2''", "H2'") ~ "Q2'",
        res.2.name == 'G' & atom.2 %in% c("H2''", "H2'") ~ "Q2'",
        res.2.name == 'G' & atom.2 %in% c("H5''", "H5'") ~ "Q5'",
        TRUE ~ atom.2
      )
    )

  if (nrow(df) == 0) {
    stop(paste("No violations found for error threshold", error.thresh))
  }

  ## Import and cleanup of unambiguous restraints (8 column file)

  rst.df <- read_table(
    rst,
    col_names = FALSE
  ) %>%
    rename(
      map.atom.1 = X3,
      res.1.name = X2,
      res.1 = X1,
      map.atom.2 = X6,
      res.2.name = X5,
      res.2 = X4,
      lower.bound = X7,
      upper.bound = X8
    )


  ## Joining

  df <- left_join(
    df,
    rst.df,
    by = join_by(res.1, res.2, map.atom.1, map.atom.2, res.1.name, res.2.name)
  )

  if (sort == 'error') {
    df <- df %>% arrange(desc(mean.error))
  } else if (sort == 'residue') {
    df <- df %>% arrange(res.1, res.2)
  } else {
    df <- df
  }

  df.summary <- df %>%
    mutate(
      atom.1 = if_else(
        res.1.name != 'LDP',
        paste0(res.1.name, res.1, '@', atom.1),
        paste0(res.1.name, '@', map.atom.1)
      ),
      atom.2 = if_else(
        res.2.name != 'LDP',
        paste0(res.2.name, res.2, '@', atom.2),
        paste0(res.2.name, '@', map.atom.2)
      )
    ) %>%
    select(atom.1, atom.2, target, mean.error, std.error, occurence)


  return(
    list(
      violations = df,
      summary = df.summary
    )
  )


}

rstslt.violations <- rst.error.batchr(input.file = ‘data/nmr_viol_doprstslt.txt’, error.thresh = 0.15, occurence.thresh = 0) rstslt.violations$summary %>% filter(str_detect(atom.1, ‘LDP28’) | str_detect(atom.2, ‘LDP28’)) %>% nrow(.)