# Markus Meier, March 2017, University of Manitoba
# Encoding: Unicode UTF-8

# Example script how to extract data from the JSON file created by x3dna-dssr
# This script takes advantage of the new "nt_resnum" field introduced in x3dna-dssr-v1.6.6-2017feb20
# and requires this version or higher.
# This example uses PDBID 2M4P (www.rcsb.org) which is an NMR ensemble with 10 models
# To generate the JSON file, execute "x3dna-dssr -i=2m4p.pdb -o=2m4p.json --more --nmr --json"

# Comments
# You can use the following code to record a chosen level of hierarchy into a file for inspection
#  sink("R_json_struct.txt")
#  print(attr(myData, "JSON"))
#  sink()

# Setup R environment
  graphics.off()
  rm(list=ls())
  library('dplyr')
  library('tidyjson')
    
  # Read JSON file generated by x3dna-dssr
  myData <- read_json("2m4p.json", format="json")

#  sink("R_json_struct.txt")
#  print(attr(myData, "JSON"))
#  sink()
  
  # Navigate through the data structure
  # x3dna-dssr generates a different depth of JSON data structure,
  # depending if the --nmr option is used or not
  
  # For JSON generated with --nmr option, use this code:
  nts_array <- myData  %>%
    enter_object("models") %>%
    gather_array() %>%
    enter_object("parameters") %>%
    enter_object("nts") %>%
    gather_array()
    
  # For JSON generated without --nmr option, use this code, instead:
#  nts_array <- myData  %>%
#    enter_object("nts") %>%
#    gather_array()

#  sink("R_json_struct_nts_array.txt")
#  print(attr(nts_array, "JSON"))
#  sink()
  
  # Now we have collapsed the JSON data enough that we can extract what we want to the dplyr tbl:
  # (chain ID, nucleotide number, nucleotide name, backbone torsion angles, sugar conformation ...)
  tidyData <- nts_array %>%
    spread_values(chain = jstring("chain_name")) %>%
    spread_values(id = jstring("nt_id")) %>% 
    spread_values(resi = jstring("nt_resnum")) %>% 
    spread_values(resn = jstring("nt_name")) %>% 
    spread_values(alpha = jnumber("alpha")) %>% 
    spread_values(beta = jnumber("beta")) %>% 
    spread_values(gamma = jnumber("gamma")) %>% 
    spread_values(delta = jnumber("delta")) %>% 
    spread_values(epsilon = jnumber("epsilon")) %>% 
    spread_values(zeta = jnumber("zeta")) %>% 
    spread_values(chi = jnumber("chi")) %>%
    spread_values(baseSugar_conf = jstring("baseSugar_conf")) %>%
    spread_values(sugar_class = jstring("sugar_class"))
   
  # Use dplyr's filter() to select the nucleotides we are interested in:
  G4_all <- tidyData %>% filter( resn=="DG" )
  G4_flankBulge <- G4_all %>% filter( resi==3 | resi==5 )
  G4_others <- G4_all %>% filter( !(resi==3 | resi==5) )
  
  # Show what's inside the tbl 
  print(G4_flankBulge)