#!/usr/bin/perl
use strict;
use warnings;

# Written by Hugh Heldenbrand (hheldenbrand at gmail.com)

# This script takes a PDB file created by OpenBabel from a Gaussian output file
# and attempts to reformat it to the standard PDB format readable by 3DNA

# It was written based on output from GaussView 5.0 and OpenBabel 2.2.3

# WARNING: this script needs the nucleobase atoms from the Gaussian output file
# to be in the order used by Gaussview's "biological fragments."

# WARNING: this script throws away all atoms that are not nucleobase atoms.

die "Usage: $0  OpenBabel_generated_PDB converted_PDB\n" unless @ARGV == 2;

# Read the OpenBabel generated PDB file into an array.

open BABEL, "<$ARGV[0]" || die "$!";
my @lines = <BABEL>;
close BABEL;

open OUT, ">$ARGV[1]" || die "$!";

# Parse the array and put the atom coordinates/types into a hash.

my %atoms = ();
my $index = 0;

foreach my $line (@lines) {
   chomp $line;
   if ($line =~ /^.*\s(-?\d+\.\d+)\s+(-?\d+\.\d+)\s+(-?\d+\.\d+)\s+1.00  0.00\s+(\w+)\s+$/) {
      $atoms{$index}{'x'} = $1;
      $atoms{$index}{'y'} = $2;
      $atoms{$index}{'z'} = $3;
      $atoms{$index}{'type'} = $4;
      $index++;
      if (length($4) == 2) {$index++} # Keep 2-letter types from messing up indexes later
   }
}

# Concatenate all the types into one long string

my $types = "";

foreach my $index (sort {$a <=> $b} keys %atoms) {
   $types = $types.$atoms{$index}{'type'};
}

# Search the long string for patterns matching nucleobases
# Loop until no more matches are found

my $match = "true";
my $chainID = 1;
   
while ($match eq "true") {
   if ($types =~ /(.*)NCHNCCNHHNCHNC(.*)/) {
   # Adenine
      my $start = length($1);
      my $end = $start + 14;
      $types = $1."XXXXXXXXXXXXXX".$2;
      $atoms{$start + 0}{'name'} = "N9"; 
      $atoms{$start + 1}{'name'} = "C8"; 
      $atoms{$start + 2}{'name'} = "H8"; 
      $atoms{$start + 3}{'name'} = "N7"; 
      $atoms{$start + 4}{'name'} = "C5"; 
      $atoms{$start + 5}{'name'} = "C6"; 
      $atoms{$start + 6}{'name'} = "N6"; 
      $atoms{$start + 7}{'name'} = "H61"; 
      $atoms{$start + 8}{'name'} = "H62"; 
      $atoms{$start + 9}{'name'} = "N1"; 
      $atoms{$start + 10}{'name'} = "C2"; 
      $atoms{$start + 11}{'name'} = "H2"; 
      $atoms{$start + 12}{'name'} = "N3"; 
      $atoms{$start + 13}{'name'} = "C4"; 
      for (my $index = $start; $index < $end; $index++) {
         $atoms{$index}{'resname'} = "A"; 
         $atoms{$index}{'chain'} = $chainID; 
      }
      $chainID++;
   } elsif ($types =~ /(.*)NCHCHCNHHNCO(.*)/) {
   # Cytosine
      my $start = length($1);
      my $end = $start + 12;
      $types = $1."XXXXXXXXXXXX".$2;
      $atoms{$start + 0}{'name'} = "N1";
      $atoms{$start + 1}{'name'} = "C6";
      $atoms{$start + 2}{'name'} = "H6";
      $atoms{$start + 3}{'name'} = "C5";
      $atoms{$start + 4}{'name'} = "H5";
      $atoms{$start + 5}{'name'} = "C4";
      $atoms{$start + 6}{'name'} = "N4";
      $atoms{$start + 7}{'name'} = "H41";
      $atoms{$start + 8}{'name'} = "H42";
      $atoms{$start + 9}{'name'} = "N3";
      $atoms{$start + 10}{'name'} = "C2";
      $atoms{$start + 11}{'name'} = "O2";
      for (my $index = $start; $index < $end; $index++) {
         $atoms{$index}{'resname'} = "C"; 
         $atoms{$index}{'chain'} = $chainID; 
      }
      $chainID++;
   } elsif ($types =~ /(.*)NCHNCCONHCNHHNC(.*)/) {
   # Guanine
      my $start = length($1);
      my $end = $start + 15;
      $types = $1."XXXXXXXXXXXXXXX".$2;
      $atoms{$start + 0}{'name'} = "N9";
      $atoms{$start + 1}{'name'} = "C8";
      $atoms{$start + 2}{'name'} = "H8";
      $atoms{$start + 3}{'name'} = "N7";
      $atoms{$start + 4}{'name'} = "C5";
      $atoms{$start + 5}{'name'} = "C6";
      $atoms{$start + 6}{'name'} = "O6";
      $atoms{$start + 7}{'name'} = "N1";
      $atoms{$start + 8}{'name'} = "H1";
      $atoms{$start + 9}{'name'} = "C2";
      $atoms{$start + 10}{'name'} = "N2";
      $atoms{$start + 11}{'name'} = "H21";
      $atoms{$start + 12}{'name'} = "H22";
      $atoms{$start + 13}{'name'} = "N3";
      $atoms{$start + 14}{'name'} = "C4";
      for (my $index = $start; $index < $end; $index++) {
         $atoms{$index}{'resname'} = "G"; 
         $atoms{$index}{'chain'} = $chainID; 
      }
      $chainID++;
   } elsif ($types =~ /(.*)NCHCCHHHCONHCO(.*)/) {
   # Thymine
      my $start = length($1);
      my $end = $start + 14;
      $types = $1."XXXXXXXXXXXXXX".$2;
      $atoms{$start + 0}{'name'} = "N1";
      $atoms{$start + 1}{'name'} = "C6";
      $atoms{$start + 2}{'name'} = "H6";
      $atoms{$start + 3}{'name'} = "C5";
      $atoms{$start + 4}{'name'} = "C7";
      $atoms{$start + 5}{'name'} = "H71";
      $atoms{$start + 6}{'name'} = "H72";
      $atoms{$start + 7}{'name'} = "H73";
      $atoms{$start + 8}{'name'} = "C4";
      $atoms{$start + 9}{'name'} = "O4";
      $atoms{$start + 10}{'name'} = "N3";
      $atoms{$start + 11}{'name'} = "H3";
      $atoms{$start + 12}{'name'} = "C2";
      $atoms{$start + 13}{'name'} = "O2";
      for (my $index = $start; $index < $end; $index++) {
         $atoms{$index}{'resname'} = "T"; 
         $atoms{$index}{'chain'} = $chainID; 
      }
      $chainID++;
   } elsif ($types =~ /(.*)NCHCHCONHCO(.*)/) {
   # Uracil
      my $start = length($1);
      my $end = $start + 11;
      $types = $1."XXXXXXXXXXX".$2;
      $atoms{$start + 0}{'name'} = "N1";
      $atoms{$start + 1}{'name'} = "C6";
      $atoms{$start + 2}{'name'} = "H6";
      $atoms{$start + 3}{'name'} = "C5";
      $atoms{$start + 4}{'name'} = "H5";
      $atoms{$start + 5}{'name'} = "C4";
      $atoms{$start + 6}{'name'} = "O4";
      $atoms{$start + 7}{'name'} = "N3";
      $atoms{$start + 8}{'name'} = "H3";
      $atoms{$start + 9}{'name'} = "C2";
      $atoms{$start + 10}{'name'} = "O2";
      for (my $index = $start; $index < $end; $index++) {
         $atoms{$index}{'resname'} = "U"; 
         $atoms{$index}{'chain'} = $chainID; 
      }
      $chainID++;
   } else {
      $match = "false";
   } 
}

# Print out the new pdb file.
# Reindex the atoms

# Each nucleobase gets its own chain ID and will be residue number 1 in the new pdb

print OUT "$lines[0]\n";
print OUT "AUTHOR    GENERATED BY g09babel2pdb.pl\n";

my $newindex = 1;

foreach my $index (sort {$a <=> $b} keys %atoms) {
   if (defined $atoms{$index}{'resname'}) {
      $newindex = sprintf("%5d", $newindex);
      # The name should be modified if any 4-character atom names are added.
      my $name = sprintf("%-4s", " $atoms{$index}{'name'}");
      my $resname = sprintf("%3s", "$atoms{$index}{'resname'}");
      my $chainID = sprintf("%1s", "$atoms{$index}{'chain'}");
      my $resnum = sprintf("%4d", 1);
      my $x = sprintf("%8.3f", "$atoms{$index}{'x'}");
      my $y = sprintf("%8.3f", "$atoms{$index}{'y'}");
      my $z = sprintf("%8.3f", "$atoms{$index}{'z'}");
#            1----67------11 13-16 18----20 ---22---23---26    31--5455--6061--66 77-7879-80
      print OUT "ATOM  $newindex $name $resname $chainID$resnum    $x$y$z  1.00  0.00           $atoms{$index}{'type'}  \n";
      $newindex++;
   }
}

close OUT;
