From a3549337b254d74db16c551937a881422ed6e219 Mon Sep 17 00:00:00 2001
From: AntoineLabeeuw <antoine.labeeuw@gmail.com>
Date: Tue, 4 Jun 2019 11:24:24 +0200
Subject: [PATCH] - removed the error 'lis index out of range - added and
 option to avoid names duplicates in the header of every sequences in the .pir
 file

---
 scripts/hhmakemodel.py | 476 +++++++++++++++++++++--------------------
 1 file changed, 246 insertions(+), 230 deletions(-)
 mode change 100755 => 100644 scripts/hhmakemodel.py

diff --git a/scripts/hhmakemodel.py b/scripts/hhmakemodel.py
old mode 100755
new mode 100644
index 8e226bb9..2375d90e
--- a/scripts/hhmakemodel.py
+++ b/scripts/hhmakemodel.py
@@ -2,18 +2,19 @@
 
 from hh_reader import read_result
 from copy import deepcopy
+from glob import glob
 from pdbx.reader.PdbxReader import PdbxReader
 from pdbx.writer.PdbxWriter import PdbxWriter
-import re, os, sys, tempfile, glob
+import re, os, sys, tempfile
 
 from operator import itemgetter  # hzhu
 from itertools import groupby    # hzhu
 
-EMPTY = '*'
+EMPTY = '.'
 GAP = '-'
 DEBUG_MODE = False
 
-class Gap: 
+class Gap:
     """ A gap is a continuous stretch of indels.
         It is defined by a opening position and a size/length
     """
@@ -23,27 +24,27 @@ def __init__(self, open_pos, size):
 
     def __repr__(self):
         return 'Gap opening pos = %d, size = %d' % (self.open_pos, self.size)
-        
+
 class Grid:
     """
     Implementation of 2D grid of cells
     Includes boundary handling
     """
-    
+
     def __init__(self, grid_height, grid_width):
         """
         Initializes grid to be empty, take height and width of grid as parameters
         Indexed by rows (left to right), then by columns (top to bottom)
         """
-        
+
         self._grid_height = grid_height
         self._grid_width = grid_width
         self._cells = [ [ EMPTY for dummy_col in range(self._grid_width) ]
                        for dummy_row in range(self._grid_height)]
-                
+
     def __str__(self):
         """ Return multi-line string represenation for grid """
-        
+
         ans = ''
         for row in range(self._grid_height):
             ans += ''.join(self._cells[row])
@@ -52,29 +53,29 @@ def __str__(self):
 
     def clear(self):
         """ Clears grid to be empty """
-        
+
         self._cells = [[EMPTY for dummy_col in range(self._grid_width)]
                        for dummy_row in range(self._grid_height)]
-    
+
     def get_grid_height(self):
         """ Return the height of the grid """
-        
+
         return self._grid_height
 
     def get_grid_width(self):
         """ Return the width of the grid """
-        
+
         return self._grid_width
-    
+
     def get_cell(self, row, col):
         return self._cells[row][col]
 
     def get_seq_start(self, row):
         """ Returns the start position of the sequence """
-        
+
         index = 0
         for pos in self._cells[row]:
-            if pos != EMPTY: 
+            if pos != EMPTY:
                 return index
             index += 1
 
@@ -82,23 +83,23 @@ def get_seq_start(self, row):
 
     def get_seq_end(self, row):
         """ Returns the end position of the sequence """
-        
+
         index = 0
         for pos in reversed(self._cells[row]):
-            if pos != EMPTY: 
+            if pos != EMPTY:
                 return self.get_grid_width() - index
             index += 1
 
         return None
 
-    def get_gaps(self, row): 
+    def get_gaps(self, row):
         """ Return the position of gaps in a row """
-        
+
         gaps = list()
 
         index = 0
         for pos in self._cells[row]:
-            if pos == GAP: 
+            if pos == GAP:
                 gaps.append(index)
             index += 1
 
@@ -109,7 +110,7 @@ def get_gaps_ref_gapless(self, row):
             The opening positions of the gaps are wrt. the gapless seq
         """
         # get all the indels
-        indels = self.get_gaps(row)  
+        indels = self.get_gaps(row)
         gaps = []
         # combine continuous indels into a gap
         for k,i in groupby( enumerate(indels), lambda x: x[0]-x[1] ):
@@ -120,9 +121,9 @@ def get_gaps_ref_gapless(self, row):
         for i in range(1, len(gaps)):
             # offset by total gap number before
             gaps[i].open_pos -= sum([gaps[j].size for j in range(i)])
-            
+
         return gaps  # a list of Gap instances
-    
+
     def get_seq_indeces(self, row):
 
         seq = list()
@@ -131,7 +132,7 @@ def get_seq_indeces(self, row):
                 seq.append(pos)
 
         return seq
-            
+
     ## def get_gap_list(self):  # hzhu commented this out. wrote a new version
     ##     """ Returns a list of list of all gap positions in the sequence grid. """
     ##     gap_pos = set()
@@ -142,17 +143,17 @@ def get_seq_indeces(self, row):
 
     ##     gap_pos = list(sorted(gap_pos))
     ##     boundaries = [ (x + 1) for x, y in zip(gap_pos, gap_pos[1:]) if y - x != 1 ]
- 
+
     ##     gap_list = list()
     ##     prev = 0
- 
+
     ##     for boundary in boundaries:
     ##         sub_list = [ pos for pos in gap_pos[prev:] if pos < boundary ]
     ##         gap_list.append(sub_list)
     ##         prev += len(sub_list)
- 
+
     ##     gap_list.append([ x for x in gap_pos[prev:]])
-    
+
     ##     return gap_list
 
     def get_gap_list(self):
@@ -165,52 +166,51 @@ def get_gap_list(self):
         for row in range(self.get_grid_height()):
             gap_pos = []
             gaps = self.get_gaps_ref_gapless(row)
-             
+
             for g in gaps:
                 if g.open_pos in gap_dict: # if there is already gaps at this open pos
                     if g.size > gap_dict[g.open_pos].size: # if new gap is bigger
                         gap_dict[g.open_pos] = g  # keep the larger gap as they overlap
                 else:
                     gap_dict[g.open_pos] = g
-                    
+
         gap_list = sorted(list(gap_dict.values()), key=lambda x: x.open_pos) # sort according to start position
         return gap_list  # a list of Gap instances
-        
+
     def set_gap(self, row, col):
         """ Set cell with index (row, col) to be a gap """
-        
+
         self._cells[row][col] = GAP
 
     def set_empty(self, row, col):
         """ Set cell with index (row, col) to be a gap """
-        
+
         self._cells[row][col] = EMPTY
-    
+
     def set_cell(self, row, col, res):
         """ Set cell with index (row, col) to be full """
-        
+
         self._cells[row][col] = res
 
     def is_empty(self, row, col):
         """ Checks whether cell with index (row, col) is empty """
-        
         return self._cells[row][col] == EMPTY
 
     def is_gap(self, row, col):
         """ Checks whetehr cell with indxex (row, col) is a gap """
-        
+
         return self._cells[row][col] == GAP
 
     def insert_gaps(self, cols):
         """ Inserts a gaps into a column of the template grid """
-        
+
         for col in cols:
             for row in range(self._grid_height):
                 if col >= self.get_seq_start(row) and col < self.get_seq_end(row):
                     self._cells[row].insert(col, GAP)
                 else:
                     self._cells[row].insert(col, EMPTY)
-            
+
             self._grid_width += 1
 
     def insert_gaps_row(self, cols, row):
@@ -220,7 +220,7 @@ def insert_gaps_row(self, cols, row):
                 self._cells[row].insert(col, GAP)
             else:
                 self._cells[row].insert(col, EMPTY)
-            # NOTE: grid_with should not be changed after every row is updated.    
+            # NOTE: grid_with should not be changed after every row is updated.
             #self._grid_width += 1
 
     def clean_trail_empty(self):
@@ -233,8 +233,8 @@ def clean_trail_empty(self):
                     break
             if i+1 > max_width:
                 max_width = i+1
-                
-        # delete excessive EMPTY        
+
+        # delete excessive EMPTY
         for row in range(self._grid_height):
             del self._cells[row][max_width:]
 
@@ -243,7 +243,7 @@ def clean_trail_empty(self):
                                  for row in range(self._grid_height) if len(self._cells[row]) < max_width]
         self._grid_width = max_width
         return
-    
+
     def remove_gaps(self, keep_width=True): # hzhu add keep_width option
         """ Removes all gaps from the grid. """
 
@@ -261,32 +261,31 @@ def remove_gaps(self, keep_width=True): # hzhu add keep_width option
 
         if not keep_width: # hzhu if width is not kept, make sure width is consistent
             self.clean_trail_empty()
-                    
+
         return
-                
+
 
 class QueryGrid(Grid):
 
     def __init__(self, grid_height, grid_width):
         Grid.__init__(self, grid_height, grid_width)
-    
+
     def get_query_start(self, row):
         """ Returns the query start position """
         return self.get_seq_start(row) + 1
 
     def get_query_end(self, row):
         """ Returns the query end postion """
-        
+
         return self.get_seq_end(row) - len(self.get_gaps(row))
 
     def get_col_residue(self, col):
         """ Tries to find a the query residue in a given column. Used by derive_global_seq() to
         identify the global query sequence """
-        
+
         for row in range(self.get_grid_height()):
             if not self.is_empty(row, col):
                 return self._cells[row][col]
-
         return GAP
 
 class TemplateGrid(Grid):
@@ -300,20 +299,33 @@ def __init__(self, grid_height, grid_width):
         self._chain = list()
         self._organism = list()
         self._resolution = list()
-        
-    def display(self):
+
+    def display(self, noduplicates):
         """ Return multi-line string represenation for grid """
-        
         ans = ''
-        for row in range(self._grid_height):
-            ans += '>P1;{p}\nstructure:{p}:{s}:{c}:{e}:{c}::{o}:{r}:\n{a}*\n'.format(
-                p = self._pdb_code[row],
-                s = add_white_space_end(self.get_template_start(row), 4),
-                e = add_white_space_end(self.get_template_end(row), 4),
-                c = self._chain[row],
-                o = self._organism[row],
-                r = self._resolution[row], 
-                a = ''.join(self._cells[row]).replace(EMPTY, GAP).replace('#', GAP))
+        if noduplicates:
+            for row in range(self._grid_height):
+                ans += '>P1;{p}_{c}_{line}{start}{end}\nstructure:{p}:{s}:{c}:{e}:{c}::{o}:{r}:\n{a}*\n'.format(
+                    p = self._pdb_code[row],
+                    s = add_white_space_end(self.get_template_start(row), 4),
+                    e = add_white_space_end(self.get_template_end(row), 4),
+                    c = self._chain[row],
+                    o = self._organism[row],
+                    r = self._resolution[row],
+                    a = ''.join(self._cells[row]).replace(EMPTY, GAP).replace('#', GAP),
+    		        line = str(row),
+    		        start = self.get_template_start(row),
+    		        end = self.get_template_end(row))
+        else:
+            for row in range(self._grid_height):
+                ans += '>P1;{p}\nstructure:{p}:{s}:{c}:{e}:{c}::{o}:{r}:\n{a}*\n'.format(
+                    p = self._pdb_code[row],
+                    s = add_white_space_end(self.get_template_start(row), 4),
+                    e = add_white_space_end(self.get_template_end(row), 4),
+                    c = self._chain[row],
+                    o = self._organism[row],
+                    r = self._resolution[row],
+                    a = ''.join(self._cells[row]).replace(EMPTY, GAP).replace('#', GAP))
 
         return ans
 
@@ -330,11 +342,11 @@ def debug(self, row):
             g2 = ', '.join([str(gap) for gap in self.get_gaps(row)]),
             seq = ''.join(self._cells[row]))
 
-        return ans 
+        return ans
 
     def set_metadata(self, row, start, end, pdb_code, chain, organism, resolution):
         """ Used by create_template_grid() to setup metadata of pir template """
-        
+
         self._start.append(start)
         self._end.append(end)
         self._pdb_code.append(pdb_code)
@@ -343,23 +355,23 @@ def set_metadata(self, row, start, end, pdb_code, chain, organism, resolution):
         self._resolution.append(resolution)
 
     def set_map(self, row, start, end):
-        
+
         self._start[row] = start
         self._end[row] = end
 
     def get_template_start(self, row):
         """ Returns the template start position """
-        
+
         return self._start[row]
 
     def get_template_end(self, row):
         """ Return sthe template end position """
-        
+
         return self._end[row]
 
     def del_row(self, row):
         """ Removes a complete template entry from the grid """
-        
+
         del self._cells[row]
         del self._start[row]
         del self._end[row]
@@ -373,45 +385,45 @@ def del_row(self, row):
 
 def add_white_space_end(string, length):
     """ Adds whitespaces to a string until it has the wished length"""
-    
+
     edited_string = str(string)
-    
+
     if len(edited_string) >= length:
         return string
     else:
         while len(edited_string) != length:
             edited_string += ' '
-    
+
     return edited_string
 
 def convert_aa_code(three_letter, convert):
     """
     Assumes a string that contains a three letter aminoacid code and
     returns the corresponding one letter code.
-    """    
-    
+    """
+
     aa_code = {
-        'CYS': 'C', 
-        'ASP': 'D', 
-        'SER': 'S', 
-        'GLN': 'Q', 
+        'CYS': 'C',
+        'ASP': 'D',
+        'SER': 'S',
+        'GLN': 'Q',
         'LYS': 'K',
-        'ILE': 'I', 
-        'PRO': 'P', 
-        'THR': 'T', 
-        'PHE': 'F', 
-        'ASN': 'N', 
-        'GLY': 'G', 
-        'HIS': 'H', 
-        'LEU': 'L', 
-        'ARG': 'R', 
-        'TRP': 'W', 
-        'ALA': 'A', 
-        'VAL': 'V', 
-        'GLU': 'E', 
-        'TYR': 'Y', 
+        'ILE': 'I',
+        'PRO': 'P',
+        'THR': 'T',
+        'PHE': 'F',
+        'ASN': 'N',
+        'GLY': 'G',
+        'HIS': 'H',
+        'LEU': 'L',
+        'ARG': 'R',
+        'TRP': 'W',
+        'ALA': 'A',
+        'VAL': 'V',
+        'GLU': 'E',
+        'TYR': 'Y',
         'MET': 'M',
-      } 
+      }
 
     non_canonical = {
         'MSE': 1,
@@ -501,12 +513,12 @@ def get_query_name(hhr_file):
         for line in fh:
             if line.startswith('Query'):
                 # match the PDB Code
-                m = re.search('(\d[A-Z0-9]{3})_(\S)', line) 
+                m = re.search(r'(\d[A-Z0-9]{3})_(\S)', line)
 
-                if m: 
+                if m:
                     pdb_code = m.group(1)
                     chain = m.group(2)
-                else: 
+                else:
                     pdb_code = 'UKNP'
                     chain = 'A'
                     # raise ValueError('Input HHR-File Does not seem to be a PDB-Structure')
@@ -517,7 +529,7 @@ def get_query_name(hhr_file):
 
 def get_cif_files(folder):
     """ Gets all cif files located in folder. """
-    
+
     return glob(os.path.join(folder, '*.cif'))
 
 def open_cif(cif_file):
@@ -529,7 +541,7 @@ def open_cif(cif_file):
         reader.read(data)
 
     block = data[0]
-    
+
     return block
 
 def get_pdb_entry_id(block):
@@ -537,7 +549,7 @@ def get_pdb_entry_id(block):
 
     entry = block.getObj('entry')
     entry_id = entry.getValue('id')
-    
+
     return entry_id
 
 
@@ -549,7 +561,7 @@ def template_id_to_pdb(template_id):
     m = re.match(r'/^(\d[A-Za-z0-9]{3})$', template_id)
     if m:
         return m.group(1).upper(), 'A'
-    
+
     # PDB CODE with chain Identifier
     m = re.match(r'^(\d[A-Za-z0-9]{3})_(\S)$', template_id)
     if m:
@@ -559,7 +571,7 @@ def template_id_to_pdb(template_id):
     m = re.match(r'^(\d[A-Za-z0-9]{3})([A-Za-z0-9]?)_\d+$', template_id)
     if m:
         return m.group(1).upper(), m.group(2).upper()
-    
+
     # No PDB code and chain identified
     return None, None
 
@@ -585,17 +597,17 @@ def create_template_grid(hhr_data):
         # Get pdb_code and chain identifier of template
         pdb_code, chain =  template_id_to_pdb(template.template_id)
 
-        m = re.search("(\d+.\d+)A", template.template_info) # try to extract resolution of the structure
-        
-        if m: 
+        m = re.search(r"(\d+.\d+)A", template.template_info) # try to extract resolution of the structure
+
+        if m:
             resolution = m.group(1)
-        else: 
+        else:
             resolution = ""
 
-        m = re.search("\{(.*)\}", template.template_info) # try to extract the organism
-        if m: 
+        m = re.search(r"\{(.*)\}", template.template_info) # try to extract the organism
+        if m:
             organism = m.group(1).replace(":", " ") # make sure that no colons are in the organism
-        else: 
+        else:
             organism = ""
 
         template_grid.set_metadata(row, start, end, pdb_code, chain, organism, resolution)
@@ -612,11 +624,11 @@ def to_seq(ali):
         return ''.join(ali)
     else:
         return ali
-    
+
 
 def create_query_grid(hhr_data):
     """ Creates a Query Grid """
-    
+
     total_seq = len(hhr_data)
     query_max = max( [ hhr.start[0] + len(to_seq(hhr.query_ali)) for hhr in hhr_data ] ) - 1
 
@@ -639,7 +651,7 @@ def create_gapless_grid(grid):
     """ Returns a gapless grid """
 
     gapless = deepcopy(grid)
-    gapless.remove_gaps(keep_width=False)  # hzhu:  shrink grid
+    gapless.remove_gaps(keep_width=True)  # hzhu:  shrink grid
 
     return gapless
 
@@ -648,23 +660,20 @@ def process_query_grid(query_grid, gapless_grid):
     """
     gaplist = query_grid.get_gap_list()
     off_set = 0
-    
+
     for g in gaplist:
         gapless_grid.insert_gaps([ p + off_set for p in range(g.open_pos, g.open_pos+g.size) ])
         off_set += g.size
-        
-    return gapless_grid
 
-def derive_global_seq(processed_query_grid, query_name, query_chain):
+    return gapless_grid
 
+def derive_global_seq(processed_query_grid, query_name, query_chain, processed_grid_length_max):
     global_seq = list()
-
-    for col in range(processed_query_grid.get_grid_width()):
+    for col in range(processed_grid_length_max):
         global_seq.append(processed_query_grid.get_col_residue(col))
-
     # this is the query entry
     header = '>P1;{q}\nsequence:{q}:1    :{c}:{l}  :{c}::::\n'.format(
-        q = query_name, 
+        q = query_name,
         l = len(global_seq),
         c = query_chain)
 
@@ -675,9 +684,9 @@ def process_template_grid(query_grid, template_grid):
         Only add gaps from **other** query_grids into template grid (NOT gapless)
     """
     gaplist = query_grid.get_gap_list()  # use this to keep the offset
-    
+
     for row in range(template_grid.get_grid_height()):
-        # do NOT consider gaps in current query row         
+        # do NOT consider gaps in current query row
         gaplist_row = query_grid.get_gaps_ref_gapless(row)
         gapdict_row = dict(zip([g.open_pos for g in gaplist_row],
                                [g.size     for g in gaplist_row]))
@@ -686,35 +695,35 @@ def process_template_grid(query_grid, template_grid):
             # if there is a gap with same opening position in the current row,
             # only consider g if it is larger than the on in the current row
             if g.open_pos in gapdict_row:
-                if g.size > gapdict_row[g.open_pos]: 
+                if g.size > gapdict_row[g.open_pos]:
                     template_grid.insert_gaps_row([ p + off_set for p in range(g.open_pos,
                                                                               g.open_pos+g.size-gapdict_row[g.open_pos]) ], row)
             else:
                 template_grid.insert_gaps_row([ p + off_set for p in range(g.open_pos, g.open_pos+g.size) ], row)
-                 
+
             off_set += g.size  # even if the gaps are not inserted, the offset should be adjusted
 
     template_grid.clean_trail_empty()  # clean the redundant trailing EMPTY char
-    
+
     return template_grid
 
 def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
     """
-    Compare the PIR Alignment with Atomsection of a mmCIF file. To make the ATOM-Section of 
+    Compare the PIR Alignment with Atomsection of a mmCIF file. To make the ATOM-Section of
     a mmCIF file compatible with MODELLER, each residue has in the ATOM-Section has to match
     corresponding positions in the PIR-Alignment
     """
 
     # glob the mmCif files from given directory and map the PDB identifier to the path
-    cif_files = glob.glob(os.path.join(folder, '*.cif'))
+    cif_files = glob(os.path.join(folder, '*.cif'))
     cif_paths = { path.split('/')[-1].split('.')[0].upper() : path for path in cif_files }
     cif_edits = dict()
-    
+
 
     # create the path where renumbered cifs are saved to
     if not os.path.exists(output_path):
         os.mkdir(output_path)
-    
+
     # if the cif does not contain any residue of the por alignment we delete it
     del_row = list()
 
@@ -745,16 +754,17 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
         cif_seq = dict()
         # For the case that we have to rename a chain
-        cif_chains = set([]) 
+        cif_chains = set([])
 
         # Iterate through the atomsection of the cif file
         for atom_row in range(0, atom_site.getRowCount()):
-            
+
             try:
                 if atom_site.getValue('label_comp_id', atom_row) == 'HOH':
                     continue
                 cif_chain = atom_site.getValue('label_asym_id', atom_row)
                 pdb_chain = atom_site.getValue('auth_asym_id', atom_row)   # use PDB chain ID
+                #cif_chain = pdb_chain
             except IndexError:
                 pass
 
@@ -763,19 +773,19 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
             # We do not care about the residues apart from the chain
             #if cif_chain != chain: # hzhu
             if pdb_chain != chain:  # hhr uses PDB chain, not the cif chain! hzhu
-                continue     
+                continue
             # and update the chain id from pdb_chain to cif_chain
             if atom_site.getValue('group_PDB', atom_row).startswith('ATOM'):  # hzhu in case HETATM ruins ch id
                 template_grid._chain[row] = cif_chain
-                
-            # get the residue and the residue number 
+
+            # get the residue and the residue number
             try:
                 res_num = int(atom_site.getValue("label_seq_id", atom_row))
             except ValueError:
                 continue
 
             residue = atom_site.getValue('label_comp_id', atom_row)
-            residue = convert_aa_code(residue, convert)         
+            residue = convert_aa_code(residue, convert)
 
 
             if res_num not in cif_seq.keys():
@@ -784,7 +794,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                 continue
             elif res_num in cif_seq.keys() and cif_seq[res_num] != residue:
                 cif_seq[res_num] = '-'
-                
+
                 if DEBUG_MODE:
                     print ('! {p} {c}: mmCIF contains a residue position that is assigned {cr} to two residues. Removing it.'.format(
                         p = pdb_code,
@@ -795,35 +805,35 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
         ## Rename chain if necessary                                          ##
         ########################################################################
 
-        chain_idx = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
-        if len(template_grid._chain[row]) != 1:
-            i = 0
-            new_chain = 0
-            
-            while i < len(chain_idx):
-                if chain_idx[i] in cif_chains:
-                    if DEBUG_MODE:
-                        print ('! {p} {c}: Chain identifier {i} is already taken.'.format(
-                            p = pdb_code,
-                            c = chain,
-                            i = chain_idx[i]))
-                    i += 1
-                else:
-                    new_chain = chain_idx[i]
-                    break
-
-            if new_chain == 0:
-                if DEBUG_MODE:
-                    print ('! {p} {c}: Could not use {p}. The chain identifier {c} is not compatible with MODELLER (2 letters) and could not be renanmed.'.format(
-                        p = pdb_code, 
-                        c = chain))
-                
-                del_row.append(row)
-                continue
-
-            if new_chain != 0:
-                print ('Selected new chain name {c}'.format(c = new_chain))
+        # chain_idx = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+        #
+        # if len(template_grid._chain[row]) != 1:
+        #     i = 0
+        #     new_chain = 0
+        #
+        #     while i < len(chain_idx):
+        #         if chain_idx[i] in cif_chains:
+        #             if DEBUG_MODE:
+        #                 print ('! {p} {c}: Chain identifier {i} is already taken.'.format(
+        #                     p = pdb_code,
+        #                     c = chain,
+        #                     i = chain_idx[i]))
+        #             i += 1
+        #         else:
+        #             new_chain = chain_idx[i]
+        #             break
+        #
+        #     if new_chain == 0:
+        #         if DEBUG_MODE:
+        #             print ('! {p} {c}: Could not use {p}. The chain identifier {c} is not compatible with MODELLER (2 letters) and could not be renanmed.'.format(
+        #                 p = pdb_code,
+        #                 c = chain))
+        #
+        #         del_row.append(row)
+        #         continue
+        #
+        #     if new_chain != 0:
+        #         print ('Selected new chain name {c}'.format(c = new_chain))
 
             #TODO
 
@@ -835,7 +845,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
         mod_pos = dict()
         mapping = dict()
 
-        for pos_cif, pos_tem in zip(range(template_grid.get_template_start(row), 
+        for pos_cif, pos_tem in zip(range(template_grid.get_template_start(row),
             template_grid.get_template_end(row) + 1), template_grid.get_seq_indeces(row)):
 
             res_tem = template_grid.get_cell(row, pos_tem)
@@ -844,7 +854,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                 res_cif = cif_seq[pos_cif]
             except KeyError:
                 res_cif = -1
-                
+
 
             match = True if res_tem == res_cif else False
 
@@ -1127,7 +1137,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
                     mod_pos[pos_cif] = 56
                     mapping[(pos_tem, res_tem)] = (pos_cif, 'E')
-                
+
                 elif res_cif == 57 and res_tem == 'F':
 
                     mod_pos[pos_cif] = 57
@@ -1207,16 +1217,16 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                     # insert a gap
                     template_grid.set_empty(row, pos_tem)
                     mapping[(pos_tem, res_tem)] = (pos_cif, res_cif)
-                    
+
                     if DEBUG_MODE:
-                        print ('! {p} {c}: template pos {pt} ({rt}) does not match cif pos {pc} ({rc}). Replacing with gap.'.format( 
-                            p = pdb_code, 
+                        print ('! {p} {c}: template pos {pt} ({rt}) does not match cif pos {pc} ({rc}). Replacing with gap.'.format(
+                            p = pdb_code,
                             c = chain,
-                            pt = pos_tem, 
-                            rt = res_tem, 
-                            pc = pos_cif, 
+                            pt = pos_tem,
+                            rt = res_tem,
+                            pc = pos_cif,
                             rc = res_cif if res_cif != -1 else 'DNE'))
-                    
+
                     if res_cif != -1:
                         del_pos.append(pos_cif)
             else:
@@ -1233,8 +1243,8 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
         except IndexError:
             # This exception handles cases in which all residues were deleted
             if DEBUG_MODE:
-                print ('! {p} {c}: Removing {p} from alignment. No residues matched the alignment sequence.'.format( 
-                    p = pdb_code, 
+                print ('! {p} {c}: Removing {p} from alignment. No residues matched the alignment sequence.'.format(
+                    p = pdb_code,
                     c = chain))
 
             del_row.append(row)
@@ -1245,7 +1255,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
         ########################################################################
 
         if threshold > 0:
-            
+
             gaps = 0
             res = 0
 
@@ -1259,18 +1269,18 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                     res += 1
 
             ratio = res/float(gaps + res)
-        
+
             if ratio > threshold:
-                print ('! Template {p} successfully passed residue ratio ({r:.2f} / {t}).'.format( 
+                print ('! Template {p} successfully passed residue ratio ({r:.2f} / {t}).'.format(
                     p = pdb_code,
                     r = ratio,
                     t = threshold ))
             else:
-                print ('! Template {p} did not passed residue ratio ({r:.2f} / {t}). Removing it from pir Alignment.'.format( 
+                print ('! Template {p} did not passed residue ratio ({r:.2f} / {t}). Removing it from pir Alignment.'.format(
                     p = pdb_code,
                     r = ratio,
-                    t = threshold ))            
-            
+                    t = threshold ))
+
                 if row not in del_row:
                     del_row.append(row)
                     continue
@@ -1281,10 +1291,10 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
         rem_row = list() # verbosity: saves information about removed residues
         mod_row = list() # verbosity: saves information about modified residues
-        cha_row = list() # verbosity: saves any other changes 
+        cha_row = list() # verbosity: saves any other changes
 
         for atom_row in reversed(range(0, atom_site.getRowCount())):
-            
+
             try:
                 cif_chain = atom_site.getValue('label_asym_id', atom_row)
             except IndexError:
@@ -1292,9 +1302,9 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
             # We do not care about the residues apart from the chain
             if cif_chain != chain:
-                continue    
+                continue
 
-            # get the residue number 
+            # get the residue number
             try:
                 res_num = int(atom_site.getValue("label_seq_id", atom_row))
             except ValueError:
@@ -1313,21 +1323,21 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
             group_PDB = atom_site.getValue('group_PDB', atom_row)
             residue = atom_site.getValue('label_comp_id', atom_row)
-            residue = convert_aa_code(residue, convert)         
+            residue = convert_aa_code(residue, convert)
 
-            # MODELLER accepts only structures if pdbx_PDB_model_num is set to 1   
+            # MODELLER accepts only structures if pdbx_PDB_model_num is set to 1
             if model_num != 1:
 
                 if (res_num, residue, 'model_num') not in cha_row:
-                    cha_row.append((res_num, residue, 'model_num'))             
-                
+                    cha_row.append((res_num, residue, 'model_num'))
+
                 atom_site.setValue(1, "pdbx_PDB_model_num", atom_row)
 
             if ins_code != '?':
-                
+
                 if (res_num, residue, 'ins_code') not in cha_row:
                     cha_row.append((res_num, residue, 'ins_code'))
-                
+
                 atom_site.setValue('?', "pdbx_PDB_ins_code", atom_row)
 
             if group_PDB != 'ATOM':
@@ -1335,7 +1345,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                 if (res_num, residue, 'group_PDB') not in cha_row:
                     cha_row.append((res_num, residue, 'group_PDB'))
 
-                atom_site.setValue('ATOM', 'group_PDB', atom_row)               
+                atom_site.setValue('ATOM', 'group_PDB', atom_row)
 
             ########################################################################
             ## Delete residues                                                    ##
@@ -1344,7 +1354,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
             if res_num in del_pos:
                 if (res_num, residue) not in rem_row:
                     rem_row.append((res_num, residue))
-                
+
                 atom_site.removeRow(atom_row)
 
             ########################################################################
@@ -1352,16 +1362,16 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
             ########################################################################
 
             if res_num in mod_pos.keys():
-                
+
                 # Get the data
                 type_symbol = atom_site.getValue('type_symbol', atom_row)
                 label_atom_id = atom_site.getValue('label_atom_id', atom_row)
                 auth_atom_id = atom_site.getValue('auth_atom_id', atom_row)
 
-                if mod_pos[res_num] == 1: # try to convert MSE to M 
-                                        
+                if mod_pos[res_num] == 1: # try to convert MSE to M
+
                     atom_site.setValue('MET', 'label_comp_id', atom_row)
-                    
+
                     try:
                         atom_site.setValue('MET', 'auth_comp_id', atom_row)
                     except IndexError:
@@ -1378,11 +1388,11 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                         mod_row.append((res_num, residue, 'MSE -> MET'))
 
                 elif mod_pos[res_num] == 2: # try to convert HYP to PRO
-                    # apparently it is enough to rename the label_comp_id to PRO to get 
+                    # apparently it is enough to rename the label_comp_id to PRO to get
                     # MODELLER working with Hydroxyprolines (HYP)
 
                     atom_site.setValue('PRO', 'label_comp_id', atom_row)
-                    
+
                     try:
                         atom_site.setValue('PRO', 'auth_comp_id', atom_row)
                     except IndexError:
@@ -1394,7 +1404,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
                 elif mod_pos[res_num] == 3: # try to convert MLY to LYS
 
                     atom_site.setValue('LYS', 'label_comp_id', atom_row)
-                    
+
                     try:
                         atom_site.setValue('LYS', 'auth_comp_id', atom_row)
                     except IndexError:
@@ -2227,7 +2237,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
             mod_model_num = len([ msg for msg in cha_row if msg[2] == 'model_num' ])
             mod_ins_code = len([ msg for msg in cha_row if msg[2] == 'ins_code' ])
             mod_group_PDB = len([ msg for msg in cha_row if msg[2] == 'group_PDB' ])
-            
+
             if mod_model_num != 0:
                 print ('! {p} {c}: modified atom_site.pdbx_PDB_model_num for {cr} residues to 1.'.format(
                         p = pdb_code,
@@ -2248,7 +2258,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
             for residue in reversed(mod_row):
                 print ('! {p} {c}: modified cif pos {cr} ({nr}).'.format(
-                    p = pdb_code, 
+                    p = pdb_code,
                     c = chain,
                     cr = residue[0],
                     ca = residue[1],
@@ -2257,7 +2267,7 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
             for residue in reversed(rem_row):
                 print ('! {p} {c}: removed cif pos {cr} ({ca})'.format(
-                    p = pdb_code, 
+                    p = pdb_code,
                     c = chain,
                     cr = residue[0],
                     ca = residue[1]))
@@ -2278,9 +2288,9 @@ def compare_with_cifs(template_grid, folder, output_path, convert, threshold):
 
 def remove_self_alignment(template_grid, query_name):
     """ Removes a self alignment from the final pir alignment to prevent clashes with MODELLER """
-    
+
     to_delete = list()
-    
+
     for row in range(template_grid.get_grid_height()):
         if template_grid._pdb_code[row] == query_name:
             to_delete.append(row)
@@ -2292,7 +2302,7 @@ def remove_self_alignment(template_grid, query_name):
 
 def write_to_file(line_list, fname):
     """ Writes the final pir file """
-    
+
     with open(fname, 'w+') as fout:
         for line in line_list:
             fout.write(line + "\n")
@@ -2303,9 +2313,9 @@ def arg():
     epilog= '2016 Harald Voehringer.'
     # Initiate a ArgumentParser Class
     parser = argparse.ArgumentParser(description = description, epilog = epilog)
-    
+
     # Call add_options to the parser
-    parser.add_argument('input', help = 'results file from HHsearch with hit list and alignment', metavar = 'FILE')    
+    parser.add_argument('input', help = 'results file from HHsearch with hit list and alignment', metavar = 'FILE')
     parser.add_argument('cifs', help = 'path to the folder containing cif files', metavar = 'DIR')
     parser.add_argument('pir', help = 'output file (PIR-formatted multiple alignment)', metavar = 'FILE')
     parser.add_argument('output', help = 'path to the folder where modified cif files should be written to', metavar = 'DIR')
@@ -2316,8 +2326,9 @@ def arg():
     parser.add_argument('-r', type = float, help = 'residue ratio (filter alignments that have contribute at least residues according to the specified ratio).',
         default = 0, metavar = 'FLOAT')
 
-    parser.add_argument('-c', help = 'convert non-canonical residues (default = True)', action = 'store_true', default = True)
-    
+    parser.add_argument('-c', help = 'convert non-canonical residues (default = True)', action = 'store_true', default = True),
+    parser.add_argument('-noduplicates', help='modify the header name to avoid name duplicates (default = False)', action= 'store_true', default = False)
+
 
     return parser
 
@@ -2326,9 +2337,9 @@ def main():
     import sys
     parser = arg()
     args = parser.parse_args(sys.argv[1:])
-    
+
     global DEBUG_MODE
-    
+
     if args.verbose:
         DEBUG_MODE = True
 
@@ -2340,7 +2351,7 @@ def main():
     if args.m and not args.e:
         selection = map(lambda x: int(x), args.m)
         print ('Selected templates {st}.'.format(st = ', '.join(args.m)))
-        
+
         for i in selection:
             tmp_info = str(data[i - 1].template_info.split('>')[1])
             print ('{i}: {t}'.format(
@@ -2351,7 +2362,7 @@ def main():
 
     elif args.e and not args.m:
         print ('Selected templates satisfying E-val <= {e}'.format(e = args.e))
-        
+
         e_values = { float(j.evalue):i for i, j in enumerate(data) }
         selection = sorted([ val for key, val in e_values.items() if key <= args.e ])
 
@@ -2368,32 +2379,37 @@ def main():
         sys.exit()
     else:
         selected_templates = data
-        
+
         print ('Creating pir file using all templates ({n})'.format(
             n = len(selected_templates)))
-
     query_grid = create_query_grid(selected_templates) # load query grid
-    print ('query_grid')
-    print(query_grid)
+    #print ('query_grid')
+    #print(query_grid)
     gapless_query_grid = create_gapless_grid(query_grid) # remove gaps
-    print ('gapless_query_grid')
-    print(gapless_query_grid)
+    #print ('gapless_query_grid')
+    #print(gapless_query_grid)
     processed_query_grid = process_query_grid(query_grid, gapless_query_grid) # insert gaps
     ##processed_query_grid = process_query_grid(query_grid, query_grid) # insert gaps
-    print ('processed_query_grid')
-    print (processed_query_grid)
-    glob_seq = derive_global_seq(processed_query_grid, query_name, query_chain) # derive query sequence
+    #print ('processed_query_grid')
+    #print (processed_query_grid)
     template_grid = create_template_grid(selected_templates) # create template grid
-    print ('template_grid')
-    print (template_grid)
-    processed_template_grid = process_template_grid(query_grid, template_grid) # insert gaps to template sequnces
-    print ('processed_query_grid')
-    print (processed_query_grid)
-    print ('hzhu processed_template_grid')
-    print (processed_template_grid)
+    #print(glob_seq)
+    #print ('template_grid')
+    #print (template_grid)
+    processed_template_grid = process_template_grid(query_grid, template_grid) # insert gaps to template sequences
+    #print ('processed_query_grid')
+    #print (processed_query_grid)
+    #print ('hzhu processed_template_grid')
+    #print (processed_template_grid)
     final_grid = compare_with_cifs(processed_template_grid, args.cifs, args.output, args.c, args.r) # compare with atom section of cifs
+    processed_grid_lengths = []
+    for row in range(final_grid.get_grid_height()):
+        processed_grid_lengths.append(len(final_grid._cells[row]))
+        if len(final_grid._cells[row][-1]) != 1:
+            final_grid._cells[row][-1] = (len(final_grid._cells[row][-1])) * '-'
     remove_self_alignment(final_grid, query_name) # remove self alignment if any
-    write_to_file([glob_seq, final_grid.display()], args.pir)
+    glob_seq = derive_global_seq(processed_query_grid, query_name, query_chain, max(processed_grid_lengths)) # derive query sequence
+    write_to_file([glob_seq.replace(EMPTY, GAP), final_grid.display(args.noduplicates)], args.pir)
 
 
 if __name__ == "__main__":