Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
PDB2Seq.c
Go to the documentation of this file.
1 /************************************************************************/
2 /**
3 
4  \file PDB2Seq.c
5 
6  \version V1.15
7  \date 01.12.15
8  \brief Conversion from PDB to sequence and other sequence
9  related routines
10 
11  \copyright (c) UCL / Dr. Andrew C. R. Martin 1993-2015
12  \author Dr. Andrew C. R. Martin
13  \par
14  Institute of Structural & Molecular Biology,
15  University College London,
16  Gower Street,
17  London.
18  WC1E 6BT.
19  \par
20  andrew@bioinf.org.uk
21  andrew.martin@ucl.ac.uk
22 
23 **************************************************************************
24 
25  This code is NOT IN THE PUBLIC DOMAIN, but it may be copied
26  according to the conditions laid out in the accompanying file
27  COPYING.DOC.
28 
29  The code may be modified as required, but any modifications must be
30  documented so that the person responsible can be identified.
31 
32  The code may not be sold commercially or included as part of a
33  commercial product except as described in the file COPYING.DOC.
34 
35 **************************************************************************
36 
37  Description:
38  ============
39 
40 
41 **************************************************************************
42 
43  Usage:
44  ======
45 
46 **************************************************************************
47 
48  Revision History:
49  =================
50 - V1.0 29.09.92 Original
51 - V1.1 07.06.93 Corrected allocation
52 - V1.2 18.06.93 Handles multi-chains and skips NTER and CTER residues.
53  Added SplitSeq()
54 - V1.3 09.07.93 SplitSeq() cleans up properly if allocation failed
55 - V1.4 11.05.94 Added TrueSeqLen()
56 - V1.5 13.05.94 Fixed bug in PDB2Seq().
57  Added KnownSeqLen().
58 - V1.6 07.09.94 Fixed allocation bug in SplitSeq()
59 - V1.7 19.07.95 Added check for ATOM records
60 - V1.8 24.01.96 Fixed bug when no ATOM records in linked list
61  Returns a blank string
62 - V1.9 26.08.97 Renamed DoPDB2Seq() with handling of Asx/Glx and
63  protein-only. Added macros to recreate the
64  old PDB2Seq() interface and similar new calls
65 - V1.10 02.10.00 Added NoX option
66 - V1.11 30.05.02 Changed PDB field from 'junk' to 'record_type'
67 - V1.12 10.06.05 Fixed bug - was undercounting by 1 for CA-only chains
68 - V1.13 04.02.14 Use CHAINMATCH By: CTP
69 - V1.14 07.07.14 Use bl prefix for functions By: CTP
70 - V1.15 01.12.15 Added blDoPDB2SeqByChain() By: ACRM
71 
72 *************************************************************************/
73 /* Doxygen
74  -------
75  #GROUP Handling PDB Data
76  #SUBGROUP Conversions
77  #FUNCTION blDoPDB2Seq()
78  malloc()'s an array containing the 1-letter sequence corresponding to
79  an input PDB linked list.
80 
81  #FUNCTION blDoPDB2SeqByChain()
82  Creates a hash indexed by chain label containing the 1-letter code
83  sequence from an input PDB linked list.
84 */
85 /************************************************************************/
86 /* Includes
87 */
88 #include <stdlib.h>
89 #include <string.h>
90 
91 #include "macros.h"
92 #include "pdb.h"
93 #include "seq.h"
94 
95 
96 /************************************************************************/
97 /* Defines and macros
98 */
99 
100 /************************************************************************/
101 /* Globals
102 */
103 
104 /************************************************************************/
105 /* Prototypes
106 */
107 
108 /************************************************************************/
109 /*>char *blDoPDB2Seq(PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)
110  -------------------------------------------------------------------
111 *//**
112 
113  \param[in] *pdb PDB linked list
114  \param[in] DoAsxGlx Handle Asx and Glx as B and Z rather than X
115  \param[in] ProtOnly Don't do DNA/RNA; these simply don't get
116  done rather than being handled as X
117  \param[in] NoX Skip amino acids which would be assigned as X
118  \return Allocated character array containing sequence
119 
120  malloc()'s an array containing the 1-letter sequence corresponding to
121  an input PDB linked list. Returns NULL if given a NULL parameter or
122  memory allocation fails. Puts *'s in the sequence for multi-chains.
123 
124  This routine is normally called via the macro interfaces:
125  PDB2Seq(pdb), PDB2SeqX(pdb), PDBProt2Seq(pdb), PDBProt2SeqX(pdb)
126  Those with Prot in their names handle protein only; those with
127  X handle Asx/Glx as B/Z rather than as X
128 
129 - 29.09.92 Original By: ACRM
130 - 07.06.93 Corrected allocation.
131 - 18.06.93 Handles multi-chains and skips NTER and CTER residues
132 - 13.05.94 Check for chain change *before* copy residue (!)
133  (Bug reported by Bob MacCullum)
134 - 19.07.95 Added check for ATOM records
135 - 24.01.96 Returns blank string (rather than core dumping!) if the
136  linked list contained no ATOM records
137 - 26.08.97 Changed to doPDB2Seq with extra parameters (DoAsxGlx &
138  ProtOnly). The old calling forms have now become macros
139 - 02.10.00 Added NoX
140 - 10.06.05 Changed the initialization of rescount, resnum, etc. so
141  it correctly points to the first residue. This solves a
142  bug with CA-only chains where it was undercounting by 1
143 - 04.02.14 Use CHAINMATCH By: CTP
144 - 07.07.14 Use bl prefix for functions By: CTP
145 */
146 char *blDoPDB2Seq(PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)
147 {
148  int resnum,
149  rescount,
150  NBreak = 0;
151  char insert,
152  chain[8],
153  *sequence = NULL;
154  PDB *p = NULL;
155 
156  /* Sanity check */
157  if(pdb==NULL) return(NULL);
158 
159  /* First step through the pdb linked list to see how many residues
160  and chains.
161  10.06.05 Fixed bug - was undercounting by one for CA-only chains
162  */
163  rescount = 1;
164  resnum = pdb->resnum;
165  insert = pdb->insert[0];
166  strcpy(chain,pdb->chain);
167 
168  for(p=pdb->next; p!=NULL; NEXT(p))
169  {
170  if(p->resnum != resnum || p->insert[0] != insert)
171  {
172  if(strncmp(p->resnam,"NTER",4) &&
173  strncmp(p->resnam,"CTER",4) &&
174  !strncmp(p->record_type,"ATOM ",6)) /* V1.7 */
175  rescount++;
176 
177  resnum = p->resnum;
178  insert = p->insert[0];
179 
180  /* Check for chain change */
181  if(!CHAINMATCH(chain,p->chain))
182  {
183  NBreak++;
184  strcpy(chain,p->chain);
185  }
186  }
187  }
188 
189  if(NBreak) rescount += NBreak;
190 
191  /* Allocate memory for the sequence array */
192  sequence = malloc((rescount + 1) * sizeof(char));
193  if(sequence == NULL) return(NULL);
194 
195  /* Step through the pdb linked list again, setting sequence array */
196  p = pdb;
197 
198  /* Skip an NTER residue */
199  /* 24.01.96 Added NULL check; occurs when no ATOM records present */
200  while(p!=NULL &&
201  (!strncmp(p->resnam,"NTER",4) ||
202  strncmp(p->record_type,"ATOM ",6)))
203  NEXT(p);
204  if(p==NULL)
205  {
206  sequence[0] = '\0';
207  return(sequence);
208  }
209 
210  sequence[0] = ((DoAsxGlx)?blThronex(p->resnam):blThrone(p->resnam));
211  if((!ProtOnly) || (!gBioplibSeqNucleicAcid))
212  rescount = 1;
213  else
214  rescount = 0;
215 
216  /* 02.10.00 Reset count if it's an X character and we are ignoring
217  them
218  */
219  if(NoX && sequence[0] == 'X')
220  rescount = 0;
221 
222  resnum = p->resnum;
223  insert = p->insert[0];
224  strcpy(chain,p->chain);
225 
226  for(p=p->next; p!=NULL; NEXT(p))
227  {
228  if(!strncmp(p->record_type,"ATOM ",6)) /* V1.7 */
229  {
230  if(p->resnum != resnum || p->insert[0] != insert)
231  {
232  /* Check for chain change */
233  if(!CHAINMATCH(chain,p->chain))
234  {
235  sequence[rescount++] = '*';
236  strcpy(chain,p->chain);
237  }
238 
239  /* 06.02.03 Fixed bug - was incrementing recount even when
240  it was NTER/CTER
241  */
242  if(strncmp(p->resnam,"NTER",4) && strncmp(p->resnam,"CTER",4))
243  {
244  sequence[rescount] = ((DoAsxGlx) ?
245  blThronex(p->resnam):
246  blThrone(p->resnam));
247  if((!ProtOnly) || (!gBioplibSeqNucleicAcid))
248  rescount++;
249 
250  /* 02.10.00 Reset count if it's an X character and we are
251  ignoring them
252  */
253  if(NoX && sequence[rescount-1] == 'X')
254  rescount--;
255  }
256 
257  resnum = p->resnum;
258  insert = p->insert[0];
259  }
260  }
261  }
262 
263  sequence[rescount] = '\0';
264 
265  return(sequence);
266 }
267 
268 
269 /************************************************************************/
270 /*>HASHTABLE *blDoPDB2SeqByChain(PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly,
271  BOOL NoX)
272  ---------------------------------------------------------------------
273 *//**
274 
275  \param[in] *pdb PDB linked list
276  \param[in] DoAsxGlx Handle Asx and Glx as B and Z rather than X
277  \param[in] ProtOnly Don't do DNA/RNA; these simply don't get
278  done rather than being handled as X
279  \param[in] NoX Skip amino acids which would be assigned as X
280  \return A hash of 1-letter code sequences indexed by
281  chain label
282 
283  Reads sequence from ATOM records in 1-letter code, storing the
284  results in a hash indexed by chain label.
285 
286  This routine is normally called via the macro interfaces:
287  PDB2SeqByCHain(pdb), PDB2SeqXByCHain(pdb), PDBProt2SeqByChain(pdb),
288  PDBProt2SeqXByChain(pdb)
289  Those with Prot in their names handle protein only; those with
290  X handle Asx/Glx as B/Z rather than as X
291 
292 - 30.11.15 Original based on blDoPDB2Seq() By: ACRM
293 */
294 HASHTABLE *blDoPDB2SeqByChain(PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly,
295  BOOL NoX)
296 {
297  int lastresnum,
298  nres = 0,
299  ArraySize = ALLOCSIZE;
300  char lastinsert[blMAXCHAINLABEL],
301  lastchain[blMAXCHAINLABEL],
302  *sequence = NULL;
303  PDB *p = NULL;
304  HASHTABLE *hash = NULL;
305 
306  /* Ensure fist residue will be recognized as different */
307  lastresnum = (-1000);
308  strcpy(lastinsert, "z");
309 
310  /* Sanity check */
311  if(pdb==NULL) return(NULL);
312 
313  /* Initialize hash with 11 bins */
314  if((hash = blInitializeHash(11))==NULL)
315  return(NULL);
316 
317  /* Initialize string to store the sequence */
318  if((sequence=(char *)malloc(ArraySize * sizeof(char)))==NULL)
319  return(NULL);
320  sequence[0] = '\0';
321 
322  /* Initialize chain label */
323  strncpy(lastchain, pdb->chain, blMAXCHAINLABEL);
324 
325  /* Step through the PDB linked list */
326  for(p=pdb; p!=NULL; NEXT(p))
327  {
328  /* Only interested in ATOM records */
329  if(!strncmp(p->record_type, "ATOM ", 6))
330  {
331  /* If chain has changed */
332  if(!CHAINMATCH(p->chain, lastchain))
333  {
334  if(blHashKeyDefined(hash, lastchain))
335  {
336  /*** TODO ***/
337  }
338  else
339  {
340  blSetHashValueString(hash, lastchain, sequence);
341  }
342  nres = 0;
343  }
344 
345  /* If residue has changed */
346  if((p->resnum != lastresnum) ||
347  !INSERTMATCH(p->insert, lastinsert) ||
348  !CHAINMATCH(p->chain, lastchain))
349  {
350  if(strncmp(p->resnam,"NTER",4) &&
351  strncmp(p->resnam,"CTER",4))
352  {
353  sequence[nres]=((DoAsxGlx) ? blThronex((p)->resnam) :
354  blThrone((p)->resnam));
355 
356  /* Increments count if it's not protein only or it's not
357  a nucleic acid AND we aren't skipping Xs or it's not
358  an X
359  */
360  if(((!ProtOnly) || (!gBioplibSeqNucleicAcid)) &&
361  (!NoX || (sequence[nres] != 'X')))
362  nres++;
363 
364  /* Increase the array size if needed */
365  if(nres >= ArraySize)
366  {
367  ArraySize += ALLOCSIZE;
368  if((sequence=(char *)realloc(sequence, ArraySize))==NULL)
369  {
370  blFreeHash(hash);
371  return(NULL);
372  }
373  }
374 
375  /* Terminate the string */
376  sequence[nres] = '\0';
377  }
378 
379  /* Updated information on last residue */
380  lastresnum = p->resnum;
381  strcpy(lastinsert, p->insert);
382  strcpy(lastchain, p->chain);
383  }
384  }
385  }
386 
387  if(nres)
388  blSetHashValueString(hash, lastchain, sequence);
389 
390  if(sequence != NULL)
391  free(sequence);
392 
393  return(hash);
394 }
395 
396 #ifdef TEST
397 #include <stdio.h>
398 int main(int argc, char **argv)
399 {
400  PDB *pdb;
401  int natoms;
402  char *seq;
403  FILE *fp;
404  HASHTABLE *seq2;
405 
406  if((fp = fopen(argv[1], "r"))!=NULL)
407  {
408  pdb=blReadPDB(fp, &natoms);
409  fclose(fp);
410 
411  if((seq = blDoPDB2Seq(pdb, FALSE, FALSE, FALSE))!=NULL)
412  printf("blDoPDB2Seq(): %s\n", seq);
413 
414  if((seq2 = blDoPDB2SeqByChain(pdb, FALSE, FALSE, FALSE))!=NULL)
415  {
416  char **chains = NULL;
417 
418  printf("blDoPDB2SeqByChain()\n");
419  if((chains = blGetHashKeyList(seq2))!=NULL)
420  {
421  int i;
422 
423  for(i=0; chains[i]!=NULL; i++)
424  {
425  printf("%s : %s\n", chains[i], blGetHashValueString(seq2, chains[i]));
426  }
427 
428  blFreeHashKeyList(chains);
429  }
430 
431 
432  }
433 
434 
435 
436 
437  }
438  return(0);
439 }
440 #endif
441 
int main(int argc, char **argv)
Definition: test.c:4
Include file for PDB routines.
int resnum
Definition: pdb.h:310
char blThronex(char *three)
Definition: throne.c:188
short BOOL
Definition: SysDefs.h:64
#define NULL
Definition: array2.c:99
char ** blGetHashKeyList(HASHTABLE *hashtable)
Definition: hash.c:226
Definition: pdb.h:298
BOOL gBioplibSeqNucleicAcid
Definition: throne.c:130
char * blGetHashValueString(HASHTABLE *hashtable, char *key)
Definition: hash.c:620
#define FALSE
Definition: macros.h:223
#define NEXT(x)
Definition: macros.h:249
char record_type[8]
Definition: pdb.h:315
BOOL blSetHashValueString(HASHTABLE *hashtable, char *key, char *value)
Definition: hash.c:452
Useful macros.
char resnam[8]
Definition: pdb.h:319
char * blDoPDB2Seq(PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)
Definition: PDB2Seq.c:146
HASHTABLE * blInitializeHash(ULONG hashsize)
Definition: hash.c:163
PDB * blReadPDB(FILE *fp, int *natom)
Definition: ReadPDB.c:419
Header file for sequence handling.
Definition: hash.h:85
void blFreeHashKeyList(char **keylist)
Definition: hash.c:200
#define INSERTMATCH(ins1, ins2)
Definition: pdb.h:496
void blFreeHash(HASHTABLE *hashtable)
Definition: hash.c:293
BOOL blHashKeyDefined(HASHTABLE *hashtable, char *key)
Definition: hash.c:775
char blThrone(char *three)
Definition: throne.c:153
#define CHAINMATCH(chain1, chain2)
Definition: pdb.h:495
#define blMAXCHAINLABEL
Definition: pdb.h:248
struct pdb_entry * next
Definition: pdb.h:307
char chain[blMAXCHAINLABEL]
Definition: pdb.h:321
HASHTABLE * blDoPDB2SeqByChain(PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)
Definition: PDB2Seq.c:294
char insert[8]
Definition: pdb.h:320
#define ALLOCSIZE