Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
RdSeqPDB.c
Go to the documentation of this file.
1 /************************************************************************/
2 /**
3 
4  \file RdSeqPDB.c
5 
6  \version V1.3
7  \date 26.02.15
8  \brief Read sequence from SEQRES records in a PDB file
9 
10  \copyright (c) UCL / Dr. Andrew C. R. Martin 1996-2015
11  \author Dr. Andrew C. R. Martin
12  \par
13  Institute of Structural & Molecular Biology,
14  University College London,
15  Gower Street,
16  London.
17  WC1E 6BT.
18  \par
19  andrew@bioinf.org.uk
20  andrew.martin@ucl.ac.uk
21 
22 **************************************************************************
23 
24  This code is NOT IN THE PUBLIC DOMAIN, but it may be copied
25  according to the conditions laid out in the accompanying file
26  COPYING.DOC.
27 
28  The code may be modified as required, but any modifications must be
29  documented so that the person responsible can be identified.
30 
31  The code may not be sold commercially or included as part of a
32  commercial product except as described in the file COPYING.DOC.
33 
34 **************************************************************************
35 
36  Description:
37  ============
38 
39 
40 **************************************************************************
41 
42  Usage:
43  ======
44 
45 **************************************************************************
46 
47  Revision History:
48  =================
49 - V1.0 14.10.96 Original By: ACRM
50 - V1.1 25.03.14 Added CHAINMATCH. By: CTP
51 - V1.2 07.07.14 Use bl prefix for functions By: CTP
52 - V1.3 26.02.15 Added blReadSeqresWholePDB() By: ACRM
53 
54 *************************************************************************/
55 /* Doxygen
56  -------
57  #GROUP Handling PDB Data
58  #SUBGROUP File IO
59  #FUNCTION blReadSeqresPDB()
60  Reads the sequence from the SEQRES records of a PDB file
61 
62  #FUNCTION blReadSeqresWholePDB()
63  Reads the sequence from the SEQRES records from header data stored in
64  a WHOLEPDB structure
65 */
66 /************************************************************************/
67 /* Includes
68 */
69 #include <stdlib.h>
70 #include "general.h"
71 #include "seq.h"
72 #include "macros.h"
73 #include "fsscanf.h"
74 
75 /************************************************************************/
76 /* Defines and macros
77 */
78 #define MAXBUFF 160
79 
80 /************************************************************************/
81 /* Globals
82 */
83 
84 /************************************************************************/
85 /* Prototypes
86 */
87 static STRINGLIST *RdSeqRes(FILE *fp);
88 static STRINGLIST *RdSeqResHeader(WHOLEPDB *wpdb);
89 
90 
91 /************************************************************************/
92 /*>char **blReadSeqresPDB(FILE *fp, int *nchains)
93  ----------------------------------------------
94 *//**
95 
96  \param[in] *fp PDB file pointer
97  \param[out] *nchains Number of chains found
98  \return Array of sequence strings
99 
100  Reads the sequence from the SEQRES records of a PDB file. Creates
101  an array of malloc()'d character arrays in which the sequence is
102  stored. Can therefore cope with any size of sequence information
103  from the PDB file.
104 
105  This is not normally recommended to get the sequence for a PDB file
106  this way, but is useful to detect discrepancies compared with the
107  sequence described by the ATOM records.
108 
109 - 14.10.96 Original By: ACRM
110 - 25.03.14 Added CHAINMATCH. Chain IDs handled as strings. By: CTP
111 - 07.07.14 Use bl prefix for functions By: CTP
112 */
113 char **blReadSeqresPDB(FILE *fp, int *nchains)
114 {
115  STRINGLIST *seqres = NULL,
116  *s;
117  char currchain[2] = " ",
118  chain[2] = " ",
119  **seqs,
120  res[13][8];
121  int chainnum = 0,
122  nres = 0,
123  i;
124 
125  *nchains = 0;
126 
127  /* First read the SEQRES records into a linked list */
128  if((seqres = RdSeqRes(fp))==NULL)
129  return(NULL);
130 
131  /* FIRST PASS: See how many chains there are */
132  strncpy(currchain,&(seqres->string[11]),1);
133  *nchains = 1;
134  for(s=seqres; s!=NULL; NEXT(s))
135  {
136  strncpy(chain,&(s->string[11]),1);
137  if(!CHAINMATCH(chain,currchain))
138  {
139  strncpy(currchain,chain,1);
140  (*nchains)++;
141  }
142  }
143 
144  /* Allocate an array of character pointers to store this number of
145  strings
146  */
147  if((seqs=(char **)malloc((*nchains) * sizeof(char *)))==NULL)
148  {
149  FREELIST(seqres, STRINGLIST);
150  return(NULL);
151  }
152 
153  /* SECOND PASS: Allocate space to store each chain */
154  chainnum = 0;
155  strcpy(currchain,"");
156  for(s=seqres; s!=NULL; NEXT(s))
157  {
158  fsscanf(s->string,"%11x%1s%5d",chain,&nres);
159  if(!CHAINMATCH(chain,currchain))
160  {
161  strcpy(currchain,chain);
162  if((seqs[chainnum]=(char *)malloc((nres+1)*sizeof(char)))
163  == NULL)
164  {
165  FREELIST(seqres, STRINGLIST);
166  return(NULL);
167  }
168  chainnum++;
169  }
170  }
171 
172  /* THIRD PASS: Store the sequence */
173  chainnum = 0;
174  nres = 0;
175  strncpy(currchain,&(seqres->string[11]),1);
176  for(s=seqres; s!=NULL; NEXT(s))
177  {
178  fsscanf(s->string,"%11x%1s%7x%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s",
179  chain,res[0],res[1],res[2],res[3],res[4],res[5],res[6],
180  res[7],res[8],res[9],res[10],res[11],res[12]);
181  if(!CHAINMATCH(chain,currchain))
182  {
183  /* Start of new chain, terminate last one */
184  seqs[chainnum][nres] = '\0';
185  strcpy(currchain,chain);
186  nres = 0;
187  chainnum++;
188  }
189 
190  /* Store these sequence data */
191  for(i=0; i<13; i++)
192  {
193  /* Break out if not all positions were filled in */
194  if(res[i][0] == ' ')
195  break;
196  seqs[chainnum][nres++] = blThrone(res[i]);
197  }
198  }
199  /* Terminate last chain */
200  seqs[chainnum][nres] = '\0';
201 
202  FREELIST(seqres, STRINGLIST);
203 
204  return(seqs);
205 }
206 
207 
208 /************************************************************************/
209 /*>char **blReadSeqresWholePDB(WHOLEPDB *wpdb, int *nchains)
210  ---------------------------------------------------------
211 *//**
212 
213  \param[in] wpdb WHOLEPDB structure
214  \param[out] *nchains Number of chains found
215  \return Array of sequence strings
216 
217  Reads the sequence from the SEQRES records from the PDB header
218  stored in a WHOLEPDB structure. Creates an array of malloc()'d
219  character arrays in which the sequence is stored. Can therefore
220  cope with any size of sequence information from the PDB file.
221 
222  This is not normally recommended to get the sequence for a PDB file
223  this way, but is useful to detect discrepancies compared with the
224  sequence described by the ATOM records.
225 
226 - 26.02.15 Original based on blReadSeqresPDB() By: ACRM
227 */
228 char **blReadSeqresWholePDB(WHOLEPDB *wpdb, int *nchains)
229 {
230  STRINGLIST *seqres = NULL,
231  *s;
232  char currchain[2] = " ",
233  chain[2] = " ",
234  **seqs,
235  res[13][8];
236  int chainnum = 0,
237  nres = 0,
238  i;
239 
240  *nchains = 0;
241 
242  /* First read the SEQRES records into a linked list */
243  if((seqres = RdSeqResHeader(wpdb))==NULL)
244  return(NULL);
245 
246  /* FIRST PASS: See how many chains there are */
247  strncpy(currchain,&(seqres->string[11]),1);
248  *nchains = 1;
249  for(s=seqres; s!=NULL; NEXT(s))
250  {
251  strncpy(chain,&(s->string[11]),1);
252  if(!CHAINMATCH(chain,currchain))
253  {
254  strncpy(currchain,chain,1);
255  (*nchains)++;
256  }
257  }
258 
259  /* Allocate an array of character pointers to store this number of
260  strings
261  */
262  if((seqs=(char **)malloc((*nchains) * sizeof(char *)))==NULL)
263  {
264  FREELIST(seqres, STRINGLIST);
265  return(NULL);
266  }
267 
268  /* SECOND PASS: Allocate space to store each chain */
269  chainnum = 0;
270  strcpy(currchain,"");
271  for(s=seqres; s!=NULL; NEXT(s))
272  {
273  fsscanf(s->string,"%11x%1s%5d",chain,&nres);
274  if(!CHAINMATCH(chain,currchain))
275  {
276  strcpy(currchain,chain);
277  if((seqs[chainnum]=(char *)malloc((nres+1)*sizeof(char)))
278  == NULL)
279  {
280  FREELIST(seqres, STRINGLIST);
281  return(NULL);
282  }
283  chainnum++;
284  }
285  }
286 
287  /* THIRD PASS: Store the sequence */
288  chainnum = 0;
289  nres = 0;
290  strncpy(currchain,&(seqres->string[11]),1);
291  for(s=seqres; s!=NULL; NEXT(s))
292  {
293  fsscanf(s->string,"%11x%1s%7x%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s%4s",
294  chain,res[0],res[1],res[2],res[3],res[4],res[5],res[6],
295  res[7],res[8],res[9],res[10],res[11],res[12]);
296  if(!CHAINMATCH(chain,currchain))
297  {
298  /* Start of new chain, terminate last one */
299  seqs[chainnum][nres] = '\0';
300  strcpy(currchain,chain);
301  nres = 0;
302  chainnum++;
303  }
304 
305  /* Store these sequence data */
306  for(i=0; i<13; i++)
307  {
308  /* Break out if not all positions were filled in */
309  if(res[i][0] == ' ')
310  break;
311  seqs[chainnum][nres++] = blThrone(res[i]);
312  }
313  }
314  /* Terminate last chain */
315  seqs[chainnum][nres] = '\0';
316 
317  FREELIST(seqres, STRINGLIST);
318 
319  return(seqs);
320 }
321 
322 
323 
324 /************************************************************************/
325 /*>static STRINGLIST *RdSeqRes(FILE *fp)
326  -------------------------------------
327 *//**
328 
329  \param[in] *fp PDB File pointer
330  \return Linked list of SEQRES records
331 
332  Used by ReadSeqresPDB() to read the SEQRES records into a linked list.
333 
334 - 14.10.96 Original By: ACRM
335 - 07.07.14 Use bl prefix for functions By: CTP
336 */
337 static STRINGLIST *RdSeqRes(FILE *fp)
338 {
339  static STRINGLIST *seqres = NULL;
340  char buffer[MAXBUFF];
341 
342  while(fgets(buffer, MAXBUFF, fp))
343  {
344  if(!strncmp(buffer,"SEQRES",6))
345  {
346  if((seqres = blStoreString(seqres, buffer)) == NULL)
347  {
348  FREELIST(seqres, STRINGLIST);
349  return(NULL);
350  }
351  }
352  }
353 
354  return(seqres);
355 }
356 
357 /************************************************************************/
358 /*>static STRINGLIST *RdSeqResHeader(WHOLEPDB *wpdb)
359  -------------------------------------------------
360 *//**
361 
362  \param[in] *fp PDB File pointer
363  \return Linked list of SEQRES records
364 
365  Used by ReadSeqresPDB() to read the SEQRES records into a linked list.
366 
367 - 26.02.15 Original based on RdSeqRes() By: ACRM
368 */
369 static STRINGLIST *RdSeqResHeader(WHOLEPDB *wpdb)
370 {
371  STRINGLIST *seqres = NULL,
372  *s;
373 
374  for(s=wpdb->header; s!=NULL; NEXT(s))
375  {
376  if(!strncmp(s->string,"SEQRES",6))
377  {
378  if((seqres = blStoreString(seqres, s->string)) == NULL)
379  {
380  FREELIST(seqres, STRINGLIST);
381  return(NULL);
382  }
383  }
384  }
385 
386  return(seqres);
387 }
388 
#define NULL
Definition: array2.c:99
char ** blReadSeqresWholePDB(WHOLEPDB *wpdb, int *nchains)
Definition: RdSeqPDB.c:228
char ** blReadSeqresPDB(FILE *fp, int *nchains)
Definition: RdSeqPDB.c:113
STRINGLIST * header
Definition: pdb.h:375
Definition: pdb.h:372
#define NEXT(x)
Definition: macros.h:249
Useful macros.
int fsscanf(char *buffer, char *format,...)
Definition: fsscanf.c:177
Header file for sequence handling.
Include file for fsscanf()
char blThrone(char *three)
Definition: throne.c:153
STRINGLIST * blStoreString(STRINGLIST *StringList, char *string)
Definition: StoreString.c:131
Header file for general purpose routines.
#define CHAINMATCH(chain1, chain2)
Definition: pdb.h:495
#define FREELIST(y, z)
Definition: macros.h:264
char * string
Definition: general.h:85
#define MAXBUFF
Definition: RdSeqPDB.c:78