Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
ReadPIR.c
Go to the documentation of this file.
1 /************************************************************************/
2 /**
3 
4  \file ReadPIR.c
5 
6  \version V2.8
7  \date 07.07.14
8  \brief Read a PIR sequence file
9 
10  \copyright (c) UCL / Dr. Andrew C. R. Martin 1991-2014
11  \author Dr. Andrew C. R. Martin
12  \par
13  Institute of Structural & Molecular Biology,
14  University College London,
15  Gower Street,
16  London.
17  WC1E 6BT.
18  \par
19  andrew@bioinf.org.uk
20  andrew.martin@ucl.ac.uk
21 
22 **************************************************************************
23 
24  This code is NOT IN THE PUBLIC DOMAIN, but it may be copied
25  according to the conditions laid out in the accompanying file
26  COPYING.DOC.
27 
28  The code may be modified as required, but any modifications must be
29  documented so that the person responsible can be identified.
30 
31  The code may not be sold commercially or included as part of a
32  commercial product except as described in the file COPYING.DOC.
33 
34 **************************************************************************
35 
36  Description:
37  ============
38 
39 
40 **************************************************************************
41 
42  Usage:
43  ======
44 
45 \code
46  int blReadPIR(FILE *fp, BOOL DoInsert, char **seqs, int maxchain,
47  SEQINFO *seqinfo, BOOL *punct, BOOL *error)
48 \endcode
49 
50  This version attempts to read any PIR file following the PIR
51  specifications. It also accepts a few non-standard features:
52  lower case sequence, no star at end of last chain, dashes in the
53  sequence to indicate insertions.
54 
55  See also:
56  - int blSimpleReadPIR(FILE *fp, int maxres, char **seqs)
57  - int blReadRawPIR(FILE *fp, BOOL DoInsert, char **seqs, int maxchain,
58  SEQINFO *seqinfo, BOOL *punct, BOOL *error)
59 
60 **************************************************************************
61 
62  Revision History:
63  =================
64 - V1.0 01.06.92 Original
65 - V2.0 08.03.94 Changed name of ReadPIR() to ReadSimplePIR()
66  Added new ReadPIR().
67 - V2.1 18.03.94 getc() -> fgetc()
68 - V2.2 11.05.94 Changes to ReadPIR() for better compatibility with
69  PIR V38.0 and V39.0
70 - V2.3 28.02.95 Added ReadRawPIR()
71 - V2.4 13.03.95 Fixed bug in reading text lines in ReadRawPIR()
72 - V2.5 26.07.95 Removed unused variables
73 - V2.6 30.10.95 Cosmetic
74 - V2.7 06.02.96 Removes trailing spaces from comment line
75 - V2.8 07.07.14 Use bl prefix for functions By: CTP
76 
77 *************************************************************************/
78 /* Doxygen
79  -------
80  #GROUP Handling Sequence Data
81  #SUBGROUP File IO
82  This is an all-singing, all-dancing PIR reader
83 */
84 /************************************************************************/
85 /* Includes
86 */
87 #include <string.h>
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <ctype.h>
91 
92 #include "SysDefs.h"
93 #include "macros.h"
94 #include "seq.h"
95 
96 /************************************************************************/
97 /* Defines and macros
98 */
99 
100 /************************************************************************/
101 /* Globals
102 */
103 
104 /************************************************************************/
105 /* Prototypes
106 */
107 
108 /************************************************************************/
109 /*>int blReadPIR(FILE *fp, BOOL DoInsert, char **seqs, int maxchain,
110  SEQINFO *seqinfo, BOOL *punct, BOOL *error)
111  ------------------------------------------------------------------
112 *//**
113 
114  \param[in] *fp File pointer
115  \param[in] DoInsert TRUE Read - characters into the sequence
116  FALSE Skip - characters
117  \param[in] maxchain Max number of chains to read. This is the
118  dimension of the seqs array.
119  N.B. THIS SHOULD BE AT LEAST 1 MORE THAN
120  THE EXPECTED MAXIMUM NUMBER OF SEQUENCES
121  \param[out] **seqs Array of character pointers which will
122  be filled in with sequence information.
123  Memory will be allocated for any sequence
124  length.
125  \param[out] *seqinfo This structure will be filled in with
126  extra information about the sequence.
127  Header & title information and details
128  of any punctuation.
129  \param[out] *punct TRUE if any punctuation found.
130  \param[out] *error TRUE if an error occured (e.g. memory
131  allocation)
132  \return Number of chains in this sequence.
133  0 if file ended, or no valid sequence
134  entries found.
135 
136  This is an all-singing, all-dancing PIR reader which should handle
137  all legal PIR files and some (slightly) incorrect ones. The only
138  requirements of the code are that the PIR file should have 2 title
139  lines per entry, the first line starting with a > sign.
140 
141  The routine will handle multiple sequence files. Successive calls
142  will return information on the next entry. The routine will return
143  0 when there are no more entries.
144 
145  Header line: Must start with >. Will handle files which don't have
146  the proper P1; or F1; parts of the header as well as those which
147  do.
148 
149  Title line: Will read the name and source fields if correctly
150  separated by a -, otherwise copies all information into the name.
151 
152  Sequence: May contain allowed puctuation. This will set the punct
153  flag and information on the types found will be placed in seqinfo.
154  White space and line breaks are ignored. Each chain should end with
155  a *, but the routine will accept the last chain of an entry with no
156  *. While the standard requires upper case text, this routine will
157  handle lower case and convert it to upper case. While the routine
158  does pretty well at last chains not terminated with a *, a last
159  chain ending with a / not followed by a * but followed by a text
160  line will be identified as incomplete rather than truncated.
161  If the DoInsert flag is set, - signs in the sequence will be
162  read as part of the sequence, otherwise they will be skipped. This
163  is an addition to the PIR standard.
164 
165  Text lines: Text lines after an entry (beginning with R;, C;, A;,
166  N; or F;) are ignored.
167 
168 - 02.03.94 Original By: ACRM
169 - 03.03.94 Added / and = handling, upcasing, strcpy()->strncpy(),
170  header lines without semi-colon, title lines without -
171 - 07.03.94 Added sequence insertion handling and DoInsert parameter.
172 - 11.05.94 buffer is now 504 characters (V38.0 spec allows 500 chars)
173  Removes leading spaces from entry code and terminates at
174  first space (V39.0 spec allows comments after the code).
175 - 28.02.95 Added check that buffer doesn't overflow. Check on nseq
176  changed to >=
177 - 06.02.96 Removes trailing spaces from comment line
178 - 07.07.14 Use bl prefix for functions By: CTP
179 */
180 int blReadPIR(FILE *fp, BOOL DoInsert, char **seqs, int maxchain,
181  SEQINFO *seqinfo, BOOL *punct, BOOL *error)
182 {
183  int ch,
184  i,
185  chpos,
186  nseq = 0,
187  ArraySize,
188  SeqPos;
189  char buffer[504],
190  *ptr;
191  BOOL InParen,
192  GotStar;
193 
194  /* Initialise error and punct outputs */
195  *error = FALSE;
196  *punct = FALSE;
197 
198  /* Initialise seqinfo structure */
199  if(seqinfo != NULL)
200  {
201  seqinfo->code[0] = '\0';
202  seqinfo->name[0] = '\0';
203  seqinfo->source[0] = '\0';
204  seqinfo->fragment = FALSE;
205  seqinfo->paren = FALSE;
206  seqinfo->DotInParen = FALSE;
207  seqinfo->NonExpJoin = FALSE;
208  seqinfo->UnknownPos = FALSE;
209  seqinfo->Incomplete = FALSE;
210  seqinfo->Juxtapose = FALSE;
211  seqinfo->Truncated = FALSE;
212  }
213 
214  /* Skip over any characters until the first > sign */
215  while((ch=fgetc(fp)) != EOF && ch != '>') ;
216 
217  /* Check for end of file */
218  if(ch==EOF) return(0);
219 
220  /* Read the rest of this line into a buffer */
221  i = 0;
222  while((ch=fgetc(fp)) != EOF && ch != '\n' && i<503)
223  buffer[i++] = (char)ch;
224  buffer[i] = '\0';
225 
226  /* Check for end of file */
227  if(ch==EOF) return(0);
228 
229  /* Set information in the seqinfo structure */
230  if(seqinfo != NULL)
231  {
232  /* Fragment flag */
233  if(buffer[2] == ';' && buffer[0] == 'F')
234  seqinfo->fragment = TRUE;
235  else
236  seqinfo->fragment = FALSE;
237 
238  /* Entry code */
239  if(buffer[2] == ';')
240  {
241  KILLLEADSPACES(ptr,(buffer+3));
242  }
243  else
244  {
245  KILLLEADSPACES(ptr,buffer);
246  }
247 
248  strncpy(seqinfo->code, ptr, 16);
249  seqinfo->code[15] = '\0';
250 
251  /* Terminate entry code at first space since comments are allowed
252  after the entry code (V39.0 spec)
253  */
254  for(i=0; seqinfo->code[i]; i++)
255  {
256  if(seqinfo->code[i] == ' ' || seqinfo->code[i] == '\t')
257  {
258  seqinfo->code[i] = '\0';
259  break;
260  }
261  }
262  }
263 
264  /* Now read the title line */
265  if(!fgets(buffer,240,fp))
266  return(0);
267  buffer[240] = '\0';
268 
269  /* 06.02.96 Remove any trailing spaces */
270  KILLTRAILSPACES(buffer);
271 
272  /* Set information in the seqinfo structure */
273  if(seqinfo)
274  {
275  TERMINATE(buffer);
276  /* If it's a fully legal PIR file, there will be a - in the midle
277  of the title line to separate name from source. If we don't
278  find one, we copy the whole line into the name
279  */
280  if((ptr = strstr(buffer," - ")) != NULL)
281  {
282  *ptr = '\0';
283  strncpy(seqinfo->source, ptr+3, 160);
284  seqinfo->source[159] = '\0';
285  }
286  strncpy(seqinfo->name, buffer, 160);
287  seqinfo->name[159] = '\0';
288  /* 06.02.96 Remove any trailing spaces */
289  KILLTRAILSPACES(seqinfo->name);
290  }
291 
292  /* Read the actual sequence info. */
293  chpos = 0;
294  for(;;)
295  {
296  GotStar = FALSE;
297  InParen = FALSE;
298 
299  /* Allocate some space for the sequence */
300  ArraySize = ALLOCSIZE;
301  if((seqs[nseq] = (char *)malloc(ArraySize * sizeof(char)))==NULL)
302  {
303  *error = TRUE;
304  return(0);
305  }
306 
307  SeqPos = 0;
308 
309  /* Read characters, storing sequence and handling any
310  punctuation
311  */
312  while((ch = fgetc(fp)) != EOF && ch != '*' && ch != '>')
313  {
314  chpos++;
315 
316  if(isalpha(ch) || (ch == '-' && DoInsert))
317  {
318  /* This is a sequence entry (probably!) */
319  seqs[nseq][SeqPos++] = (isupper(ch) ? ch : toupper(ch));
320 
321  /* If necessary, expand the sequence array */
322  if(SeqPos >= ArraySize)
323  {
324  ArraySize += ALLOCSIZE;
325  seqs[nseq] = (char *)realloc((void *)(seqs[nseq]),
326  ArraySize);
327  if(seqs[nseq] == NULL)
328  {
329  *error = TRUE;
330  return(0);
331  }
332  }
333  }
334  else if(ch == '/')
335  {
336  /* Sequence is incomplete or truncated */
337  *punct = TRUE;
338 
339  if(seqinfo != NULL)
340  {
341  if(SeqPos == 0) /* It's the first character in a chain */
342  {
343 
344  seqinfo->Truncated = TRUE;
345  }
346  else /* Not first, is it last? */
347  {
348  /* Skip spaces and newlines till we get the next real
349  character
350  */
351  while((ch = fgetc(fp)) != EOF &&
352  (ch == ' ' || ch == '\t' || ch == '\n')) ;
353  /* Replace the character in the input stream */
354  ungetc(ch,fp);
355 
356  if(ch == '*' ||
357  ch == EOF ||
358  ch == '>') /* End of chain */
359  seqinfo->Truncated = TRUE;
360  else /* Middle of chain */
361  seqinfo->Incomplete = TRUE;
362  }
363  }
364  }
365  else if(ch == '=')
366  {
367  /* Parts of the sequence may be juxtaposed */
368  *punct = TRUE;
369  if(seqinfo != NULL) seqinfo->Juxtapose = TRUE;
370  }
371  else if(ch == '(')
372  {
373  /* Start of a region in parentheses */
374  InParen = TRUE;
375  *punct = TRUE;
376  if(seqinfo != NULL) seqinfo->paren = TRUE;
377  }
378  else if(ch == ')')
379  {
380  /* End of region in parentheses */
381  InParen = FALSE;
382  *punct = TRUE;
383  if(seqinfo != NULL) seqinfo->paren = TRUE;
384  }
385  else if(ch == '.')
386  {
387  *punct = TRUE;
388 
389  if(InParen)
390  {
391  /* Previous aa >90% certain in position */
392  if(seqinfo != NULL) seqinfo->DotInParen = TRUE;
393  }
394  else
395  {
396  /* Join in sequence not known experimentally but is clear
397  from sequence homology.
398  */
399  if(seqinfo != NULL) seqinfo->NonExpJoin = TRUE;
400  }
401  }
402  else if(ch == ',')
403  {
404  /* Position of previous aa not known with confidence */
405  if(seqinfo != NULL) seqinfo->UnknownPos = TRUE;
406  }
407  else if(ch == '\n')
408  {
409  /* Start of new line, relevant to check on ; */
410  chpos = 0;
411  }
412  else if(ch == ';' && chpos == 2)
413  {
414  /* This is a text line, so the previous character wasn't
415  a sequence item
416  */
417  SeqPos--;
418 
419  /* Ignore the rest of this line and reset chpos */
420  while((ch = fgetc(fp))!=EOF && ch != '\n') ;
421  chpos = 0;
422  }
423  } /* Reading this sequence */
424 
425  /* Test the exit conditions from the read character loop */
426  if(ch == '*')
427  {
428  /* End of chain */
429  seqs[nseq][SeqPos] = '\0';
430  GotStar = TRUE;
431  if(++nseq >= maxchain)
432  {
433  *error = TRUE;
434  return(nseq);
435  }
436  }
437  else if(ch == '>')
438  {
439  /* Start of new entry */
440  ungetc(ch,fp);
441  break; /* Out of read for this sequence */
442  }
443  else if(ch == EOF)
444  {
445  /* End of file */
446  break; /* Out of read for this sequence */
447  }
448  } /* Loop on with this sequence (next chain) */
449 
450 
451  /* Now tidy up if we have an unfinished sequence */
452  if(!GotStar)
453  {
454  seqs[nseq][SeqPos] = '\0';
455  if(!strlen(seqs[nseq]))
456  free(seqs[nseq]);
457  else
458  nseq++;
459  }
460 
461  return(nseq);
462 }
463 
464 
char code[16]
Definition: seq.h:100
short BOOL
Definition: SysDefs.h:64
#define NULL
Definition: array2.c:99
BOOL UnknownPos
Definition: seq.h:92
#define KILLTRAILSPACES(x)
Definition: macros.h:414
BOOL fragment
Definition: seq.h:92
#define FALSE
Definition: macros.h:223
Definition: seq.h:90
Useful macros.
#define TERMINATE(x)
Definition: macros.h:366
Header file for sequence handling.
int blReadPIR(FILE *fp, BOOL DoInsert, char **seqs, int maxchain, SEQINFO *seqinfo, BOOL *punct, BOOL *error)
Definition: ReadPIR.c:180
BOOL Incomplete
Definition: seq.h:92
BOOL DotInParen
Definition: seq.h:92
BOOL NonExpJoin
Definition: seq.h:92
#define TRUE
Definition: macros.h:219
BOOL paren
Definition: seq.h:92
char source[160]
Definition: seq.h:100
#define KILLLEADSPACES(y, x)
Definition: macros.h:408
System-type variable type definitions.
BOOL Juxtapose
Definition: seq.h:92
char name[160]
Definition: seq.h:100
#define ALLOCSIZE
BOOL Truncated
Definition: seq.h:92