Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
Data Structures | Macros | Functions | Variables
seq.h File Reference

Header file for sequence handling. More...

#include "MathType.h"
#include "SysDefs.h"
#include "pdb.h"
#include "hash.h"
#include "deprecated.h"

Go to the source code of this file.

Data Structures

struct  SEQINFO
 

Macros

#define blMAXPIRLABEL   160
 
#define ALLOCSIZE
 
#define blPDB2Seq(x)   blDoPDB2Seq((x), FALSE, FALSE, FALSE)
 
#define blPDB2SeqX(x)   blDoPDB2Seq((x), TRUE, FALSE, FALSE)
 
#define blPDB2SeqNoX(x)   blDoPDB2Seq((x), FALSE, FALSE, TRUE)
 
#define blPDB2SeqXNoX(x)   blDoPDB2Seq((x), TRUE, FALSE, TRUE)
 
#define blPDBProt2Seq(x)   blDoPDB2Seq((x), FALSE, TRUE, FALSE)
 
#define blPDBProt2SeqX(x)   blDoPDB2Seq((x), TRUE, TRUE, FALSE)
 
#define blPDBProt2SeqNoX(x)   blDoPDB2Seq((x), FALSE, TRUE, TRUE)
 
#define blPDBProt2SeqXNoX(x)   blDoPDB2Seq((x), TRUE, TRUE, TRUE)
 
#define blPDB2SeqByChain(x)   blDoPDB2SeqByChain((x), FALSE, FALSE, FALSE)
 
#define blPDB2SeqXByChain(x)   blDoPDB2SeqByChain((x), TRUE, FALSE, FALSE)
 
#define blPDB2SeqNoXByChain(x)   blDoPDB2SeqByChain((x), FALSE, FALSE, TRUE)
 
#define blPDB2SeqXNoXByChain(x)   blDoPDB2SeqByChain((x), TRUE, FALSE, TRUE)
 
#define blPDBProt2SeqByChain(x)   blDoPDB2SeqByChain((x), FALSE, TRUE, FALSE)
 
#define blPDBProt2SeqXByChain(x)   blDoPDB2SeqByChain((x), TRUE, TRUE, FALSE)
 
#define blPDBProt2SeqNoXByChain(x)   blDoPDB2SeqByChain((x), FALSE, TRUE, TRUE)
 
#define blPDBProt2SeqXNoXByChain(x)   blDoPDB2SeqByChain((x), TRUE, TRUE, TRUE)
 
#define _SEQ_H_DEPRECATED
 

Functions

char blThrone (char *three)
 
char blThronex (char *three)
 
char * blOnethr (char one)
 
char * blDoPDB2Seq (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)
 
HASHTABLEblDoPDB2SeqByChain (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)
 
int blSplitSeq (char *LinearSeq, char **seqs)
 
int blReadSimplePIR (FILE *fp, int maxres, char **seqs)
 
int blReadPIR (FILE *fp, BOOL DoInsert, char **seqs, int maxchain, SEQINFO *seqinfo, BOOL *punct, BOOL *error)
 
int blReadRawPIR (FILE *fp, char **seqs, int maxchain, BOOL upcase, SEQINFO *seqinfo, BOOL *error)
 
int blAlign (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, char *align1, char *align2, int *align_len)
 
int blAffinealign (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len)
 
int blCalcMDMScore (char resa, char resb)
 
int blAffinealignuc (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len)
 
int blCalcMDMScoreUC (char resa, char resb)
 
BOOL blReadMDM (char *mdmfile)
 
int blZeroMDM (void)
 
char blDNAtoAA (char *dna)
 
int blTrueSeqLen (char *sequence)
 
int blKnownSeqLen (char *sequence)
 
BOOL blNumericReadMDM (char *mdmfile)
 
int blNumericCalcMDMScore (int resa, int resb)
 
int blNumericAffineAlign (int *seq1, int length1, int *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, int *align1, int *align2, int *align_len)
 
void blSetMDMScoreWeight (char resa, char resb, REAL weight)
 
void blWriteOneStringPIR (FILE *out, char *label, char *title, char *sequence, char **chains, BOOL ByChain, BOOL doFasta)
 

Variables

BOOL gBioplibSeqNucleicAcid
 

Detailed Description

Header file for sequence handling.

Version
V2.16
Date
30.11.15
Author
Dr. Andrew C. R. Martin
Institute of Structural & Molecular Biology, University College London, Gower Street, London. WC1E 6BT.
andre.nosp@m.w@bi.nosp@m.oinf..nosp@m.org..nosp@m.uk andre.nosp@m.w.ma.nosp@m.rtin@.nosp@m.ucl..nosp@m.ac.uk

This code is NOT IN THE PUBLIC DOMAIN, but it may be copied according to the conditions laid out in the accompanying file COPYING.DOC.

The code may be modified as required, but any modifications must be documented so that the person responsible can be identified.

The code may not be sold commercially or included as part of a commercial product except as described in the file COPYING.DOC.

Description:

Usage:

Revision History:

Definition in file seq.h.

Macro Definition Documentation

#define _SEQ_H_DEPRECATED

Definition at line 168 of file seq.h.

#define ALLOCSIZE
Value:
80 /* ReadPIR() uses this as a chunk size for
allocating memory
*/

Definition at line 86 of file seq.h.

#define blMAXPIRLABEL   160

Definition at line 85 of file seq.h.

#define blPDB2Seq (   x)    blDoPDB2Seq((x), FALSE, FALSE, FALSE)

Definition at line 107 of file seq.h.

#define blPDB2SeqByChain (   x)    blDoPDB2SeqByChain((x), FALSE, FALSE, FALSE)

Definition at line 117 of file seq.h.

#define blPDB2SeqNoX (   x)    blDoPDB2Seq((x), FALSE, FALSE, TRUE)

Definition at line 109 of file seq.h.

#define blPDB2SeqNoXByChain (   x)    blDoPDB2SeqByChain((x), FALSE, FALSE, TRUE)

Definition at line 119 of file seq.h.

#define blPDB2SeqX (   x)    blDoPDB2Seq((x), TRUE, FALSE, FALSE)

Definition at line 108 of file seq.h.

#define blPDB2SeqXByChain (   x)    blDoPDB2SeqByChain((x), TRUE, FALSE, FALSE)

Definition at line 118 of file seq.h.

#define blPDB2SeqXNoX (   x)    blDoPDB2Seq((x), TRUE, FALSE, TRUE)

Definition at line 110 of file seq.h.

#define blPDB2SeqXNoXByChain (   x)    blDoPDB2SeqByChain((x), TRUE, FALSE, TRUE)

Definition at line 120 of file seq.h.

#define blPDBProt2Seq (   x)    blDoPDB2Seq((x), FALSE, TRUE, FALSE)

Definition at line 112 of file seq.h.

#define blPDBProt2SeqByChain (   x)    blDoPDB2SeqByChain((x), FALSE, TRUE, FALSE)

Definition at line 122 of file seq.h.

#define blPDBProt2SeqNoX (   x)    blDoPDB2Seq((x), FALSE, TRUE, TRUE)

Definition at line 114 of file seq.h.

#define blPDBProt2SeqNoXByChain (   x)    blDoPDB2SeqByChain((x), FALSE, TRUE, TRUE)

Definition at line 124 of file seq.h.

#define blPDBProt2SeqX (   x)    blDoPDB2Seq((x), TRUE, TRUE, FALSE)

Definition at line 113 of file seq.h.

#define blPDBProt2SeqXByChain (   x)    blDoPDB2SeqByChain((x), TRUE, TRUE, FALSE)

Definition at line 123 of file seq.h.

#define blPDBProt2SeqXNoX (   x)    blDoPDB2Seq((x), TRUE, TRUE, TRUE)

Definition at line 115 of file seq.h.

#define blPDBProt2SeqXNoXByChain (   x)    blDoPDB2SeqByChain((x), TRUE, TRUE, TRUE)

Definition at line 125 of file seq.h.

Function Documentation

int blAffinealign ( char *  seq1,
int  length1,
char *  seq2,
int  length2,
BOOL  verbose,
BOOL  identity,
int  penalty,
int  penext,
char *  align1,
char *  align2,
int *  align_len 
)
Parameters
[in]*seq1First sequence
[in]length1First sequence length
[in]*seq2Second sequence
[in]length2Second sequence length
[in]verboseDisplay N&W matrix
[in]identityUse identity matrix
[in]penaltyGap insertion penalty value
[in]penextExtension penalty
[out]*align1Sequence 1 aligned
[out]*align2Sequence 2 aligned
[out]*align_lenAlignment length
Returns
Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

  • 07.10.92 Adapted from original written while at NIMR
  • 08.10.92 Split into separate routines
  • 09.10.92 Changed best structure to simple integers, moved SearchForBest() into TraceBack()
  • 21.08.95 Was only filling in the bottom right cell at initialisation rather than all the right hand column and bottom row
  • 11.07.96 Changed calls to calcscore() to CalcMDMScore()
  • 06.03.00 Changed name to affinealign() (the routine align() is provided as a backwards compatible wrapper). Added penext parameter. Now supports affine gap penalties with separate opening and extension penalties. The code now maintains the path as it goes.
  • 07.07.14 Use bl prefix for functions By: CTP
      NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealignuc()   ******

Definition at line 275 of file align.c.

int blAffinealignuc ( char *  seq1,
int  length1,
char *  seq2,
int  length2,
BOOL  verbose,
BOOL  identity,
int  penalty,
int  penext,
char *  align1,
char *  align2,
int *  align_len 
)
Parameters
[in]*seq1First sequence
[in]length1First sequence length
[in]*seq2Second sequence
[in]length2Second sequence length
[in]verboseDisplay N&W matrix
[in]identityUse identity matrix
[in]penaltyGap insertion penalty value
[in]penextExtension penalty
[out]*align1Sequence 1 aligned
[out]*align2Sequence 2 aligned
[out]*align_lenAlignment length
Returns
Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

  • 07.10.92 Adapted from original written while at NIMR
  • 08.10.92 Split into separate routines
  • 09.10.92 Changed best structure to simple integers, moved SearchForBest() into TraceBack()
  • 21.08.95 Was only filling in the bottom right cell at initialisation rather than all the right hand column and bottom row
  • 11.07.96 Changed calls to calcscore() to CalcMDMScore()
  • 06.03.00 Changed name to affinealign() (the routine align() is provided as a backwards compatible wrapper). Added penext parameter. Now supports affine gap penalties with separate opening and extension penalties. The code now maintains the path as it goes.
  • 27.02.07 Exactly as affinealign() but upcases characters before comparison
  • 07.07.14 Use bl prefix for functions By: CTP
      NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealign()    ******

Definition at line 583 of file align.c.

int blAlign ( char *  seq1,
int  length1,
char *  seq2,
int  length2,
BOOL  verbose,
BOOL  identity,
int  penalty,
char *  align1,
char *  align2,
int *  align_len 
)
Parameters
[in]*seq1First sequence
[in]length1First sequence length
[in]*seq2Second sequence
[in]length2Second sequence length
[in]verboseDisplay N&W matrix
[in]identityUse identity matrix
[in]penaltyGap insertion penalty value
[out]*align1Sequence 1 aligned
[out]*align2Sequence 2 aligned
[out]*align_lenAlignment length
Returns
Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

A single gap penalty is used, so gap extension incurrs no further penalty.

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

  • 06.03.00 Implemented as a wrapper to affinealign() which is the old align() routine, plus support for affine gap penalties, plus new traceback code based on storing the path as we go
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 214 of file align.c.

int blCalcMDMScore ( char  resa,
char  resb 
)
Parameters
[in]resaFirst residue
[in]resbSecond residue
Returns
score

Calculate score from static globally stored mutation data matrix

If both residues are set as '\0' it will simply silence all warnings

  • 07.10.92 Adapted from NIMR-written original
  • 24.11.94 Only gives 10 warnings
  • 28.02.95 Modified to use sMDMSize
  • 24.08.95 If a residue was not found was doing an out-of-bounds array reference causing a potential core dump
  • 11.07.96 Name changed from calcscore() and now non-static
  • 07.07.14 Use bl prefix for functions By: CTP
  • 04.01.16 Added special call with both residues set to '\0' to silence warnings. Also warnings now go to stderr

Definition at line 1220 of file align.c.

int blCalcMDMScoreUC ( char  resa,
char  resb 
)
Parameters
[in]resaFirst residue
[in]resbSecond residue
Returns
score

Calculate score from static globally stored mutation data matrix

  • 07.10.92 Adapted from NIMR-written original
  • 24.11.94 Only gives 10 warnings
  • 28.02.95 Modified to use sMDMSize
  • 24.08.95 If a residue was not found was doing an out-of-bounds array reference causing a potential core dump
  • 11.07.96 Name changed from calcscore() and now non-static
  • 27.02.07 As CalcMDMScore() but upcases characters before comparison
  • 07.07.14 Use bl prefix for functions By: CTP
  • 04.01.16 Added special call with both residues set to '\0' to silence warnings. Also warnings now go to stderr

Definition at line 1293 of file align.c.

char blDNAtoAA ( char *  dna)
Parameters
[in]*dnaDNA/RNA codon
Returns
1-letter amino acid code (X=termination)

Converts a nucleic acid codon to the 1-letter amino acid equivalent. Termination codons are returned as X. No special action is taken for initiation codons.

  • 18.04.94 Original By: ACRM
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 110 of file DNAtoAA.c.

char* blDoPDB2Seq ( PDB pdb,
BOOL  DoAsxGlx,
BOOL  ProtOnly,
BOOL  NoX 
)
Parameters
[in]*pdbPDB linked list
[in]DoAsxGlxHandle Asx and Glx as B and Z rather than X
[in]ProtOnlyDon't do DNA/RNA; these simply don't get done rather than being handled as X
[in]NoXSkip amino acids which would be assigned as X
Returns
Allocated character array containing sequence

malloc()'s an array containing the 1-letter sequence corresponding to an input PDB linked list. Returns NULL if given a NULL parameter or memory allocation fails. Puts *'s in the sequence for multi-chains.

This routine is normally called via the macro interfaces: PDB2Seq(pdb), PDB2SeqX(pdb), PDBProt2Seq(pdb), PDBProt2SeqX(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X

  • 29.09.92 Original By: ACRM
  • 07.06.93 Corrected allocation.
  • 18.06.93 Handles multi-chains and skips NTER and CTER residues
  • 13.05.94 Check for chain change before copy residue (!) (Bug reported by Bob MacCullum)
  • 19.07.95 Added check for ATOM records
  • 24.01.96 Returns blank string (rather than core dumping!) if the linked list contained no ATOM records
  • 26.08.97 Changed to doPDB2Seq with extra parameters (DoAsxGlx & ProtOnly). The old calling forms have now become macros
  • 02.10.00 Added NoX
  • 10.06.05 Changed the initialization of rescount, resnum, etc. so it correctly points to the first residue. This solves a bug with CA-only chains where it was undercounting by 1
  • 04.02.14 Use CHAINMATCH By: CTP
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 146 of file PDB2Seq.c.

HASHTABLE* blDoPDB2SeqByChain ( PDB pdb,
BOOL  DoAsxGlx,
BOOL  ProtOnly,
BOOL  NoX 
)
Parameters
[in]*pdbPDB linked list
[in]DoAsxGlxHandle Asx and Glx as B and Z rather than X
[in]ProtOnlyDon't do DNA/RNA; these simply don't get done rather than being handled as X
[in]NoXSkip amino acids which would be assigned as X
Returns
A hash of 1-letter code sequences indexed by chain label

Reads sequence from ATOM records in 1-letter code, storing the results in a hash indexed by chain label.

This routine is normally called via the macro interfaces: PDB2SeqByCHain(pdb), PDB2SeqXByCHain(pdb), PDBProt2SeqByChain(pdb), PDBProt2SeqXByChain(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X

Definition at line 294 of file PDB2Seq.c.

int blKnownSeqLen ( char *  sequence)
Parameters
[in]*sequenceA sequence containing deletions
Returns
Length without deletions

Scans a 1-letter code sequence and calculate the length without `-', ` ' or '?' residues

  • 13.05.94 Original By: ACRM
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 107 of file KnownSeqLen.c.

int blNumericAffineAlign ( int *  seq1,
int  length1,
int *  seq2,
int  length2,
BOOL  verbose,
BOOL  identity,
int  penalty,
int  penext,
int *  align1,
int *  align2,
int *  align_len 
)
Parameters
[in]*seq1First sequence of tokens
[in]length1First sequence length
[in]*seq2Second sequence of tokens
[in]length2Second sequence length
[in]verboseDisplay N&W matrix
[in]identityUse identity matrix
[in]penaltyGap insertion penalty value
[in]penextExtension penalty
[out]*align1Sequence 1 aligned
[out]*align2Sequence 2 aligned
[out]*align_lenAlignment length
Returns
Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

The sequences come as integer arrays containing numeric tokens

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

Identical to align.c/affinealign(), but uses integer arrays

  • 08.03.00 Original based on align.c/affinealign() 06.03.00 By: ACRM
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 412 of file NumericAlign.c.

int blNumericCalcMDMScore ( int  resa,
int  resb 
)
Parameters
[in]resaFirst token
[in]resbSecond token
Returns
score

Calculate score from static globally stored mutation data matrix

Identical to align.c/CalcMDMScore(), but uses a different static score array and takes integer parameters. These are used as direct lookups into the score array rather than being searched.

  • 08.03.00 Original based on align.c/CalcMDMScore() 11.07.96 By: ACRM
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 342 of file NumericAlign.c.

BOOL blNumericReadMDM ( char *  mdmfile)
Parameters
[in]*mdmfileMutation data matrix filename
Returns
Success?

Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself

Identical to align.c/ReadMDM() but reads into a different static 2D array and doesn't read a symbol identifier line from the file as the symbols are numeric and always start from 1 (0 is used as the insert character)

  • 08.03.00 Original based on align.c/ReadMDM() 26.07.95 By: ACRM
  • 06.02.03 Fixed for new version of GetWord()
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 258 of file NumericAlign.c.

char* blOnethr ( char  one)
Parameters
[in]oneOne letter code
Returns
Three letter code (padded to 4 chars with a space)

Converts 1-letter code to 3-letter code (actually as 4 chars).

  • 07.06.93 Original By: ACRM
  • 25.07.95 If the gBioplibSeqNucleicAcid flag is set, assumes nucleic acids rather than amino acids
  • 03.02.09 Fixed nucleic search - j was incrementing instead of decrementing!
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 223 of file throne.c.

BOOL blReadMDM ( char *  mdmfile)
Parameters
[in]*mdmfileMutation data matrix filename
Returns
Success?

Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself

  • 07.10.92 Original
  • 18.03.94 getc() -> fgetc()
  • 24.11.94 Automatically looks in DATAENV if not found in current directory
  • 28.02.95 Modified to read any size MDM and allow comments Also allows the list of aa types before or after the actual matrix
  • 26.07.95 Removed unused variables
  • 06.02.03 Fixed for new version of GetWord()
  • 07.04.09 Completely re-written to allow it to read BLAST style matrix files as well as the ones used previously Allow comments introduced with # as well as ! Uses MAXWORD rather than hardcoded 16
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 871 of file align.c.

int blReadPIR ( FILE *  fp,
BOOL  DoInsert,
char **  seqs,
int  maxchain,
SEQINFO seqinfo,
BOOL punct,
BOOL error 
)
Parameters
[in]*fpFile pointer
[in]DoInsertTRUE Read - characters into the sequence FALSE Skip - characters
[in]maxchainMax number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES
[out]**seqsArray of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length.
[out]*seqinfoThis structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation.
[out]*punctTRUE if any punctuation found.
[out]*errorTRUE if an error occured (e.g. memory allocation)
Returns
Number of chains in this sequence. 0 if file ended, or no valid sequence entries found.

This is an all-singing, all-dancing PIR reader which should handle all legal PIR files and some (slightly) incorrect ones. The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign.

The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.

Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.

Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.

Sequence: May contain allowed puctuation. This will set the punct flag and information on the types found will be placed in seqinfo. White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.

Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.

  • 02.03.94 Original By: ACRM
  • 03.03.94 Added / and = handling, upcasing, strcpy()->strncpy(), header lines without semi-colon, title lines without -
  • 07.03.94 Added sequence insertion handling and DoInsert parameter.
  • 11.05.94 buffer is now 504 characters (V38.0 spec allows 500 chars) Removes leading spaces from entry code and terminates at first space (V39.0 spec allows comments after the code).
  • 28.02.95 Added check that buffer doesn't overflow. Check on nseq changed to >=
  • 06.02.96 Removes trailing spaces from comment line
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 180 of file ReadPIR.c.

int blReadRawPIR ( FILE *  fp,
char **  seqs,
int  maxchain,
BOOL  upcase,
SEQINFO seqinfo,
BOOL error 
)
Parameters
[in]*fpFile pointer
[in]maxchainMax number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES
[in]upcaseShould lower-case letters be upcased?
[out]**seqsArray of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length.
[out]*seqinfoThis structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation.
[out]*errorTRUE if an error occured (e.g. memory allocation)
Returns
Number of chains in this sequence. 0 if file ended, or no valid sequence entries found.

This is based on ReadPIR(), but reads all characters into the sequence arrays (i.e. all punctuation characters are read as is). This is useful when punctuation has been used to indicate consensus sequence features.

The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign. The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.

Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.

Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.

White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.

Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.

  • 28.02.95 Original based on ReadPIR() By: ACRM
  • 13.03.95 chpos++ had got moved wrongly when adapting from ReadPIR(). Put it back fixing handling of text lines.
  • 26.07.95 Removed unused variables
  • 06.02.96 Remove any trailing spaces

Definition at line 169 of file ReadRawPIR.c.

int blReadSimplePIR ( FILE *  fp,
int  maxres,
char **  seqs 
)
Parameters
[in]*fpFile pointer
[in]maxresMax number of residues in chain.
[out]**seqsArray of pointers to sequences
Returns
Number of chains. 0 if error

Read a PIR file containing multiple chains of up to maxres amino acids. Each chain is returned in seqs[]. The number of chains is returned by the routine. 0 is returned if a memory allocation failed

  • 01.06.91 Original
  • 03.03.94 Added check on case before toupper(). Changed name.
  • 18.03.94 Changed getc() to fgetc()
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 121 of file ReadSimplePIR.c.

void blSetMDMScoreWeight ( char  resa,
char  resb,
REAL  weight 
)
Parameters
[in]resaFirst residue
[in]resbSecond residue
[in]weightWeight to apply

Apply a weight to a particular amino acid substitution

  • 26.08.14 Original By: ACRM

Definition at line 1408 of file align.c.

int blSplitSeq ( char *  LinearSeq,
char **  seqs 
)
Parameters
[in]*LinearSeqArray containing sequence with chains terminated by *'s
[out]**seqsAllocated set of character arrays containing one chain per array
Returns
Number of chains found

Splits a sequence stored as a linear array with each chain separated by a * into an array of sequences. Returns the number of chains found.

  • 18.06.93 Original By: ACRM
  • 09.07.93 Cleans up properly of allocation failed
  • 07.09.94 Sequence space was being allocated one too small
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 115 of file SplitSeq.c.

char blThrone ( char *  three)
Parameters
[in]*threeThree letter code
Returns
One letter code

Converts 3-letter code to 1-letter code. Handles ASX and GLX as X

  • 29.09.92 Original By: ACRM
  • 11.03.94 Modified to handle ASX and GLX in the tables
  • 25.07.95 Added handling of gBioplibSeqNucleicAcid
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 153 of file throne.c.

char blThronex ( char *  three)
Parameters
[in]*threeThree letter code
Returns
One letter code

Converts 3-letter code to 1-letter code. Handles ASX and GLX as B and Z.

  • 29.09.92 Original By: ACRM
  • 25.07.95 Added handling of gBioplibSeqNucleicAcid
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 188 of file throne.c.

int blTrueSeqLen ( char *  sequence)
Parameters
[in]*sequenceA sequence containing deletions
Returns
Length without deletions

Scans a 1-letter code sequence and calculate the length without `-' or ` ' residues

  • 14.04.94 Original By: ACRM
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 106 of file TrueSeqLen.c.

void blWriteOneStringPIR ( FILE *  out,
char *  label,
char *  title,
char *  sequence,
char **  chainLabels,
BOOL  ByChain,
BOOL  doFasta 
)
Parameters
[in]*outFile pointer
[in]*labelSequence label
[in]*titleSequence title
[in]*sequenceSequence (1-letter code) with chains separated by *
[in]**chainLabelsChain labels (may be set to NULL unless ByChain is set)
[in]ByChainPrint a separate header for each chain
[in]doFastaOutput FASTA format instead of PIR

Writes a PIR sequence file from a 1-letter code sequence. Multiple chains are split with '*'. If ByChain is set the the chainLabels array must be non-NULL and contains labels for each chain Adds a terminating * if required.

  • 10.05.94 Original By: ACRM
  • 22.08.97 Can now handle chains separately
  • 26.08.97 If chains are handled separately, don't bother writing out an empty chain
  • 10.08.98 Basically a total rewrite to fix a bug which caused the header not to be printed with -c -p for a chain after one which was non-protein. Much simplified the code by printing the header at the beginning of a chain rather than end of previous chain.
  • 18.10.00 Added code to write FASTA as well
  • 11.06.15 Reset count=0 after a * - tidies up the output!

Definition at line 116 of file WritePIR.c.

int blZeroMDM ( void  )
Returns
Maximum value in modified matrix

Modifies all values in the MDM such that the minimum value is 0

  • 17.09.96 Original
  • 07.07.14 Use bl prefix for functions By: CTP

Definition at line 1358 of file align.c.

Variable Documentation

BOOL gBioplibSeqNucleicAcid

Definition at line 130 of file throne.c.