lib/dnaseq.c

Go to the documentation of this file.
00001 /* dnaSeq.c - stuff to manage DNA sequences. 
00002  *
00003  * This file is copyright 2002 Jim Kent, but license is hereby
00004  * granted for all use - public, private or commercial. */
00005 
00006 #include "common.h"
00007 #include "dnaseq.h"
00008 #include "bits.h"
00009 #include "hash.h"
00010 #include "obscure.h"
00011 
00012 static char const rcsid[] = "$Id: dnaseq.c,v 1.19 2005/11/10 03:40:34 kent Exp $";
00013 
00014 
00015 struct dnaSeq *newDnaSeq(DNA *dna, int size, char *name)
00016 /* Create a new DNA seq. */
00017 {
00018 struct dnaSeq *seq;
00019 
00020 seq = needMem(sizeof(*seq));
00021 if (name != NULL)
00022     seq->name = cloneString(name);
00023 seq->dna = dna;
00024 seq->size = size;
00025 seq->mask = NULL;
00026 return seq;
00027 }
00028 
00029 struct dnaSeq *cloneDnaSeq(struct dnaSeq *orig)
00030 /* Duplicate dna sequence in RAM. */
00031 {
00032 struct dnaSeq *seq = CloneVar(orig);
00033 seq->name = cloneString(seq->name);
00034 seq->dna = needHugeMem(seq->size+1);
00035 memcpy(seq->dna, orig->dna, seq->size+1);
00036 seq->mask = NULL;
00037 if (orig->mask != NULL)
00038     {
00039     seq->mask = bitClone(orig->mask, seq->size);
00040     }
00041 return seq;
00042 }
00043 
00044 void freeDnaSeq(struct dnaSeq **pSeq)
00045 /* Free up DNA seq. (And unlink underlying resource node.) */
00046 {
00047 struct dnaSeq *seq = *pSeq;
00048 if (seq == NULL)
00049     return;
00050 freeMem(seq->name);
00051 freeMem(seq->dna);
00052 bitFree(&seq->mask);
00053 freez(pSeq);
00054 }
00055 
00056 void freeDnaSeqList(struct dnaSeq **pSeqList)
00057 /* Free up list of DNA sequences. */
00058 {
00059 struct dnaSeq *seq, *next;
00060 
00061 for (seq = *pSeqList; seq != NULL; seq = next)
00062     {
00063     next = seq->next;
00064     freeDnaSeq(&seq);
00065     }
00066 *pSeqList = NULL;
00067 }
00068 
00069 boolean seqIsLower(bioSeq *seq)
00070 /* Return TRUE if sequence is all lower case. */
00071 {
00072 int size = seq->size, i;
00073 char *poly = seq->dna;
00074 for (i=0; i<size; ++i)
00075     if (!islower(poly[i]))
00076         return FALSE;
00077 return TRUE;
00078 }
00079 
00080 boolean seqIsDna(bioSeq *seq)
00081 /* Make educated guess whether sequence is DNA or protein. */
00082 {
00083 return isDna(seq->dna, seq->size);
00084 }
00085 
00086 
00087 aaSeq *translateSeqN(struct dnaSeq *inSeq, unsigned offset, unsigned inSize, boolean stop)
00088 /* Return a translated sequence.  Offset is position of first base to
00089  * translate. If size is 0 then use length of inSeq. */
00090 {
00091 aaSeq *seq;
00092 DNA *dna = inSeq->dna;
00093 AA *pep, aa;
00094 int i, lastCodon;
00095 int actualSize = 0;
00096 
00097 assert(offset <= inSeq->size);
00098 if ((inSize == 0) || (inSize > (inSeq->size - offset)))
00099     inSize = inSeq->size - offset;
00100 lastCodon = offset + inSize - 3;
00101 
00102 AllocVar(seq);
00103 seq->dna = pep = needLargeMem(inSize/3+1);
00104 for (i=offset; i <= lastCodon; i += 3)
00105     {
00106     aa = lookupCodon(dna+i);
00107     if (aa == 0)
00108         {
00109         if (stop)
00110             break;
00111         else
00112             aa = 'Z';
00113         }
00114     *pep++ = aa;
00115     ++actualSize;
00116     }
00117 *pep = 0;
00118 assert(actualSize <= inSize/3+1);
00119 seq->size = actualSize;
00120 seq->name = cloneString(inSeq->name);
00121 return seq;
00122 }
00123 
00124 aaSeq *translateSeq(struct dnaSeq *inSeq, unsigned offset, boolean stop)
00125 /* Return a translated sequence.  Offset is position of first base to
00126  * translate. If stop is TRUE then stop at first stop codon.  (Otherwise 
00127  * represent stop codons as 'Z'). */
00128 {
00129 return translateSeqN(inSeq, offset, 0, stop);
00130 }
00131 
00132 bioSeq *whichSeqIn(bioSeq **seqs, int seqCount, char *letters)
00133 /* Figure out which if any sequence letters is in. */
00134 {
00135 aaSeq *seq;
00136 int i;
00137 
00138 for (i=0; i<seqCount; ++i)
00139     {
00140     seq = seqs[i];
00141     if (seq->dna <= letters && letters < seq->dna + seq->size)
00142         return seq;
00143     }
00144 internalErr();
00145 return NULL;
00146 }
00147 
00148 Bits *maskFromUpperCaseSeq(bioSeq *seq)
00149 /* Allocate a mask for sequence and fill it in based on
00150  * sequence case. */
00151 {
00152 int size = seq->size, i;
00153 char *poly = seq->dna;
00154 Bits *b = bitAlloc(size);
00155 for (i=0; i<size; ++i)
00156     {
00157     if (isupper(poly[i]))
00158         bitSetOne(b, i);
00159     }
00160 return b;
00161 }
00162 
00163 struct hash *dnaSeqHash(struct dnaSeq *seqList)
00164 /* Return hash of sequences keyed by name. */
00165 {
00166 int size = slCount(seqList)+1;
00167 int sizeLog2 = digitsBaseTwo(size);
00168 struct hash *hash = hashNew(sizeLog2);
00169 struct dnaSeq *seq;
00170 for (seq = seqList; seq != NULL; seq = seq->next)
00171     hashAddUnique(hash, seq->name, seq);
00172 return hash;
00173 }
00174 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2