00001
00002
00003
00004
00005
00006 #include "common.h"
00007 #include "errabort.h"
00008 #include "hash.h"
00009 #include "portable.h"
00010 #include "dnautil.h"
00011 #include "dnaseq.h"
00012 #include "fa.h"
00013 #include "linefile.h"
00014
00015 static char const rcsid[] = "$Id: fa.c,v 1.37 2007/02/20 18:50:21 kent Exp $";
00016
00017 boolean faReadNext(FILE *f, char *defaultName, boolean mustStartWithComment,
00018 char **retCommentLine, struct dnaSeq **retSeq)
00019
00020
00021
00022
00023
00024 {
00025 return faReadMixedNext(f, 0, defaultName, mustStartWithComment,
00026 retCommentLine, retSeq);
00027 }
00028
00029 boolean faReadMixedNext(FILE *f, boolean preserveCase, char *defaultName,
00030 boolean mustStartWithComment, char **retCommentLine, struct dnaSeq **retSeq)
00031
00032
00033
00034
00035 {
00036 char lineBuf[1024];
00037 int lineSize;
00038 char *words[1];
00039 int c;
00040 off_t offset = ftello(f);
00041 size_t dnaSize = 0;
00042 DNA *dna, *sequence;
00043 char *name = defaultName;
00044
00045 if (name == NULL)
00046 name = "";
00047 dnaUtilOpen();
00048 if (retCommentLine != NULL)
00049 *retCommentLine = NULL;
00050 *retSeq = NULL;
00051
00052
00053 for (;;)
00054 {
00055 if(fgets(lineBuf, sizeof(lineBuf), f) == NULL)
00056 {
00057 if (ferror(f))
00058 errnoAbort("read of fasta file failed");
00059 return FALSE;
00060 }
00061 lineSize = strlen(lineBuf);
00062 if (lineBuf[0] == '>')
00063 {
00064 if (retCommentLine != NULL)
00065 *retCommentLine = cloneString(lineBuf);
00066 offset = ftello(f);
00067 chopByWhite(lineBuf, words, ArraySize(words));
00068 name = words[0]+1;
00069 break;
00070 }
00071 else if (!mustStartWithComment)
00072 {
00073 if (fseeko(f, offset, SEEK_SET) < 0)
00074 errnoAbort("fseek on fasta file failed");
00075 break;
00076 }
00077 else
00078 offset += lineSize;
00079 }
00080
00081 for (;;)
00082 {
00083 c = fgetc(f);
00084 if (c == EOF || c == '>')
00085 break;
00086 if (isalpha(c))
00087 {
00088 ++dnaSize;
00089 }
00090 }
00091
00092 if (dnaSize == 0)
00093 {
00094 warn("Invalid fasta format: sequence size == 0 for element %s",name);
00095 }
00096
00097
00098 dna = sequence = needHugeMem(dnaSize+1);
00099 if (fseeko(f, offset, SEEK_SET) < 0)
00100 errnoAbort("fseek on fasta file failed");
00101 for (;;)
00102 {
00103 c = fgetc(f);
00104 if (c == EOF || c == '>')
00105 break;
00106 if (isalpha(c))
00107 {
00108
00109 if (ntChars[c] == 0)
00110 {
00111 *dna++ = preserveCase ? 'N' : 'n';
00112 }
00113 else
00114 {
00115 *dna++ = preserveCase ? c : ntChars[c];
00116 }
00117 }
00118 }
00119 if (c == '>')
00120 ungetc(c, f);
00121 *dna = 0;
00122
00123 *retSeq = newDnaSeq(sequence, dnaSize, name);
00124 if (ferror(f))
00125 errnoAbort("read of fasta file failed");
00126 return TRUE;
00127 }
00128
00129
00130 struct dnaSeq *faReadOneDnaSeq(FILE *f, char *defaultName, boolean mustStartWithComment)
00131
00132
00133 {
00134 struct dnaSeq *seq;
00135 if (!faReadNext(f, defaultName, mustStartWithComment, NULL, &seq))
00136 return NULL;
00137 else
00138 return seq;
00139 }
00140
00141 static bioSeq *nextSeqFromMem(char **pText, boolean isDna, boolean doFilter)
00142
00143
00144 {
00145 char *name = "";
00146 char *s, *d;
00147 struct dnaSeq *seq;
00148 int size = 0;
00149 char c;
00150 char *filter = (isDna ? ntChars : aaChars);
00151 char *text = *pText;
00152 char *p = skipLeadingSpaces(text);
00153 if (p == NULL)
00154 return NULL;
00155 dnaUtilOpen();
00156 if (*p == '>')
00157 {
00158 char *end;
00159 s = strchr(p, '\n');
00160 if (s != NULL) ++s;
00161 name = skipLeadingSpaces(p+1);
00162 end = skipToSpaces(name);
00163 if (end >= s || name >= s)
00164 errAbort("No name in line starting with '>'");
00165 if (end != NULL)
00166 *end = 0;
00167 }
00168 else
00169 {
00170 s = p;
00171 if (s == NULL || s[0] == 0)
00172 return NULL;
00173 }
00174 name = cloneString(name);
00175
00176 d = text;
00177 if (s != NULL)
00178 {
00179 for (;;)
00180 {
00181 c = *s;
00182 if (c == 0 || c == '>')
00183 break;
00184 ++s;
00185 if (!isalpha(c))
00186 continue;
00187 if (doFilter)
00188 {
00189 if ((c = filter[(int)c]) == 0)
00190 {
00191 if (isDna)
00192 c = 'n';
00193 else
00194 c = 'X';
00195 }
00196 }
00197 d[size++] = c;
00198 }
00199 }
00200 d[size] = 0;
00201
00202
00203 AllocVar(seq);
00204 seq->name = name;
00205 seq->dna = text;
00206 seq->size = size;
00207 *pText = s;
00208 return seq;
00209 }
00210
00211 bioSeq *faNextSeqFromMemText(char **pText, boolean isDna)
00212
00213
00214 {
00215 return nextSeqFromMem(pText, isDna, TRUE);
00216 }
00217
00218 bioSeq *faNextSeqFromMemTextRaw(char **pText)
00219
00220
00221 {
00222 return nextSeqFromMem(pText, TRUE, FALSE);
00223 }
00224
00225 bioSeq *faSeqListFromMemText(char *text, boolean isDna)
00226
00227 {
00228 bioSeq *seqList = NULL, *seq;
00229 while ((seq = faNextSeqFromMemText(&text, isDna)) != NULL)
00230 {
00231 slAddHead(&seqList, seq);
00232 }
00233 slReverse(&seqList);
00234 return seqList;
00235 }
00236
00237 bioSeq *faSeqListFromMemTextRaw(char *text)
00238
00239
00240 {
00241 bioSeq *seqList = NULL, *seq;
00242 while ((seq = faNextSeqFromMemTextRaw(&text)) != NULL)
00243 {
00244 slAddHead(&seqList, seq);
00245 }
00246 slReverse(&seqList);
00247 return seqList;
00248 }
00249
00250
00251 bioSeq *faSeqFromMemText(char *text, boolean isDna)
00252
00253 {
00254 return faNextSeqFromMemText(&text, isDna);
00255 }
00256
00257 struct dnaSeq *faFromMemText(char *text)
00258
00259
00260
00261
00262
00263 {
00264 return faNextSeqFromMemText(&text, TRUE);
00265 }
00266
00267 struct dnaSeq *faReadSeq(char *fileName, boolean isDna)
00268
00269 {
00270 int maxSize = fileSize(fileName);
00271 int fd;
00272 DNA *s;
00273
00274 if (maxSize < 0)
00275 errAbort("can't open %s", fileName);
00276 s = needHugeMem(maxSize+1);
00277 fd = open(fileName, O_RDONLY);
00278 read(fd, s, maxSize);
00279 close(fd);
00280 s[maxSize] = 0;
00281 return faSeqFromMemText(s, isDna);
00282 }
00283
00284 struct dnaSeq *faReadDna(char *fileName)
00285
00286 {
00287 return faReadSeq(fileName, TRUE);
00288 }
00289
00290 struct dnaSeq *faReadAa(char *fileName)
00291
00292 {
00293 return faReadSeq(fileName, FALSE);
00294 }
00295
00296 static unsigned faFastBufSize = 0;
00297 static DNA *faFastBuf;
00298
00299 static void expandFaFastBuf(int bufPos, int minExp)
00300
00301 {
00302 if (faFastBufSize == 0)
00303 {
00304 faFastBufSize = 64 * 1024;
00305 while (minExp > faFastBufSize)
00306 faFastBufSize <<= 1;
00307 faFastBuf = needHugeMem(faFastBufSize);
00308 }
00309 else
00310 {
00311 DNA *newBuf;
00312 unsigned newBufSize = faFastBufSize + faFastBufSize;
00313 while (newBufSize < minExp)
00314 {
00315 newBufSize <<= 1;
00316 if (newBufSize <= 0)
00317 errAbort("expandFaFastBuf: integer overflow when trying to "
00318 "increase buffer size from %u to a min of %u.",
00319 faFastBufSize, minExp);
00320 }
00321 newBuf = needHugeMem(newBufSize);
00322 memcpy(newBuf, faFastBuf, bufPos);
00323 freeMem(faFastBuf);
00324 faFastBuf = newBuf;
00325 faFastBufSize = newBufSize;
00326 }
00327 }
00328
00329 void faFreeFastBuf()
00330
00331 {
00332 freez(&faFastBuf);
00333 faFastBufSize = 0;
00334 }
00335
00336 boolean faFastReadNext(FILE *f, DNA **retDna, int *retSize, char **retName)
00337
00338
00339
00340 {
00341 int c;
00342 int bufIx = 0;
00343 static char name[256];
00344 int nameIx = 0;
00345 boolean gotSpace = FALSE;
00346
00347
00348 dnaUtilOpen();
00349 name[0] = 0;
00350 for (;;)
00351 {
00352 if ((c = fgetc(f)) == EOF)
00353 {
00354 *retDna = NULL;
00355 *retSize = 0;
00356 *retName = NULL;
00357 return FALSE;
00358 }
00359 if (!gotSpace && nameIx < ArraySize(name)-1)
00360 {
00361 if (isspace(c))
00362 gotSpace = TRUE;
00363 else if (c != '>')
00364 {
00365 name[nameIx++] = c;
00366 }
00367 }
00368 if (c == '\n')
00369 break;
00370 }
00371 name[nameIx] = 0;
00372
00373 for (;;)
00374 {
00375 c = fgetc(f);
00376 if (c == EOF || c == '>')
00377 c = 0;
00378 else if (!isalpha(c))
00379 continue;
00380 else
00381 {
00382 c = ntChars[c];
00383 if (c == 0) c = 'n';
00384 }
00385 if (bufIx >= faFastBufSize)
00386 expandFaFastBuf(bufIx, 0);
00387 faFastBuf[bufIx++] = c;
00388 if (c == 0)
00389 {
00390 *retDna = faFastBuf;
00391 *retSize = bufIx-1;
00392 *retName = name;
00393 return TRUE;
00394 }
00395 }
00396 }
00397
00398
00399 void faWriteNext(FILE *f, char *startLine, DNA *dna, int dnaSize)
00400
00401 {
00402 if (dnaSize == 0)
00403 return;
00404 if (startLine != NULL)
00405 fprintf(f, ">%s\n", startLine);
00406 writeSeqWithBreaks(f, dna, dnaSize, 50);
00407 }
00408
00409 void faWrite(char *fileName, char *startLine, DNA *dna, int dnaSize)
00410
00411 {
00412 FILE *f = mustOpen(fileName, "w");
00413 faWriteNext(f, startLine, dna, dnaSize);
00414 if (fclose(f) != 0)
00415 errnoAbort("fclose failed");
00416 }
00417
00418 void faWriteAll(char *fileName, bioSeq *seqList)
00419
00420 {
00421 FILE *f = mustOpen(fileName, "w");
00422 bioSeq *seq;
00423
00424 for (seq=seqList; seq != NULL; seq = seq->next)
00425 faWriteNext(f, seq->name, seq->dna, seq->size);
00426 if (fclose(f) != 0)
00427 errnoAbort("fclose failed");
00428 }
00429
00430 boolean faMixedSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
00431
00432
00433 {
00434 char c;
00435 int bufIx = 0;
00436 static char name[512];
00437 int lineSize, i;
00438 char *line;
00439
00440 dnaUtilOpen();
00441
00442
00443
00444 name[0] = 0;
00445 if (!lineFileNext(lf, &line, &lineSize))
00446 {
00447 *retDna = NULL;
00448 *retSize = 0;
00449 return FALSE;
00450 }
00451 if (line[0] == '>')
00452 {
00453 line = firstWordInLine(skipLeadingSpaces(line+1));
00454 if (line == NULL)
00455 errAbort("Expecting sequence name after '>' line %d of %s", lf->lineIx, lf->fileName);
00456 strncpy(name, line, sizeof(name));
00457 name[sizeof(name)-1] = '\0';
00458 }
00459 else
00460 {
00461 errAbort("Expecting '>' line %d of %s", lf->lineIx, lf->fileName);
00462 }
00463
00464 for (;;)
00465 {
00466 if (!lineFileNext(lf, &line, &lineSize))
00467 break;
00468 if (line[0] == '>')
00469 {
00470 lineFileReuse(lf);
00471 break;
00472 }
00473 if (bufIx + lineSize >= faFastBufSize)
00474 expandFaFastBuf(bufIx, lineSize);
00475 for (i=0; i<lineSize; ++i)
00476 {
00477 c = line[i];
00478 if (isalpha(c) || c == '-')
00479 faFastBuf[bufIx++] = c;
00480 }
00481 }
00482 if (bufIx >= faFastBufSize)
00483 expandFaFastBuf(bufIx, 0);
00484 faFastBuf[bufIx] = 0;
00485 *retDna = faFastBuf;
00486 *retSize = bufIx;
00487 *retName = name;
00488 if (bufIx == 0)
00489 {
00490 warn("Invalid fasta format: sequence size == 0 for element %s",name);
00491 }
00492
00493 return TRUE;
00494 }
00495
00496 void faToProtein(char *poly, int size)
00497
00498
00499
00500 {
00501 int i;
00502 char c;
00503 dnaUtilOpen();
00504 for (i=0; i<size; ++i)
00505 {
00506 if ((c = aaChars[(int)poly[i]]) == 0)
00507 c = 'X';
00508 poly[i] = c;
00509 }
00510 }
00511
00512 void faToDna(char *poly, int size)
00513
00514
00515
00516 {
00517 int i;
00518 char c;
00519 dnaUtilOpen();
00520 for (i=0; i<size; ++i)
00521 {
00522 if ((c = ntChars[(int)poly[i]]) == 0)
00523 c = 'n';
00524 poly[i] = c;
00525 }
00526 }
00527
00528 boolean faSomeSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName, boolean isDna)
00529
00530 {
00531 char *poly;
00532 int size;
00533
00534 if (!faMixedSpeedReadNext(lf, retDna, retSize, retName))
00535 return FALSE;
00536 size = *retSize;
00537 poly = *retDna;
00538 if (isDna)
00539 faToDna(poly, size);
00540 else
00541 faToProtein(poly, size);
00542 return TRUE;
00543 }
00544
00545 boolean faPepSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
00546
00547 {
00548 return faSomeSpeedReadNext(lf, retDna, retSize, retName, FALSE);
00549 }
00550
00551 boolean faSpeedReadNext(struct lineFile *lf, DNA **retDna, int *retSize, char **retName)
00552
00553
00554
00555
00556 {
00557 return faSomeSpeedReadNext(lf, retDna, retSize, retName, TRUE);
00558 }
00559
00560 static struct dnaSeq *faReadAllMixableInLf(struct lineFile *lf,
00561 boolean isDna, boolean mixed)
00562
00563
00564
00565 {
00566 struct dnaSeq *seqList = NULL, *seq;
00567 DNA *dna;
00568 char *name;
00569 int size;
00570 boolean ok;
00571
00572 for (;;)
00573 {
00574 if (mixed)
00575 ok = faMixedSpeedReadNext(lf, &dna, &size, &name);
00576 else
00577 ok = faSomeSpeedReadNext(lf, &dna, &size, &name, isDna);
00578 if (!ok)
00579 break;
00580 AllocVar(seq);
00581 seq->name = cloneString(name);
00582 seq->size = size;
00583 seq->dna = cloneMem(dna, size+1);
00584 slAddHead(&seqList, seq);
00585 }
00586 slReverse(&seqList);
00587 faFreeFastBuf();
00588 return seqList;
00589 }
00590
00591 static struct dnaSeq *faReadAllSeqMixable(char *fileName, boolean isDna, boolean mixed)
00592
00593
00594
00595 {
00596 struct lineFile *lf = lineFileOpen(fileName, FALSE);
00597 struct dnaSeq *seqList = faReadAllMixableInLf(lf, isDna, mixed);
00598 lineFileClose(&lf);
00599 return seqList;
00600 }
00601
00602 struct hash *faReadAllIntoHash(char *fileName, enum dnaCase dnaCase)
00603
00604 {
00605 boolean isDna = (dnaCase == dnaLower);
00606 boolean isMixed = (dnaCase == dnaMixed);
00607 struct dnaSeq *seqList = faReadAllSeqMixable(fileName, isDna, isMixed);
00608 struct hash *hash = hashNew(18);
00609 struct dnaSeq *seq;
00610 for (seq = seqList; seq != NULL; seq = seq->next)
00611 {
00612 if (hashLookup(hash, seq->name))
00613 errAbort("%s duplicated in %s", seq->name, fileName);
00614 hashAdd(hash, seq->name, seq);
00615 }
00616 return hash;
00617 }
00618
00619
00620 struct dnaSeq *faReadAllSeq(char *fileName, boolean isDna)
00621
00622 {
00623 return faReadAllSeqMixable(fileName, isDna, FALSE);
00624 }
00625
00626 struct dnaSeq *faReadAllDna(char *fileName)
00627
00628 {
00629 return faReadAllSeq(fileName, TRUE);
00630 }
00631
00632 struct dnaSeq *faReadAllPep(char *fileName)
00633
00634 {
00635 return faReadAllSeq(fileName, FALSE);
00636 }
00637
00638 struct dnaSeq *faReadAllMixed(char *fileName)
00639
00640 {
00641 return faReadAllSeqMixable(fileName, FALSE, TRUE);
00642 }
00643
00644 struct dnaSeq *faReadAllMixedInLf(struct lineFile *lf)
00645
00646 {
00647 return faReadAllMixableInLf(lf, FALSE, TRUE);
00648 }