jkOwnLib/gfPcrLib.c

Go to the documentation of this file.
00001 /* gfPcrLib - Routines to help do in silico PCR.
00002  * Copyright 2004-2005 Jim Kent.  All rights reserved. */
00003 
00004 #include "common.h"
00005 #include "linefile.h"
00006 #include "hash.h"
00007 #include "options.h"
00008 #include "dystring.h"
00009 #include "fa.h"
00010 #include "net.h"
00011 #include "genoFind.h"
00012 #include "sqlNum.h"
00013 #include "gfInternal.h"
00014 #include "gfPcrLib.h"
00015 
00016 static char const rcsid[] = "$Id: gfPcrLib.c,v 1.8 2006/03/15 18:36:16 angie Exp $";
00017 
00018 /**** Input and Output Handlers *****/
00019 
00020 void gfPcrOutputFree(struct gfPcrOutput **pOut)
00021 /* Free up a gfPcrOutput structure. */
00022 {
00023 struct gfPcrOutput *out = *pOut;
00024 if (pOut != NULL)
00025     {
00026     freeMem(out->name);
00027     freeMem(out->fPrimer);
00028     freeMem(out->rPrimer);
00029     freeMem(out->seqName);
00030     freeMem(out->dna);
00031     freez(pOut);
00032     }
00033 }
00034 
00035 void gfPcrOutputFreeList(struct gfPcrOutput **pList)
00036 /* Free up a list of gfPcrOutput structures. */
00037 {
00038 struct gfPcrOutput *el, *next;
00039 
00040 for (el = *pList; el != NULL; el = next)
00041     {
00042     next = el->next;
00043     gfPcrOutputFree(&el);
00044     }
00045 *pList = NULL;
00046 }
00047 
00048 void gfPcrInputStaticLoad(char **row, struct gfPcrInput *ret)
00049 /* Load a row from gfPcrInput table into ret.  The contents of ret will
00050  * be replaced at the next call to this function. */
00051 {
00052 ret->name = row[0];
00053 ret->fPrimer = row[1];
00054 ret->rPrimer = row[2];
00055 }
00056 
00057 struct gfPcrInput *gfPcrInputLoad(char **row)
00058 /* Load a gfPcrInput from row fetched with select * from gfPcrInput
00059  * from database.  Dispose of this with gfPcrInputFree(). */
00060 {
00061 struct gfPcrInput *ret;
00062 
00063 AllocVar(ret);
00064 ret->name = cloneString(row[0]);
00065 ret->fPrimer = cloneString(row[1]);
00066 ret->rPrimer = cloneString(row[2]);
00067 return ret;
00068 }
00069 
00070 struct gfPcrInput *gfPcrInputLoadAll(char *fileName) 
00071 /* Load all gfPcrInput from a whitespace-separated file.
00072  * Dispose of this with gfPcrInputFreeList(). */
00073 {
00074 struct gfPcrInput *list = NULL, *el;
00075 struct lineFile *lf = lineFileOpen(fileName, TRUE);
00076 char *row[3];
00077 
00078 while (lineFileRow(lf, row))
00079     {
00080     el = gfPcrInputLoad(row);
00081     slAddHead(&list, el);
00082     }
00083 lineFileClose(&lf);
00084 slReverse(&list);
00085 return list;
00086 }
00087 
00088 
00089 void gfPcrInputFree(struct gfPcrInput **pEl)
00090 /* Free a single dynamically allocated gfPcrInput such as created
00091  * with gfPcrInputLoad(). */
00092 {
00093 struct gfPcrInput *el;
00094 
00095 if ((el = *pEl) == NULL) return;
00096 freeMem(el->name);
00097 freeMem(el->fPrimer);
00098 freeMem(el->rPrimer);
00099 freez(pEl);
00100 }
00101 
00102 void gfPcrInputFreeList(struct gfPcrInput **pList)
00103 /* Free a list of dynamically allocated gfPcrInput's */
00104 {
00105 struct gfPcrInput *el, *next;
00106 
00107 for (el = *pList; el != NULL; el = next)
00108     {
00109     next = el->next;
00110     gfPcrInputFree(&el);
00111     }
00112 *pList = NULL;
00113 }
00114 
00115 
00116 
00117 /**** Handle Refinement (after Index has given us approximate position) *****/
00118 
00119 static boolean goodMatch(char *a, char *b, int size)
00120 /* Return TRUE if there are 2 matches per mismatch. */
00121 {
00122 int score = 0, i;
00123 for (i=0; i<size; ++i)
00124    {
00125    if (a[i] == b[i])
00126        score += 1;
00127    else
00128        score -= 2;
00129    }
00130 return score >= 0;
00131 }
00132 
00133 static void upperMatch(char *dna, char *primer, int size)
00134 /* Uppercase DNA where it matches primer. */
00135 {
00136 int i;
00137 for (i=0; i<size; ++i)
00138     {
00139     if (dna[i] == primer[i])
00140         dna[i] = toupper(dna[i]);
00141     }
00142 }
00143 
00144 static void outputFa(struct gfPcrOutput *out, FILE *f, char *url)
00145 /* Output match in fasta format. */
00146 {
00147 int fPrimerSize = strlen(out->fPrimer);
00148 int rPrimerSize = strlen(out->rPrimer);
00149 int productSize = out->rPos - out->fPos;
00150 char *dna = cloneStringZ(out->dna, productSize);
00151 char *rrPrimer = cloneString(out->rPrimer);
00152 char *ffPrimer = cloneString(out->fPrimer);
00153 struct dyString *faLabel = newDyString(0);
00154 char *name = out->name;
00155 
00156 /* Create fasta header with position, possibly empty name, and upper cased primers with position optionally hyperlinked. */
00157 touppers(rrPrimer);
00158 touppers(ffPrimer);
00159 if (url != NULL)
00160     {
00161     dyStringAppend(faLabel, "<A HREF=\"");
00162     dyStringPrintf(faLabel, url, out->seqName, out->fPos+1, out->rPos);
00163     dyStringAppend(faLabel, "\">");
00164     }
00165 dyStringPrintf(faLabel, "%s:%d%c%d", 
00166         out->seqName, out->fPos+1, out->strand, out->rPos);
00167 if (url != NULL)
00168     dyStringAppend(faLabel, "</A>");
00169 if (name != NULL)
00170     dyStringPrintf(faLabel, " %s", name);
00171 dyStringPrintf(faLabel, " %dbp %s %s",
00172         out->rPos - out->fPos, ffPrimer, rrPrimer);
00173 
00174 /* Flip reverse primer to be in same direction and case as sequence. */
00175 reverseComplement(rrPrimer, rPrimerSize);
00176 tolowers(rrPrimer);
00177 
00178 /* Capitalize where sequence and primer match, and write out sequence. */
00179 upperMatch(dna, out->fPrimer, fPrimerSize);
00180 upperMatch(dna + productSize - rPrimerSize, rrPrimer, rPrimerSize);
00181 faWriteNext(f, faLabel->string, dna, productSize);
00182 
00183 /* Clean up. */
00184 freez(&dna);
00185 freez(&rrPrimer);
00186 freez(&ffPrimer);
00187 dyStringFree(&faLabel)
00188 }
00189 
00190 static int countMatch(char *a, char *b, int size)
00191 /* Count number of letters in a, b that match */
00192 {
00193 int count = 0, i;
00194 for (i=0; i<size; ++i)
00195     if (a[i] == b[i])
00196        ++count;
00197 return count;
00198 }
00199 
00200 static void outputBed(struct gfPcrOutput *out, FILE *f, char *url)
00201 /* Output match in BED format. */
00202 {
00203 int match;
00204 int size = out->rPos - out->fPos;
00205 int fPrimerSize = strlen(out->fPrimer);
00206 int rPrimerSize = strlen(out->rPrimer);
00207 char *name = out->name;
00208 if (name == NULL) name = "n/a";
00209 fprintf(f, "%s\t%d\t%d\t", out->seqName, out->fPos, out->rPos);
00210 fprintf(f, "%s\t", name);
00211 match = countMatch(out->dna, out->fPrimer, fPrimerSize);
00212 assert(size > 0);
00213 reverseComplement(out->rPrimer, rPrimerSize);
00214 match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize);
00215 reverseComplement(out->rPrimer, rPrimerSize);
00216 fprintf(f, "%d\t", round(1000.0 * match / (double)(fPrimerSize + rPrimerSize) ));
00217 fprintf(f, "%c\n", out->strand);
00218 }
00219 
00220 static void outputPsl(struct gfPcrOutput *out, FILE *f, char *url)
00221 /* Output match in PSL format. */
00222 {
00223 int match;
00224 int size = out->rPos - out->fPos;
00225 int fPrimerSize = strlen(out->fPrimer);
00226 int rPrimerSize = strlen(out->rPrimer);
00227 int bothSize = fPrimerSize + rPrimerSize;
00228 int gapSize = size - bothSize;
00229 char *name = out->name;
00230 if (name == NULL) name = "n/a";
00231 match = countMatch(out->dna, out->fPrimer, fPrimerSize);
00232 reverseComplement(out->rPrimer, rPrimerSize);
00233 assert(size > 0);
00234 match += countMatch(out->dna + size - rPrimerSize, out->rPrimer, rPrimerSize);
00235 reverseComplement(out->rPrimer, rPrimerSize);
00236 
00237 fprintf(f, "%d\t", match);
00238 fprintf(f, "%d\t", bothSize - match);
00239 fprintf(f, "0\t0\t");   /* repMatch, nCount. */
00240 fprintf(f, "1\t%d\t", gapSize);   /* qNumInsert, qBaseInsert */
00241 fprintf(f, "1\t%d\t", gapSize);   /* tNumInsert, tBaseInsert */
00242 fprintf(f, "%c\t", out->strand);
00243 fprintf(f, "%s\t", name);
00244 fprintf(f, "%d\t", size);
00245 fprintf(f, "0\t%d\t", size);    /* qStart, qEnd */
00246 fprintf(f, "%s\t%d\t", out->seqName, out->seqSize);
00247 fprintf(f, "%d\t%d\t", out->fPos, out->rPos);
00248 fprintf(f, "2\t");
00249 fprintf(f, "%d,%d,\t", fPrimerSize, rPrimerSize);
00250 fprintf(f, "%d,%d,\t", 0,size - rPrimerSize);
00251 fprintf(f, "%d,%d,\n", out->fPos, out->rPos - rPrimerSize);
00252 }
00253 
00254 
00255 void gfPcrOutputWriteList(struct gfPcrOutput *outList, char *outType, 
00256         char *url, FILE *f)
00257 /* Write list of outputs in specified format (either "fa" or "bed") 
00258  * to file.  If url is non-null it should be a printf formatted
00259  * string that takes %s, %d, %d for chromosome, start, end. */
00260 {
00261 struct gfPcrOutput *out;
00262 void (*output)(struct gfPcrOutput *out, FILE *f, char *url) = NULL;
00263 if (sameWord(outType, "fa"))
00264     output = outputFa;
00265 else if (sameWord(outType, "bed"))
00266     output = outputBed;
00267 else if (sameWord(outType, "psl"))
00268     output = outputPsl;
00269 else
00270     errAbort("Unrecognized pcr output type %s", outType);
00271 for (out = outList; out != NULL; out = out->next)
00272     output(out, f, url);
00273 }
00274 
00275 void gfPcrOutputWriteAll(struct gfPcrOutput *outList, 
00276         char *outType, char *url, char *fileName)
00277 /* Create file of outputs in specified format (either "fa" or "bed") 
00278  * to file.  If url is non-null it should be a printf formatted
00279  * string that takes %s, %d, %d for chromosome, start, end. */
00280 {
00281 FILE *f = mustOpen(fileName, "w");
00282 gfPcrOutputWriteList(outList, outType, url, f);
00283 carefulClose(&f);
00284 }
00285 
00286 
00287 static void pcrLocalStrand(char *pcrName, 
00288         struct dnaSeq *seq,  int seqOffset, char *seqName, int seqSize,
00289         int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
00290         int minPerfect, int minGood,
00291         char strand, struct gfPcrOutput **pOutList)
00292 /* Do detailed PCR scan on one strand and report results. */
00293 {
00294 char *fDna = seq->dna, *rDna;
00295 char *endDna = fDna + seq->size;
00296 char *fpPerfect = fPrimer + fPrimerSize - minPerfect;
00297 char *rpPerfect = rPrimer;
00298 char *fpMatch, *rpMatch;
00299 int sizeLeft;
00300 int goodSize = minGood - minPerfect;
00301 struct gfPcrOutput  *out;
00302 int matchSize;
00303 
00304 reverseComplement(rPrimer, rPrimerSize);
00305 for (;;)
00306     {
00307     fpMatch = memMatch(fpPerfect, minPerfect, fDna, endDna - fDna);
00308     if (fpMatch == NULL)
00309         break;
00310     sizeLeft = endDna - fpMatch;
00311     rDna = fpMatch;
00312     for (;;)
00313         {
00314         int fPos, rPos, fGoodPos, rGoodPos;
00315         rpMatch = memMatch(rpPerfect, minPerfect, rDna, endDna - rDna);
00316         if (rpMatch == NULL)
00317             break;
00318         fPos = fpMatch - (fPrimerSize - minPerfect) - seq->dna;
00319         rPos = rpMatch + rPrimerSize - seq->dna;
00320         fGoodPos = fpMatch - goodSize - seq->dna;
00321         rGoodPos = rpMatch + minPerfect - seq->dna;
00322         matchSize = rPos - fPos;
00323         if (rPos >= 0 && fPos >= 0 && fPos < seq->size && matchSize <= maxSize)
00324             {
00325             /* If matches well enough create output record. */
00326             if (goodMatch(seq->dna + fGoodPos, fpPerfect - goodSize, goodSize) &&
00327                 goodMatch(seq->dna + rGoodPos, rpPerfect + minPerfect, goodSize))
00328                 {
00329                 AllocVar(out);
00330                 out->name  = cloneString(pcrName);
00331                 out->fPrimer = cloneString(fPrimer);
00332                 out->rPrimer = cloneString(rPrimer);
00333                 reverseComplement(out->rPrimer, rPrimerSize);
00334                 out->seqName = cloneString(seqName);
00335                 out->seqSize = seqSize;
00336                 out->fPos = fPos + seqOffset;
00337                 out->rPos = rPos + seqOffset;
00338                 out->strand = strand;
00339                 out->dna = cloneStringZ(seq->dna + fPos, matchSize);
00340 
00341                 /* Dealing with the strand of darkness....  Here we just have to swap
00342                  * forward and reverse primers to flip strands, and reverse complement
00343                  * the amplified area.. */
00344                 if (strand == '-')
00345                     {
00346                     char *temp = out->rPrimer;
00347                     out->rPrimer = out->fPrimer;
00348                     out->fPrimer = temp;
00349                     reverseComplement(out->dna, matchSize);
00350                     }
00351                 slAddHead(pOutList, out);
00352                 }
00353             }
00354         rDna = rpMatch+1;
00355         }
00356     fDna = fpMatch + 1;
00357     }
00358 reverseComplement(rPrimer, rPrimerSize);
00359 }
00360 
00361 void gfPcrLocal(char *pcrName, 
00362         struct dnaSeq *seq, int seqOffset, char *seqName, int seqSize,
00363         int maxSize, char *fPrimer, int fPrimerSize, char *rPrimer, int rPrimerSize,
00364         int minPerfect, int minGood, char strand, struct gfPcrOutput **pOutList)
00365 /* Do detailed PCR scan on DNA already loaded into memory and put results
00366  * (in reverse order) on *pOutList. */
00367 {
00368 /* For PCR primers reversing search strand just means switching
00369  * order of primers. */
00370 if (strand == '-')
00371     pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize, 
00372         rPrimer, rPrimerSize, fPrimer, fPrimerSize, 
00373         minPerfect, minGood, strand, pOutList);
00374 else
00375     pcrLocalStrand(pcrName, seq, seqOffset, seqName, seqSize, maxSize, 
00376         fPrimer, fPrimerSize, rPrimer, rPrimerSize, 
00377         minPerfect, minGood, strand, pOutList);
00378 }
00379 
00380 struct gfRange *gfPcrGetRanges(char *host, char *port, char *fPrimer, char *rPrimer,
00381         int maxSize)
00382 /* Query gfServer with primers and convert response to a list of gfRanges. */
00383 {
00384 char buf[256];
00385 int conn = gfConnect(host, port);
00386 struct gfRange *rangeList = NULL, *range;
00387 
00388 /* Query server and put results into rangeList. */
00389 safef(buf, sizeof(buf), "%spcr %s %s %d", gfSignature(), fPrimer, rPrimer, maxSize);
00390 write(conn, buf, strlen(buf));
00391 for (;;)
00392     {
00393     if (netGetString(conn, buf) == NULL)
00394         break;
00395     if (sameString(buf, "end"))
00396         break;
00397     else if (startsWith("Error:", buf))
00398         errAbort(buf);
00399     else
00400         {
00401         char *s = buf;
00402         char *name, *start, *end, *strand;
00403         name = nextWord(&s);
00404         start = nextWord(&s);
00405         end = nextWord(&s);
00406         strand = nextWord(&s);
00407         if (strand == NULL)
00408             errAbort("Truncated gfServer response");
00409         AllocVar(range);
00410         range->tName = cloneString(name);
00411         range->tStart = atoi(start);
00412         range->tEnd = atoi(end);
00413         range->tStrand = strand[0];
00414         slAddHead(&rangeList, range);
00415         }
00416     }
00417 close(conn);
00418 slReverse(&rangeList);
00419 return rangeList;
00420 }
00421 
00422 static void gfPcrOneViaNet(
00423         char *host, char *port, char *seqDir,
00424         char *pcrName, char *fPrimer, char *rPrimer, int maxSize,
00425         int minPerfect, int minGood,
00426         struct hash *tFileCache, struct gfPcrOutput **pOutList)
00427 /* Query gfServer for likely primer hits, load DNA to do detailed
00428  * examination, and output hits to head of *pOutList.. */
00429 {
00430 struct gfRange *rangeList = NULL, *range;
00431 int fPrimerSize = strlen(fPrimer);
00432 int rPrimerSize = strlen(rPrimer);
00433 int maxPrimerSize = max(fPrimerSize, rPrimerSize);
00434 int minPrimerSize = min(fPrimerSize, rPrimerSize);
00435 
00436 tolowers(fPrimer);
00437 tolowers(rPrimer);
00438 
00439 /* Check for obvious user mistake. */
00440 if (minPrimerSize < minGood || minPrimerSize < minPerfect)
00441     errAbort("Need longer primer in pair %s (%dbp) %s (%dbp).  Minimum size is %d\n",
00442         fPrimer, fPrimerSize, rPrimer, rPrimerSize, minGood);
00443 
00444 /* Load ranges and do more detailed snooping. */
00445 rangeList = gfPcrGetRanges(host, port, fPrimer, rPrimer, maxSize);
00446 for (range = rangeList; range != NULL; range = range->next)
00447     {
00448     int tSeqSize;
00449     char seqName[PATH_LEN];
00450     struct dnaSeq *seq = gfiExpandAndLoadCached(range,
00451         tFileCache, seqDir,  0, &tSeqSize, FALSE, FALSE, maxPrimerSize);
00452     gfiGetSeqName(range->tName, seqName, NULL);
00453     gfPcrLocal(pcrName, seq, range->tStart, seqName, tSeqSize, maxSize, 
00454             fPrimer, fPrimerSize, rPrimer, rPrimerSize, 
00455             minPerfect, minGood, range->tStrand, pOutList);
00456     dnaSeqFree(&seq);
00457     }
00458 gfRangeFreeList(&rangeList);
00459 }
00460 
00461 
00462 struct gfPcrOutput *gfPcrViaNet(char *host, char *port, char *seqDir, 
00463         struct gfPcrInput *inList, int maxSize, int minPerfect, int minGood)
00464 /* Do PCRs using gfServer index, returning list of results. */
00465 {
00466 struct hash *tFileCache = gfFileCacheNew();
00467 struct gfPcrInput *in;
00468 struct gfPcrOutput *outList = NULL;
00469 
00470 for (in = inList; in != NULL; in = in->next)
00471     {
00472     gfPcrOneViaNet(host, port, seqDir,
00473         in->name, in->fPrimer, in->rPrimer, maxSize,
00474         minPerfect, minGood,
00475         tFileCache, &outList);
00476     }
00477 gfFileCacheFree(&tFileCache);
00478 slReverse(&outList);
00479 return outList;
00480 }
00481 
00482 char *gfPcrMakePrimer(char *s)
00483 /* Make primer (lowercased DNA) out of text.  Complain if
00484  * it is too short or too long. */
00485 {
00486 int size = dnaFilteredSize(s);
00487 int realSize;
00488 char *primer = needMem(size+1);
00489 dnaFilter(s, primer);
00490 realSize = size - countChars(primer, 'n');
00491 if (realSize < 10 || realSize < size/2)
00492    errAbort("%s does not seem to be a good primer", s);
00493 return primer;
00494 }
00495 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2