lib/fof.c

Go to the documentation of this file.
00001 /* fofFa - create a fof index for a list of FA files. 
00002  *
00003  * This file is copyright 2002 Jim Kent, but license is hereby
00004  * granted for all use - public, private or commercial. */
00005 #include "common.h"
00006 #include "localmem.h"
00007 #include "sig.h"
00008 #include "fof.h"
00009 
00010 static char const rcsid[] = "$Id: fof.c,v 1.7 2006/03/10 17:43:36 angie Exp $";
00011 
00012 struct fofRecord
00013 /* This holds a record of an index file. */
00014     {
00015     bits32 offset;  /* Start offset within file. Must be first element.*/
00016     bits32 size;    /* Sizer within file. */
00017     UBYTE fileIx;   /* Which file it's in. */
00018     char name[1];   /* Dynamically allocated to fit actual size. */
00019     };
00020 
00021 struct fof
00022 /* Manage a file offset file - an index which includes the file,
00023  * the offset, and the size of each item. */
00024     {
00025     char *name;                  /* Name of fof given to fofOpen. */
00026     char *relDir;                /* Directory to apply to index files. */
00027     char **fileNames;            /* Names of files being indexed. */
00028     FILE **files;                /* Possibly null file handles of files being indexed. */
00029     FILE *f;                     /* Index file handle. */
00030     int fileCount;               /* The number of files being indexed. */
00031     int endIx;                   /* Last index. */
00032     int maxNameSize;             /* Size allocated for index field in index. */
00033     int itemSize;                /* Size of index record. */
00034     long headSize;               /* Offset to first index record. */
00035     struct fofRecord *rec;       /* Buffer space for one index record. */
00036     struct fofRecord *first;     /* First record. */
00037     struct fofRecord *last;      /* Last record. */
00038     };
00039 
00040 
00041 static void readStringZ(FILE *f, char *s, int maxLen)
00042 /* Read in a zero terminated string from file. */
00043 {
00044 int c;
00045 int ix;
00046 maxLen -= 1;    /* room for zero tag. */
00047 
00048 for (ix = 0; ix <maxLen; ++ix)
00049     {
00050     if ((c = fgetc(f)) == EOF)
00051         errAbort("Unexpected EOF in readStringZ");
00052     if (c == 0)
00053         break;
00054     s[ix] = c;
00055     }
00056 if (ix == maxLen)
00057     errAbort("String too long in readStringZ");
00058 s[ix] = 0;
00059 }
00060 
00061 
00062 struct fof *fofOpen(char *fofName, char *fofDir)
00063 /* Open up the named fof. fofDir may be NULL.  It should include 
00064  * trailing '/' if non-null. */
00065 {
00066 bits32 sig, elCount;
00067 bits16 fileCount, maxNameSize;
00068 FILE *f;
00069 char nameBuf[512];
00070 char pathBuf[512];
00071 struct fof *fof;
00072 int i;
00073 
00074 /* Handle directory either being something or NULL, and
00075  * either ending with a slash or not. */
00076 if (fofDir == NULL)
00077     {
00078     fofDir = "";
00079     }
00080 
00081 /* Open file, verify signature. */
00082 safef(pathBuf, sizeof(pathBuf), "%s%s", fofDir, fofName);
00083 f = mustOpen(pathBuf, "rb");
00084 mustReadOne(f, sig);
00085 if (sig != fofSig)
00086     errAbort("Bad signature on %s", pathBuf);
00087 mustReadOne(f, elCount);
00088 
00089 /* Read size info and allocate basic fof structure. */
00090 mustReadOne(f, fileCount);
00091 if (fileCount > 12)
00092     warn("%d files indexed in fof %s!?", fileCount, fofName);
00093 mustReadOne(f, maxNameSize);
00094 if (maxNameSize > 40)
00095     warn("%d maxName size in fof %s!?", maxNameSize, fofName);
00096 AllocVar(fof);
00097 fof->name = cloneString(fofName);
00098 fof->relDir = cloneString(fofDir);
00099 fof->fileNames = needMem(fileCount * sizeof(fof->fileNames[0]));
00100 fof->files = needMem(fileCount * sizeof(fof->files[0]));
00101 fof->f = f;
00102 fof->fileCount = fileCount;
00103 fof->endIx = elCount-1;
00104 fof->maxNameSize = maxNameSize;
00105 fof->itemSize = sizeof(bits32) +sizeof(bits32) + sizeof(UBYTE) + maxNameSize;
00106 fof->rec = needMem(sizeof(*fof->rec) + maxNameSize);
00107 fof->first = needMem(sizeof(*fof->rec) + maxNameSize);
00108 fof->last = needMem(sizeof(*fof->rec) + maxNameSize);
00109 
00110 /* Read in names of files being indexed and figure header size. */
00111 for (i=0; i<fileCount; ++i)
00112     {
00113     readStringZ(f, nameBuf, sizeof(nameBuf));
00114     safef(pathBuf, sizeof(pathBuf), "%s%s", fofDir, nameBuf);
00115     fof->fileNames[i] = cloneString(pathBuf);
00116     }
00117 fof->headSize = ftell(f);
00118 
00119 /* Read in first and last records. */
00120 mustRead(f, fof->first, fof->itemSize);
00121 fseek(f, fof->headSize + fof->endIx*fof->itemSize, SEEK_SET);
00122 mustRead(f, fof->last, fof->itemSize);
00123 
00124 /* All done (files will be opened as needed, not here). */
00125 return fof;
00126 }
00127 
00128 
00129 void fofClose(struct fof **pFof)
00130 /* Close down the named fof. */
00131 {
00132 struct fof *fof = *pFof;
00133 if (fof != NULL)
00134     {
00135     int fileCount = fof->fileCount;
00136     int i;
00137 
00138     for (i=0; i<fileCount; ++i)
00139         {
00140         freeMem(fof->fileNames[i]);
00141         carefulClose(&fof->files[i]);
00142         }
00143     freeMem(fof->name);
00144     freeMem(fof->fileNames);
00145     freeMem(fof->files);
00146     freeMem(fof->rec);
00147     freeMem(fof->first);
00148     freeMem(fof->last);
00149     carefulClose(&fof->f);
00150     freez(pFof);
00151     }
00152 }
00153 
00154 int fofElementCount(struct fof *fof)
00155 /* How many names are in fof file? */
00156 {
00157 return fof->endIx + 1;
00158 }
00159 
00160 static void fofRecToPos(struct fof *fof, int ix, struct fofRecord *rec, struct fofPos *pos)
00161 /* Convert from record to position, opening file for entry if necessary. */
00162 {
00163 int fileIx = rec->fileIx;
00164 FILE *f;
00165 
00166 pos->indexIx = ix;
00167 pos->offset = rec->offset;
00168 pos->size = rec->size;
00169 pos->fileName = fof->fileNames[fileIx];
00170 if ((f = fof->files[fileIx]) != NULL)
00171     {
00172     pos->f = f;
00173     }
00174 else
00175     {
00176     pos->f = fof->files[fileIx] = mustOpen(fof->fileNames[fileIx], "rb");
00177     }
00178 return;
00179 }
00180 
00181 
00182 static int fofCmp(char *prefix, char *name, int maxSize, boolean isPrefix)
00183 /* Compare either prefix of whole string to name. */
00184 {
00185 if (isPrefix)
00186     return memcmp(prefix, name, maxSize);
00187 else
00188     return strcmp(prefix, name);
00189 }
00190 
00191 static boolean fofSearch(struct fof *fof, char *name, int nameSize, 
00192     boolean isPrefix, struct fofPos *retPos)
00193 /* Find index of name by binary search.
00194  * Returns FALSE if no such name in the index file. */
00195  {
00196 struct fofRecord *rec = fof->rec;
00197 int startIx, endIx, midIx;
00198 int cmp;
00199 int itemSize = fof->itemSize;
00200 FILE *f = fof->f;
00201 int headSize = fof->headSize;
00202 
00203 /* Truncate name size if necessary. */
00204 if (nameSize > fof->maxNameSize)
00205     nameSize = fof->maxNameSize;
00206 
00207 /* Set up endpoints of binary search */
00208 startIx = 0;
00209 endIx = fof->endIx;
00210 
00211 /* Check for degenerate initial case */
00212 if (fofCmp(name, fof->first->name, nameSize, isPrefix) == 0)
00213     {
00214     fofRecToPos(fof, startIx, fof->first, retPos);
00215     return TRUE;
00216     }
00217 if (fofCmp(name, fof->last->name, nameSize, isPrefix) == 0)
00218     {
00219     fofRecToPos(fof, endIx, fof->last, retPos);
00220     return TRUE;
00221     }
00222 
00223 /* Do binary search. */
00224 for (;;)
00225     {
00226     midIx = (startIx + endIx ) / 2;
00227     if (midIx == startIx || midIx == endIx)
00228         return FALSE;
00229     fseek(f, headSize + midIx*itemSize, SEEK_SET);
00230     mustRead(f, rec, itemSize);
00231     cmp = fofCmp(name, rec->name, nameSize, isPrefix);
00232     if (cmp == 0)
00233         {
00234         fofRecToPos(fof, midIx, rec, retPos);
00235         return TRUE;
00236         }
00237     else if (cmp > 0)
00238         {
00239         startIx = midIx;
00240         }
00241     else
00242         {
00243         endIx = midIx;
00244         }
00245     }
00246 }
00247 
00248 boolean fofFindFirst(struct fof *fof, char *prefix, 
00249     int prefixSize, struct fofPos *retPos)
00250 /* Find first element with key starting with prefix. */
00251 {
00252 int ix;
00253 struct fofRecord *rec = fof->rec;
00254 FILE *f = fof->f;
00255 int itemSize = fof->itemSize;
00256 int headSize = fof->headSize;
00257 
00258 /* Find some record that starts with prefix. */
00259 if (!fofSearch(fof, prefix, prefixSize, TRUE, retPos))
00260     return FALSE;
00261 
00262 /* Backtrack until find one that doesn't start with prefix. */
00263 ix = retPos->indexIx;
00264 while (--ix >= 0)
00265     {
00266     fseek(f, headSize + ix*itemSize, SEEK_SET);
00267     mustRead(f, rec, itemSize);
00268     if (memcmp(prefix, rec->name, prefixSize) != 0)
00269         break;
00270     }
00271 
00272 /* Return the first record that does start with prefix. */
00273 ++ix;
00274 fseek(f, headSize + ix*itemSize, SEEK_SET);
00275 mustRead(f, rec, itemSize);
00276 fofRecToPos(fof, ix, rec, retPos);
00277 return TRUE;
00278 }
00279 
00280 
00281 boolean fofFind(struct fof *fof, char *name, struct fofPos *retPos)
00282 /* Find element corresponding with name.  Returns FALSE if no such name
00283  * in the index file. */
00284 {
00285 return fofSearch(fof, name, strlen(name), FALSE, retPos);
00286 }
00287 
00288 void *fofFetch(struct fof *fof, char *name, int *retSize)
00289 /* Lookup element in index, allocate memory for it, and read
00290  * it.  Returns buffer with element in it, which should be
00291  * freeMem'd when done. Aborts if element isn't there. */
00292 {
00293 struct fofPos pos;
00294 void *s;
00295 
00296 if (!fofFind(fof, name, &pos))
00297     errAbort("Couldn't find %s in %s", name, fof->name);
00298 s = needLargeMem(pos.size);
00299 fseek(pos.f, pos.offset, SEEK_SET);
00300 mustRead(pos.f, s, pos.size);
00301 *retSize = pos.size;
00302 return s;
00303 }
00304 
00305 char *fofFetchString(struct fof *fof, char *name, int *retSize)
00306 /* Lookup element in index, allocate memory for it, read it.
00307  * Returns zero terminated string with element in it, which 
00308  * should be freeMem'd when done. Aborts if element isn't there. */
00309 {
00310 struct fofPos pos;
00311 char *s;
00312 
00313 if (!fofFind(fof, name, &pos))
00314     errAbort("Couldn't find %s in %s", name, fof->name);
00315 s = needLargeMem(pos.size+1);
00316 fseek(pos.f, pos.offset, SEEK_SET);
00317 mustRead(pos.f, s, pos.size);
00318 s[pos.size] = 0;
00319 *retSize = pos.size;
00320 return s;
00321 }
00322 
00323 /* ------------------- Batch read ------------------------*/
00324 static int cmpOnKey(const void *va, const void *vb)
00325 /* Comparison function for qsort on an array of offset pointers.
00326  * Sorts on key. */
00327 {
00328 const struct fofBatch *a = *((struct fofBatch **)va);
00329 const struct fofBatch *b = *((struct fofBatch **)vb);
00330 return strcmp(a->key, b->key);
00331 }
00332 
00333 static int cmpOnFilePos(const void *va, const void *vb)
00334 /* Comparison function for qsort on an array of offset pointers.
00335  * Sorts on file then file offset. */
00336 {
00337 const struct fofBatch *a = *((struct fofBatch **)va);
00338 const struct fofBatch *b = *((struct fofBatch **)vb);
00339 int dif = a->f - b->f;
00340 if (dif == 0)
00341     dif = a->offset - b->offset;
00342 return dif;
00343 }
00344 
00345 static void elFromRec(struct fof *fof, struct fofRecord *rec, struct fofBatch *el)
00346 /* Fill in a batch element from record. */
00347 {
00348 FILE *ff;
00349 int fileIx = rec->fileIx;
00350 if ((ff = fof->files[fileIx]) != NULL)
00351     {
00352     el->f = ff;
00353     }
00354 else
00355     {
00356     el->f = fof->files[fileIx] = mustOpen(fof->fileNames[fileIx], "rb");
00357     }
00358 el->offset = rec->offset;
00359 el->size = rec->size;
00360 }
00361 
00362 struct fofBatch *fofBatchFind(struct fof *fof, struct fofBatch *list)
00363 /* Look up all of members on list. */
00364 {
00365 struct fofBatch *el;
00366 FILE *f = fof->f;
00367 struct fofRecord *rec = fof->rec;
00368 int itemSize = fof->itemSize;
00369 char *lastKey = "";
00370 
00371 slSort(&list, cmpOnKey);
00372 fseek(f, fof->headSize, SEEK_SET);
00373 for (el = list; el != NULL; el = el->next)
00374     {
00375     char *key = el->key;
00376     if (sameString(key, lastKey))
00377         {
00378         elFromRec(fof, rec, el);
00379         }
00380     else
00381         {
00382         for (;;)
00383             {
00384             mustRead(f, rec, itemSize);
00385             if (sameString(key, rec->name))
00386                 {
00387                 elFromRec(fof, rec, el);
00388                 lastKey = key;
00389                 break;
00390                 }
00391             }
00392         }
00393     }
00394 slSort(&list, cmpOnFilePos);
00395 return list;
00396 }
00397 
00398 
00399 /* ------------------- Write Side ------------------------*/
00400 
00401 struct lm *localMem;    /* Local (fast) memory pool. */
00402 
00403 struct fofRecList
00404 /* This holds a list of records for an index file. */
00405     {
00406     struct fofRecList *next;
00407     struct fofRecord rec;
00408     };
00409 
00410 static int cmpRecList(const void *va, const void *vb)
00411 /* Comparison function for qsort on an array of offset pointers.
00412  * Sorts first on name, then on fileIx, then on offset. */
00413 {
00414 const struct fofRecList *a = *((struct fofRecList **)va);
00415 const struct fofRecList *b = *((struct fofRecList **)vb);
00416 int dif;
00417 dif = strcmp(a->rec.name, b->rec.name);
00418 if (dif == 0)
00419     {
00420     UBYTE ao = a->rec.fileIx;
00421     UBYTE bo = b->rec.fileIx;
00422     if (ao < bo)
00423         dif =  -1;
00424     else if (ao > bo)
00425         dif = 1;
00426     else
00427         {
00428         bits32 ao = a->rec.offset;
00429         bits32 bo = b->rec.offset;
00430         if (ao < bo)
00431             dif =  -1;
00432         else if (ao == bo)
00433             dif = 0;
00434         else
00435             dif = 1;
00436         }
00437     }
00438 return dif;
00439 }
00440 
00441 static bits16 maxNameSize;      /* Maximum size we've seen. */
00442 
00443 struct fofRecList *newFofRecEl(int fileIx, long offset, long size, char *name, int nameLen)
00444 /* Create a new offset list element. */
00445 {
00446 struct fofRecList *fr;
00447 
00448 if (maxNameSize < nameLen)
00449     maxNameSize = nameLen;
00450 fr = lmAlloc(localMem, sizeof(*fr) + nameLen);
00451 fr->rec.offset = offset;
00452 fr->rec.size = size;
00453 fr->rec.fileIx = fileIx;
00454 memcpy(fr->rec.name, name, nameLen);
00455 return fr;
00456 }
00457 
00458 void fofMake(char *inFiles[], int inCount, char *outName, 
00459     boolean (*readHeader)(FILE *inFile, void *data),
00460     boolean (*nextRecord)(FILE *inFile, void *data, char **rName, int *rNameLen), 
00461     void *data, boolean dupeOk)
00462 /* Make an index file
00463  * Inputs:
00464  *     inFiles - List of files that you're indexing with header read and verified.
00465  *     inCount - Size of file list.
00466  *     outName - name of index file to create
00467  *     readHeader - function that sets up file to read first record.  May be NULL.
00468  *     nextRecord - function that reads next record in file you're indexing
00469  *                  and returns the name of that record.  Returns FALSE at
00470  *                  end of file.  Can set *rNameLen to zero you want indexer
00471  *                  to ignore the record. 
00472  *     data - void pointer passed through to nextRecord.
00473  *     dupeOk - set to TRUE if you want dupes to not cause squawking
00474  */
00475 {
00476 FILE *out;
00477 bits32 sig = fofSig;
00478 bits32 elCount = 0;
00479 bits16 fileCount = inCount;
00480 struct fofRecList *recList = NULL, *rl;
00481 int i, fileIx, itemSize;
00482 char *lastName = "";
00483 int maxMod = 10000;
00484 
00485 /* Initialize. */
00486 localMem = lmInit(0);
00487 maxNameSize = 0;
00488 
00489 /* Read in all records and sort by name. */
00490 for (fileIx = 0; fileIx<inCount; ++fileIx)
00491     {
00492     char *inName = inFiles[fileIx];
00493     FILE *in = mustOpen(inName, "rb");
00494     bits32 start, end;
00495     char *name;
00496     int nameLen;
00497     int mod = maxMod;
00498 
00499     printf("Processing %s\n", inName);
00500     if (readHeader)
00501         readHeader(in, data);
00502     start = ftell(in);
00503     while (nextRecord(in, data, &name, &nameLen))
00504         {
00505         if (--mod == 0)
00506             {
00507             putc('.', stdout);
00508             fflush(stdout);
00509             mod = maxMod;
00510             }
00511         end = ftell(in);
00512         if (nameLen > 0)
00513             {
00514             rl = newFofRecEl(fileIx, start, end-start, name, nameLen);
00515             slAddHead(&recList, rl);
00516             }
00517         start = end;
00518         }
00519     fclose(in);
00520     printf("\n");
00521     }
00522 
00523 printf("sorting\n");
00524 slSort(&recList, cmpRecList);
00525 
00526 /* Count up names. */
00527 if (dupeOk)
00528     elCount = slCount(recList);
00529 else
00530     {
00531     lastName = "";
00532     for (rl = recList; rl != NULL; rl = rl->next)
00533         {
00534         char *name = rl->rec.name;
00535         if (!sameString(name, lastName))
00536             {
00537             ++elCount;
00538             lastName = name;
00539             }
00540         }
00541     }
00542 
00543 /* Write out index file. */
00544 printf("Writing %s\n", outName);
00545 out = mustOpen(outName, "wb");
00546 writeOne(out, sig);
00547 writeOne(out, elCount);
00548 writeOne(out, fileCount);
00549 writeOne(out, maxNameSize);
00550 itemSize = sizeof(bits32) +sizeof(bits32) + sizeof(UBYTE) + maxNameSize;
00551 for (i=0; i<inCount; ++i)
00552     {
00553     char *name = inFiles[i];
00554     int len = strlen(name)+1;
00555     mustWrite(out, name, len);
00556     }
00557 lastName = "";
00558 for (rl = recList; rl != NULL; rl = rl->next)
00559     {
00560     if (!dupeOk)
00561         {
00562         char *name = rl->rec.name;
00563         if (sameString(name, lastName))
00564             {
00565             warn("Duplicate %s only saving first.", name);
00566             continue;
00567             }
00568         else
00569             lastName = name;
00570         }
00571     writeOne(out, rl->rec.offset);
00572     writeOne(out, rl->rec.size);
00573     writeOne(out, rl->rec.fileIx);
00574     mustWrite(out, rl->rec.name, maxNameSize);
00575     }
00576 if (fclose(out) != 0)
00577     errnoAbort("fclose failed");
00578 /* Clean up. */
00579 lmCleanup(&localMem);
00580 }
00581 

Generated on Tue Dec 25 18:39:30 2007 for blat by  doxygen 1.5.2