include/readindex.h File Reference

#include <stdio.h>

Include dependency graph for readindex.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  index_scanner
struct  lo_info
struct  listinfo

Functions

index_scanneropen_index (char *index_prefix)
listinfoget_next_list (struct index_scanner *iscn, struct listinfo *info)
listinfoget_list_at (struct index_scanner *iscn, struct listinfo *info, long offset)
listinfoget_next_biglist (struct index_scanner *iscn, struct listinfo *info)
lo_infoget_next_lo_info (struct index_scanner *iscn, struct lo_info *lo_info)
void info_free (struct listinfo *info)
index_scannerreset_index (struct index_scanner *scn)
index_scannerclose_index (struct index_scanner *scn)


Detailed Description

Functions designed for sequentially extracting postings lists from lucy index files, primarily designed for use with the spex indices from the deco algorithm.

Yaniv Bernstein 2004

Definition in file readindex.h.


Function Documentation

struct index_scanner* close_index ( struct index_scanner scn  )  [read]

Close all the file pointers to the index

Definition at line 166 of file readindex.c.

References index_scanner::idx_file.

Referenced by cluster_clusterSequences().

00167 {
00168    fclose(scn->idx_file);
00169 
00170    free(scn);
00171 
00172    return scn;
00173 }

Here is the caller graph for this function:

struct listinfo* get_list_at ( struct index_scanner iscn,
struct listinfo info,
long  offset 
) [read]

Definition at line 65 of file readindex.c.

References listinfo::doc_count, listinfo::doc_numbers, index_scanner::idx_file, info_free(), listinfo::phrase_frequency, listinfo::phrase_offsets, listinfo::size, vbyte_read(), vec_free(), vec_getvbyte(), vec_init(), and vec::vector.

Referenced by get_next_biglist(), and get_next_list().

00066 {
00067    struct vec *vector;
00068    unsigned long num, last_doc = -1;
00069    int c, d;
00070    FILE *idx = iscn->idx_file;
00071    unsigned long veclen,
00072                  numdocs,
00073                  occurs,
00074                  last_offset;
00075 
00076    if (offset >= 0)
00077       fseek(idx, offset, SEEK_SET);
00078 
00079    /* There is nothing left to read; free structures */
00080    if (!( vbyte_read(idx, &numdocs)
00081             && vbyte_read(idx, &occurs)
00082             && vbyte_read(idx, &veclen) ))
00083    {
00084       info_free(info);
00085       return NULL;
00086    }
00087 
00088    /* Initialise the info structure if we weren't already given one */
00089    if (info == NULL)
00090    {
00091       info = malloc(sizeof(struct listinfo));
00092       info->doc_numbers = malloc(numdocs * sizeof(unsigned long));
00093       info->phrase_frequency = malloc(numdocs * sizeof(unsigned long));
00094       info->phrase_offsets = malloc(numdocs * sizeof(unsigned long *));
00095       memset(info->phrase_offsets, 0, numdocs * sizeof(unsigned long *));
00096       info->size = numdocs;
00097    }
00098    
00099    /* We may need to enlarge various aspects of the structure if it is not
00100     * big enough to hold the current postings list */
00101    if (info->size < numdocs)
00102    {
00103       info->doc_numbers = realloc(info->doc_numbers, numdocs * sizeof(unsigned long));
00104       info->phrase_frequency = realloc(info->phrase_frequency, numdocs * sizeof(unsigned long));
00105       info->phrase_offsets = realloc(info->phrase_offsets, numdocs * sizeof(unsigned long *));
00106       memset(info->phrase_offsets + info->size, 0, (numdocs - info->size) * sizeof(unsigned long *));
00107       info->size = numdocs;
00108    }
00109 
00110    info->doc_count = numdocs;
00111 
00112    vector = vec_init(veclen);
00113    vector->len = veclen;
00114 
00115    /* Populate the vector's block of memory from the appropriate location
00116     * in the vector file */
00117    fread(vector->vector, veclen, 1, idx);
00118 
00119    /* Read in the stats for each document in the postings list */
00120    for (c = 0; c < numdocs; c++)
00121    {
00122       /* Get the document number (stored as a d-gap) */
00123       vec_getvbyte(vector, &num);
00124       info->doc_numbers[c] = last_doc + num + 1;
00125       last_doc += num + 1;
00126 
00127       /* Read the number of times the phrase occurs in this document */
00128       vec_getvbyte(vector, &info->phrase_frequency[c]);
00129       if (info->phrase_offsets[c] != NULL)
00130          free(info->phrase_offsets[c]);
00131 
00132       /* Prepare the memory for the appropriate number of offsets */
00133       info->phrase_offsets[c] = malloc(info->phrase_frequency[c] * sizeof(unsigned long));
00134       memset(info->phrase_offsets[c], 0, info->phrase_frequency[c] * sizeof(unsigned long));
00135       
00136       /* Read in all the document offsets */
00137       last_offset = -1;
00138       for (d = 0; d < info->phrase_frequency[c]; d++)
00139       {
00140          vec_getvbyte(vector, info->phrase_offsets[c] + d);
00141          last_offset += (info->phrase_offsets[c])[d] + 1;
00142          (info->phrase_offsets[c])[d] = last_offset;
00143       }
00144    }
00145 
00146    vec_free(vector);
00147 
00148    return info;
00149 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct listinfo* get_next_biglist ( struct index_scanner iscn,
struct listinfo info 
) [read]

Definition at line 36 of file readindex.c.

References get_list_at(), get_next_lo_info(), and lo_info::offset.

Referenced by cluster_clusterSequences().

00037 {
00038    struct lo_info *lo_info;
00039 
00040    lo_info = get_next_lo_info(iscn, NULL);
00041 
00042    if (!lo_info)
00043       return NULL;
00044 
00045    return get_list_at(iscn, info, lo_info->offset);
00046 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct listinfo* get_next_list ( struct index_scanner iscn,
struct listinfo info 
) [read]

Return a phrase_info data structure for the next phrase in the index

Definition at line 31 of file readindex.c.

References get_list_at().

Referenced by cluster_clusterSequences().

00032 {
00033    return get_list_at(iscn, info, -1);
00034 }

Here is the call graph for this function:

Here is the caller graph for this function:

struct lo_info* get_next_lo_info ( struct index_scanner iscn,
struct lo_info lo_info 
) [read]

Definition at line 48 of file readindex.c.

References index_scanner::offsets, and vbyte_read().

Referenced by get_next_biglist().

00049 {
00050    FILE *offsetfile = iscn->offsets;
00051 
00052    if (!lo_info)
00053       lo_info = malloc(sizeof(struct lo_info));
00054 
00055    vbyte_read(offsetfile, &(lo_info->size));
00056    if (!(vbyte_read(offsetfile, &(lo_info->offset))))
00057    {
00058       free(lo_info);
00059       return NULL;
00060    }
00061 
00062    return lo_info;
00063 }

Here is the call graph for this function:

Here is the caller graph for this function:

void info_free ( struct listinfo info  ) 

Deallocates the memory used by the data structure

Definition at line 151 of file readindex.c.

References listinfo::doc_numbers, and listinfo::phrase_frequency.

Referenced by get_list_at().

00152 {
00153    free(info->doc_numbers);
00154    free(info->phrase_frequency);
00155 
00156    free(info);
00157 }

Here is the caller graph for this function:

struct index_scanner* open_index ( char *  index_prefix  )  [read]

Returns a data structure that contains open buffered file pointers to the index file and all the vector files in preparation for sequential reading of postings vectors.

Definition at line 13 of file readindex.c.

References index_scanner::idx_file, and index_scanner::offsets.

Referenced by cluster_clusterSequences().

00014 {
00015    struct index_scanner *scanner;
00016    char fnamebuf[FILENAME_MAX + 1];
00017 
00018    fnamebuf[FILENAME_MAX] = '\0';
00019 
00020    scanner = (struct index_scanner *) malloc(sizeof(struct index_scanner));
00021 
00022    snprintf(fnamebuf, FILENAME_MAX, "%s.idx", index_prefix);
00023    scanner->idx_file = fopen(fnamebuf, "r");
00024 
00025    snprintf(fnamebuf, FILENAME_MAX, "%s.lo", index_prefix);
00026    scanner->offsets = fopen(fnamebuf, "r");
00027 
00028    return scanner;
00029 }

Here is the caller graph for this function:

struct index_scanner* reset_index ( struct index_scanner scn  )  [read]

Reset all the file pointers back to the beginning of the index

Definition at line 159 of file readindex.c.

References index_scanner::idx_file.

Referenced by cluster_clusterSequences().

00160 {
00161    fseek(scn->idx_file, 0, SEEK_SET);
00162 
00163    return scn;
00164 }

Here is the caller graph for this function:


Generated on Wed Dec 19 20:51:33 2007 for fsa-blast by  doxygen 1.5.2