#include <stdio.h>Include dependency graph for hashcounter.h:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.
Data Structures | |
| struct | Hashcounter |
Defines | |
| #define | hashcounter_hash(counter, chunk, chunk_length, hashValue, position) |
Functions | |
| Hashcounter * | hashcounter_new (unsigned long size, unsigned long seed) |
| void | hashcounter_insert (Hashcounter *counter, uint4 hashValue) |
| int | hashcounter_multiple (Hashcounter *counter, uint4 hashValue) |
| int | hashcounter_single (Hashcounter *counter, uint4 hashValue) |
| unsigned long | hashcounter_insertions (Hashcounter *counter) |
| void | hashcounter_write (Hashcounter *counter, FILE *stream) |
| void | hashcounter_count (Hashcounter *counter, FILE *stream) |
| void | hashcounter_reset (Hashcounter *counter) |
| void | hashcounter_free (Hashcounter *counter) |
Because the only information that is required is whether a phrase appears multiple times (ie. the exact count is irrelevant), two bits per entry are sufficient. This allows for a large hashtable to be stored quite compactly, minimising the probability of collisions.
Definition in file hashcounter.h.
| #define hashcounter_hash | ( | counter, | |||
| chunk, | |||||
| chunk_length, | |||||
| hashValue, | |||||
| position | ) |
Value:
if (1) \ { \ hashValue = 5381; \ for (position = 0; position < chunk_length; position++) \ { \ hashValue ^= ((hashValue << 7) + chunk[position] + (hashValue >> 2)); \ } \ }
Definition at line 85 of file hashcounter.h.
Referenced by cluster_spexClusterSequences(), rsdb_slottedSpex(), rsdb_spexClusterSequences(), and rsdb_winnowingSpex().
| void hashcounter_count | ( | Hashcounter * | counter, | |
| FILE * | stream | |||
| ) |
Print out a count of fields with different values
Definition at line 136 of file hashcounter.c.
References Hashcounter::byte_size, Hashcounter::hashtable, and htlookup.
Referenced by cluster_spexClusterSequences(), rsdb_slottedSpex(), and rsdb_winnowingSpex().
00137 { 00138 uint64_t singles = 0, multiples = 0; 00139 uint64_t c = 0, d = 0; 00140 unsigned char cur; 00141 00142 for (c = 0; c < counter->byte_size; c++) 00143 { 00144 cur = counter->hashtable[c]; 00145 for (d = 0; d < 4; d++) 00146 { 00147 switch( htlookup[d][cur]) 00148 { 00149 case 1: 00150 singles++; 00151 case 0: 00152 break; 00153 00154 default: 00155 multiples++; 00156 } 00157 } 00158 } 00159 00160 fprintf(stream, "Hashcounter state:\nSingles: %"PRIu64"\nDuplicates: %"PRIu64"\n", singles, multiples); 00161 00162 return; 00163 }
Here is the caller graph for this function:

| void hashcounter_free | ( | Hashcounter * | counter | ) |
Clean up the hascounter when you're done.
Definition at line 174 of file hashcounter.c.
References Hashcounter::hashtable.
Referenced by cluster_spexClusterSequences(), rsdb_slottedSpex(), rsdb_spexClusterSequences(), and rsdb_winnowingSpex().
00175 { 00176 free(counter->hashtable); 00177 free(counter); 00178 00179 return; 00180 }
Here is the caller graph for this function:

| void hashcounter_insert | ( | Hashcounter * | counter, | |
| uint4 | hashValue | |||
| ) |
Increment the count for the given phrase within the hashcounter. The phrase is given as an array of strings, each containing a single word.
The parameter 'words' indicates the number of words in the phrase, while 'array_size' indicates the size of the array (which is sometimes bigger when hashing subphrases). The 'offset' parameter indicates the position in the array in which the phrase starts -- the array is circular, so when the end of the array is reached, the phrase is assumed to continue at the beginning of the array.
Definition at line 87 of file hashcounter.c.
References Hashcounter::fieldmask, Hashcounter::hashtable, htinsert, Hashcounter::indmask, and Hashcounter::insertions.
Referenced by cluster_spexClusterSequences(), rsdb_slottedSpex(), and rsdb_winnowingSpex().
00088 { 00089 unsigned long ind, field; 00090 unsigned char cur; 00091 00092 ind = (hashValue & counter->indmask) >> 2; 00093 field = hashValue & counter->fieldmask; 00094 00095 cur = counter->hashtable[ind]; 00096 counter->hashtable[ind] = htinsert[field][cur]; 00097 00098 counter->insertions++; 00099 00100 return; 00101 }
Here is the caller graph for this function:

| unsigned long hashcounter_insertions | ( | Hashcounter * | counter | ) |
returns the number of phrases that have been inserted into this hashcounter
Definition at line 121 of file hashcounter.c.
References Hashcounter::insertions.
00122 { 00123 return counter->insertions; 00124 }
| int hashcounter_multiple | ( | Hashcounter * | counter, | |
| uint4 | hashValue | |||
| ) |
Returns a nonzero value value if the phrase in question appears more than once in the hashcounter. See the comment for 'hashcounter_new' for notes on the meanings of the various parameters.
Definition at line 103 of file hashcounter.c.
References Hashcounter::hashtable, htlookup, and Hashcounter::indmask.
Referenced by cluster_spexClusterSequences(), rsdb_slottedSpex(), rsdb_spexClusterSequences(), and rsdb_winnowingSpex().
00104 { 00105 unsigned char cur; 00106 00107 cur = counter->hashtable[(hashValue & counter->indmask) >> 2]; 00108 return htlookup[hashValue & 3][cur] & 2; 00109 00110 }
Here is the caller graph for this function:

| Hashcounter* hashcounter_new | ( | unsigned long | size, | |
| unsigned long | seed | |||
| ) |
Initialise a new hashcounter of size size. Note that size is the size in bytes -- the hashtable will in fact contain 4 * bytes fields
Definition at line 56 of file hashcounter.c.
References Hashcounter::byte_size, Hashcounter::fieldmask, global_malloc(), hashcounter_reset(), Hashcounter::hashseed, Hashcounter::hashtable, Hashcounter::indmask, init_lookups(), Hashcounter::insertions, and Hashcounter::size.
Referenced by cluster_spexClusterSequences(), rsdb_slottedSpex(), and rsdb_winnowingSpex().
00057 { 00058 Hashcounter *new_counter; 00059 int c; 00060 00061 init_lookups(); 00062 new_counter = (Hashcounter *) global_malloc(sizeof(Hashcounter)); 00063 00064 for (new_counter->indmask = 0, c = 0; c < logsize; c++) 00065 { 00066 new_counter->indmask <<= 1; 00067 new_counter->indmask++; 00068 } 00069 new_counter->indmask <<= 2; 00070 new_counter->fieldmask = 3; 00071 00072 new_counter->byte_size = 1 << logsize; 00073 /* We can fit 4 entries in each byte */ 00074 new_counter->size = 4 * new_counter->byte_size; 00075 new_counter->hashseed = seed; 00076 new_counter->insertions = 0; 00077 00078 new_counter->hashtable = global_malloc(new_counter->byte_size); 00079 00080 printf("New hashcounter size=%.1f Mb\n", (float)new_counter->byte_size / 1024.0 / 1024.0); 00081 00082 hashcounter_reset(new_counter); 00083 00084 return new_counter; 00085 }
Here is the call graph for this function:

Here is the caller graph for this function:

| void hashcounter_reset | ( | Hashcounter * | counter | ) |
Clear the hashcounter
Definition at line 165 of file hashcounter.c.
References Hashcounter::byte_size, Hashcounter::hashtable, and Hashcounter::insertions.
Referenced by hashcounter_new().
00166 { 00167 00168 counter->insertions = 0; 00169 memset(counter->hashtable, 0, counter->byte_size); 00170 00171 return; 00172 }
Here is the caller graph for this function:

| int hashcounter_single | ( | Hashcounter * | counter, | |
| uint4 | hashValue | |||
| ) |
Definition at line 112 of file hashcounter.c.
References Hashcounter::hashtable, htlookup, and Hashcounter::indmask.
Referenced by cluster_spexClusterSequences(), and rsdb_slottedSpex().
00113 { 00114 unsigned char cur; 00115 00116 cur = counter->hashtable[(hashValue & counter->indmask) >> 2]; 00117 return htlookup[hashValue & 3][cur]; 00118 00119 }
Here is the caller graph for this function:

| void hashcounter_write | ( | Hashcounter * | counter, | |
| FILE * | stream | |||
| ) |
Write the hashtable out to stream 'stream'
Definition at line 126 of file hashcounter.c.
References Hashcounter::hashtable, and Hashcounter::size.
00127 { 00128 uint64_t c; 00129 00130 for (c = 0; c < counter->size / 4; c++) 00131 { 00132 fputc(counter->hashtable[c], stream); 00133 } 00134 }
1.5.2