Bloom Filters

Learning about Bloom filters to quickly check that items exist in a dataset….

hdcolin

Learning about using Bloom filters to quickly check that items exist in a dataset. The idea behind this concept is that you build the filter as a bit array, and populate it with bits set at a position based on two hashes of the data you want to query.

In the example below we iterate over a directory reading in file names. We allocate 15 bits into the bit array for each file we find. This gives us a good margin for error.

For each file name in the directory we hash it with two different hash functions. We then modulo the result by the total number of positions in our bit array, which gives us the bits we need to set.

To check if a file exists in our bit array we just hash the file name with the same hash functions modulo the result and check if the bits are set. If at least one is set, then it is likely that our entry exists.

This is useful as it allows is to create a filter that a query is passed through to ascertain if a full lookup is worth it. I.E if we need had a search query to a slow file-system, if the filter returns no bits set, then the file is not present and the system need not be burdened looking for a file that doesn’t exist.

Full code below, it’s a bit hacky, but it works in essence. If any one wants to suggest changes or submit a contribution email a patch file via the contact page.

https://git.howardcolin.co.uk/bloom/

include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <dirent.h> 

typedef struct BitArray {
    uint8_t *array;
    size_t total_bits;
} BitArray;

BitArray ba_init(size_t no_b) {
    BitArray bits = {0}; 
    bits.total_bits = no_b; 
    bits.array = calloc((no_b + 7) / 8, 1);
    return bits; 
}

void ba_destroy(BitArray *ba) {
    if(ba->array != NULL)
        free(ba->array);
}

int ba_insert(BitArray *ba, const unsigned long pos_a, const unsigned long pos_b) {
    if(pos_a >= ba->total_bits || pos_b >= ba->total_bits)
        return -1; 

    //set bit a
    ba->array[pos_a / 8] |= (1 << (pos_a % 8));
    //set bit b 
    ba->array[pos_b / 8] |= (1 << (pos_b % 8));

    return 0;
}

int ba_check(const BitArray *ba, const unsigned long pos_a, const unsigned long pos_b) {
    if(pos_a >= ba->total_bits || pos_b >= ba->total_bits)
        return -1;

    int a = ba->array[pos_a / 8] & (1 << (pos_a % 8));
    int b = ba->array[pos_b / 8] & (1 << (pos_b % 8));

    return (a && b) ? 1 : 0;
}

unsigned long djb2(const char *str) {
    unsigned long hash = 5381;
    int c; 
    while((c = *str++))
        hash = ((hash << 5) + hash) + c;
    return hash;
}

unsigned long fnv1a(const char *str) {
    unsigned long hash = 14695981039346656037UL; 
    while (*str) {
        hash ^= (unsigned char)*str++;
        hash *= 1099511628211UL;
    }
    return hash;
}

typedef struct FileHandler {
    struct dirent *entries; 
    DIR *dp;
} FileHandler; 

size_t fa_count(FileHandler *fh) {
    size_t count = 0;
    while((fh->entries = readdir(fh->dp))) {
        count++;
    }
    rewinddir(fh->dp);
    return count;
}

int main(int argc, char *argv[]) {
    if(argc != 2) {
        fprintf(stderr, "Usage: main --folderpath\n");
        return -1;
    }

    FileHandler fh_data = {0};
    fh_data.dp = opendir(argv[1]);
    if(fh_data.dp == NULL) {
        fprintf(stderr, "Error accessing file, please check path\n");
        return -1;
    }
   
    size_t ba_size = fa_count(&fh_data) * 15; 
    BitArray bits = ba_init(ba_size);
    if(bits.array == NULL) {
        fprintf(stderr, "Memory allocation error in ba_init()");
        closedir(fh_data.dp);
        return -1; 
    }
   
    // parse names and add to bit array.
    unsigned long a = 0; 
    unsigned long b = 0;
    while((fh_data.entries = readdir(fh_data.dp))) {
        printf("ADDING ENTRY: %s\n", fh_data.entries->d_name);
        a = djb2(fh_data.entries->d_name) % ba_size; 
        b = fnv1a(fh_data.entries->d_name)% ba_size;
   
        if(ba_insert(&bits, a, b) == -1) {
            fprintf(stderr, "ba_insert: bit array range error\n"); 
            closedir(fh_data.dp);
            ba_destroy(&bits);
            return -1;
        }
    }

    // construct simple test:
    printf("--------------- QUERY SAMPLE Documents -------------\n");
    unsigned long c = djb2("Documents") % ba_size;
    unsigned long d = fnv1a("Documents") % ba_size;
    int res = ba_check(&bits, c, d);
    if(res == 0) {
        printf("NOT FOUND\n");
    } else if(res == 1) {
        printf("MAYBE PRESENT\n");
    } else {
        printf("ERROR\n");
    }
   
    closedir(fh_data.dp);
    ba_destroy(&bits);
    return 0; 
}