From 6fbd1feb67fe384bf49e21cae42565b2d3c72bcf Mon Sep 17 00:00:00 2001 From: Neale Pickett Date: Tue, 20 Nov 2012 18:13:16 -0700 Subject: [PATCH] flesh out infobot helper --- Makefile | 2 +- cdb.c | 71 ++++++++++++++++++++++--- cdb.h | 7 ++- cdbm.c | 131 ---------------------------------------------- cdbmake.c | 121 ++++++++++++++++++++++++++++++++++++++++++ cdbmake.h | 25 +++++++++ infobot.c | 153 +++++++++++++++++++++++++++++++++++++++++++++++------- 7 files changed, 352 insertions(+), 158 deletions(-) delete mode 100644 cdbm.c create mode 100644 cdbmake.c create mode 100644 cdbmake.h diff --git a/Makefile b/Makefile index 360ef9e..78ff81b 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ TARGETS += infobot all: $(TARGETS) -infobot: infobot.o cdb.o +infobot: infobot.o cdb.o cdbmake.o .PHONY: clean clean: diff --git a/cdb.c b/cdb.c index e2cc7e9..bc3c845 100644 --- a/cdb.c +++ b/cdb.c @@ -2,6 +2,7 @@ #include #include #include "cdb.h" +#include "dump.h" /* * @@ -46,22 +47,80 @@ read_u32le(FILE *f) (d[3] << 24)); } +static uint32_t +read_buf(FILE *f, uint32_t fieldlen, char *buf, size_t buflen) +{ + + uint32_t size = min(buflen, fieldlen); + size_t r = fread(buf, 1, (size_t)size, f); + + // Slurp up the rest + for (fieldlen -= r; fieldlen > 0; fieldlen -= 1) { + getc(f); + } + + return size; +} + void cdb_init(struct cdb_ctx *ctx, FILE *f) { ctx->f = f; ctx->key = NULL; + ctx->hash_len = 1; +} + +int +cdb_dump(struct cdb_ctx *ctx, + char *key, size_t *keylen, + char *val, size_t *vallen) +{ + // Set hash_len to 0 to signal we're in position + if (ctx->hash_len != 0) { + // Find out where to stop reading + int i; + + ctx->hash_len = 0; + ctx->hash_pos = 0xffffffff; + for (i = 0; i < 256; i += 1) { + uint32_t p; + + fseek(ctx->f, i * 8, SEEK_SET); + p = read_u32le(ctx->f); + if (p < ctx->hash_pos) { + ctx->hash_pos = p; + } + } + fseek(ctx->f, 256 * 8, SEEK_SET); + } else { + long where = ftell(ctx->f); + + if (where >= ctx->hash_pos) { + return EOF; + } + } + + // Read the two buffers + { + uint32_t klen = read_u32le(ctx->f); + uint32_t vlen = read_u32le(ctx->f); + + *keylen = read_buf(ctx->f, klen, key, *keylen); + *vallen = read_buf(ctx->f, vlen, val, *vallen); + } + + return 0; } void -cdb_find(struct cdb_ctx *ctx, char *key, uint32_t keylen) +cdb_find(struct cdb_ctx *ctx, char *key, size_t keylen) { - ctx->hash_val = hash(key, keylen); - ctx->key = key; ctx->keylen = keylen; - /* Read pointer */ + ctx->hash_val = hash(key, keylen); + + // Read pointer fseek(ctx->f, (ctx->hash_val % 256) * 8, SEEK_SET); ctx->hash_pos = read_u32le(ctx->f); ctx->hash_len = read_u32le(ctx->f); @@ -69,7 +128,7 @@ cdb_find(struct cdb_ctx *ctx, char *key, uint32_t keylen) } uint32_t -cdb_next(struct cdb_ctx *ctx, char *buf, uint32_t buflen) +cdb_next(struct cdb_ctx *ctx, char *buf, size_t buflen) { uint32_t hashval; uint32_t entry_pos; @@ -109,7 +168,7 @@ cdb_next(struct cdb_ctx *ctx, char *buf, uint32_t buflen) } if (buf) { - return fread(buf, 1, min(dlen, buflen), ctx->f); + return read_buf(ctx->f, dlen, buf, buflen); } else { return dlen; } diff --git a/cdb.h b/cdb.h index 558e4b0..b5f5012 100644 --- a/cdb.h +++ b/cdb.h @@ -18,7 +18,10 @@ struct cdb_ctx { }; void cdb_init(struct cdb_ctx *ctx, FILE *f); -void cdb_find(struct cdb_ctx *ctx, char *key, uint32_t keylen); -uint32_t cdb_next(struct cdb_ctx *ctx, char *buf, uint32_t buflen); +int cdb_dump(struct cdb_ctx *ctx, + char *key, size_t *keylen, + char *val, size_t *vallen); +void cdb_find(struct cdb_ctx *ctx, char *key, size_t keylen); +uint32_t cdb_next(struct cdb_ctx *ctx, char *buf, size_t buflen); #endif diff --git a/cdbm.c b/cdbm.c deleted file mode 100644 index 59d563f..0000000 --- a/cdbm.c +++ /dev/null @@ -1,131 +0,0 @@ -#include -#include -#include -#include // XXX: remove if malloc() is gone - -static uint32_t -hash(char *s, size_t len) -{ - uint32_t h = 5381; - size_t i; - - for (i = 0; i < len; i += 1) { - h = ((h << 5) + h) ^ s[i]; - } - return h; -} - -void -write_u32le(FILE *f, uint32_t val) -{ - fputc((val >> 0) & 0xff, f); - fputc((val >> 8) & 0xff, f); - fputc((val >> 16) & 0xff, f); - fputc((val >> 24) & 0xff, f); -} - -struct record { - uint32_t hashval; - uint32_t offset; -}; - -int -main(int argc, char *argv[]) -{ - struct record *records[256] = {0}; - uint32_t nrecords[256] = {0}; - FILE *f = fopen("a.cdb", "wb"); - int idx; - - // Start writing records - fseek(f, 256*4*2, SEEK_SET); - - while (! feof(stdin)) { - int ret; - char key[4098]; - char val[4098]; - uint32_t keylen; - uint32_t vallen; - - // Read a record - ret = scanf("+%u,%u:", &keylen, &vallen); - if (ret == 0) { - break; - } - if ((keylen > sizeof(key)) || (vallen > sizeof(val))) { - fprintf(stderr, "error: my buffers are too puny (%d,%d)\n", keylen, vallen); - return 1; - } - fread(key, 1, keylen, stdin); - getchar(); - getchar(); - fread(val, 1, vallen, stdin); - getchar(); - - // Comupute hash of key - { - long where = ftell(f); - uint32_t hashval = hash(key, keylen); - uint32_t n; - - idx = hashval % 256; - n = nrecords[idx]; - nrecords[idx] += 1; - records[idx] = (struct record *)realloc(records[idx], nrecords[idx] * sizeof(struct record)); - if (NULL == records[idx]) { - perror("realloc records"); - return 1; - } - - records[idx][n].hashval = hashval; - records[idx][n].offset = (uint32_t)where; - } - - // Write it out - write_u32le(f, keylen); - write_u32le(f, vallen); - fwrite(key, 1, keylen, f); - fwrite(val, 1, vallen, f); - } - - // Write out tables - for (idx = 0; idx < 256; idx += 1) { - uint32_t r; - long offset; - uint32_t tlen = nrecords[idx] * 2; - uint32_t *buf; - - // Pointer - offset = ftell(f); - fseek(f, idx * 8, SEEK_SET); - write_u32le(f, (uint32_t)offset); - write_u32le(f, tlen); - fseek(f, offset, SEEK_SET); - - // Build table in memory - buf = (uint32_t *)calloc(tlen * 2, sizeof(uint32_t)); - if (! buf) { - perror("Allocating hash table"); - return 1; - } - for (r = 0; r < nrecords[idx]; r += 1) { - uint32_t slot = (records[idx][r].hashval / 256) % tlen; - - while (buf[slot * 2] > 0) { - slot = (slot + 1) % tlen; - } - buf[slot*2 + 0] = records[idx][r].hashval; - buf[slot*2 + 1] = records[idx][r].offset; - } - // Write it out - for (r = 0; r < tlen; r += 1) { - write_u32le(f, buf[r*2 + 0]); - write_u32le(f, buf[r*2 + 1]); - } - free(buf); - } - - fclose(f); - - return 0; -} diff --git a/cdbmake.c b/cdbmake.c new file mode 100644 index 0000000..b675c1a --- /dev/null +++ b/cdbmake.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include // XXX: remove if malloc() is gone +#include "cdbmake.h" + +static uint32_t +hash(char *s, size_t len) +{ + uint32_t h = 5381; + size_t i; + + for (i = 0; i < len; i += 1) { + h = ((h << 5) + h) ^ s[i]; + } + return h; +} + +static void +write_u32le(FILE *f, uint32_t val) +{ + fputc((val >> 0) & 0xff, f); + fputc((val >> 8) & 0xff, f); + fputc((val >> 16) & 0xff, f); + fputc((val >> 24) & 0xff, f); +} + +void +cdbmake_init(struct cdbmake_ctx *ctx, FILE *f) +{ + int i; + + ctx->f = f; + for (i = 0; i < 256; i += 1) { + ctx->records[i] = NULL; + ctx->nrecords[i] = 0; + } + + fseek(f, 256 * 8, SEEK_SET); +} + +void +cdbmake_add(struct cdbmake_ctx *ctx, + char *key, size_t keylen, + char *val, size_t vallen) +{ + long where = ftell(ctx->f); + uint32_t hashval = hash(key, keylen); + int idx = hashval % 256; + uint32_t n = ctx->nrecords[idx]; + + ctx->nrecords[idx] += 1; + ctx->records[idx] = (struct cdbmake_record *)realloc(ctx->records[idx], + ctx->nrecords[idx] * sizeof(struct cdbmake_record)); + if (NULL == ctx->records[idx]) { + perror("realloc records"); + return; + } + ctx->records[idx][n].hashval = hashval; + ctx->records[idx][n].offset = (uint32_t)where; + + // Write it out + write_u32le(ctx->f, keylen); + write_u32le(ctx->f, vallen); + fwrite(key, 1, keylen, ctx->f); + fwrite(val, 1, vallen, ctx->f); +} + +void +cdbmake_finalize(struct cdbmake_ctx *ctx) +{ + int idx; + + // Write out tables + for (idx = 0; idx < 256; idx += 1) { + uint32_t r; + long offset; + uint32_t tlen = ctx->nrecords[idx] * 2; + uint32_t *buf; + + // Pointer + offset = ftell(ctx->f); + fseek(ctx->f, idx * 8, SEEK_SET); + write_u32le(ctx->f, (uint32_t)offset); + write_u32le(ctx->f, tlen); + fseek(ctx->f, offset, SEEK_SET); + + // Build table in memory + buf = (uint32_t *)calloc(tlen * 2, sizeof(uint32_t)); + if (! buf) { + perror("Allocating hash table"); + return; + } + for (r = 0; r < ctx->nrecords[idx]; r += 1) { + uint32_t slot = (ctx->records[idx][r].hashval / 256) % tlen; + + while (buf[slot * 2] > 0) { + slot = (slot + 1) % tlen; + } + buf[slot*2 + 0] = ctx->records[idx][r].hashval; + buf[slot*2 + 1] = ctx->records[idx][r].offset; + } + // Write it out + for (r = 0; r < tlen; r += 1) { + write_u32le(ctx->f, buf[r*2 + 0]); + write_u32le(ctx->f, buf[r*2 + 1]); + } + free(buf); + } + + fclose(ctx->f); + ctx->f = NULL; + + for (idx = 0; idx < 256; idx += 1) { + if (ctx->records[idx]) { + free(ctx->records[idx]); + } + ctx->records[idx] = NULL; + ctx->nrecords[idx] = 0; + } +} diff --git a/cdbmake.h b/cdbmake.h new file mode 100644 index 0000000..0c5a727 --- /dev/null +++ b/cdbmake.h @@ -0,0 +1,25 @@ +#ifndef __CDBMAKE_H__ +#define __CDBMAKE_H__ + +#include +#include +#include + +struct cdbmake_record { + uint32_t hashval; + uint32_t offset; +}; + +struct cdbmake_ctx { + FILE *f; + struct cdbmake_record *records[256]; + uint32_t nrecords[256]; +}; + +void cdbmake_init(struct cdbmake_ctx *ctx, FILE *f); +void cdbmake_add(struct cdbmake_ctx *ctx, + char *key, size_t keylen, + char *val, size_t vallen); +void cdbmake_finalize(struct cdbmake_ctx *ctx); + +#endif diff --git a/infobot.c b/infobot.c index 8288688..b34f739 100644 --- a/infobot.c +++ b/infobot.c @@ -2,10 +2,12 @@ #include #include #include +#include #include #include #include #include "cdb.h" +#include "cdbmake.h" /* Some things I use for debugging */ #ifdef NODUMP @@ -26,7 +28,7 @@ usage() { fprintf(stderr, "Usage: infobot factoids.cdb \"text\"\n"); - return 0; + return EX_USAGE; } size_t @@ -41,18 +43,12 @@ lowercase(char *text) return ret; } -int -infocmd(char *filename, char *text) -{ - return 0; -} - int -lookup(char *filename, char *text) +choose(char *filename, char *key) { struct cdb_ctx c; FILE *f = fopen(filename, "r"); - size_t textlen = lowercase(text); + size_t keylen = lowercase(key); uint32_t nresults; if (! f) { @@ -63,7 +59,7 @@ lookup(char *filename, char *text) cdb_init(&c, f); /* Count how many results there are */ - cdb_find(&c, text, textlen); + cdb_find(&c, key, keylen); for (nresults = 0; cdb_next(&c, NULL, 0); nresults += 1); if (nresults > 0) { @@ -75,11 +71,11 @@ lookup(char *filename, char *text) char val[8192]; uint32_t i; - cdb_find(&c, text, textlen); + cdb_find(&c, key, keylen); for (i = 0; i < which; i += 1) { cdb_next(&c, NULL, 0); } - vallen = cdb_next(&c, val, sizeof val); + vallen = cdb_next(&c, val, sizeof(val)); printf("%.*s\n", vallen, val); } @@ -88,16 +84,125 @@ lookup(char *filename, char *text) return 0; } +int +list(char *filename, char *key) +{ + struct cdb_ctx c; + size_t keylen = lowercase(key); + FILE *f = fopen(filename, "rb"); + + if (! f) { + perror("Opening database"); + return EX_NOINPUT; + } + + cdb_init(&c, f); + + cdb_find(&c, key, keylen); + for (;;) { + uint32_t vallen; + char val[8192]; + + vallen = cdb_next(&c, val, sizeof(val)); + if (vallen == 0) { + break; + } + printf("%.*s\n", vallen, val); + } + + fclose(f); + + return 0; +} + +int +add(char *filename, char *key, char *val) +{ + struct cdb_ctx inc; + struct cdbmake_ctx outc; + FILE *inf; + FILE *outf; + + inf = fopen(filename, "rb"); + if (! inf) { + perror("Opening database"); + return EX_NOINPUT; + } + + { + char fn[4096]; + + snprintf(fn, sizeof(fn), "%s.%d", filename, getpid()); + outf = fopen(fn, "wb"); + } + if (! outf) { + perror("Creating temporary database"); + return EX_CANTCREAT; + } + + cdb_init(&inc, inf); + cdbmake_init(&outc, outf); + + for (;;) { + char key[8192]; + char val[8192]; + size_t keylen = sizeof(key); + size_t vallen = sizeof(val); + + if (EOF == cdb_dump(&inc, key, &keylen, val, &vallen)) { + break; + } + cdbmake_add(&outc, key, keylen, val, vallen); + } + cdbmake_add(&outc, key, strlen(key), val, strlen(val)); + cdbmake_finalize(&outc); + + return 0; +} + + +enum action { + ACT_ONE, + ACT_ALL, + ACT_ADD, + ACT_DEL +}; + int main(int argc, char *argv[]) { char *filename; - char *text; + char *key; + char *val; + enum action act = ACT_ONE; - if (3 != argc) { - return usage(); + for (;;) { + int opt = getopt(argc, argv, "la:r:"); + + if (-1 == opt) { + break; + } + switch (opt) { + case 'l': + act = ACT_ALL; + break; + case 'a': + act = ACT_ADD; + val = optarg; + break; + case 'r': + act = ACT_DEL; + val = optarg; + break; + default: + return usage(argv[0]); + } + } + if (argc - optind != 2) { + return usage(argv[0]); } + // Seed PRNG with some crap { struct timeval tv; @@ -105,8 +210,20 @@ main(int argc, char *argv[]) srand((unsigned int)(tv.tv_sec * tv.tv_usec)); } - filename = argv[1]; - text = argv[2]; + filename = argv[optind]; + key = argv[optind + 1]; - return lookup(filename, text); + switch (act) { + case ACT_ONE: + return choose(filename, key); + case ACT_ALL: + return list(filename, key); + case ACT_ADD: + return add(filename, key, val); + default: + fprintf(stderr, "Not yet implemented, chump %s.\n", val); + break; + } + + return 0; }