summaryrefslogtreecommitdiff
path: root/dteutils.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'dteutils.cpp')
-rw-r--r--dteutils.cpp316
1 files changed, 288 insertions, 28 deletions
diff --git a/dteutils.cpp b/dteutils.cpp
index b08cfe0..2759dc5 100644
--- a/dteutils.cpp
+++ b/dteutils.cpp
@@ -22,10 +22,129 @@
#include <stdlib.h>
#include <string.h>
#include "fileutils.h"
+#include "generic.h"
+
+class thingtree {
+ public:
+ static void addstring(const char *, long);
+ static long look(const char *);
+ static void destroy();
+ private:
+ thingtree();
+ thingtree(char, bool, thingtree *, long);
+ ~thingtree();
+ char thischar;
+ bool terminal;
+ thingtree * father, * child, * brother;
+ long thingentry;
+ static thingtree * root;
+};
+
+thingtree * thingtree::root = 0;
+
+thingtree::thingtree(char gchar, bool gterm, thingtree * gfather, long gthingentry) : thischar(gchar), terminal(gterm), thingentry(gthingentry) {
+ if (!gfather) {
+ if (!root) {
+ root = new thingtree();
+ }
+ father = root;
+ } else {
+ father = gfather;
+ }
+ brother = father->child;
+ father->child = this;
+}
+
+thingtree::thingtree() : thischar(0), terminal(false), father(0), child(0), brother(0) { }
+
+long thingtree::look(const char * s) {
+ const char * p;
+ long currentthing = -1;
+ thingtree * ptr = root;
+
+ if (!ptr) {
+ printm(M_ERROR, "Error: thingtree not initialised\n");
+ exit(-1);
+ }
+
+ for (p = s; *p; p++) {
+// printm(M_INFO, "Looking for '%c'\n", *p);
+ for (ptr = ptr->child; ptr; ptr = ptr->brother) {
+// printm(M_INFO, "Looking in %p: %c\n", ptr, ptr->thischar);
+ if (ptr->thischar == *p) {
+ if (ptr->terminal) {
+ currentthing = ptr->thingentry;
+ }
+ break;
+ }
+ }
+ if (!ptr) {
+// printm(M_INFO, "Not found.\n");
+ break;
+ }
+ }
+
+ if (currentthing == -1) {
+ printm(M_ERROR, "Error, can't find any entry for string '%s'\n", s);
+ }
+
+ return currentthing;
+}
+
+void thingtree::addstring(const char * s, long thingentry) {
+ const char * p;
+ thingtree * ptr, * fptr;
+
+ if (!root) {
+ root = new thingtree();
+ }
+
+// printm(M_INFO, "Creating new thingtree entry: %li='%s'\n", thingentry, s);
+
+ ptr = root;
+ for (p = s; *p; p++) {
+ fptr = ptr;
+// printm(M_INFO, "Finding entry for '%c'\n", *p);
+ for (ptr = ptr->child; ptr; ptr = ptr->brother) {
+// printm(M_INFO, "Browsing childs: %p = %c\n", ptr, ptr->thischar);
+ if (ptr->thischar == *p)
+ break;
+ }
+
+ if (!ptr) {
+// printm(M_INFO, "Creating new branch for %c\n", *p);
+ ptr = new thingtree(*p, *(p + 1) == 0, fptr, thingentry);
+// printm(M_INFO, "Created branch %p\n", ptr);
+ } else {
+ if (*(p + 1) == 0) {
+ ptr->terminal = true;
+ ptr->thingentry = thingentry;
+ }
+ }
+ }
+}
+
+void thingtree::destroy() {
+ delete root;
+ root = 0;
+}
+
+thingtree::~thingtree() {
+ if (father->child == this)
+ father->child = brother;
+
+ while (child)
+ delete child;
+}
+
+char * things[256];
char * dte_text;
char dte_flags[256 * 256];
long dte_counters[256 * 256];
+long dte_entries[256];
+char alloweds[256];
+long dte_usage[256];
long dte_most;
long dte_counter;
@@ -33,12 +152,6 @@ long dte_text_size;
int dte_size;
long gain;
-#ifdef USEASM
-extern "C" {
- void build_dte(void);
- void dte_reset(void);
-}
-#else
void dte_reset(void) {
memset(dte_counters, 0, 0x40000);
dte_most = 0;
@@ -47,21 +160,72 @@ void dte_reset(void) {
void build_dte(void) {
int i;
- unsigned short t;
+ unsigned short t, t2;
+// unsigned short p = 0;
for (i = 0; i < dte_text_size; i++) {
t = *((unsigned short *) (dte_text + i));
- if (!dte_flags[t]) {
- if ((++dte_counters[t]) > dte_counter) {
+ t2 = *((unsigned short *) (dte_text + i + 1));
+/* if (t == p) {
+ p = 0;
+ continue;
+ }
+ p = t; */
+ if ((!dte_flags[t]) && (dte_flags[t2] != 3)) {
+ dte_counters[t]++;
+ if (dte_counters[t] > dte_counter) {
dte_most = t;
dte_counter = dte_counters[t];
}
+ } else if (dte_flags[t] == 3) {
+ i++;
}
}
}
-#endif
-void dte_compress(char alloweds[]) {
+void push_entry(long entry) {
+ int i;
+ char t[3], c1, c2;
+ t[2] = 0;
+
+ c1 = dte_most & 0xff;
+ c2 = dte_most >> 8;
+ t[0] = things[c1][0];
+ t[1] = things[c2][0];
+
+ for (i = 0; i < 256; i++) {
+ if (!dte_entries[i]) {
+ dte_entries[i] = entry;
+ things[i] = strdup(t);
+ thingtree::addstring(t, i);
+ break;
+ }
+ }
+}
+
+char * read_line(FILE * f, char * b) {
+ int r, pos = 0;
+
+ while (!feof(f)) {
+ if ((r = fgetc(f)) == EOF)
+ break;
+ if (r == 0x0d) {
+ if ((r = fgetc(f)) == EOF)
+ break;
+ }
+ if (r == 0x0a) {
+ b[pos] = 0;
+ return b;
+ }
+ b[pos++] = r;
+ }
+
+ b[pos] = 0;
+
+ return b;
+}
+
+void dte_compress() {
int i, j;
char c1, c2;
@@ -73,46 +237,142 @@ void dte_compress(char alloweds[]) {
gain = 0;
- fprintf(stderr, "Going for it: dte_size = %i\n", dte_size);
+ printm(M_STATUS, "Going for it: dte_size = %i\n", dte_size);
for (i = 0; i < dte_size; i++) {
dte_reset();
build_dte();
c1 = dte_most & 0xff;
c2 = dte_most >> 8;
- fprintf(stderr, "Entry #%i, most count: %li, couple = 0x%04x = (%c %c)\n", i, dte_counter, (int) dte_most, c1, c2);
- dte_flags[c1 + c2 * 256] = 1;
+ c1 = things[c1][0];
+ c2 = things[c2][0];
+ printm(M_INFO, "Entry #%i, most count: %li, couple = 0x%04x = (%c %c)\n", i, dte_counter, (int) dte_most, c1, c2);
+ dte_flags[dte_most] = 3;
gain += dte_counter;
+ push_entry(dte_most);
+ }
+
+ printm(M_STATUS, "Estimated gain: %li bytes, new file size = %li\n", gain, dte_text_size - gain);
+}
+
+void read_thingy(FILE * f) {
+ char line[256], * st, * thing;
+ long code;
+ int i;
+
+ st = line + 2;
+ thing = line + 5;
+ line[0] = '0';
+ line[1] = 'x';
+
+ dte_size = 0;
+
+ for (i = 0; i < 256; i++) {
+ dte_entries[i] = -1;
}
- fprintf(stderr, "Total gain: %li bytes\n", gain);
+ while (!feof(f)) {
+ if (strlen(read_line(f, st))) {
+ line[4] = 0;
+ sscanf(line, "%li", &code);
+ switch (strlen(thing)) {
+ case 0:
+ dte_size++;
+ dte_entries[code] = 0;
+ break;
+ case 1:
+ alloweds[code] = 1;
+ default:
+ things[code] = strdup(thing);
+ thingtree::addstring(thing, code);
+ }
+ }
+ }
}
+void read_thingy_file(FILE * f) {
+ char line[10240], * p, * c, trans[5];
+ long code;
+ int ptr = 0, i;
+
+ for (i = 0; i < 256; i++) {
+ dte_usage[i] = 0;
+ }
+
+ while (!feof(f)) {
+ if (!strlen(p = read_line(f, line)))
+ continue;
+ while (*p) {
+ if (*p == '<') {
+ if (!(c = strchr(p, '>'))) {
+ printm(M_ERROR, "Error in file: '<' not closed.\n");
+ exit(-1);
+ }
+ p++;
+ if ((c - p) == 2) {
+ *c = 0;
+ sprintf(trans, "0x%s", p);
+ sscanf(trans, "%li", &code);
+ dte_text[ptr++] = code;
+ printm(M_BARE, "0x%02x-", code);
+ } else {
+ printm(M_BARE, "Unknow %s-", p);
+ }
+ p = c + 1;
+ } else {
+ if ((code = thingtree::look(p)) == -1)
+ exit(-1);
+ dte_text[ptr++] = code;
+ p += strlen(things[code]);
+ printm(M_BARE, "%s-", things[code]);
+ dte_usage[code]++;
+ }
+ }
+ printm(M_BARE, "\n");
+ }
+
+ dte_text[ptr] = dte_text[ptr + 1] = dte_text[ptr + 2] = dte_text[ptr + 3] = 0;
+ dte_text_size = ptr;
+}
#ifdef DTEMAIN
int main(int argc, char ** argv) {
+ FILE * f, * t;
+ long old_size;
int i;
- FILE * f;
- char alloweds[256];
- dte_size = 128;
+ verbosity = M_INFO;
+ printm(M_STATUS, "Reading thingy table\n");
+ t = fopen(argv[2], "r");
+ read_thingy(t);
+ fclose(t);
+
f = fopen(argv[1], "r");
dte_text_size = filesize(f);
dte_text = (char *) calloc(dte_text_size + 4, 1);
- fprintf(stderr, "Reading file, size = %li\n", dte_text_size);
- fread(dte_text, 1, dte_text_size, f);
+ printm(M_STATUS, "Reading file, size = %li\n", dte_text_size);
+ read_thingy_file(f);
fclose(f);
- memset(alloweds, 0, 256);
- fprintf(stderr, "Building alloweds table\n");
- for (i = 0; i < dte_text_size; i++) {
- if ((dte_text[i] != '\n') && (dte_text[i] != '\r')) {
- alloweds[dte_text[i]] = 1;
- }
- }
+ printm(M_STATUS, "True size = %li\n", old_size = dte_text_size);
+
+ printm(M_STATUS, "Compressing file.\n");
+ dte_compress();
- dte_compress(alloweds);
+ printm(M_STATUS, "Rereading file.\n");
+ f = fopen(argv[1], "r");
+ dte_text_size = filesize(f);
+ dte_text = (char *) calloc(dte_text_size + 4, 1);
+ read_thingy_file(f);
+ fclose(f);
+
+ printm(M_STATUS, "True size = %li, real gain = %li\n", dte_text_size, old_size - dte_text_size);
+
+ printm(M_INFO, "DTE Usage:\n");
+ for (i = 0; i < 256; i++) {
+ printm(M_INFO, "Entry %i ('%s') used at %i\n", i, things[i], dte_usage[i]);
+ }
free(dte_text);
}