From 3abfcd994a7dad841f223bb64bd74fb59d13fef2 Mon Sep 17 00:00:00 2001
From: root <root>
Date: Thu, 21 Jun 2007 22:11:34 +0000
Subject: *** empty log message ***

---
 Changes |  4 ++++
 lzfP.h  | 10 +---------
 lzf_c.c | 30 +++++++++++++++++++++---------
 lzf_d.c | 18 ++++++++++++++----
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/Changes b/Changes
index cbded04..ac650cd 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+2.1
+	- get rid of memcpy.
+        - tentatively use rep movsb on x86 and x86_64 (gcc only) for a
+          moderate speed improvement.
 
 2.0  Fri Feb 16 23:11:18 CET 2007
 	- replaced lzf demo by industrial-strength lzf utility with behaviour
diff --git a/lzfP.h b/lzfP.h
index 988ea14..ce775a2 100644
--- a/lzfP.h
+++ b/lzfP.h
@@ -85,14 +85,6 @@
 # define STRICT_ALIGN !(defined(__i386) || defined (__amd64))
 #endif
 
-/*
- * Use string functions to copy memory.
- * this is usually a loss, even with glibc's optimized memcpy
- */
-#ifndef USE_MEMCPY
-# define USE_MEMCPY 0
-#endif
-
 /*
  * You may choose to pre-set the hash table (might be faster on some
  * modern cpus and large (>>64k) blocks, and also makes compression
@@ -158,7 +150,7 @@ typedef const u8 *LZF_STATE[1 << (HLOG)];
 # endif
 #endif
 
-#if USE_MEMCPY || INIT_HTAB
+#if INIT_HTAB
 # ifdef __cplusplus
 #  include <cstring>
 # else
diff --git a/lzf_c.c b/lzf_c.c
index f65ad72..216a4d9 100644
--- a/lzf_c.c
+++ b/lzf_c.c
@@ -72,6 +72,13 @@
 #define        MAX_OFF        (1 << 13)
 #define        MAX_REF        ((1 << 8) + (1 << 3))
 
+#if (__i386 || __amd64) && __GNUC__ >= 3
+# define lzf_movsb(dst, src, len)                \
+   asm ("rep movsb"                              \
+        : "=D" (dst), "=S" (src), "=c" (len)     \
+        :  "0" (dst),  "1" (src),  "2" (len));
+#endif
+
 /*
  * compressed format
  *
@@ -104,11 +111,10 @@ lzf_compress (const void *const in_data, unsigned int in_len,
            int lit = 0;
 
 #if INIT_HTAB
-# if USE_MEMCPY
-    memset (htab, 0, sizeof (htab));
-# else
-    for (hslot = htab; hslot < htab + HSIZE; hslot++)
-      *hslot++ = ip;
+  memset (htab, 0, sizeof (htab));
+# if 0
+  for (hslot = htab; hslot < htab + HSIZE; hslot++)
+    *hslot++ = ip;
 # endif
 #endif
 
@@ -214,10 +220,10 @@ lzf_compress (const void *const in_data, unsigned int in_len,
             return 0;
 
           *op++ = MAX_LIT - 1;
-#if USE_MEMCPY
-          memcpy (op, ip - MAX_LIT, MAX_LIT);
-          op += MAX_LIT;
-          lit = 0;
+
+#ifdef lzf_movsb
+          ip -= lit;
+          lzf_movsb (op, ip, lit);
 #else
           lit = -lit;
           do
@@ -233,11 +239,17 @@ lzf_compress (const void *const in_data, unsigned int in_len,
 	return 0;
 
       *op++ = lit - 1;
+#ifdef lzf_movsb
+      ip -= lit;
+      lzf_movsb (op, ip, lit);
+#else
       lit = -lit;
       do
 	*op++ = ip[lit];
       while (++lit);
+#endif
     }
 
   return op - (u8 *) out_data;
 }
+
diff --git a/lzf_d.c b/lzf_d.c
index 73a1a80..61df183 100644
--- a/lzf_d.c
+++ b/lzf_d.c
@@ -45,6 +45,13 @@
 # define SET_ERRNO(n) errno = (n)
 #endif
 
+#if (__i386 || __amd64) && __GNUC__ >= 3
+# define lzf_movsb(dst, src, len)                \
+   asm ("rep movsb"                              \
+        : "=D" (dst), "=S" (src), "=c" (len)     \
+        :  "0" (dst),  "1" (src),  "2" (len));
+#endif
+
 unsigned int 
 lzf_decompress (const void *const in_data,  unsigned int in_len,
                 void             *out_data, unsigned int out_len)
@@ -76,10 +83,8 @@ lzf_decompress (const void *const in_data,  unsigned int in_len,
             }
 #endif
 
-#if USE_MEMCPY
-          memcpy (op, ip, ctrl);
-          op += ctrl;
-          ip += ctrl;
+#ifdef lzf_movsb
+          lzf_movsb (op, ip, ctrl);
 #else
           do
             *op++ = *ip++;
@@ -125,12 +130,17 @@ lzf_decompress (const void *const in_data,  unsigned int in_len,
               return 0;
             }
 
+#ifdef lzf_movsb
+          len += 2;
+          lzf_movsb (op, ref, len);
+#else
           *op++ = *ref++;
           *op++ = *ref++;
 
           do
             *op++ = *ref++;
           while (--len);
+#endif
         }
     }
   while (ip < in_end);
-- 
cgit v1.2.3