summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Chabowski <kevin@kch42.de>2014-03-21 14:50:06 +0100
committerKevin Chabowski <kevin@kch42.de>2014-03-21 14:50:06 +0100
commit9f0a8395b31e30634296145219392770b2b0ffff (patch)
treecb90e415c035cac9a38cf719e21a51d69c6e60ba
downloadbuzhash-9f0a8395b31e30634296145219392770b2b0ffff.tar.gz
buzhash-9f0a8395b31e30634296145219392770b2b0ffff.tar.bz2
buzhash-9f0a8395b31e30634296145219392770b2b0ffff.zip
Initial commit.
Originally this was part of a larger, now discontinued, project. I chose to publish some parts of it that might be useful to others.
-rw-r--r--LICENSE4
-rw-r--r--README.markdown24
-rw-r--r--hash.go143
-rw-r--r--hash_test.go92
4 files changed, 263 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..68d0481
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,4 @@
+ DO WHATEVER THE FUCK YOU WANT, PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. You just DO WHATEVER THE FUCK YOU WANT. \ No newline at end of file
diff --git a/README.markdown b/README.markdown
new file mode 100644
index 0000000..11d415f
--- /dev/null
+++ b/README.markdown
@@ -0,0 +1,24 @@
+# buzhash
+
+Package buzhash implements a buzhash algorithm using this defintion <http://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial>.
+
+## Rolling hash
+
+Buzhash is a rolling hash function, that means that the current hash sum is the sum of the last n consumed bytes.
+
+Example:
+
+* Message 1: <code>This is a stupid example text to demo<strong>nstrate buzhash.</strong></code>
+* Message 2: <code>Another text to demo<strong>nstrate buzhash.</strong></code>
+
+When hashing both messages with a buzhasher with n=16, both messages will have the same hash sum, since the last 16 characters (`nstrate buzhash.`) are equal.
+
+This can be useful, when searching for a data fragment in large files, without knowing the fragment (only its hash). This is used in binary diff tools, such as `rdiff` (although they use a different rolling hash function).
+
+## Installation
+
+`go get github.com/kch42/buzhash`
+
+## Documentation
+
+Either install the package and use a local godoc server or use [godoc.org](http://godoc.org/github.com/kch42/buzhash)
diff --git a/hash.go b/hash.go
new file mode 100644
index 0000000..dea5642
--- /dev/null
+++ b/hash.go
@@ -0,0 +1,143 @@
+// Package buzhash implements a buzhash algorithm using this defintion:
+// http://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial
+//
+// BuzHash is a rolling hash function, that means that the current sum will
+// always be the sum of the last n input bytes.
+package buzhash
+
+import (
+ "bytes"
+ "encoding/binary"
+)
+
+// 256 random uint32's to hash a single byte
+var bytehash = [256]uint32{
+ 0x12bd9527, 0xf4140cea, 0x987bd6e1, 0x79079850, 0xafbfd539, 0xd350ce0a,
+ 0x82973931, 0x9fc32b9c, 0x28003b88, 0xc30c13aa, 0x6b678c34, 0x5844ef1d,
+ 0xaa552c18, 0x4a77d3e8, 0xd1f62ea0, 0x6599417c, 0xfbe30e7a, 0xf9e2d5ee,
+ 0xa1fca42e, 0x41548969, 0x116d5b59, 0xaeda1e1a, 0xc5191c17, 0x54b9a3cb,
+ 0x727e492a, 0x5c432f91, 0x31a50bce, 0xc2696af6, 0x217c8020, 0x1262aefc,
+ 0xace75924, 0x9876a04f, 0xaf300bc2, 0x3ffce3f6, 0xd6680fb5, 0xd0b1ced8,
+ 0x6651f842, 0x736fadef, 0xbc2d3429, 0xb03d2904, 0x7e634ba4, 0xdfd87d8c,
+ 0x7988d63a, 0x4be4d933, 0x6a8d0382, 0x9e132d62, 0x3ee9c95f, 0xfec05b97,
+ 0x6907ad34, 0x8616cfcc, 0xa6aabf24, 0x8ad1c92e, 0x4f2affc0, 0xb87519db,
+ 0x6576eaf6, 0x15dbe00a, 0x63e1dd82, 0xa36b6a81, 0xeead99b3, 0xbc6a4309,
+ 0x3478d1a7, 0x2182bcc0, 0xdd50cfce, 0x7cb25580, 0x73075483, 0x503b7f42,
+ 0x4cd50d63, 0x3f4d94c9, 0x385fcbb7, 0x90daf16c, 0xece10b8e, 0x11c1cb04,
+ 0x816a899b, 0x69a29d06, 0xfb090b37, 0xf98ef13c, 0x07653435, 0x9f15dc42,
+ 0x3b43abdf, 0x1334283f, 0x93f3d9af, 0x0cbdfe71, 0xa788a614, 0x4f54d2f0,
+ 0xd4374fc7, 0x70557ce7, 0xf741fce8, 0xe4b6f661, 0xc630cb98, 0x387a6366,
+ 0x72f428fd, 0x539009db, 0xc53e3810, 0x1e1a52e5, 0x7d6816b0, 0x040f9b81,
+ 0x9c99c9fb, 0x9f3af3d2, 0x774d1061, 0xd5c840ea, 0x8e1480fe, 0x6ee4023c,
+ 0x2fbda535, 0xd88eff7a, 0xd8632a2a, 0x43c4e024, 0x3ef27971, 0xc72866fd,
+ 0xe35cc630, 0x46d96220, 0x437a8384, 0xe92caf0c, 0x6290a47e, 0xa7bb9238,
+ 0x0e1000f9, 0x49e76bdc, 0x3acfb4b8, 0x03582b8e, 0x6ea2de4e, 0x2ec1008d,
+ 0xfcc8df69, 0x91c2fe0a, 0xb471c7d9, 0x778be812, 0x70d29ad1, 0x76411cbf,
+ 0xc302e81c, 0x4e445194, 0x22e3aa72, 0xb65762e9, 0xa280db05, 0x827aa70e,
+ 0x4c531a9d, 0x7a60bf4a, 0x8fd95a44, 0x2289aef0, 0xcd50ddc4, 0x639aae69,
+ 0x5fe85ed6, 0x4ed724ff, 0x00f04f7d, 0x95a5fcb0, 0x88255d15, 0xa603d2c9,
+ 0xf6956a5b, 0x53ea7f3e, 0xb570f225, 0x2b3be203, 0xa181e40e, 0xc413cdce,
+ 0xa7cb1ebb, 0xcf258b1f, 0x516eb016, 0xca204586, 0xd1e69894, 0xe85a73d3,
+ 0x7db2d382, 0xae73b463, 0x3598d643, 0x5087c864, 0xd91f30b6, 0xe1d4d1e7,
+ 0x73b3b337, 0xceac1233, 0x8edf7845, 0xa69c45c9, 0xdb5db3ab, 0x28cfade8,
+ 0xebfa49e7, 0xcbc2a659, 0x59cce971, 0x959a01af, 0x8ee9aae7, 0xfb2f01c6,
+ 0x5a752836, 0x9ed12981, 0x618d05b6, 0x93ec12b3, 0x4590c779, 0xed1317a2,
+ 0x03fe5835, 0x7ad3c6f7, 0xd4aad5b5, 0x1a995ed7, 0x247bfaa4, 0x69c2c799,
+ 0x745fa405, 0xc5b9f239, 0xc3d9aebc, 0xa6f60e0b, 0xdf1e91d7, 0xab8e041c,
+ 0xee3188c6, 0x37377a9e, 0xc0e1a3bf, 0x19a5a9e4, 0x56cb9556, 0xc4d33d3f,
+ 0xfb1eb03e, 0xf9557057, 0x1be31d37, 0xd1fa65f1, 0xf518d714, 0x570ac722,
+ 0xf26cf66a, 0x24794d47, 0x8ba2e402, 0x3f5137e6, 0x35be1453, 0x43350478,
+ 0x9f05ee88, 0x364cf9cf, 0x39a23ee7, 0xa4db8d49, 0xc2ebb3d2, 0xc6fb99d5,
+ 0xe014dfb0, 0x7156d425, 0xe090a87a, 0x4cc12f78, 0x1b30f503, 0x06694a7a,
+ 0x68198cd1, 0x2f8345bd, 0x9d79198e, 0xd871943f, 0x22ef6cf4, 0xe81b1c15,
+ 0x067b61d8, 0xfc4ea4f5, 0xfe6dab57, 0x1bf744ba, 0xa70b6a25, 0xafe6e412,
+ 0xc6c1a05c, 0x8ffbe3ce, 0xc4270af1, 0xf3f36373, 0xc4507dd8, 0x5e6fd1e2,
+ 0x58cd9739, 0x47d3c5b5, 0xe1d5a343, 0x3d4dea4a, 0x893d91ae, 0xbb2a5e2a,
+ 0x0d57b800, 0x652a7cc9, 0x6a68ccfd, 0x62529f0b, 0xec5f36d6, 0x766cceda,
+ 0x96ca63ef, 0xa0499838, 0xd9030f59, 0x8185f4d2}
+
+// BuzHash implements the hash.Hash32 interface and also has a function to write a single byte.
+type BuzHash struct {
+ state uint32
+ buf []byte
+ n uint32
+ bshiftn uint
+ bshiftm uint
+ bufpos uint32
+ overflow bool
+}
+
+// NewBuzHash generates and initializes a new BuzHash object with block size n
+func NewBuzHash(n uint32) *BuzHash {
+ rv := new(BuzHash)
+ rv.n = n
+ rv.bshiftn = uint(n % 32)
+ rv.bshiftm = 32 - rv.bshiftn
+ rv.buf = make([]byte, n)
+ rv.Reset()
+ return rv
+}
+
+// HashByte updates the hash with a single byte and returns the resulting sum
+func (bh *BuzHash) HashByte(b byte) uint32 {
+ if bh.bufpos == bh.n {
+ bh.overflow = true
+ bh.bufpos = 0
+ }
+
+ state := bh.state
+
+ state = (state << 1) | (state >> 31)
+
+ if bh.overflow {
+ toshift := bytehash[bh.buf[bh.bufpos]]
+ state ^= (toshift << bh.bshiftn) | (toshift >> bh.bshiftm)
+ }
+
+ bh.buf[bh.bufpos] = b
+ bh.bufpos++
+
+ state ^= bytehash[b]
+
+ bh.state = state
+ return state
+}
+
+// Write updates the hash with the bytes from slice p
+func (bh *BuzHash) Write(p []byte) (int, error) {
+ for _, b := range p {
+ bh.HashByte(b)
+ }
+
+ return len(p), nil
+}
+
+// Sum appends the (little endian) checksum to b
+func (bh *BuzHash) Sum(b []byte) []byte {
+ var buf bytes.Buffer
+ binary.Write(&buf, binary.LittleEndian, bh.state)
+ hash := buf.Bytes()
+ for _, hb := range hash {
+ b = append(b, hb)
+ }
+
+ return b
+}
+
+func (bh *BuzHash) Reset() {
+ bh.state = 0
+ bh.bufpos = 0
+ bh.overflow = false
+}
+
+func (bh *BuzHash) Size() int {
+ return 4
+}
+
+func (bh *BuzHash) BlockSize() int {
+ return int(bh.n)
+}
+
+func (bh *BuzHash) Sum32() uint32 {
+ return bh.state
+}
diff --git a/hash_test.go b/hash_test.go
new file mode 100644
index 0000000..0f915c9
--- /dev/null
+++ b/hash_test.go
@@ -0,0 +1,92 @@
+package buzhash
+
+import (
+ "fmt"
+ "testing"
+)
+
+// Test the rolling hash property of the buzhash
+func TestRollingHash(t *testing.T) {
+ loremipsum1 := `Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
+Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et
+magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis,
+ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis
+enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In
+enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis
+eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum
+semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu,
+consequat vitae, eleifend ac, enim. Aliquam lorem ante, dapibus in, viverra
+quis, feugiat a, tellus. Phasellus viverra nulla ut metus varius laoreet.
+Quisque rutrum. Aenean imperdiet. Etiam ultricies nisi vel augue. Curabitur
+ullamcorper ultricies nisi. Nam eget dui.
+
+Etiam rhoncus. Maecenas tempus, tellus eget condimentum rhoncus, sem quam semper
+libero, sit amet adipiscing sem neque sed ipsum. Nam quam nunc, blandit vel,
+luctus pulvinar, hendrerit id, lorem. Maecenas nec odio et ante tincidunt
+tempus. Donec vitae sapien ut libero venenatis faucibus. Nullam quis ante. Etiam
+sit amet orci eget eros faucibus tincidunt. Duis leo. Sed fringilla mauris sit
+amet nibh. Donec sodales sagittis magna. Sed consequat, leo eget bibendum
+sodales, augue velit cursus nunc, quis gravida magna mi a libero. Fusce
+vulputate eleifend sapien. Vestibulum purus quam, scelerisque ut, mollis sed,
+nonummy id, metus. Nullam accumsan lorem in dui. Cras ultricies mi eu turpis
+hendrerit fringilla. Vestibulum ante ipsum primis in faucibus orci luctus et
+ultrices posuere cubilia Curae; In ac dui quis mi consectetuer lacinia.`
+
+ loremipsum2 := `Nam pretium turpis et arcu. Duis arcu tortor, suscipit eget,
+imperdiet nec, imperdiet iaculis, ipsum. Sed aliquam ultrices mauris. Integer
+ante arcu, accumsan a, consectetuer eget, posuere ut, mauris. Praesent
+adipiscing. Phasellus ullamcorper ipsum rutrum nunc. Nunc nonummy metus.
+Vestibulum volutpat pretium libero. Cras id dui. Aenean ut eros et nisl sagittis
+vestibulum. Nullam nulla eros, ultricies sit amet, nonummy id, imperdiet
+feugiat, pede. Sed lectus. Donec mollis hendrerit risus. Phasellus nec sem in
+justo pellentesque facilisis. Etiam imperdiet imperdiet orci. Nunc nec neque.
+Phasellus leo dolor, tempus non, auctor et, hendrerit quis, nisi.
+
+Curabitur ligula sapien, tincidunt non, euismod vitae, posuere imperdiet, leo.
+Maecenas malesuada. Praesent congue erat at massa. Sed cursus turpis vitae
+tortor. Donec posuere vulputate arcu. Phasellus accumsan cursus velit.
+Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia
+Curae; Sed aliquam, nisi quis porttitor congue, elit erat euismod orci, ac
+placerat dolor lectus quis orci. Phasellus consectetuer vestibulum elit. Aenean
+tellus metus, bibendum sed, posuere ac, mattis non, nunc. Vestibulum fringilla
+pede sit amet augue. In turpis. Pellentesque posuere. Praesent turpis.`
+
+ phrase1 := "Aenean massa. Cum sociis natoque"
+ phrase2 := "Phasellus leo dolor, tempus non, auctor et, hendrerit quis, nisi"
+
+ hasher1 := NewBuzHash(32)
+ fmt.Fprint(hasher1, phrase1)
+ p1sum := hasher1.Sum32()
+
+ hasher1.Reset()
+ found := false
+ for idx, b := range []byte(loremipsum1) {
+ ssum := hasher1.HashByte(b)
+ if (ssum == p1sum) && (idx-32 == 91) {
+ found = true
+ break
+ }
+ }
+
+ if !found {
+ t.Errorf("Could not find '%s' by its checksum %08x.", phrase1, p1sum)
+ }
+
+ hasher2 := NewBuzHash(64)
+ fmt.Fprint(hasher2, phrase2)
+ p2sum := hasher2.Sum32()
+
+ hasher2.Reset()
+ found = false
+ for idx, b := range []byte(loremipsum2) {
+ ssum := hasher2.HashByte(b)
+ if (ssum == p2sum) && (idx-64 == 592) {
+ found = true
+ break
+ }
+ }
+
+ if !found {
+ t.Errorf("Could not find '%s' by its checksum %08x.", phrase2, p2sum)
+ }
+}