Documentation
¶
Overview ¶
Package minhashlsh implements Locality Sensitive Hashing using MinHash signatures
Index ¶
- type Minhash
- type MinhashLSH
- func NewMinhashLSH[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
- func NewMinhashLSH16[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
- func NewMinhashLSH32[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
- func NewMinhashLSH64[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
- func NewMinhashLSHWithDefaults[T comparable](initSize int) *MinhashLSH[T]
- type MinhashLSHMap
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Minhash ¶
type Minhash struct {
// contains filtered or unexported fields
}
Minhash represents a MinHash object
func NewMinhash ¶
NewMinhash initialize a MinHash object with a seed and the number of hash functions.
func NewMinhashWithDefaults ¶
func NewMinhashWithDefaults() *Minhash
NewMinhashWithDefaults initializes a MinHash object with default seed and number of hashes. Matches Python implementation: https://github.com/ekzhu/datasketch/blob/5512549871d29c55b3cd8e99a79fcbd14859b77d/datasketch/minhash.py#L69C9-L69C17
func (*Minhash) Merge ¶
Merge combines the signature of the other Minhash with this one, making this one carry the signature of the union.
type MinhashLSH ¶
type MinhashLSH[T comparable] struct { // contains filtered or unexported fields }
MinhashLSH represents a MinHash LSH implemented using LSH Forest (http://ilpubs.stanford.edu:8090/678/1/2005-14.pdf). It supports query-time setting of the MinHash LSH parameters L (number of bands) and K (number of hash functions per band).
func NewMinhashLSH ¶
func NewMinhashLSH[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
NewMinhashLSH is the default constructor uses 32 bit hash value with pre-allocation of hash tables.
func NewMinhashLSH16 ¶
func NewMinhashLSH16[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
NewMinhashLSH16 uses 16-bit hash values and pre-allocation of hash tables. MinHash signatures with 64 or 32 bit hash values will have their hash values trimmed.
func NewMinhashLSH32 ¶
func NewMinhashLSH32[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
NewMinhashLSH32 uses 32-bit hash values and pre-allocation of hash tables. MinHash signatures with 64 bit hash values will have their hash values trimmed.
func NewMinhashLSH64 ¶
func NewMinhashLSH64[T comparable](numHash int, threshold float64, initSize int) *MinhashLSH[T]
NewMinhashLSH64 uses 64-bit hash values and pre-allocation of hash tables.
func NewMinhashLSHWithDefaults ¶
func NewMinhashLSHWithDefaults[T comparable](initSize int) *MinhashLSH[T]
NewMinhashLSHWithDefaults is the default constructor with default parameters
func (*MinhashLSH[T]) Add ¶
func (f *MinhashLSH[T]) Add(key T, sig []uint64)
Add a key with MinHash signature into the index. The key won't be searchable until Index() is called.
func (*MinhashLSH[T]) Index ¶
func (f *MinhashLSH[T]) Index()
Index makes all the keys added searchable.
func (*MinhashLSH[T]) Params ¶
func (f *MinhashLSH[T]) Params() (k, l int)
Params returns the LSH parameters k and l
func (*MinhashLSH[T]) Query ¶
func (f *MinhashLSH[T]) Query(sig []uint64) []T
Query returns candidate keys given the query signature.
type MinhashLSHMap ¶
type MinhashLSHMap[T comparable] struct { // contains filtered or unexported fields }
MinhashLSHMap is a map-backed Minhash LSH optimized for interleaved Add/Query workloads. Entries are immediately searchable after Add() with O(1) insert and O(1) lookup per band — no sorting or explicit Index() call required.
Example ¶
package main
import (
"fmt"
minhashlsh "github.com/stillmatic/minhash-lsh"
)
type newsItem struct {
URL string
Description string
}
func main() {
newsItems := []newsItem{
{URL: "https://example.com/1", Description: "This is a test"},
{URL: "https://example.com/2", Description: "This is another test"},
{URL: "https://example.com/3", Description: "This is a test"},
}
// key on the URL, so instantiate with `string` generic
lsh := minhashlsh.NewMinhashLSHMapWithSize[string](88, 0.7, len(newsItems))
for _, item := range newsItems {
mh := minhashlsh.NewMinhashWithDefaults()
mh.Push([]byte(item.Description))
lsh.Add(item.URL, mh.Signature())
}
// no need to build index with map backend
// find duplicate entries
dupeKeys := make(map[string]struct{})
for _, item := range newsItems {
if _, ok := dupeKeys[item.URL]; ok {
//already a duplicate
continue
}
mh := minhashlsh.NewMinhashWithDefaults()
mh.Push([]byte(item.Description))
queryRes := lsh.Query(mh.Signature())
if len(queryRes) == 0 {
continue
}
for _, res := range queryRes {
if res != item.URL {
dupeKeys[res] = struct{}{}
}
}
}
// should be 1 duplicate to remove
fmt.Println(dupeKeys)
}
Output:
func NewMinhashLSHMap ¶
func NewMinhashLSHMap[T comparable](numHash int, threshold float64) *MinhashLSHMap[T]
func NewMinhashLSHMapWithSize ¶
func NewMinhashLSHMapWithSize[T comparable](numHash int, threshold float64, initSize int) *MinhashLSHMap[T]
func (*MinhashLSHMap[T]) Add ¶
func (f *MinhashLSHMap[T]) Add(key T, sig []uint64)
func (*MinhashLSHMap[T]) Query ¶
func (f *MinhashLSHMap[T]) Query(sig []uint64) []T
Query returns candidate keys given the query signature.