Files
Deflated/internal/inflation/matcher.go
T
2026-05-03 16:43:53 +03:00

144 lines
4.0 KiB
Go

package inflation
import (
"strings"
"unicode"
)
// Matcher maps raw receipt item names to canonical item names.
// This is the "normalization" problem — "1 gal whole milk" and
// "MILK WHL GAL" should both resolve to "milk_whole_1gal".
//
// Start simple: exact alias lookup + token overlap.
// Later: replace with embeddings or a fuzzy search library.
type Matcher struct {
// aliasMap maps lowercase alias string -> canonical name
aliasMap map[string]string
}
// knownAliases is the seed list. In production, load these from the
// canonical_items table's aliases column at startup.
var knownAliases = map[string]string{
// Milk
"whole milk gallon": "milk_whole_1gal",
"1 gal whole milk": "milk_whole_1gal",
"milk whl gal": "milk_whole_1gal",
"whole milk 1gal": "milk_whole_1gal",
"milk whole": "milk_whole_1gal",
// Eggs
"large eggs 12ct": "eggs_large_dozen",
"eggs large dozen": "eggs_large_dozen",
"grade a large eggs": "eggs_large_dozen",
"eggs lg 12": "eggs_large_dozen",
"large eggs": "eggs_large_dozen",
// Bread
"white bread": "bread_white_loaf",
"sandwich bread": "bread_white_loaf",
"bread loaf": "bread_white_loaf",
"white bread loaf": "bread_white_loaf",
// Ground beef
"ground beef lb": "ground_beef_1lb",
"80/20 ground beef": "ground_beef_1lb",
"hamburger meat": "ground_beef_1lb",
"ground beef": "ground_beef_1lb",
// Olive oil
"olive oil 16oz": "olive_oil_16oz",
"extra virgin olive oil": "olive_oil_16oz",
"evoo 16oz": "olive_oil_16oz",
"olive oil": "olive_oil_16oz",
// Butter
"butter salted pound": "butter_salted_1lb",
"salted butter 4 sticks": "butter_salted_1lb",
"salted butter": "butter_salted_1lb",
// Chicken
"boneless chicken breast": "chicken_breast_1lb",
"chicken breast lb": "chicken_breast_1lb",
"chicken breast": "chicken_breast_1lb",
// OJ
"oj 52oz": "orange_juice_52oz",
"orange juice carton": "orange_juice_52oz",
"orange juice": "orange_juice_52oz",
}
func NewMatcher() *Matcher {
return &Matcher{aliasMap: knownAliases}
}
// Match tries to find a canonical name for a raw receipt string.
// Returns empty string if no match is found — unmatched items are stored
// with canonical_name = NULL and can be reviewed/matched later.
func (m *Matcher) Match(raw string) string {
normalized := normalize(raw)
// 1. Exact alias match (fastest)
if canonical, ok := m.aliasMap[normalized]; ok {
return canonical
}
// 2. Substring match — if any alias is contained in the raw text
for alias, canonical := range m.aliasMap {
if strings.Contains(normalized, alias) {
return canonical
}
}
// 3. Token overlap — split both into words and count shared tokens
// This catches "MILK WHOLE 1 GAL" matching "whole milk gallon"
rawTokens := tokenize(normalized)
bestScore := 0
bestMatch := ""
for alias, canonical := range m.aliasMap {
aliasTokens := tokenize(alias)
score := tokenOverlap(rawTokens, aliasTokens)
// Require matching at least 2 tokens and >50% of alias tokens
if score >= 2 && score > bestScore && float64(score)/float64(len(aliasTokens)) > 0.5 {
bestScore = score
bestMatch = canonical
}
}
return bestMatch
}
// normalize lowercases and strips punctuation/extra whitespace.
func normalize(s string) string {
s = strings.ToLower(s)
var b strings.Builder
for _, r := range s {
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '/' {
b.WriteRune(r)
} else {
b.WriteRune(' ')
}
}
return strings.Join(strings.Fields(b.String()), " ")
}
// tokenize splits a normalized string into unique words.
func tokenize(s string) []string {
return strings.Fields(s)
}
// tokenOverlap counts how many words from a appear in b.
func tokenOverlap(a, b []string) int {
set := make(map[string]bool, len(b))
for _, t := range b {
set[t] = true
}
count := 0
for _, t := range a {
if set[t] {
count++
}
}
return count
}