144 lines
4.0 KiB
Go
144 lines
4.0 KiB
Go
package inflation
|
|
|
|
import (
|
|
"strings"
|
|
"unicode"
|
|
)
|
|
|
|
// Matcher maps raw receipt item names to canonical item names.
|
|
// This is the "normalization" problem — "1 gal whole milk" and
|
|
// "MILK WHL GAL" should both resolve to "milk_whole_1gal".
|
|
//
|
|
// Start simple: exact alias lookup + token overlap.
|
|
// Later: replace with embeddings or a fuzzy search library.
|
|
type Matcher struct {
|
|
// aliasMap maps lowercase alias string -> canonical name
|
|
aliasMap map[string]string
|
|
}
|
|
|
|
// knownAliases is the seed list. In production, load these from the
|
|
// canonical_items table's aliases column at startup.
|
|
var knownAliases = map[string]string{
|
|
// Milk
|
|
"whole milk gallon": "milk_whole_1gal",
|
|
"1 gal whole milk": "milk_whole_1gal",
|
|
"milk whl gal": "milk_whole_1gal",
|
|
"whole milk 1gal": "milk_whole_1gal",
|
|
"milk whole": "milk_whole_1gal",
|
|
|
|
// Eggs
|
|
"large eggs 12ct": "eggs_large_dozen",
|
|
"eggs large dozen": "eggs_large_dozen",
|
|
"grade a large eggs": "eggs_large_dozen",
|
|
"eggs lg 12": "eggs_large_dozen",
|
|
"large eggs": "eggs_large_dozen",
|
|
|
|
// Bread
|
|
"white bread": "bread_white_loaf",
|
|
"sandwich bread": "bread_white_loaf",
|
|
"bread loaf": "bread_white_loaf",
|
|
"white bread loaf": "bread_white_loaf",
|
|
|
|
// Ground beef
|
|
"ground beef lb": "ground_beef_1lb",
|
|
"80/20 ground beef": "ground_beef_1lb",
|
|
"hamburger meat": "ground_beef_1lb",
|
|
"ground beef": "ground_beef_1lb",
|
|
|
|
// Olive oil
|
|
"olive oil 16oz": "olive_oil_16oz",
|
|
"extra virgin olive oil": "olive_oil_16oz",
|
|
"evoo 16oz": "olive_oil_16oz",
|
|
"olive oil": "olive_oil_16oz",
|
|
|
|
// Butter
|
|
"butter salted pound": "butter_salted_1lb",
|
|
"salted butter 4 sticks": "butter_salted_1lb",
|
|
"salted butter": "butter_salted_1lb",
|
|
|
|
// Chicken
|
|
"boneless chicken breast": "chicken_breast_1lb",
|
|
"chicken breast lb": "chicken_breast_1lb",
|
|
"chicken breast": "chicken_breast_1lb",
|
|
|
|
// OJ
|
|
"oj 52oz": "orange_juice_52oz",
|
|
"orange juice carton": "orange_juice_52oz",
|
|
"orange juice": "orange_juice_52oz",
|
|
}
|
|
|
|
func NewMatcher() *Matcher {
|
|
return &Matcher{aliasMap: knownAliases}
|
|
}
|
|
|
|
// Match tries to find a canonical name for a raw receipt string.
|
|
// Returns empty string if no match is found — unmatched items are stored
|
|
// with canonical_name = NULL and can be reviewed/matched later.
|
|
func (m *Matcher) Match(raw string) string {
|
|
normalized := normalize(raw)
|
|
|
|
// 1. Exact alias match (fastest)
|
|
if canonical, ok := m.aliasMap[normalized]; ok {
|
|
return canonical
|
|
}
|
|
|
|
// 2. Substring match — if any alias is contained in the raw text
|
|
for alias, canonical := range m.aliasMap {
|
|
if strings.Contains(normalized, alias) {
|
|
return canonical
|
|
}
|
|
}
|
|
|
|
// 3. Token overlap — split both into words and count shared tokens
|
|
// This catches "MILK WHOLE 1 GAL" matching "whole milk gallon"
|
|
rawTokens := tokenize(normalized)
|
|
bestScore := 0
|
|
bestMatch := ""
|
|
|
|
for alias, canonical := range m.aliasMap {
|
|
aliasTokens := tokenize(alias)
|
|
score := tokenOverlap(rawTokens, aliasTokens)
|
|
// Require matching at least 2 tokens and >50% of alias tokens
|
|
if score >= 2 && score > bestScore && float64(score)/float64(len(aliasTokens)) > 0.5 {
|
|
bestScore = score
|
|
bestMatch = canonical
|
|
}
|
|
}
|
|
|
|
return bestMatch
|
|
}
|
|
|
|
// normalize lowercases and strips punctuation/extra whitespace.
|
|
func normalize(s string) string {
|
|
s = strings.ToLower(s)
|
|
var b strings.Builder
|
|
for _, r := range s {
|
|
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '/' {
|
|
b.WriteRune(r)
|
|
} else {
|
|
b.WriteRune(' ')
|
|
}
|
|
}
|
|
return strings.Join(strings.Fields(b.String()), " ")
|
|
}
|
|
|
|
// tokenize splits a normalized string into unique words.
|
|
func tokenize(s string) []string {
|
|
return strings.Fields(s)
|
|
}
|
|
|
|
// tokenOverlap counts how many words from a appear in b.
|
|
func tokenOverlap(a, b []string) int {
|
|
set := make(map[string]bool, len(b))
|
|
for _, t := range b {
|
|
set[t] = true
|
|
}
|
|
count := 0
|
|
for _, t := range a {
|
|
if set[t] {
|
|
count++
|
|
}
|
|
}
|
|
return count
|
|
}
|