initial boilerplate
This commit is contained in:
@@ -0,0 +1,143 @@
|
||||
package inflation
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Matcher maps raw receipt item names to canonical item names.
|
||||
// This is the "normalization" problem — "1 gal whole milk" and
|
||||
// "MILK WHL GAL" should both resolve to "milk_whole_1gal".
|
||||
//
|
||||
// Start simple: exact alias lookup + token overlap.
|
||||
// Later: replace with embeddings or a fuzzy search library.
|
||||
type Matcher struct {
|
||||
// aliasMap maps lowercase alias string -> canonical name
|
||||
aliasMap map[string]string
|
||||
}
|
||||
|
||||
// knownAliases is the seed list. In production, load these from the
|
||||
// canonical_items table's aliases column at startup.
|
||||
var knownAliases = map[string]string{
|
||||
// Milk
|
||||
"whole milk gallon": "milk_whole_1gal",
|
||||
"1 gal whole milk": "milk_whole_1gal",
|
||||
"milk whl gal": "milk_whole_1gal",
|
||||
"whole milk 1gal": "milk_whole_1gal",
|
||||
"milk whole": "milk_whole_1gal",
|
||||
|
||||
// Eggs
|
||||
"large eggs 12ct": "eggs_large_dozen",
|
||||
"eggs large dozen": "eggs_large_dozen",
|
||||
"grade a large eggs": "eggs_large_dozen",
|
||||
"eggs lg 12": "eggs_large_dozen",
|
||||
"large eggs": "eggs_large_dozen",
|
||||
|
||||
// Bread
|
||||
"white bread": "bread_white_loaf",
|
||||
"sandwich bread": "bread_white_loaf",
|
||||
"bread loaf": "bread_white_loaf",
|
||||
"white bread loaf": "bread_white_loaf",
|
||||
|
||||
// Ground beef
|
||||
"ground beef lb": "ground_beef_1lb",
|
||||
"80/20 ground beef": "ground_beef_1lb",
|
||||
"hamburger meat": "ground_beef_1lb",
|
||||
"ground beef": "ground_beef_1lb",
|
||||
|
||||
// Olive oil
|
||||
"olive oil 16oz": "olive_oil_16oz",
|
||||
"extra virgin olive oil": "olive_oil_16oz",
|
||||
"evoo 16oz": "olive_oil_16oz",
|
||||
"olive oil": "olive_oil_16oz",
|
||||
|
||||
// Butter
|
||||
"butter salted pound": "butter_salted_1lb",
|
||||
"salted butter 4 sticks": "butter_salted_1lb",
|
||||
"salted butter": "butter_salted_1lb",
|
||||
|
||||
// Chicken
|
||||
"boneless chicken breast": "chicken_breast_1lb",
|
||||
"chicken breast lb": "chicken_breast_1lb",
|
||||
"chicken breast": "chicken_breast_1lb",
|
||||
|
||||
// OJ
|
||||
"oj 52oz": "orange_juice_52oz",
|
||||
"orange juice carton": "orange_juice_52oz",
|
||||
"orange juice": "orange_juice_52oz",
|
||||
}
|
||||
|
||||
func NewMatcher() *Matcher {
|
||||
return &Matcher{aliasMap: knownAliases}
|
||||
}
|
||||
|
||||
// Match tries to find a canonical name for a raw receipt string.
|
||||
// Returns empty string if no match is found — unmatched items are stored
|
||||
// with canonical_name = NULL and can be reviewed/matched later.
|
||||
func (m *Matcher) Match(raw string) string {
|
||||
normalized := normalize(raw)
|
||||
|
||||
// 1. Exact alias match (fastest)
|
||||
if canonical, ok := m.aliasMap[normalized]; ok {
|
||||
return canonical
|
||||
}
|
||||
|
||||
// 2. Substring match — if any alias is contained in the raw text
|
||||
for alias, canonical := range m.aliasMap {
|
||||
if strings.Contains(normalized, alias) {
|
||||
return canonical
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Token overlap — split both into words and count shared tokens
|
||||
// This catches "MILK WHOLE 1 GAL" matching "whole milk gallon"
|
||||
rawTokens := tokenize(normalized)
|
||||
bestScore := 0
|
||||
bestMatch := ""
|
||||
|
||||
for alias, canonical := range m.aliasMap {
|
||||
aliasTokens := tokenize(alias)
|
||||
score := tokenOverlap(rawTokens, aliasTokens)
|
||||
// Require matching at least 2 tokens and >50% of alias tokens
|
||||
if score >= 2 && score > bestScore && float64(score)/float64(len(aliasTokens)) > 0.5 {
|
||||
bestScore = score
|
||||
bestMatch = canonical
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch
|
||||
}
|
||||
|
||||
// normalize lowercases and strips punctuation/extra whitespace.
|
||||
func normalize(s string) string {
|
||||
s = strings.ToLower(s)
|
||||
var b strings.Builder
|
||||
for _, r := range s {
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '/' {
|
||||
b.WriteRune(r)
|
||||
} else {
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
}
|
||||
return strings.Join(strings.Fields(b.String()), " ")
|
||||
}
|
||||
|
||||
// tokenize splits a normalized string into unique words.
|
||||
func tokenize(s string) []string {
|
||||
return strings.Fields(s)
|
||||
}
|
||||
|
||||
// tokenOverlap counts how many words from a appear in b.
|
||||
func tokenOverlap(a, b []string) int {
|
||||
set := make(map[string]bool, len(b))
|
||||
for _, t := range b {
|
||||
set[t] = true
|
||||
}
|
||||
count := 0
|
||||
for _, t := range a {
|
||||
if set[t] {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
Reference in New Issue
Block a user