package inflation import ( "strings" "unicode" ) // Matcher maps raw receipt item names to canonical item names. // This is the "normalization" problem — "1 gal whole milk" and // "MILK WHL GAL" should both resolve to "milk_whole_1gal". // // Start simple: exact alias lookup + token overlap. // Later: replace with embeddings or a fuzzy search library. type Matcher struct { // aliasMap maps lowercase alias string -> canonical name aliasMap map[string]string } // knownAliases is the seed list. In production, load these from the // canonical_items table's aliases column at startup. var knownAliases = map[string]string{ // Milk "whole milk gallon": "milk_whole_1gal", "1 gal whole milk": "milk_whole_1gal", "milk whl gal": "milk_whole_1gal", "whole milk 1gal": "milk_whole_1gal", "milk whole": "milk_whole_1gal", // Eggs "large eggs 12ct": "eggs_large_dozen", "eggs large dozen": "eggs_large_dozen", "grade a large eggs": "eggs_large_dozen", "eggs lg 12": "eggs_large_dozen", "large eggs": "eggs_large_dozen", // Bread "white bread": "bread_white_loaf", "sandwich bread": "bread_white_loaf", "bread loaf": "bread_white_loaf", "white bread loaf": "bread_white_loaf", // Ground beef "ground beef lb": "ground_beef_1lb", "80/20 ground beef": "ground_beef_1lb", "hamburger meat": "ground_beef_1lb", "ground beef": "ground_beef_1lb", // Olive oil "olive oil 16oz": "olive_oil_16oz", "extra virgin olive oil": "olive_oil_16oz", "evoo 16oz": "olive_oil_16oz", "olive oil": "olive_oil_16oz", // Butter "butter salted pound": "butter_salted_1lb", "salted butter 4 sticks": "butter_salted_1lb", "salted butter": "butter_salted_1lb", // Chicken "boneless chicken breast": "chicken_breast_1lb", "chicken breast lb": "chicken_breast_1lb", "chicken breast": "chicken_breast_1lb", // OJ "oj 52oz": "orange_juice_52oz", "orange juice carton": "orange_juice_52oz", "orange juice": "orange_juice_52oz", } func NewMatcher() *Matcher { return &Matcher{aliasMap: knownAliases} } // Match tries to find a canonical name for a raw receipt string. // Returns empty string if no match is found — unmatched items are stored // with canonical_name = NULL and can be reviewed/matched later. func (m *Matcher) Match(raw string) string { normalized := normalize(raw) // 1. Exact alias match (fastest) if canonical, ok := m.aliasMap[normalized]; ok { return canonical } // 2. Substring match — if any alias is contained in the raw text for alias, canonical := range m.aliasMap { if strings.Contains(normalized, alias) { return canonical } } // 3. Token overlap — split both into words and count shared tokens // This catches "MILK WHOLE 1 GAL" matching "whole milk gallon" rawTokens := tokenize(normalized) bestScore := 0 bestMatch := "" for alias, canonical := range m.aliasMap { aliasTokens := tokenize(alias) score := tokenOverlap(rawTokens, aliasTokens) // Require matching at least 2 tokens and >50% of alias tokens if score >= 2 && score > bestScore && float64(score)/float64(len(aliasTokens)) > 0.5 { bestScore = score bestMatch = canonical } } return bestMatch } // normalize lowercases and strips punctuation/extra whitespace. func normalize(s string) string { s = strings.ToLower(s) var b strings.Builder for _, r := range s { if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '/' { b.WriteRune(r) } else { b.WriteRune(' ') } } return strings.Join(strings.Fields(b.String()), " ") } // tokenize splits a normalized string into unique words. func tokenize(s string) []string { return strings.Fields(s) } // tokenOverlap counts how many words from a appear in b. func tokenOverlap(a, b []string) int { set := make(map[string]bool, len(b)) for _, t := range b { set[t] = true } count := 0 for _, t := range a { if set[t] { count++ } } return count }