initial boilerplate

2026-05-03 16:43:53 +03:00
parent bea266e066
commit 2e63e0e95b
18 changed files with 1878 additions and 1 deletions
@@ -0,0 +1,143 @@
+package inflation
+
+import (
+	"strings"
+	"unicode"
+)
+
+// Matcher maps raw receipt item names to canonical item names.
+// This is the "normalization" problem — "1 gal whole milk" and
+// "MILK WHL GAL" should both resolve to "milk_whole_1gal".
+//
+// Start simple: exact alias lookup + token overlap.
+// Later: replace with embeddings or a fuzzy search library.
+type Matcher struct {
+	// aliasMap maps lowercase alias string -> canonical name
+	aliasMap map[string]string
+}
+
+// knownAliases is the seed list. In production, load these from the
+// canonical_items table's aliases column at startup.
+var knownAliases = map[string]string{
+	// Milk
+	"whole milk gallon":    "milk_whole_1gal",
+	"1 gal whole milk":     "milk_whole_1gal",
+	"milk whl gal":         "milk_whole_1gal",
+	"whole milk 1gal":      "milk_whole_1gal",
+	"milk whole":           "milk_whole_1gal",
+
+	// Eggs
+	"large eggs 12ct":      "eggs_large_dozen",
+	"eggs large dozen":     "eggs_large_dozen",
+	"grade a large eggs":   "eggs_large_dozen",
+	"eggs lg 12":           "eggs_large_dozen",
+	"large eggs":           "eggs_large_dozen",
+
+	// Bread
+	"white bread":          "bread_white_loaf",
+	"sandwich bread":       "bread_white_loaf",
+	"bread loaf":           "bread_white_loaf",
+	"white bread loaf":     "bread_white_loaf",
+
+	// Ground beef
+	"ground beef lb":       "ground_beef_1lb",
+	"80/20 ground beef":    "ground_beef_1lb",
+	"hamburger meat":       "ground_beef_1lb",
+	"ground beef":          "ground_beef_1lb",
+
+	// Olive oil
+	"olive oil 16oz":       "olive_oil_16oz",
+	"extra virgin olive oil": "olive_oil_16oz",
+	"evoo 16oz":            "olive_oil_16oz",
+	"olive oil":            "olive_oil_16oz",
+
+	// Butter
+	"butter salted pound":  "butter_salted_1lb",
+	"salted butter 4 sticks": "butter_salted_1lb",
+	"salted butter":        "butter_salted_1lb",
+
+	// Chicken
+	"boneless chicken breast": "chicken_breast_1lb",
+	"chicken breast lb":    "chicken_breast_1lb",
+	"chicken breast":       "chicken_breast_1lb",
+
+	// OJ
+	"oj 52oz":              "orange_juice_52oz",
+	"orange juice carton":  "orange_juice_52oz",
+	"orange juice":         "orange_juice_52oz",
+}
+
+func NewMatcher() *Matcher {
+	return &Matcher{aliasMap: knownAliases}
+}
+
+// Match tries to find a canonical name for a raw receipt string.
+// Returns empty string if no match is found — unmatched items are stored
+// with canonical_name = NULL and can be reviewed/matched later.
+func (m *Matcher) Match(raw string) string {
+	normalized := normalize(raw)
+
+	// 1. Exact alias match (fastest)
+	if canonical, ok := m.aliasMap[normalized]; ok {
+		return canonical
+	}
+
+	// 2. Substring match — if any alias is contained in the raw text
+	for alias, canonical := range m.aliasMap {
+		if strings.Contains(normalized, alias) {
+			return canonical
+		}
+	}
+
+	// 3. Token overlap — split both into words and count shared tokens
+	// This catches "MILK WHOLE 1 GAL" matching "whole milk gallon"
+	rawTokens := tokenize(normalized)
+	bestScore := 0
+	bestMatch := ""
+
+	for alias, canonical := range m.aliasMap {
+		aliasTokens := tokenize(alias)
+		score := tokenOverlap(rawTokens, aliasTokens)
+		// Require matching at least 2 tokens and >50% of alias tokens
+		if score >= 2 && score > bestScore && float64(score)/float64(len(aliasTokens)) > 0.5 {
+			bestScore = score
+			bestMatch = canonical
+		}
+	}
+
+	return bestMatch
+}
+
+// normalize lowercases and strips punctuation/extra whitespace.
+func normalize(s string) string {
+	s = strings.ToLower(s)
+	var b strings.Builder
+	for _, r := range s {
+		if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '/' {
+			b.WriteRune(r)
+		} else {
+			b.WriteRune(' ')
+		}
+	}
+	return strings.Join(strings.Fields(b.String()), " ")
+}
+
+// tokenize splits a normalized string into unique words.
+func tokenize(s string) []string {
+	return strings.Fields(s)
+}
+
+// tokenOverlap counts how many words from a appear in b.
+func tokenOverlap(a, b []string) int {
+	set := make(map[string]bool, len(b))
+	for _, t := range b {
+		set[t] = true
+	}
+	count := 0
+	for _, t := range a {
+		if set[t] {
+			count++
+		}
+	}
+	return count
+}