initial boilerplate

2026-05-03 16:43:53 +03:00
parent bea266e066
commit 2e63e0e95b
18 changed files with 1878 additions and 1 deletions
@@ -0,0 +1,99 @@
+// Package parser handles normalizing raw receipt item names into
+// canonical identifiers that can be compared across submissions.
+//
+// Phase 1: simple rule-based lookup (good enough to ship)
+// Phase 2: fuzzy matching + embeddings (future upgrade)
+package parser
+
+import (
+	"strings"
+)
+
+// entry maps keywords (found in the raw name) to a canonical ID and category.
+type entry struct {
+	canonical string
+	category  string
+}
+
+// knownItems is the canonical item dictionary.
+// Key: lowercase substring that must appear in the raw name.
+// Expand this as you see patterns in submissions.
+var knownItems = []struct {
+	keywords  []string // ALL must be present (AND logic)
+	canonical string
+	category  string
+}{
+	{[]string{"milk", "whole"}, "milk_whole_1gal", "dairy"},
+	{[]string{"milk", "2%"}, "milk_2pct_1gal", "dairy"},
+	{[]string{"milk", "skim"}, "milk_skim_1gal", "dairy"},
+	{[]string{"egg"}, "eggs_large_dozen", "dairy"},
+	{[]string{"butter", "unsalted"}, "butter_unsalted_1lb", "dairy"},
+	{[]string{"butter"}, "butter_salted_1lb", "dairy"},
+	{[]string{"cheddar"}, "cheese_cheddar_8oz", "dairy"},
+
+	{[]string{"bread", "white"}, "bread_white_loaf", "bakery"},
+	{[]string{"bread", "wheat"}, "bread_wheat_loaf", "bakery"},
+	{[]string{"bread", "sourdough"}, "bread_sourdough_loaf", "bakery"},
+
+	{[]string{"ground beef"}, "ground_beef_1lb", "meat"},
+	{[]string{"chicken breast"}, "chicken_breast_1lb", "meat"},
+	{[]string{"salmon"}, "salmon_fillet_1lb", "seafood"},
+
+	{[]string{"apple"}, "apples_bag", "produce"},
+	{[]string{"banana"}, "bananas_1lb", "produce"},
+	{[]string{"orange"}, "oranges_bag", "produce"},
+	{[]string{"tomato"}, "tomatoes_1lb", "produce"},
+	{[]string{"potato"}, "potatoes_5lb", "produce"},
+	{[]string{"onion"}, "onions_3lb", "produce"},
+	{[]string{"garlic"}, "garlic_head", "produce"},
+	{[]string{"spinach"}, "spinach_5oz", "produce"},
+	{[]string{"broccoli"}, "broccoli_head", "produce"},
+
+	{[]string{"olive oil"}, "olive_oil_16oz", "pantry"},
+	{[]string{"vegetable oil"}, "vegetable_oil_48oz", "pantry"},
+	{[]string{"flour", "all-purpose"}, "flour_allpurpose_5lb", "pantry"},
+	{[]string{"sugar", "white"}, "sugar_white_4lb", "pantry"},
+	{[]string{"sugar"}, "sugar_white_4lb", "pantry"},
+	{[]string{"salt"}, "salt_table_26oz", "pantry"},
+	{[]string{"rice"}, "rice_white_2lb", "pantry"},
+	{[]string{"pasta"}, "pasta_spaghetti_1lb", "pantry"},
+	{[]string{"coffee"}, "coffee_ground_12oz", "pantry"},
+	{[]string{"orange juice"}, "orange_juice_52oz", "beverages"},
+	{[]string{"water", "gallon"}, "water_gallon", "beverages"},
+}
+
+// Normalize attempts to map a raw item name to a canonical identifier.
+// Returns (nil, nil) if no match is found — the item is stored raw only.
+func Normalize(rawName string) (*string, *string) {
+	lower := strings.ToLower(rawName)
+
+	for _, rule := range knownItems {
+		if matchesAll(lower, rule.keywords) {
+			c := rule.canonical
+			cat := rule.category
+			return &c, &cat
+		}
+	}
+	return nil, nil
+}
+
+// matchesAll returns true if s contains every keyword in the list.
+func matchesAll(s string, keywords []string) bool {
+	for _, kw := range keywords {
+		if !strings.Contains(s, kw) {
+			return false
+		}
+	}
+	return true
+}
+
+// ── Future: fuzzy matching ─────────────────────────────────────────────────────
+//
+// When the rule list grows unwieldy, replace Normalize with an embeddings
+// approach: encode the raw name with a sentence transformer, find the nearest
+// canonical item by cosine similarity. The go-faiss library or a simple
+// Postgres pgvector extension both work well for this.
+//
+// For now, ship the rule-based version. Add to knownItems as you see misses
+// in production by querying: SELECT raw_name, COUNT(*) FROM line_items
+// WHERE canonical_name IS NULL GROUP BY raw_name ORDER BY count DESC;