// Package parser handles normalizing raw receipt item names into // canonical identifiers that can be compared across submissions. // // Phase 1: simple rule-based lookup (good enough to ship) // Phase 2: fuzzy matching + embeddings (future upgrade) package parser import ( "strings" ) // entry maps keywords (found in the raw name) to a canonical ID and category. type entry struct { canonical string category string } // knownItems is the canonical item dictionary. // Key: lowercase substring that must appear in the raw name. // Expand this as you see patterns in submissions. var knownItems = []struct { keywords []string // ALL must be present (AND logic) canonical string category string }{ {[]string{"milk", "whole"}, "milk_whole_1gal", "dairy"}, {[]string{"milk", "2%"}, "milk_2pct_1gal", "dairy"}, {[]string{"milk", "skim"}, "milk_skim_1gal", "dairy"}, {[]string{"egg"}, "eggs_large_dozen", "dairy"}, {[]string{"butter", "unsalted"}, "butter_unsalted_1lb", "dairy"}, {[]string{"butter"}, "butter_salted_1lb", "dairy"}, {[]string{"cheddar"}, "cheese_cheddar_8oz", "dairy"}, {[]string{"bread", "white"}, "bread_white_loaf", "bakery"}, {[]string{"bread", "wheat"}, "bread_wheat_loaf", "bakery"}, {[]string{"bread", "sourdough"}, "bread_sourdough_loaf", "bakery"}, {[]string{"ground beef"}, "ground_beef_1lb", "meat"}, {[]string{"chicken breast"}, "chicken_breast_1lb", "meat"}, {[]string{"salmon"}, "salmon_fillet_1lb", "seafood"}, {[]string{"apple"}, "apples_bag", "produce"}, {[]string{"banana"}, "bananas_1lb", "produce"}, {[]string{"orange"}, "oranges_bag", "produce"}, {[]string{"tomato"}, "tomatoes_1lb", "produce"}, {[]string{"potato"}, "potatoes_5lb", "produce"}, {[]string{"onion"}, "onions_3lb", "produce"}, {[]string{"garlic"}, "garlic_head", "produce"}, {[]string{"spinach"}, "spinach_5oz", "produce"}, {[]string{"broccoli"}, "broccoli_head", "produce"}, {[]string{"olive oil"}, "olive_oil_16oz", "pantry"}, {[]string{"vegetable oil"}, "vegetable_oil_48oz", "pantry"}, {[]string{"flour", "all-purpose"}, "flour_allpurpose_5lb", "pantry"}, {[]string{"sugar", "white"}, "sugar_white_4lb", "pantry"}, {[]string{"sugar"}, "sugar_white_4lb", "pantry"}, {[]string{"salt"}, "salt_table_26oz", "pantry"}, {[]string{"rice"}, "rice_white_2lb", "pantry"}, {[]string{"pasta"}, "pasta_spaghetti_1lb", "pantry"}, {[]string{"coffee"}, "coffee_ground_12oz", "pantry"}, {[]string{"orange juice"}, "orange_juice_52oz", "beverages"}, {[]string{"water", "gallon"}, "water_gallon", "beverages"}, } // Normalize attempts to map a raw item name to a canonical identifier. // Returns (nil, nil) if no match is found — the item is stored raw only. func Normalize(rawName string) (*string, *string) { lower := strings.ToLower(rawName) for _, rule := range knownItems { if matchesAll(lower, rule.keywords) { c := rule.canonical cat := rule.category return &c, &cat } } return nil, nil } // matchesAll returns true if s contains every keyword in the list. func matchesAll(s string, keywords []string) bool { for _, kw := range keywords { if !strings.Contains(s, kw) { return false } } return true } // ── Future: fuzzy matching ───────────────────────────────────────────────────── // // When the rule list grows unwieldy, replace Normalize with an embeddings // approach: encode the raw name with a sentence transformer, find the nearest // canonical item by cosine similarity. The go-faiss library or a simple // Postgres pgvector extension both work well for this. // // For now, ship the rule-based version. Add to knownItems as you see misses // in production by querying: SELECT raw_name, COUNT(*) FROM line_items // WHERE canonical_name IS NULL GROUP BY raw_name ORDER BY count DESC;