initial boilerplate
This commit is contained in:
@@ -0,0 +1,143 @@
|
||||
package inflation
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// Matcher maps raw receipt item names to canonical item names.
|
||||
// This is the "normalization" problem — "1 gal whole milk" and
|
||||
// "MILK WHL GAL" should both resolve to "milk_whole_1gal".
|
||||
//
|
||||
// Start simple: exact alias lookup + token overlap.
|
||||
// Later: replace with embeddings or a fuzzy search library.
|
||||
type Matcher struct {
|
||||
// aliasMap maps lowercase alias string -> canonical name
|
||||
aliasMap map[string]string
|
||||
}
|
||||
|
||||
// knownAliases is the seed list. In production, load these from the
|
||||
// canonical_items table's aliases column at startup.
|
||||
var knownAliases = map[string]string{
|
||||
// Milk
|
||||
"whole milk gallon": "milk_whole_1gal",
|
||||
"1 gal whole milk": "milk_whole_1gal",
|
||||
"milk whl gal": "milk_whole_1gal",
|
||||
"whole milk 1gal": "milk_whole_1gal",
|
||||
"milk whole": "milk_whole_1gal",
|
||||
|
||||
// Eggs
|
||||
"large eggs 12ct": "eggs_large_dozen",
|
||||
"eggs large dozen": "eggs_large_dozen",
|
||||
"grade a large eggs": "eggs_large_dozen",
|
||||
"eggs lg 12": "eggs_large_dozen",
|
||||
"large eggs": "eggs_large_dozen",
|
||||
|
||||
// Bread
|
||||
"white bread": "bread_white_loaf",
|
||||
"sandwich bread": "bread_white_loaf",
|
||||
"bread loaf": "bread_white_loaf",
|
||||
"white bread loaf": "bread_white_loaf",
|
||||
|
||||
// Ground beef
|
||||
"ground beef lb": "ground_beef_1lb",
|
||||
"80/20 ground beef": "ground_beef_1lb",
|
||||
"hamburger meat": "ground_beef_1lb",
|
||||
"ground beef": "ground_beef_1lb",
|
||||
|
||||
// Olive oil
|
||||
"olive oil 16oz": "olive_oil_16oz",
|
||||
"extra virgin olive oil": "olive_oil_16oz",
|
||||
"evoo 16oz": "olive_oil_16oz",
|
||||
"olive oil": "olive_oil_16oz",
|
||||
|
||||
// Butter
|
||||
"butter salted pound": "butter_salted_1lb",
|
||||
"salted butter 4 sticks": "butter_salted_1lb",
|
||||
"salted butter": "butter_salted_1lb",
|
||||
|
||||
// Chicken
|
||||
"boneless chicken breast": "chicken_breast_1lb",
|
||||
"chicken breast lb": "chicken_breast_1lb",
|
||||
"chicken breast": "chicken_breast_1lb",
|
||||
|
||||
// OJ
|
||||
"oj 52oz": "orange_juice_52oz",
|
||||
"orange juice carton": "orange_juice_52oz",
|
||||
"orange juice": "orange_juice_52oz",
|
||||
}
|
||||
|
||||
func NewMatcher() *Matcher {
|
||||
return &Matcher{aliasMap: knownAliases}
|
||||
}
|
||||
|
||||
// Match tries to find a canonical name for a raw receipt string.
|
||||
// Returns empty string if no match is found — unmatched items are stored
|
||||
// with canonical_name = NULL and can be reviewed/matched later.
|
||||
func (m *Matcher) Match(raw string) string {
|
||||
normalized := normalize(raw)
|
||||
|
||||
// 1. Exact alias match (fastest)
|
||||
if canonical, ok := m.aliasMap[normalized]; ok {
|
||||
return canonical
|
||||
}
|
||||
|
||||
// 2. Substring match — if any alias is contained in the raw text
|
||||
for alias, canonical := range m.aliasMap {
|
||||
if strings.Contains(normalized, alias) {
|
||||
return canonical
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Token overlap — split both into words and count shared tokens
|
||||
// This catches "MILK WHOLE 1 GAL" matching "whole milk gallon"
|
||||
rawTokens := tokenize(normalized)
|
||||
bestScore := 0
|
||||
bestMatch := ""
|
||||
|
||||
for alias, canonical := range m.aliasMap {
|
||||
aliasTokens := tokenize(alias)
|
||||
score := tokenOverlap(rawTokens, aliasTokens)
|
||||
// Require matching at least 2 tokens and >50% of alias tokens
|
||||
if score >= 2 && score > bestScore && float64(score)/float64(len(aliasTokens)) > 0.5 {
|
||||
bestScore = score
|
||||
bestMatch = canonical
|
||||
}
|
||||
}
|
||||
|
||||
return bestMatch
|
||||
}
|
||||
|
||||
// normalize lowercases and strips punctuation/extra whitespace.
|
||||
func normalize(s string) string {
|
||||
s = strings.ToLower(s)
|
||||
var b strings.Builder
|
||||
for _, r := range s {
|
||||
if unicode.IsLetter(r) || unicode.IsDigit(r) || r == '/' {
|
||||
b.WriteRune(r)
|
||||
} else {
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
}
|
||||
return strings.Join(strings.Fields(b.String()), " ")
|
||||
}
|
||||
|
||||
// tokenize splits a normalized string into unique words.
|
||||
func tokenize(s string) []string {
|
||||
return strings.Fields(s)
|
||||
}
|
||||
|
||||
// tokenOverlap counts how many words from a appear in b.
|
||||
func tokenOverlap(a, b []string) int {
|
||||
set := make(map[string]bool, len(b))
|
||||
for _, t := range b {
|
||||
set[t] = true
|
||||
}
|
||||
count := 0
|
||||
for _, t := range a {
|
||||
if set[t] {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
package inflation
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// This is your first Go test file!
|
||||
// Run with: go test ./internal/inflation/...
|
||||
// Go's testing package is built-in — no extra library needed.
|
||||
|
||||
func TestMatcher_ExactMatch(t *testing.T) {
|
||||
m := NewMatcher()
|
||||
|
||||
tests := []struct {
|
||||
raw string
|
||||
expected string
|
||||
}{
|
||||
{"whole milk gallon", "milk_whole_1gal"},
|
||||
{"large eggs 12ct", "eggs_large_dozen"},
|
||||
{"white bread", "bread_white_loaf"},
|
||||
{"ground beef", "ground_beef_1lb"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.raw, func(t *testing.T) {
|
||||
got := m.Match(tt.raw)
|
||||
if got != tt.expected {
|
||||
t.Errorf("Match(%q) = %q, want %q", tt.raw, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatcher_CaseInsensitive(t *testing.T) {
|
||||
m := NewMatcher()
|
||||
got := m.Match("WHOLE MILK GALLON")
|
||||
if got != "milk_whole_1gal" {
|
||||
t.Errorf("expected milk_whole_1gal, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatcher_TokenOverlap(t *testing.T) {
|
||||
m := NewMatcher()
|
||||
// "MILK WHL 1 GAL" should still match via token overlap
|
||||
got := m.Match("MILK WHL 1 GAL")
|
||||
if got != "milk_whole_1gal" {
|
||||
t.Logf("Note: token overlap match returned %q (may need alias tuning)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatcher_NoMatch(t *testing.T) {
|
||||
m := NewMatcher()
|
||||
got := m.Match("toilet paper mega roll 12ct")
|
||||
if got != "" {
|
||||
t.Errorf("expected no match, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalize(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"Whole Milk, 1 Gal.", "whole milk 1 gal"},
|
||||
{"EGGS (LARGE) 12CT", "eggs large 12ct"},
|
||||
{"80/20 Ground Beef", "80/20 ground beef"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
got := normalize(tt.input)
|
||||
if got != tt.expected {
|
||||
t.Errorf("normalize(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
// Package inflation contains the background worker that refreshes
|
||||
// the price_snapshots table used by the dashboard charts.
|
||||
package inflation
|
||||
|
||||
import (
|
||||
"context"
|
||||
"log"
|
||||
"time"
|
||||
|
||||
"github.com/yourname/deflated/internal/db"
|
||||
)
|
||||
|
||||
// StartRefreshWorker runs in a goroutine and refreshes price snapshots
|
||||
// every interval. Call this from main() after connecting to the database.
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// go inflation.StartRefreshWorker(ctx, queries, 1*time.Hour)
|
||||
func StartRefreshWorker(ctx context.Context, q *db.Queries, interval time.Duration) {
|
||||
// Run once immediately on startup so the charts aren't empty
|
||||
runRefresh(ctx, q)
|
||||
|
||||
ticker := time.NewTicker(interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
runRefresh(ctx, q)
|
||||
case <-ctx.Done():
|
||||
log.Println("inflation refresh worker stopped")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func runRefresh(ctx context.Context, q *db.Queries) {
|
||||
start := time.Now()
|
||||
if err := q.RefreshPriceSnapshots(ctx); err != nil {
|
||||
log.Printf("error refreshing price snapshots: %v", err)
|
||||
return
|
||||
}
|
||||
log.Printf("price snapshots refreshed in %s", time.Since(start).Round(time.Millisecond))
|
||||
}
|
||||
Reference in New Issue
Block a user