commit edb31d2d31fbe1e5a84d9b70c3ba16b3127622b5
Author: Thiago Sposito <th.spo@pm.me>
Date:   Wed Feb 4 21:11:16 2026 -0300

    chore: init

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ee06ba6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+*.pyc
+*.pyo
+frames/*
+*.webm
+*.avi
+*.mp4
+*.mkv
+*.mov
+*.flv
+result.csv
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3d772d1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,66 @@
+# Table extraction from scrolling video
+
+Extract a table from a screen-recorded video: sample frames, OCR (Portuguese), align to column/row bounds from an SVG template, then merge and deduplicate into one CSV.
+
+## Inputs
+
+| File | Role |
+|------|------|
+| `video-data.webm` | Source video (scroll-down table). |
+| `template.svg` | Annotation: rectangles under the image = column bounds; 28 rows. |
+
+## Outputs
+
+| File / dir | Role |
+|------------|------|
+| `result.csv` | Final table: 6 columns (Concessionária, Código, Rodovia/UF, km inicial, km final, Extensão), one row per segment, deduplicated. |
+| `frames/` | Working: one PNG and one CSV per sampled frame (e.g. 0, 10, 20, …). |
+| `llm-fixed-frames/` | Optional: copy of frame CSVs after manual/LLM fixes; sew from here instead of `frames/` if used. |
+
+## Run (same chore again)
+
+**Fully automated (extract → fix → sew → result):**
+
+```bash
+nix-shell -p python3 python3Packages.opencv4 python3Packages.numpy python3Packages.pytesseract tesseract ffmpeg --run "./run.sh"
+```
+
+Or with custom video / output dir:
+
+```bash
+./run.sh path/to/video.webm my_frames
+# result.csv is still written at project root
+```
+
+**If you do manual fixes:** copy `frames/*.csv` into `llm-fixed-frames/`, edit CSVs, then:
+
+```bash
+python3 sew_csvs.py llm-fixed-frames result.csv
+```
+
+## Scripts (keep)
+
+| Script | Role |
+|--------|------|
+| `extract_frames_and_tables.py` | Sample video every N frames → PNGs; OCR (por) + SVG column/row bounds → one CSV per frame. |
+| `fix_all_csvs.py` | Heuristic fixes on frame CSVs (strip, E→-, pipe→space, extensão from km). |
+| `sew_csvs.py` | Merge frame CSVs in order, remove boundary overlap, deduplicate full rows, write result. |
+| `svg_columns.py` | Parse column/row rectangles from template.svg. |
+| `assign_cells.py` | Map word boxes to (col, row); merge cell text (col1 no space, others space). |
+| `clean_csv_heuristics.py` | Per-row cleanup and extensão correction (used by fix_all_csvs). |
+| `row_eq.py` | Row equality for sewing. |
+
+## Not kept (removed)
+
+- `extract_table_frames.py` – superseded by `extract_frames_and_tables.py`.
+- `extract_every_n_frames.py` – logic folded into `extract_frames_and_tables.py`.
+- `sewn.csv` – superseded by `result.csv` (sew now writes result.csv and dedupes).
+
+## Clean re-run
+
+To start from scratch:
+
+- Delete or clear `frames/` and optionally `llm-fixed-frames/`.
+- Run `./run.sh` (or the manual-fix flow above).
+
+`result.csv` is overwritten each run.
diff --git a/assign_cells.py b/assign_cells.py
new file mode 100644
index 0000000..b57ca74
--- /dev/null
+++ b/assign_cells.py
@@ -0,0 +1,38 @@
+"""Assign word box to column/row by position; merge cell text with heuristics."""
+import re
+
+
+def assign_cell(cx, cy, col_bounds, row_bounds):
+    col_idx = None
+    for i, (left, right) in enumerate(col_bounds):
+        if left <= cx <= right:
+            col_idx = i
+            break
+    if col_idx is None:
+        return None, None
+    row_idx = None
+    for i, (y0, y1) in enumerate(row_bounds):
+        if y0 <= cy <= y1:
+            row_idx = i
+            break
+    if row_idx is None:
+        return col_idx, None
+    return col_idx, row_idx
+
+
+def merge_cell_words(words, col_idx):
+    if col_idx == 1:
+        return "".join(w.strip() for w in words).strip()
+    if col_idx in (3, 4, 5):
+        s = " ".join(w.strip() for w in words).strip()
+        s = s.replace(",", ".")
+        return s
+    return " ".join(w.strip() for w in words).strip()
+
+
+def normalize_float(s):
+    s = (s or "").strip().replace(",", ".")
+    if not s:
+        return ""
+    m = re.match(r"^[\d.]+", s)
+    return m.group(0) if m else s
diff --git a/clean_csv_heuristics.py b/clean_csv_heuristics.py
new file mode 100644
index 0000000..a292a1a
--- /dev/null
+++ b/clean_csv_heuristics.py
@@ -0,0 +1,45 @@
+"""Apply heuristic fixes to a single CSV row (strip, E->-, pipe->space, extensão)."""
+import re
+
+
+def strip_cell(c):
+    return (c or "").strip().replace("\r", "").replace("\n", "")
+
+
+def fix_cell_value(value, col_idx):
+    s = strip_cell(value)
+    if col_idx == 0 and "|" in s:
+        s = s.replace("|", " ")
+    if col_idx == 5 and s in ("E", "e", "EX", "ES", "EN"):
+        s = "-"
+    if col_idx == 5 and s and re.match(r"^\d+$", s):
+        s = s + ".0"
+    return s
+
+
+def fix_row(row, num_cols=6):
+    return [fix_cell_value(row[i] if i < len(row) else "", i) for i in range(num_cols)]
+
+
+def fix_extensao_from_km(row):
+    if len(row) < 6:
+        return row
+    try:
+        km_ini = float(str(row[3]).replace(",", ".").strip() or "0")
+        km_fin = float(str(row[4]).replace(",", ".").strip() or "0")
+        ext = str(row[5]).replace(",", ".").strip()
+        expected = round(km_fin - km_ini, 2)
+        if not ext or ext == "-" or ext in ("E", "e"):
+            row = list(row)
+            row[5] = f"{expected:.2f}" if km_fin or km_ini else "-"
+            return row
+        current = float(ext) if re.match(r"^[\d.]+$", ext) else None
+        if current is not None and abs(current - expected) < 0.02:
+            return row
+        if current is not None:
+            row = list(row)
+            row[5] = f"{expected:.2f}"
+            return row
+    except (ValueError, TypeError):
+        pass
+    return row
diff --git a/extract_frames_and_tables.py b/extract_frames_and_tables.py
new file mode 100644
index 0000000..1345ef3
--- /dev/null
+++ b/extract_frames_and_tables.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Extract every Nth frame as PNG; OCR with SVG column/row bounds; save one CSV per frame."""
+import csv
+import os
+import sys
+
+import cv2
+import pytesseract
+
+from assign_cells import assign_cell, merge_cell_words
+from svg_columns import column_bounds, parse_column_rects, row_bounds
+
+
+def extract_frames_every_n(video_path, out_dir, n=10):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise RuntimeError(f"Cannot open video: {video_path}")
+    os.makedirs(out_dir, exist_ok=True)
+    frame_num = 0
+    saved = []
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_num % n == 0:
+            path = os.path.join(out_dir, f"{frame_num}.png")
+            cv2.imwrite(path, frame)
+            saved.append((frame_num, path))
+        frame_num += 1
+    cap.release()
+    return saved
+
+
+def ocr_with_positions(image, lang="por"):
+    data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT)
+    out = []
+    for i in range(len(data["text"])):
+        t = (data["text"][i] or "").strip()
+        if not t:
+            continue
+        out.append({
+            "text": t,
+            "x": data["left"][i],
+            "y": data["top"][i],
+            "w": data["width"][i],
+            "h": data["height"][i],
+        })
+    return out
+
+
+def build_table_from_boxes(boxes, col_bounds, row_bounds, num_cols=6, num_rows=28):
+    cells = {}
+    for b in boxes:
+        cx = b["x"] + b["w"] / 2
+        cy = b["y"] + b["h"] / 2
+        ci, ri = assign_cell(cx, cy, col_bounds, row_bounds)
+        if ci is not None and ri is not None:
+            key = (ri, ci)
+            cells.setdefault(key, []).append(b["text"])
+    table = [[""] * num_cols for _ in range(num_rows)]
+    for (ri, ci), words in cells.items():
+        if 0 <= ri < num_rows and 0 <= ci < num_cols:
+            table[ri][ci] = merge_cell_words(words, ci)
+    return table
+
+
+def write_csv(path, table):
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        csv.writer(f).writerows(table)
+
+
+def run(out_dir, video_path, svg_path, every_n=10, num_rows=28):
+    cols = parse_column_rects(svg_path)
+    col_bounds = column_bounds(cols)
+    table_y = cols[0]["y"]
+    table_h = cols[0]["h"]
+    row_b = row_bounds(num_rows, table_y, table_h)
+    num_cols = len(col_bounds)
+
+    frames = extract_frames_every_n(video_path, out_dir, every_n)
+    for frame_num, png_path in frames:
+        img = cv2.imread(png_path)
+        if img is None:
+            continue
+        boxes = ocr_with_positions(img, lang="por")
+        table = build_table_from_boxes(boxes, col_bounds, row_b, num_cols, num_rows)
+        csv_path = os.path.join(out_dir, f"{frame_num}.csv")
+        write_csv(csv_path, table)
+    return len(frames)
+
+
+def main():
+    out_dir = sys.argv[1] if len(sys.argv) > 1 else "frames"
+    video_path = sys.argv[2] if len(sys.argv) > 2 else "video-data.webm"
+    svg_path = sys.argv[3] if len(sys.argv) > 3 else "template.svg"
+    every_n = int(sys.argv[4]) if len(sys.argv) > 4 else 10
+    n = run(out_dir, video_path, svg_path, every_n=every_n)
+    print(f"Extracted {n} frames and CSVs to {out_dir}/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fix_all_csvs.py b/fix_all_csvs.py
new file mode 100644
index 0000000..f6c5eac
--- /dev/null
+++ b/fix_all_csvs.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Read each frame CSV, apply heuristic fixes, overwrite in place."""
+import csv
+import os
+import sys
+
+from clean_csv_heuristics import fix_extensao_from_km, fix_row
+
+
+def read_csv(path):
+    with open(path, newline="", encoding="utf-8") as f:
+        return list(csv.reader(f))
+
+
+def write_csv(path, rows):
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        csv.writer(f).writerows(rows)
+
+
+def fix_csv(path, num_cols=6):
+    rows = read_csv(path)
+    fixed = []
+    for row in rows:
+        if not row:
+            fixed.append(row)
+            continue
+        r = fix_row(row, num_cols)
+        r = fix_extensao_from_km(r)
+        fixed.append(r)
+    write_csv(path, fixed)
+    return len(fixed)
+
+
+def fix_dir(frames_dir, num_cols=6):
+    count = 0
+    for name in sorted(os.listdir(frames_dir)):
+        if not name.endswith(".csv"):
+            continue
+        path = os.path.join(frames_dir, name)
+        try:
+            n = fix_csv(path, num_cols)
+            count += 1
+        except Exception as e:
+            print(f"{name}: {e}", file=sys.stderr)
+    return count
+
+
+def main():
+    frames_dir = sys.argv[1] if len(sys.argv) > 1 else "frames"
+    n = fix_dir(frames_dir)
+    print(f"Fixed {n} CSVs in {frames_dir}/")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..82ffa63
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,22 @@
+{
+  description = "Table extraction from scrolling video";
+
+  inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+
+  outputs = { self, nixpkgs }:
+    let
+      pkgs = nixpkgs.legacyPackages.x86_64-linux;
+    in
+    {
+      devShells.x86_64-linux.default = pkgs.mkShell {
+        packages = with pkgs; [
+          python3
+          python3Packages.opencv4
+          python3Packages.numpy
+          python3Packages.pytesseract
+          tesseract
+          ffmpeg
+        ];
+      };
+    };
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b8613b3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+contourpy==1.3.3
+cycler==0.12.1
+fonttools==4.60.1
+kiwisolver==1.4.8
+matplotlib==3.10.5
+numpy==2.3.4
+opencv @ file:///build/source/modules/python/package/dist/opencv-4.12.0-py3-none-any.whl#sha256=7634c6fe19d12c2ba6825ae2dcaee4d84c6fc1a2d31e7e81ff295ac805747a9a
+opencv-python==4.12.0
+packaging==25.0
+pillow==12.0.0
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+six==1.17.0
+tkinter==3.13.9
diff --git a/row_eq.py b/row_eq.py
new file mode 100644
index 0000000..639b8a2
--- /dev/null
+++ b/row_eq.py
@@ -0,0 +1,10 @@
+"""Row equality for sew: same content (strip, compare)."""
+
+
+def row_equal(a, b, num_cols=6):
+    if len(a) < num_cols or len(b) < num_cols:
+        return False
+    for i in range(num_cols):
+        if (a[i] if i < len(a) else "").strip() != (b[i] if i < len(b) else "").strip():
+            return False
+    return True
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..7a792a4
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Full pipeline: extract frames + OCR → heuristic fixes → sew + dedupe → result.csv
+# Usage: ./run.sh [video] [output_dir]
+# Default: video=video-data.webm, output_dir=frames, result=result.csv
+set -e
+VIDEO="${1:-video-data.webm}"
+OUT_DIR="${2:-frames}"
+RESULT="result.csv"
+
+echo "1. Extract frames (every 10) + OCR with template.svg → ${OUT_DIR}/"
+python3 extract_frames_and_tables.py "$OUT_DIR" "$VIDEO" template.svg 10
+
+echo "2. Heuristic fixes on ${OUT_DIR}/*.csv"
+python3 fix_all_csvs.py "$OUT_DIR"
+
+echo "3. Sew CSVs + dedupe → ${RESULT}"
+python3 sew_csvs.py "$OUT_DIR" "$RESULT"
+
+echo "Done. Output: ${RESULT}"
diff --git a/sew_csvs.py b/sew_csvs.py
new file mode 100644
index 0000000..de70a7b
--- /dev/null
+++ b/sew_csvs.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""Sew all frame CSVs in order; deduplicate overlap at boundaries (scroll down)."""
+import csv
+import os
+import re
+import sys
+
+from row_eq import row_equal
+
+
+def frame_number_from_name(name):
+    m = re.match(r"^(\d+)\.csv$", name)
+    return int(m.group(1)) if m else None
+
+
+def list_frame_csvs(frames_dir):
+    names = []
+    for name in os.listdir(frames_dir):
+        n = frame_number_from_name(name)
+        if n is not None:
+            names.append((n, name))
+    return [name for _, name in sorted(names)]
+
+
+def read_csv(path):
+    with open(path, newline="", encoding="utf-8") as f:
+        return list(csv.reader(f))
+
+
+def write_csv(path, rows):
+    with open(path, "w", newline="", encoding="utf-8") as f:
+        csv.writer(f).writerows(rows)
+
+
+def overlap_length(acc_rows, next_rows, num_cols=6):
+    max_k = min(len(acc_rows), len(next_rows))
+    for k in range(max_k, 0, -1):
+        if k > len(acc_rows) or k > len(next_rows):
+            continue
+        match = True
+        for i in range(k):
+            if not row_equal(acc_rows[-k + i], next_rows[i], num_cols):
+                match = False
+                break
+        if match:
+            return k
+    return 0
+
+
+def dedupe_rows(rows, num_cols=6):
+    seen = set()
+    out = []
+    for row in rows:
+        key = tuple((row[i] if i < len(row) else "").strip() for i in range(num_cols))
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(row)
+    return out
+
+
+def sew(frames_dir, out_path, num_cols=6, dedupe=True):
+    csv_names = list_frame_csvs(frames_dir)
+    if not csv_names:
+        return 0
+    acc = read_csv(os.path.join(frames_dir, csv_names[0]))
+    for name in csv_names[1:]:
+        next_rows = read_csv(os.path.join(frames_dir, name))
+        k = overlap_length(acc, next_rows, num_cols)
+        acc.extend(next_rows[k:])
+    if dedupe:
+        acc = dedupe_rows(acc, num_cols)
+    write_csv(out_path, acc)
+    return len(acc)
+
+
+def main():
+    frames_dir = sys.argv[1] if len(sys.argv) > 1 else "frames"
+    out_path = sys.argv[2] if len(sys.argv) > 2 else "result.csv"
+    n = sew(frames_dir, out_path)
+    print(f"Wrote {n} rows to {out_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/svg_columns.py b/svg_columns.py
new file mode 100644
index 0000000..5b9387e
--- /dev/null
+++ b/svg_columns.py
@@ -0,0 +1,25 @@
+"""Parse column rectangles from template.svg."""
+import re
+
+
+def parse_column_rects(svg_path):
+    with open(svg_path) as f:
+        content = f.read()
+    blocks = re.findall(r"<rect[^>]+>", content)
+    cols = []
+    for block in blocks:
+        x = float(re.search(r'x="([^"]+)"', block).group(1))
+        y = float(re.search(r'y="([^"]+)"', block).group(1))
+        w = float(re.search(r'width="([^"]+)"', block).group(1))
+        h = float(re.search(r'height="([^"]+)"', block).group(1))
+        cols.append({"x": x, "y": y, "w": w, "h": h})
+    return cols
+
+
+def column_bounds(cols):
+    return [(c["x"], c["x"] + c["w"]) for c in cols]
+
+
+def row_bounds(num_rows, table_y, table_height):
+    step = table_height / num_rows
+    return [(table_y + i * step, table_y + (i + 1) * step) for i in range(num_rows)]
diff --git a/template.svg b/template.svg
new file mode 100644
index 0000000..4786ed0
--- /dev/null
+++ b/template.svg
@@ -0,0 +1,39 @@
+<svg width="1179" height="801" version="1.1" viewBox="0 0 1179 801" xmlns="http://www.w3.org/2000/svg">
+ <g fill="#d21d1d" fill-opacity=".549" stroke="#000" stroke-linecap="round" stroke-linejoin="round">
+  <rect x="11" y="52.5" width="313" height="740" stroke-width=".379" style="paint-order:markers stroke fill"/>
+  <rect x="334" y="52.5" width="182" height="740" stroke-width=".378" style="paint-order:markers stroke fill"/>
+  <rect x="576" y="52.5" width="122" height="740" stroke-width=".379" style="paint-order:markers stroke fill"/>
+  <rect x="727" y="52.5" width="76.6" height="740" stroke-width=".379" style="paint-order:markers stroke fill"/>
+  <rect x="816" y="52.5" width="81.5" height="740" stroke-width=".378" style="paint-order:markers stroke fill"/>
+  <rect x="921" y="52.5" width="69.1" height="740" stroke-width=".378" style="paint-order:markers stroke fill"/>
+ </g>
+ <g fill="#00f" fill-opacity=".549" stroke="#000" stroke-linecap="round" stroke-linejoin="round" stroke-width=".378">
+  <path d="m12.7 52.9h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 80.4h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 108h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 135h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 163h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 190h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 218h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 245h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 273h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 300h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 328h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 355h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 382h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 410h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 437h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 465h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 492h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 520h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 547h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 575h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 602h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 630h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 657h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 684h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 712h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m12.7 739h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+  <path d="m11.8 767h972v27.5h-972z" style="paint-order:markers stroke fill"/>
+ </g>
+</svg>