commit edb31d2d31fbe1e5a84d9b70c3ba16b3127622b5
Author: Thiago Sposito
Date: Wed Feb 4 21:11:16 2026 -0300
chore: init
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ee06ba6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+*.pyc
+*.pyo
+frames/*
+*.webm
+*.avi
+*.mp4
+*.mkv
+*.mov
+*.flv
+result.csv
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3d772d1
--- /dev/null
+++ b/README.md
@@ -0,0 +1,66 @@
+# Table extraction from scrolling video
+
+Extract a table from a screen-recorded video: sample frames, OCR (Portuguese), align to column/row bounds from an SVG template, then merge and deduplicate into one CSV.
+
+## Inputs
+
+| File | Role |
+|------|------|
+| `video-data.webm` | Source video (scroll-down table). |
+| `template.svg` | Annotation: rectangles under the image = column bounds; 28 rows. |
+
+## Outputs
+
+| File / dir | Role |
+|------------|------|
+| `result.csv` | Final table: 6 columns (Concessionária, Código, Rodovia/UF, km inicial, km final, Extensão), one row per segment, deduplicated. |
+| `frames/` | Working: one PNG and one CSV per sampled frame (e.g. 0, 10, 20, …). |
+| `llm-fixed-frames/` | Optional: copy of frame CSVs after manual/LLM fixes; sew from here instead of `frames/` if used. |
+
+## Run (same chore again)
+
+**Fully automated (extract → fix → sew → result):**
+
+```bash
+nix-shell -p python3 python3Packages.opencv4 python3Packages.numpy python3Packages.pytesseract tesseract ffmpeg --run "./run.sh"
+```
+
+Or with custom video / output dir:
+
+```bash
+./run.sh path/to/video.webm my_frames
+# result.csv is still written at project root
+```
+
+**If you do manual fixes:** copy `frames/*.csv` into `llm-fixed-frames/`, edit CSVs, then:
+
+```bash
+python3 sew_csvs.py llm-fixed-frames result.csv
+```
+
+## Scripts (keep)
+
+| Script | Role |
+|--------|------|
+| `extract_frames_and_tables.py` | Sample video every N frames → PNGs; OCR (por) + SVG column/row bounds → one CSV per frame. |
+| `fix_all_csvs.py` | Heuristic fixes on frame CSVs (strip, E→-, pipe→space, extensão from km). |
+| `sew_csvs.py` | Merge frame CSVs in order, remove boundary overlap, deduplicate full rows, write result. |
+| `svg_columns.py` | Parse column/row rectangles from template.svg. |
+| `assign_cells.py` | Map word boxes to (col, row); merge cell text (col1 no space, others space). |
+| `clean_csv_heuristics.py` | Per-row cleanup and extensão correction (used by fix_all_csvs). |
+| `row_eq.py` | Row equality for sewing. |
+
+## Not kept (removed)
+
+- `extract_table_frames.py` – superseded by `extract_frames_and_tables.py`.
+- `extract_every_n_frames.py` – logic folded into `extract_frames_and_tables.py`.
+- `sewn.csv` – superseded by `result.csv` (sew now writes result.csv and dedupes).
+
+## Clean re-run
+
+To start from scratch:
+
+- Delete or clear `frames/` and optionally `llm-fixed-frames/`.
+- Run `./run.sh` (or the manual-fix flow above).
+
+`result.csv` is overwritten each run.
diff --git a/assign_cells.py b/assign_cells.py
new file mode 100644
index 0000000..b57ca74
--- /dev/null
+++ b/assign_cells.py
@@ -0,0 +1,38 @@
+"""Assign word box to column/row by position; merge cell text with heuristics."""
+import re
+
+
+def assign_cell(cx, cy, col_bounds, row_bounds):
+ col_idx = None
+ for i, (left, right) in enumerate(col_bounds):
+ if left <= cx <= right:
+ col_idx = i
+ break
+ if col_idx is None:
+ return None, None
+ row_idx = None
+ for i, (y0, y1) in enumerate(row_bounds):
+ if y0 <= cy <= y1:
+ row_idx = i
+ break
+ if row_idx is None:
+ return col_idx, None
+ return col_idx, row_idx
+
+
+def merge_cell_words(words, col_idx):
+ if col_idx == 1:
+ return "".join(w.strip() for w in words).strip()
+ if col_idx in (3, 4, 5):
+ s = " ".join(w.strip() for w in words).strip()
+ s = s.replace(",", ".")
+ return s
+ return " ".join(w.strip() for w in words).strip()
+
+
+def normalize_float(s):
+ s = (s or "").strip().replace(",", ".")
+ if not s:
+ return ""
+ m = re.match(r"^[\d.]+", s)
+ return m.group(0) if m else s
diff --git a/clean_csv_heuristics.py b/clean_csv_heuristics.py
new file mode 100644
index 0000000..a292a1a
--- /dev/null
+++ b/clean_csv_heuristics.py
@@ -0,0 +1,45 @@
+"""Apply heuristic fixes to a single CSV row (strip, E->-, pipe->space, extensão)."""
+import re
+
+
+def strip_cell(c):
+ return (c or "").strip().replace("\r", "").replace("\n", "")
+
+
+def fix_cell_value(value, col_idx):
+ s = strip_cell(value)
+ if col_idx == 0 and "|" in s:
+ s = s.replace("|", " ")
+ if col_idx == 5 and s in ("E", "e", "EX", "ES", "EN"):
+ s = "-"
+ if col_idx == 5 and s and re.match(r"^\d+$", s):
+ s = s + ".0"
+ return s
+
+
+def fix_row(row, num_cols=6):
+ return [fix_cell_value(row[i] if i < len(row) else "", i) for i in range(num_cols)]
+
+
+def fix_extensao_from_km(row):
+ if len(row) < 6:
+ return row
+ try:
+ km_ini = float(str(row[3]).replace(",", ".").strip() or "0")
+ km_fin = float(str(row[4]).replace(",", ".").strip() or "0")
+ ext = str(row[5]).replace(",", ".").strip()
+ expected = round(km_fin - km_ini, 2)
+ if not ext or ext == "-" or ext in ("E", "e"):
+ row = list(row)
+ row[5] = f"{expected:.2f}" if km_fin or km_ini else "-"
+ return row
+ current = float(ext) if re.match(r"^[\d.]+$", ext) else None
+ if current is not None and abs(current - expected) < 0.02:
+ return row
+ if current is not None:
+ row = list(row)
+ row[5] = f"{expected:.2f}"
+ return row
+ except (ValueError, TypeError):
+ pass
+ return row
diff --git a/extract_frames_and_tables.py b/extract_frames_and_tables.py
new file mode 100644
index 0000000..1345ef3
--- /dev/null
+++ b/extract_frames_and_tables.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""Extract every Nth frame as PNG; OCR with SVG column/row bounds; save one CSV per frame."""
+import csv
+import os
+import sys
+
+import cv2
+import pytesseract
+
+from assign_cells import assign_cell, merge_cell_words
+from svg_columns import column_bounds, parse_column_rects, row_bounds
+
+
+def extract_frames_every_n(video_path, out_dir, n=10):
+ cap = cv2.VideoCapture(video_path)
+ if not cap.isOpened():
+ raise RuntimeError(f"Cannot open video: {video_path}")
+ os.makedirs(out_dir, exist_ok=True)
+ frame_num = 0
+ saved = []
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+ if frame_num % n == 0:
+ path = os.path.join(out_dir, f"{frame_num}.png")
+ cv2.imwrite(path, frame)
+ saved.append((frame_num, path))
+ frame_num += 1
+ cap.release()
+ return saved
+
+
+def ocr_with_positions(image, lang="por"):
+ data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT)
+ out = []
+ for i in range(len(data["text"])):
+ t = (data["text"][i] or "").strip()
+ if not t:
+ continue
+ out.append({
+ "text": t,
+ "x": data["left"][i],
+ "y": data["top"][i],
+ "w": data["width"][i],
+ "h": data["height"][i],
+ })
+ return out
+
+
+def build_table_from_boxes(boxes, col_bounds, row_bounds, num_cols=6, num_rows=28):
+ cells = {}
+ for b in boxes:
+ cx = b["x"] + b["w"] / 2
+ cy = b["y"] + b["h"] / 2
+ ci, ri = assign_cell(cx, cy, col_bounds, row_bounds)
+ if ci is not None and ri is not None:
+ key = (ri, ci)
+ cells.setdefault(key, []).append(b["text"])
+ table = [[""] * num_cols for _ in range(num_rows)]
+ for (ri, ci), words in cells.items():
+ if 0 <= ri < num_rows and 0 <= ci < num_cols:
+ table[ri][ci] = merge_cell_words(words, ci)
+ return table
+
+
+def write_csv(path, table):
+ with open(path, "w", newline="", encoding="utf-8") as f:
+ csv.writer(f).writerows(table)
+
+
+def run(out_dir, video_path, svg_path, every_n=10, num_rows=28):
+ cols = parse_column_rects(svg_path)
+ col_bounds = column_bounds(cols)
+ table_y = cols[0]["y"]
+ table_h = cols[0]["h"]
+ row_b = row_bounds(num_rows, table_y, table_h)
+ num_cols = len(col_bounds)
+
+ frames = extract_frames_every_n(video_path, out_dir, every_n)
+ for frame_num, png_path in frames:
+ img = cv2.imread(png_path)
+ if img is None:
+ continue
+ boxes = ocr_with_positions(img, lang="por")
+ table = build_table_from_boxes(boxes, col_bounds, row_b, num_cols, num_rows)
+ csv_path = os.path.join(out_dir, f"{frame_num}.csv")
+ write_csv(csv_path, table)
+ return len(frames)
+
+
+def main():
+ out_dir = sys.argv[1] if len(sys.argv) > 1 else "frames"
+ video_path = sys.argv[2] if len(sys.argv) > 2 else "video-data.webm"
+ svg_path = sys.argv[3] if len(sys.argv) > 3 else "template.svg"
+ every_n = int(sys.argv[4]) if len(sys.argv) > 4 else 10
+ n = run(out_dir, video_path, svg_path, every_n=every_n)
+ print(f"Extracted {n} frames and CSVs to {out_dir}/")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/fix_all_csvs.py b/fix_all_csvs.py
new file mode 100644
index 0000000..f6c5eac
--- /dev/null
+++ b/fix_all_csvs.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Read each frame CSV, apply heuristic fixes, overwrite in place."""
+import csv
+import os
+import sys
+
+from clean_csv_heuristics import fix_extensao_from_km, fix_row
+
+
+def read_csv(path):
+ with open(path, newline="", encoding="utf-8") as f:
+ return list(csv.reader(f))
+
+
+def write_csv(path, rows):
+ with open(path, "w", newline="", encoding="utf-8") as f:
+ csv.writer(f).writerows(rows)
+
+
+def fix_csv(path, num_cols=6):
+ rows = read_csv(path)
+ fixed = []
+ for row in rows:
+ if not row:
+ fixed.append(row)
+ continue
+ r = fix_row(row, num_cols)
+ r = fix_extensao_from_km(r)
+ fixed.append(r)
+ write_csv(path, fixed)
+ return len(fixed)
+
+
+def fix_dir(frames_dir, num_cols=6):
+ count = 0
+ for name in sorted(os.listdir(frames_dir)):
+ if not name.endswith(".csv"):
+ continue
+ path = os.path.join(frames_dir, name)
+ try:
+ n = fix_csv(path, num_cols)
+ count += 1
+ except Exception as e:
+ print(f"{name}: {e}", file=sys.stderr)
+ return count
+
+
+def main():
+ frames_dir = sys.argv[1] if len(sys.argv) > 1 else "frames"
+ n = fix_dir(frames_dir)
+ print(f"Fixed {n} CSVs in {frames_dir}/")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..82ffa63
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,22 @@
+{
+ description = "Table extraction from scrolling video";
+
+ inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+
+ outputs = { self, nixpkgs }:
+ let
+ pkgs = nixpkgs.legacyPackages.x86_64-linux;
+ in
+ {
+ devShells.x86_64-linux.default = pkgs.mkShell {
+ packages = with pkgs; [
+ python3
+ python3Packages.opencv4
+ python3Packages.numpy
+ python3Packages.pytesseract
+ tesseract
+ ffmpeg
+ ];
+ };
+ };
+}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..b8613b3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+contourpy==1.3.3
+cycler==0.12.1
+fonttools==4.60.1
+kiwisolver==1.4.8
+matplotlib==3.10.5
+numpy==2.3.4
+opencv @ file:///build/source/modules/python/package/dist/opencv-4.12.0-py3-none-any.whl#sha256=7634c6fe19d12c2ba6825ae2dcaee4d84c6fc1a2d31e7e81ff295ac805747a9a
+opencv-python==4.12.0
+packaging==25.0
+pillow==12.0.0
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+six==1.17.0
+tkinter==3.13.9
diff --git a/row_eq.py b/row_eq.py
new file mode 100644
index 0000000..639b8a2
--- /dev/null
+++ b/row_eq.py
@@ -0,0 +1,10 @@
+"""Row equality for sew: same content (strip, compare)."""
+
+
+def row_equal(a, b, num_cols=6):
+ if len(a) < num_cols or len(b) < num_cols:
+ return False
+ for i in range(num_cols):
+ if (a[i] if i < len(a) else "").strip() != (b[i] if i < len(b) else "").strip():
+ return False
+ return True
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000..7a792a4
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Full pipeline: extract frames + OCR → heuristic fixes → sew + dedupe → result.csv
+# Usage: ./run.sh [video] [output_dir]
+# Default: video=video-data.webm, output_dir=frames, result=result.csv
+set -e
+VIDEO="${1:-video-data.webm}"
+OUT_DIR="${2:-frames}"
+RESULT="result.csv"
+
+echo "1. Extract frames (every 10) + OCR with template.svg → ${OUT_DIR}/"
+python3 extract_frames_and_tables.py "$OUT_DIR" "$VIDEO" template.svg 10
+
+echo "2. Heuristic fixes on ${OUT_DIR}/*.csv"
+python3 fix_all_csvs.py "$OUT_DIR"
+
+echo "3. Sew CSVs + dedupe → ${RESULT}"
+python3 sew_csvs.py "$OUT_DIR" "$RESULT"
+
+echo "Done. Output: ${RESULT}"
diff --git a/sew_csvs.py b/sew_csvs.py
new file mode 100644
index 0000000..de70a7b
--- /dev/null
+++ b/sew_csvs.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""Sew all frame CSVs in order; deduplicate overlap at boundaries (scroll down)."""
+import csv
+import os
+import re
+import sys
+
+from row_eq import row_equal
+
+
+def frame_number_from_name(name):
+ m = re.match(r"^(\d+)\.csv$", name)
+ return int(m.group(1)) if m else None
+
+
+def list_frame_csvs(frames_dir):
+ names = []
+ for name in os.listdir(frames_dir):
+ n = frame_number_from_name(name)
+ if n is not None:
+ names.append((n, name))
+ return [name for _, name in sorted(names)]
+
+
+def read_csv(path):
+ with open(path, newline="", encoding="utf-8") as f:
+ return list(csv.reader(f))
+
+
+def write_csv(path, rows):
+ with open(path, "w", newline="", encoding="utf-8") as f:
+ csv.writer(f).writerows(rows)
+
+
+def overlap_length(acc_rows, next_rows, num_cols=6):
+ max_k = min(len(acc_rows), len(next_rows))
+ for k in range(max_k, 0, -1):
+ if k > len(acc_rows) or k > len(next_rows):
+ continue
+ match = True
+ for i in range(k):
+ if not row_equal(acc_rows[-k + i], next_rows[i], num_cols):
+ match = False
+ break
+ if match:
+ return k
+ return 0
+
+
+def dedupe_rows(rows, num_cols=6):
+ seen = set()
+ out = []
+ for row in rows:
+ key = tuple((row[i] if i < len(row) else "").strip() for i in range(num_cols))
+ if key in seen:
+ continue
+ seen.add(key)
+ out.append(row)
+ return out
+
+
+def sew(frames_dir, out_path, num_cols=6, dedupe=True):
+ csv_names = list_frame_csvs(frames_dir)
+ if not csv_names:
+ return 0
+ acc = read_csv(os.path.join(frames_dir, csv_names[0]))
+ for name in csv_names[1:]:
+ next_rows = read_csv(os.path.join(frames_dir, name))
+ k = overlap_length(acc, next_rows, num_cols)
+ acc.extend(next_rows[k:])
+ if dedupe:
+ acc = dedupe_rows(acc, num_cols)
+ write_csv(out_path, acc)
+ return len(acc)
+
+
+def main():
+ frames_dir = sys.argv[1] if len(sys.argv) > 1 else "frames"
+ out_path = sys.argv[2] if len(sys.argv) > 2 else "result.csv"
+ n = sew(frames_dir, out_path)
+ print(f"Wrote {n} rows to {out_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/svg_columns.py b/svg_columns.py
new file mode 100644
index 0000000..5b9387e
--- /dev/null
+++ b/svg_columns.py
@@ -0,0 +1,25 @@
+"""Parse column rectangles from template.svg."""
+import re
+
+
+def parse_column_rects(svg_path):
+ with open(svg_path) as f:
+ content = f.read()
+ blocks = re.findall(r"]+>", content)
+ cols = []
+ for block in blocks:
+ x = float(re.search(r'x="([^"]+)"', block).group(1))
+ y = float(re.search(r'y="([^"]+)"', block).group(1))
+ w = float(re.search(r'width="([^"]+)"', block).group(1))
+ h = float(re.search(r'height="([^"]+)"', block).group(1))
+ cols.append({"x": x, "y": y, "w": w, "h": h})
+ return cols
+
+
+def column_bounds(cols):
+ return [(c["x"], c["x"] + c["w"]) for c in cols]
+
+
+def row_bounds(num_rows, table_y, table_height):
+ step = table_height / num_rows
+ return [(table_y + i * step, table_y + (i + 1) * step) for i in range(num_rows)]
diff --git a/template.svg b/template.svg
new file mode 100644
index 0000000..4786ed0
--- /dev/null
+++ b/template.svg
@@ -0,0 +1,39 @@
+