commit edb31d2d31fbe1e5a84d9b70c3ba16b3127622b5 Author: Thiago Sposito Date: Wed Feb 4 21:11:16 2026 -0300 chore: init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ee06ba6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +__pycache__/ +*.pyc +*.pyo +frames/* +*.webm +*.avi +*.mp4 +*.mkv +*.mov +*.flv +result.csv \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..3d772d1 --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# Table extraction from scrolling video + +Extract a table from a screen-recorded video: sample frames, OCR (Portuguese), align to column/row bounds from an SVG template, then merge and deduplicate into one CSV. + +## Inputs + +| File | Role | +|------|------| +| `video-data.webm` | Source video (scroll-down table). | +| `template.svg` | Annotation: rectangles under the image = column bounds; 28 rows. | + +## Outputs + +| File / dir | Role | +|------------|------| +| `result.csv` | Final table: 6 columns (Concessionária, Código, Rodovia/UF, km inicial, km final, Extensão), one row per segment, deduplicated. | +| `frames/` | Working: one PNG and one CSV per sampled frame (e.g. 0, 10, 20, …). | +| `llm-fixed-frames/` | Optional: copy of frame CSVs after manual/LLM fixes; sew from here instead of `frames/` if used. | + +## Run (same chore again) + +**Fully automated (extract → fix → sew → result):** + +```bash +nix-shell -p python3 python3Packages.opencv4 python3Packages.numpy python3Packages.pytesseract tesseract ffmpeg --run "./run.sh" +``` + +Or with custom video / output dir: + +```bash +./run.sh path/to/video.webm my_frames +# result.csv is still written at project root +``` + +**If you do manual fixes:** copy `frames/*.csv` into `llm-fixed-frames/`, edit CSVs, then: + +```bash +python3 sew_csvs.py llm-fixed-frames result.csv +``` + +## Scripts (keep) + +| Script | Role | +|--------|------| +| `extract_frames_and_tables.py` | Sample video every N frames → PNGs; OCR (por) + SVG column/row bounds → one CSV per frame. | +| `fix_all_csvs.py` | Heuristic fixes on frame CSVs (strip, E→-, pipe→space, extensão from km). | +| `sew_csvs.py` | Merge frame CSVs in order, remove boundary overlap, deduplicate full rows, write result. | +| `svg_columns.py` | Parse column/row rectangles from template.svg. | +| `assign_cells.py` | Map word boxes to (col, row); merge cell text (col1 no space, others space). | +| `clean_csv_heuristics.py` | Per-row cleanup and extensão correction (used by fix_all_csvs). | +| `row_eq.py` | Row equality for sewing. | + +## Not kept (removed) + +- `extract_table_frames.py` – superseded by `extract_frames_and_tables.py`. +- `extract_every_n_frames.py` – logic folded into `extract_frames_and_tables.py`. +- `sewn.csv` – superseded by `result.csv` (sew now writes result.csv and dedupes). + +## Clean re-run + +To start from scratch: + +- Delete or clear `frames/` and optionally `llm-fixed-frames/`. +- Run `./run.sh` (or the manual-fix flow above). + +`result.csv` is overwritten each run. diff --git a/assign_cells.py b/assign_cells.py new file mode 100644 index 0000000..b57ca74 --- /dev/null +++ b/assign_cells.py @@ -0,0 +1,38 @@ +"""Assign word box to column/row by position; merge cell text with heuristics.""" +import re + + +def assign_cell(cx, cy, col_bounds, row_bounds): + col_idx = None + for i, (left, right) in enumerate(col_bounds): + if left <= cx <= right: + col_idx = i + break + if col_idx is None: + return None, None + row_idx = None + for i, (y0, y1) in enumerate(row_bounds): + if y0 <= cy <= y1: + row_idx = i + break + if row_idx is None: + return col_idx, None + return col_idx, row_idx + + +def merge_cell_words(words, col_idx): + if col_idx == 1: + return "".join(w.strip() for w in words).strip() + if col_idx in (3, 4, 5): + s = " ".join(w.strip() for w in words).strip() + s = s.replace(",", ".") + return s + return " ".join(w.strip() for w in words).strip() + + +def normalize_float(s): + s = (s or "").strip().replace(",", ".") + if not s: + return "" + m = re.match(r"^[\d.]+", s) + return m.group(0) if m else s diff --git a/clean_csv_heuristics.py b/clean_csv_heuristics.py new file mode 100644 index 0000000..a292a1a --- /dev/null +++ b/clean_csv_heuristics.py @@ -0,0 +1,45 @@ +"""Apply heuristic fixes to a single CSV row (strip, E->-, pipe->space, extensão).""" +import re + + +def strip_cell(c): + return (c or "").strip().replace("\r", "").replace("\n", "") + + +def fix_cell_value(value, col_idx): + s = strip_cell(value) + if col_idx == 0 and "|" in s: + s = s.replace("|", " ") + if col_idx == 5 and s in ("E", "e", "EX", "ES", "EN"): + s = "-" + if col_idx == 5 and s and re.match(r"^\d+$", s): + s = s + ".0" + return s + + +def fix_row(row, num_cols=6): + return [fix_cell_value(row[i] if i < len(row) else "", i) for i in range(num_cols)] + + +def fix_extensao_from_km(row): + if len(row) < 6: + return row + try: + km_ini = float(str(row[3]).replace(",", ".").strip() or "0") + km_fin = float(str(row[4]).replace(",", ".").strip() or "0") + ext = str(row[5]).replace(",", ".").strip() + expected = round(km_fin - km_ini, 2) + if not ext or ext == "-" or ext in ("E", "e"): + row = list(row) + row[5] = f"{expected:.2f}" if km_fin or km_ini else "-" + return row + current = float(ext) if re.match(r"^[\d.]+$", ext) else None + if current is not None and abs(current - expected) < 0.02: + return row + if current is not None: + row = list(row) + row[5] = f"{expected:.2f}" + return row + except (ValueError, TypeError): + pass + return row diff --git a/extract_frames_and_tables.py b/extract_frames_and_tables.py new file mode 100644 index 0000000..1345ef3 --- /dev/null +++ b/extract_frames_and_tables.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +"""Extract every Nth frame as PNG; OCR with SVG column/row bounds; save one CSV per frame.""" +import csv +import os +import sys + +import cv2 +import pytesseract + +from assign_cells import assign_cell, merge_cell_words +from svg_columns import column_bounds, parse_column_rects, row_bounds + + +def extract_frames_every_n(video_path, out_dir, n=10): + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise RuntimeError(f"Cannot open video: {video_path}") + os.makedirs(out_dir, exist_ok=True) + frame_num = 0 + saved = [] + while True: + ret, frame = cap.read() + if not ret: + break + if frame_num % n == 0: + path = os.path.join(out_dir, f"{frame_num}.png") + cv2.imwrite(path, frame) + saved.append((frame_num, path)) + frame_num += 1 + cap.release() + return saved + + +def ocr_with_positions(image, lang="por"): + data = pytesseract.image_to_data(image, lang=lang, output_type=pytesseract.Output.DICT) + out = [] + for i in range(len(data["text"])): + t = (data["text"][i] or "").strip() + if not t: + continue + out.append({ + "text": t, + "x": data["left"][i], + "y": data["top"][i], + "w": data["width"][i], + "h": data["height"][i], + }) + return out + + +def build_table_from_boxes(boxes, col_bounds, row_bounds, num_cols=6, num_rows=28): + cells = {} + for b in boxes: + cx = b["x"] + b["w"] / 2 + cy = b["y"] + b["h"] / 2 + ci, ri = assign_cell(cx, cy, col_bounds, row_bounds) + if ci is not None and ri is not None: + key = (ri, ci) + cells.setdefault(key, []).append(b["text"]) + table = [[""] * num_cols for _ in range(num_rows)] + for (ri, ci), words in cells.items(): + if 0 <= ri < num_rows and 0 <= ci < num_cols: + table[ri][ci] = merge_cell_words(words, ci) + return table + + +def write_csv(path, table): + with open(path, "w", newline="", encoding="utf-8") as f: + csv.writer(f).writerows(table) + + +def run(out_dir, video_path, svg_path, every_n=10, num_rows=28): + cols = parse_column_rects(svg_path) + col_bounds = column_bounds(cols) + table_y = cols[0]["y"] + table_h = cols[0]["h"] + row_b = row_bounds(num_rows, table_y, table_h) + num_cols = len(col_bounds) + + frames = extract_frames_every_n(video_path, out_dir, every_n) + for frame_num, png_path in frames: + img = cv2.imread(png_path) + if img is None: + continue + boxes = ocr_with_positions(img, lang="por") + table = build_table_from_boxes(boxes, col_bounds, row_b, num_cols, num_rows) + csv_path = os.path.join(out_dir, f"{frame_num}.csv") + write_csv(csv_path, table) + return len(frames) + + +def main(): + out_dir = sys.argv[1] if len(sys.argv) > 1 else "frames" + video_path = sys.argv[2] if len(sys.argv) > 2 else "video-data.webm" + svg_path = sys.argv[3] if len(sys.argv) > 3 else "template.svg" + every_n = int(sys.argv[4]) if len(sys.argv) > 4 else 10 + n = run(out_dir, video_path, svg_path, every_n=every_n) + print(f"Extracted {n} frames and CSVs to {out_dir}/") + + +if __name__ == "__main__": + main() diff --git a/fix_all_csvs.py b/fix_all_csvs.py new file mode 100644 index 0000000..f6c5eac --- /dev/null +++ b/fix_all_csvs.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +"""Read each frame CSV, apply heuristic fixes, overwrite in place.""" +import csv +import os +import sys + +from clean_csv_heuristics import fix_extensao_from_km, fix_row + + +def read_csv(path): + with open(path, newline="", encoding="utf-8") as f: + return list(csv.reader(f)) + + +def write_csv(path, rows): + with open(path, "w", newline="", encoding="utf-8") as f: + csv.writer(f).writerows(rows) + + +def fix_csv(path, num_cols=6): + rows = read_csv(path) + fixed = [] + for row in rows: + if not row: + fixed.append(row) + continue + r = fix_row(row, num_cols) + r = fix_extensao_from_km(r) + fixed.append(r) + write_csv(path, fixed) + return len(fixed) + + +def fix_dir(frames_dir, num_cols=6): + count = 0 + for name in sorted(os.listdir(frames_dir)): + if not name.endswith(".csv"): + continue + path = os.path.join(frames_dir, name) + try: + n = fix_csv(path, num_cols) + count += 1 + except Exception as e: + print(f"{name}: {e}", file=sys.stderr) + return count + + +def main(): + frames_dir = sys.argv[1] if len(sys.argv) > 1 else "frames" + n = fix_dir(frames_dir) + print(f"Fixed {n} CSVs in {frames_dir}/") + + +if __name__ == "__main__": + main() diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..82ffa63 --- /dev/null +++ b/flake.nix @@ -0,0 +1,22 @@ +{ + description = "Table extraction from scrolling video"; + + inputs.nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + + outputs = { self, nixpkgs }: + let + pkgs = nixpkgs.legacyPackages.x86_64-linux; + in + { + devShells.x86_64-linux.default = pkgs.mkShell { + packages = with pkgs; [ + python3 + python3Packages.opencv4 + python3Packages.numpy + python3Packages.pytesseract + tesseract + ffmpeg + ]; + }; + }; +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b8613b3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +contourpy==1.3.3 +cycler==0.12.1 +fonttools==4.60.1 +kiwisolver==1.4.8 +matplotlib==3.10.5 +numpy==2.3.4 +opencv @ file:///build/source/modules/python/package/dist/opencv-4.12.0-py3-none-any.whl#sha256=7634c6fe19d12c2ba6825ae2dcaee4d84c6fc1a2d31e7e81ff295ac805747a9a +opencv-python==4.12.0 +packaging==25.0 +pillow==12.0.0 +pyparsing==3.2.3 +python-dateutil==2.9.0.post0 +six==1.17.0 +tkinter==3.13.9 diff --git a/row_eq.py b/row_eq.py new file mode 100644 index 0000000..639b8a2 --- /dev/null +++ b/row_eq.py @@ -0,0 +1,10 @@ +"""Row equality for sew: same content (strip, compare).""" + + +def row_equal(a, b, num_cols=6): + if len(a) < num_cols or len(b) < num_cols: + return False + for i in range(num_cols): + if (a[i] if i < len(a) else "").strip() != (b[i] if i < len(b) else "").strip(): + return False + return True diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..7a792a4 --- /dev/null +++ b/run.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Full pipeline: extract frames + OCR → heuristic fixes → sew + dedupe → result.csv +# Usage: ./run.sh [video] [output_dir] +# Default: video=video-data.webm, output_dir=frames, result=result.csv +set -e +VIDEO="${1:-video-data.webm}" +OUT_DIR="${2:-frames}" +RESULT="result.csv" + +echo "1. Extract frames (every 10) + OCR with template.svg → ${OUT_DIR}/" +python3 extract_frames_and_tables.py "$OUT_DIR" "$VIDEO" template.svg 10 + +echo "2. Heuristic fixes on ${OUT_DIR}/*.csv" +python3 fix_all_csvs.py "$OUT_DIR" + +echo "3. Sew CSVs + dedupe → ${RESULT}" +python3 sew_csvs.py "$OUT_DIR" "$RESULT" + +echo "Done. Output: ${RESULT}" diff --git a/sew_csvs.py b/sew_csvs.py new file mode 100644 index 0000000..de70a7b --- /dev/null +++ b/sew_csvs.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +"""Sew all frame CSVs in order; deduplicate overlap at boundaries (scroll down).""" +import csv +import os +import re +import sys + +from row_eq import row_equal + + +def frame_number_from_name(name): + m = re.match(r"^(\d+)\.csv$", name) + return int(m.group(1)) if m else None + + +def list_frame_csvs(frames_dir): + names = [] + for name in os.listdir(frames_dir): + n = frame_number_from_name(name) + if n is not None: + names.append((n, name)) + return [name for _, name in sorted(names)] + + +def read_csv(path): + with open(path, newline="", encoding="utf-8") as f: + return list(csv.reader(f)) + + +def write_csv(path, rows): + with open(path, "w", newline="", encoding="utf-8") as f: + csv.writer(f).writerows(rows) + + +def overlap_length(acc_rows, next_rows, num_cols=6): + max_k = min(len(acc_rows), len(next_rows)) + for k in range(max_k, 0, -1): + if k > len(acc_rows) or k > len(next_rows): + continue + match = True + for i in range(k): + if not row_equal(acc_rows[-k + i], next_rows[i], num_cols): + match = False + break + if match: + return k + return 0 + + +def dedupe_rows(rows, num_cols=6): + seen = set() + out = [] + for row in rows: + key = tuple((row[i] if i < len(row) else "").strip() for i in range(num_cols)) + if key in seen: + continue + seen.add(key) + out.append(row) + return out + + +def sew(frames_dir, out_path, num_cols=6, dedupe=True): + csv_names = list_frame_csvs(frames_dir) + if not csv_names: + return 0 + acc = read_csv(os.path.join(frames_dir, csv_names[0])) + for name in csv_names[1:]: + next_rows = read_csv(os.path.join(frames_dir, name)) + k = overlap_length(acc, next_rows, num_cols) + acc.extend(next_rows[k:]) + if dedupe: + acc = dedupe_rows(acc, num_cols) + write_csv(out_path, acc) + return len(acc) + + +def main(): + frames_dir = sys.argv[1] if len(sys.argv) > 1 else "frames" + out_path = sys.argv[2] if len(sys.argv) > 2 else "result.csv" + n = sew(frames_dir, out_path) + print(f"Wrote {n} rows to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/svg_columns.py b/svg_columns.py new file mode 100644 index 0000000..5b9387e --- /dev/null +++ b/svg_columns.py @@ -0,0 +1,25 @@ +"""Parse column rectangles from template.svg.""" +import re + + +def parse_column_rects(svg_path): + with open(svg_path) as f: + content = f.read() + blocks = re.findall(r"]+>", content) + cols = [] + for block in blocks: + x = float(re.search(r'x="([^"]+)"', block).group(1)) + y = float(re.search(r'y="([^"]+)"', block).group(1)) + w = float(re.search(r'width="([^"]+)"', block).group(1)) + h = float(re.search(r'height="([^"]+)"', block).group(1)) + cols.append({"x": x, "y": y, "w": w, "h": h}) + return cols + + +def column_bounds(cols): + return [(c["x"], c["x"] + c["w"]) for c in cols] + + +def row_bounds(num_rows, table_y, table_height): + step = table_height / num_rows + return [(table_y + i * step, table_y + (i + 1) * step) for i in range(num_rows)] diff --git a/template.svg b/template.svg new file mode 100644 index 0000000..4786ed0 --- /dev/null +++ b/template.svg @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +