feat: initial commit for post-ocr extraction pipeline

2026-02-12 14:20:33 +08:00
commit fe5a346fdd
7 changed files with 196 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,10 @@
+data/input/*
+!data/input/.gitkeep
+data/output/*
+!data/output/.gitkeep
+__pycache__/
+*.pyc
+.DS_Store
+.venv/
+venv/
+
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -0,0 +1,18 @@
+# Post-OCR Data Extraction Project
+
+## 项目愿景
+实现工厂环境下信封背面信息的自动化提取与结构化录入。
+
+## 技术栈
+- **OCR**: PaddleOCR (本地部署)
+- **数据处理**: Python, Pandas
+- **解析逻辑**: 正则表达式 + 语义校验
+
+## 目录结构
+- `data/input/`: 原始图片存放处
+- `data/output/`: 结果 Excel 及处理日志
+- `src/`: 源代码
+
+## 开发规范
+1. 错误处理：所有 OCR 失败或解析不完全的记录必须记录在 `data/output/error_log.csv` 中。
+2. 验证：在保存前进行邮编（6位）和电话（正则）的合法性校验。
--- a/data/input/.gitkeep
+++ b/data/input/.gitkeep
--- a/data/output/.gitkeep
+++ b/data/output/.gitkeep
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+paddleocr
+paddlepaddle
+pandas
+openpyxl
+pydantic
+tqdm
--- a/src/main.py
+++ b/src/main.py
@@ -0,0 +1,75 @@
+import os
+import glob
+import pandas as pd
+from tqdm import tqdm
+from paddleocr import PaddleOCR
+from processor import extract_info, save_to_excel
+
+# 禁用联网检查，加快启动速度
+os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
+
+
+def main():
+    # 初始化 PaddleOCR
+    ocr = PaddleOCR(use_textline_orientation=True, lang="ch")
+
+    input_dir = "data/input"
+    output_dir = "data/output"
+    output_excel = os.path.join(output_dir, "result.xlsx")
+    error_log = os.path.join(output_dir, "error_log.csv")
+
+    # 支持常见的图片格式
+    extensions = ("*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff")
+    image_paths = []
+    for ext in extensions:
+        image_paths.extend(glob.glob(os.path.join(input_dir, ext)))
+
+    if not image_paths:
+        print(f"错误: 在 {input_dir} 文件夹中未找到任何图片文件。")
+        return
+
+    all_records = []
+    errors = []
+
+    print(f"检测到 {len(image_paths)} 个待处理信封。开始提取...")
+
+    for img_path in tqdm(image_paths):
+        try:
+            # 1. 执行 OCR 识别 (使用 predict 替代 deprecated 的 ocr 方法)
+            result = ocr.predict(img_path)
+
+            # 2. 提取文字行 (适配 Paddlex OCRResult 结构)
+            ocr_texts = []
+            if result:
+                for res in result:
+                    # 获取识别出的文本列表
+                    if hasattr(res, "rec_texts"):
+                        ocr_texts.extend(res.rec_texts)
+                    elif isinstance(res, dict) and "rec_texts" in res:
+                        ocr_texts.extend(res["rec_texts"])
+
+            # 3. 结构化解析
+            if ocr_texts:
+                record = extract_info(ocr_texts)
+                all_records.append(record)
+            else:
+                errors.append(
+                    {"file": os.path.basename(img_path), "error": "未识别到任何文字"}
+                )
+
+        except Exception as e:
+            errors.append({"file": os.path.basename(img_path), "error": str(e)})
+
+    # 4. 保存最终结果到 Excel
+    if all_records:
+        save_to_excel(all_records, output_excel)
+        print(f"\n[成功] 已提取 {len(all_records)} 条数据，保存至: {output_excel}")
+
+    # 5. 记录失败项
+    if errors:
+        pd.DataFrame(errors).to_csv(error_log, index=False)
+        print(f"[警告] 有 {len(errors)} 张图片处理失败，详情请查看: {error_log}")
+
+
+if __name__ == "__main__":
+    main()
--- a/src/processor.py
+++ b/src/processor.py
@@ -0,0 +1,87 @@
+import re
+import pandas as pd
+from typing import List, Dict, Any
+from pydantic import BaseModel, Field
+
+
+class EnvelopeRecord(BaseModel):
+    编号: str = ""
+    邮编: str = ""
+    地址: str = ""
+    联系人_单位名: str = Field(default="", alias="联系人/单位名")
+    电话: str = ""
+
+
+def clean_text(text: str) -> str:
+    """清理OCR识别出的杂质字符"""
+    return text.strip().replace(" ", "")
+
+
+def extract_info(ocr_results: List[str]) -> Dict[str, str]:
+    """
+    从OCR结果列表中提取结构化信息。
+    """
+    data = {"编号": "", "邮编": "", "地址": "", "联系人/单位名": "", "电话": ""}
+
+    full_content = " ".join(ocr_results)
+
+    # 1. 提取邮编 (6位数字)
+    zip_match = re.search(r"\b(\d{6})\b", full_content)
+    if zip_match:
+        data["邮编"] = zip_match.group(1)
+
+    # 2. 提取电话 (11位手机号或带区号固话)
+    phone_match = re.search(r"(1[3-9]\d{9}|0\d{2,3}-\d{7,8})", full_content)
+    if phone_match:
+        data["电话"] = phone_match.group(0)
+
+    # 3. 提取联系人 (通常在电话前面，或者是独立的短行)
+    # 遍历每一行寻找包含电话的行
+    for line in ocr_results:
+        if data["电话"] and data["电话"] in line:
+            # 移除电话部分，剩下的可能是姓名
+            name_part = line.replace(data["电话"], "").strip()
+            # 进一步清洗姓名（移除符号）
+            name_part = re.sub(r"[^\w\u4e00-\u9fa5]", "", name_part)
+            if name_part:
+                data["联系人/单位名"] = name_part
+            break
+
+    # 如果还没找到联系人，尝试找不含数字的短行
+    if not data["联系人/单位名"]:
+        for line in ocr_results:
+            clean_line = re.sub(r"[^\w\u4e00-\u9fa5]", "", line)
+            if 2 <= len(clean_line) <= 10 and not re.search(r"\d", clean_line):
+                data["联系人/单位名"] = clean_line
+                break
+
+    # 4. 提取地址
+    address_match = re.search(
+        r"([^,，。\s]*(?:省|市|区|县|乡|镇|路|街|村|组|号)[^,，。\s]*)", full_content
+    )
+    if address_match:
+        data["地址"] = address_match.group(1)
+    else:
+        # 兜底：寻找较长的包含地名特征的行
+        for line in ocr_results:
+            if any(k in line for k in ["省", "市", "区", "县", "乡", "镇", "村"]):
+                data["地址"] = line.strip()
+                break
+
+    # 5. 提取编号 (长数字串)
+    # 排除邮编和电话后的最长数字串
+    long_numbers = re.findall(r"\b\d{10,20}\b", full_content)
+    for num in long_numbers:
+        if num != data["电话"]:
+            data["编号"] = num
+            break
+
+    return data
+
+
+def save_to_excel(records: List[Dict[str, Any]], output_path: str):
+    df = pd.DataFrame(records)
+    # 调整列顺序
+    cols = ["编号", "邮编", "地址", "联系人/单位名", "电话"]
+    df = df.reindex(columns=cols)
+    df.to_excel(output_path, index=False)