From fe5a346fddb0ab62b27de4a323645d8f3bfb10b0 Mon Sep 17 00:00:00 2001 From: empty Date: Thu, 12 Feb 2026 14:20:33 +0800 Subject: [PATCH] feat: initial commit for post-ocr extraction pipeline --- .gitignore | 10 +++++ CLAUDE.md | 18 +++++++++ data/input/.gitkeep | 0 data/output/.gitkeep | 0 requirements.txt | 6 +++ src/main.py | 75 ++++++++++++++++++++++++++++++++++++++ src/processor.py | 87 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 196 insertions(+) create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 data/input/.gitkeep create mode 100644 data/output/.gitkeep create mode 100644 requirements.txt create mode 100644 src/main.py create mode 100644 src/processor.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26e1e18 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +data/input/* +!data/input/.gitkeep +data/output/* +!data/output/.gitkeep +__pycache__/ +*.pyc +.DS_Store +.venv/ +venv/ + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..b2cefe8 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,18 @@ +# Post-OCR Data Extraction Project + +## 项目愿景 +实现工厂环境下信封背面信息的自动化提取与结构化录入。 + +## 技术栈 +- **OCR**: PaddleOCR (本地部署) +- **数据处理**: Python, Pandas +- **解析逻辑**: 正则表达式 + 语义校验 + +## 目录结构 +- `data/input/`: 原始图片存放处 +- `data/output/`: 结果 Excel 及处理日志 +- `src/`: 源代码 + +## 开发规范 +1. 错误处理:所有 OCR 失败或解析不完全的记录必须记录在 `data/output/error_log.csv` 中。 +2. 验证:在保存前进行邮编(6位)和电话(正则)的合法性校验。 diff --git a/data/input/.gitkeep b/data/input/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/data/output/.gitkeep b/data/output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..28c8beb --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +paddleocr +paddlepaddle +pandas +openpyxl +pydantic +tqdm diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..283b4c5 --- /dev/null +++ b/src/main.py @@ -0,0 +1,75 @@ +import os +import glob +import pandas as pd +from tqdm import tqdm +from paddleocr import PaddleOCR +from processor import extract_info, save_to_excel + +# 禁用联网检查,加快启动速度 +os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True" + + +def main(): + # 初始化 PaddleOCR + ocr = PaddleOCR(use_textline_orientation=True, lang="ch") + + input_dir = "data/input" + output_dir = "data/output" + output_excel = os.path.join(output_dir, "result.xlsx") + error_log = os.path.join(output_dir, "error_log.csv") + + # 支持常见的图片格式 + extensions = ("*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff") + image_paths = [] + for ext in extensions: + image_paths.extend(glob.glob(os.path.join(input_dir, ext))) + + if not image_paths: + print(f"错误: 在 {input_dir} 文件夹中未找到任何图片文件。") + return + + all_records = [] + errors = [] + + print(f"检测到 {len(image_paths)} 个待处理信封。开始提取...") + + for img_path in tqdm(image_paths): + try: + # 1. 执行 OCR 识别 (使用 predict 替代 deprecated 的 ocr 方法) + result = ocr.predict(img_path) + + # 2. 提取文字行 (适配 Paddlex OCRResult 结构) + ocr_texts = [] + if result: + for res in result: + # 获取识别出的文本列表 + if hasattr(res, "rec_texts"): + ocr_texts.extend(res.rec_texts) + elif isinstance(res, dict) and "rec_texts" in res: + ocr_texts.extend(res["rec_texts"]) + + # 3. 结构化解析 + if ocr_texts: + record = extract_info(ocr_texts) + all_records.append(record) + else: + errors.append( + {"file": os.path.basename(img_path), "error": "未识别到任何文字"} + ) + + except Exception as e: + errors.append({"file": os.path.basename(img_path), "error": str(e)}) + + # 4. 保存最终结果到 Excel + if all_records: + save_to_excel(all_records, output_excel) + print(f"\n[成功] 已提取 {len(all_records)} 条数据,保存至: {output_excel}") + + # 5. 记录失败项 + if errors: + pd.DataFrame(errors).to_csv(error_log, index=False) + print(f"[警告] 有 {len(errors)} 张图片处理失败,详情请查看: {error_log}") + + +if __name__ == "__main__": + main() diff --git a/src/processor.py b/src/processor.py new file mode 100644 index 0000000..599af96 --- /dev/null +++ b/src/processor.py @@ -0,0 +1,87 @@ +import re +import pandas as pd +from typing import List, Dict, Any +from pydantic import BaseModel, Field + + +class EnvelopeRecord(BaseModel): + 编号: str = "" + 邮编: str = "" + 地址: str = "" + 联系人_单位名: str = Field(default="", alias="联系人/单位名") + 电话: str = "" + + +def clean_text(text: str) -> str: + """清理OCR识别出的杂质字符""" + return text.strip().replace(" ", "") + + +def extract_info(ocr_results: List[str]) -> Dict[str, str]: + """ + 从OCR结果列表中提取结构化信息。 + """ + data = {"编号": "", "邮编": "", "地址": "", "联系人/单位名": "", "电话": ""} + + full_content = " ".join(ocr_results) + + # 1. 提取邮编 (6位数字) + zip_match = re.search(r"\b(\d{6})\b", full_content) + if zip_match: + data["邮编"] = zip_match.group(1) + + # 2. 提取电话 (11位手机号或带区号固话) + phone_match = re.search(r"(1[3-9]\d{9}|0\d{2,3}-\d{7,8})", full_content) + if phone_match: + data["电话"] = phone_match.group(0) + + # 3. 提取联系人 (通常在电话前面,或者是独立的短行) + # 遍历每一行寻找包含电话的行 + for line in ocr_results: + if data["电话"] and data["电话"] in line: + # 移除电话部分,剩下的可能是姓名 + name_part = line.replace(data["电话"], "").strip() + # 进一步清洗姓名(移除符号) + name_part = re.sub(r"[^\w\u4e00-\u9fa5]", "", name_part) + if name_part: + data["联系人/单位名"] = name_part + break + + # 如果还没找到联系人,尝试找不含数字的短行 + if not data["联系人/单位名"]: + for line in ocr_results: + clean_line = re.sub(r"[^\w\u4e00-\u9fa5]", "", line) + if 2 <= len(clean_line) <= 10 and not re.search(r"\d", clean_line): + data["联系人/单位名"] = clean_line + break + + # 4. 提取地址 + address_match = re.search( + r"([^,,。\s]*(?:省|市|区|县|乡|镇|路|街|村|组|号)[^,,。\s]*)", full_content + ) + if address_match: + data["地址"] = address_match.group(1) + else: + # 兜底:寻找较长的包含地名特征的行 + for line in ocr_results: + if any(k in line for k in ["省", "市", "区", "县", "乡", "镇", "村"]): + data["地址"] = line.strip() + break + + # 5. 提取编号 (长数字串) + # 排除邮编和电话后的最长数字串 + long_numbers = re.findall(r"\b\d{10,20}\b", full_content) + for num in long_numbers: + if num != data["电话"]: + data["编号"] = num + break + + return data + + +def save_to_excel(records: List[Dict[str, Any]], output_path: str): + df = pd.DataFrame(records) + # 调整列顺序 + cols = ["编号", "邮编", "地址", "联系人/单位名", "电话"] + df = df.reindex(columns=cols) + df.to_excel(output_path, index=False)