feat: initial commit for post-ocr extraction pipeline
This commit is contained in:
10
.gitignore
vendored
Normal file
10
.gitignore
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
data/input/*
|
||||
!data/input/.gitkeep
|
||||
data/output/*
|
||||
!data/output/.gitkeep
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
.venv/
|
||||
venv/
|
||||
|
||||
18
CLAUDE.md
Normal file
18
CLAUDE.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# Post-OCR Data Extraction Project
|
||||
|
||||
## 项目愿景
|
||||
实现工厂环境下信封背面信息的自动化提取与结构化录入。
|
||||
|
||||
## 技术栈
|
||||
- **OCR**: PaddleOCR (本地部署)
|
||||
- **数据处理**: Python, Pandas
|
||||
- **解析逻辑**: 正则表达式 + 语义校验
|
||||
|
||||
## 目录结构
|
||||
- `data/input/`: 原始图片存放处
|
||||
- `data/output/`: 结果 Excel 及处理日志
|
||||
- `src/`: 源代码
|
||||
|
||||
## 开发规范
|
||||
1. 错误处理:所有 OCR 失败或解析不完全的记录必须记录在 `data/output/error_log.csv` 中。
|
||||
2. 验证:在保存前进行邮编(6位)和电话(正则)的合法性校验。
|
||||
0
data/input/.gitkeep
Normal file
0
data/input/.gitkeep
Normal file
0
data/output/.gitkeep
Normal file
0
data/output/.gitkeep
Normal file
6
requirements.txt
Normal file
6
requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
paddleocr
|
||||
paddlepaddle
|
||||
pandas
|
||||
openpyxl
|
||||
pydantic
|
||||
tqdm
|
||||
75
src/main.py
Normal file
75
src/main.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import os
|
||||
import glob
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from paddleocr import PaddleOCR
|
||||
from processor import extract_info, save_to_excel
|
||||
|
||||
# 禁用联网检查,加快启动速度
|
||||
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
||||
|
||||
|
||||
def main():
|
||||
# 初始化 PaddleOCR
|
||||
ocr = PaddleOCR(use_textline_orientation=True, lang="ch")
|
||||
|
||||
input_dir = "data/input"
|
||||
output_dir = "data/output"
|
||||
output_excel = os.path.join(output_dir, "result.xlsx")
|
||||
error_log = os.path.join(output_dir, "error_log.csv")
|
||||
|
||||
# 支持常见的图片格式
|
||||
extensions = ("*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff")
|
||||
image_paths = []
|
||||
for ext in extensions:
|
||||
image_paths.extend(glob.glob(os.path.join(input_dir, ext)))
|
||||
|
||||
if not image_paths:
|
||||
print(f"错误: 在 {input_dir} 文件夹中未找到任何图片文件。")
|
||||
return
|
||||
|
||||
all_records = []
|
||||
errors = []
|
||||
|
||||
print(f"检测到 {len(image_paths)} 个待处理信封。开始提取...")
|
||||
|
||||
for img_path in tqdm(image_paths):
|
||||
try:
|
||||
# 1. 执行 OCR 识别 (使用 predict 替代 deprecated 的 ocr 方法)
|
||||
result = ocr.predict(img_path)
|
||||
|
||||
# 2. 提取文字行 (适配 Paddlex OCRResult 结构)
|
||||
ocr_texts = []
|
||||
if result:
|
||||
for res in result:
|
||||
# 获取识别出的文本列表
|
||||
if hasattr(res, "rec_texts"):
|
||||
ocr_texts.extend(res.rec_texts)
|
||||
elif isinstance(res, dict) and "rec_texts" in res:
|
||||
ocr_texts.extend(res["rec_texts"])
|
||||
|
||||
# 3. 结构化解析
|
||||
if ocr_texts:
|
||||
record = extract_info(ocr_texts)
|
||||
all_records.append(record)
|
||||
else:
|
||||
errors.append(
|
||||
{"file": os.path.basename(img_path), "error": "未识别到任何文字"}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
errors.append({"file": os.path.basename(img_path), "error": str(e)})
|
||||
|
||||
# 4. 保存最终结果到 Excel
|
||||
if all_records:
|
||||
save_to_excel(all_records, output_excel)
|
||||
print(f"\n[成功] 已提取 {len(all_records)} 条数据,保存至: {output_excel}")
|
||||
|
||||
# 5. 记录失败项
|
||||
if errors:
|
||||
pd.DataFrame(errors).to_csv(error_log, index=False)
|
||||
print(f"[警告] 有 {len(errors)} 张图片处理失败,详情请查看: {error_log}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
87
src/processor.py
Normal file
87
src/processor.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import re
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class EnvelopeRecord(BaseModel):
|
||||
编号: str = ""
|
||||
邮编: str = ""
|
||||
地址: str = ""
|
||||
联系人_单位名: str = Field(default="", alias="联系人/单位名")
|
||||
电话: str = ""
|
||||
|
||||
|
||||
def clean_text(text: str) -> str:
|
||||
"""清理OCR识别出的杂质字符"""
|
||||
return text.strip().replace(" ", "")
|
||||
|
||||
|
||||
def extract_info(ocr_results: List[str]) -> Dict[str, str]:
|
||||
"""
|
||||
从OCR结果列表中提取结构化信息。
|
||||
"""
|
||||
data = {"编号": "", "邮编": "", "地址": "", "联系人/单位名": "", "电话": ""}
|
||||
|
||||
full_content = " ".join(ocr_results)
|
||||
|
||||
# 1. 提取邮编 (6位数字)
|
||||
zip_match = re.search(r"\b(\d{6})\b", full_content)
|
||||
if zip_match:
|
||||
data["邮编"] = zip_match.group(1)
|
||||
|
||||
# 2. 提取电话 (11位手机号或带区号固话)
|
||||
phone_match = re.search(r"(1[3-9]\d{9}|0\d{2,3}-\d{7,8})", full_content)
|
||||
if phone_match:
|
||||
data["电话"] = phone_match.group(0)
|
||||
|
||||
# 3. 提取联系人 (通常在电话前面,或者是独立的短行)
|
||||
# 遍历每一行寻找包含电话的行
|
||||
for line in ocr_results:
|
||||
if data["电话"] and data["电话"] in line:
|
||||
# 移除电话部分,剩下的可能是姓名
|
||||
name_part = line.replace(data["电话"], "").strip()
|
||||
# 进一步清洗姓名(移除符号)
|
||||
name_part = re.sub(r"[^\w\u4e00-\u9fa5]", "", name_part)
|
||||
if name_part:
|
||||
data["联系人/单位名"] = name_part
|
||||
break
|
||||
|
||||
# 如果还没找到联系人,尝试找不含数字的短行
|
||||
if not data["联系人/单位名"]:
|
||||
for line in ocr_results:
|
||||
clean_line = re.sub(r"[^\w\u4e00-\u9fa5]", "", line)
|
||||
if 2 <= len(clean_line) <= 10 and not re.search(r"\d", clean_line):
|
||||
data["联系人/单位名"] = clean_line
|
||||
break
|
||||
|
||||
# 4. 提取地址
|
||||
address_match = re.search(
|
||||
r"([^,,。\s]*(?:省|市|区|县|乡|镇|路|街|村|组|号)[^,,。\s]*)", full_content
|
||||
)
|
||||
if address_match:
|
||||
data["地址"] = address_match.group(1)
|
||||
else:
|
||||
# 兜底:寻找较长的包含地名特征的行
|
||||
for line in ocr_results:
|
||||
if any(k in line for k in ["省", "市", "区", "县", "乡", "镇", "村"]):
|
||||
data["地址"] = line.strip()
|
||||
break
|
||||
|
||||
# 5. 提取编号 (长数字串)
|
||||
# 排除邮编和电话后的最长数字串
|
||||
long_numbers = re.findall(r"\b\d{10,20}\b", full_content)
|
||||
for num in long_numbers:
|
||||
if num != data["电话"]:
|
||||
data["编号"] = num
|
||||
break
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def save_to_excel(records: List[Dict[str, Any]], output_path: str):
|
||||
df = pd.DataFrame(records)
|
||||
# 调整列顺序
|
||||
cols = ["编号", "邮编", "地址", "联系人/单位名", "电话"]
|
||||
df = df.reindex(columns=cols)
|
||||
df.to_excel(output_path, index=False)
|
||||
Reference in New Issue
Block a user