92 lines
2.5 KiB
Python
92 lines
2.5 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import json
|
|
|
|
url = "https://developers.weixin.qq.com/miniprogram/dev/framework/search/seo.html"
|
|
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/121.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
|
|
|
|
def fetch_html(url):
|
|
resp = requests.get(url, headers=headers, timeout=10)
|
|
resp.raise_for_status()
|
|
resp.encoding = resp.apparent_encoding
|
|
return resp.text
|
|
|
|
|
|
def extract_structured_content(html):
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
|
|
# 可能的正文容器
|
|
main = soup.select_one("#docContent, .content, #page-content, .page-content")
|
|
if not main:
|
|
main = soup.body # 最后兜底
|
|
|
|
data = []
|
|
|
|
for el in main.descendants:
|
|
if el.name in ["h1", "h2", "h3", "h4"]:
|
|
data.append({
|
|
"type": "heading",
|
|
"level": int(el.name[-1]),
|
|
"content": el.get_text(strip=True)
|
|
})
|
|
|
|
elif el.name == "p":
|
|
txt = el.get_text(" ", strip=True)
|
|
if txt:
|
|
data.append({
|
|
"type": "paragraph",
|
|
"content": txt
|
|
})
|
|
|
|
elif el.name == "pre":
|
|
code = el.get_text("\n", strip=False)
|
|
if code:
|
|
data.append({
|
|
"type": "code",
|
|
"lang": el.get("lang") or el.get("data-lang") or "text",
|
|
"content": code
|
|
})
|
|
|
|
elif el.name == "table":
|
|
rows = []
|
|
for tr in el.select("tr"):
|
|
cols = [td.get_text(" ", strip=True) for td in tr.select("th,td")]
|
|
rows.append(cols)
|
|
|
|
data.append({
|
|
"type": "table",
|
|
"rows": rows
|
|
})
|
|
|
|
elif el.name in ["ul", "ol"]:
|
|
items = [
|
|
li.get_text(" ", strip=True)
|
|
for li in el.select("li")
|
|
]
|
|
if items:
|
|
data.append({
|
|
"type": "list",
|
|
"ordered": el.name == "ol",
|
|
"items": items
|
|
})
|
|
|
|
return data
|
|
|
|
|
|
if __name__ == "__main__":
|
|
html = fetch_html(url)
|
|
structured = extract_structured_content(html)
|
|
|
|
with open("wechat_dev_seo_structured.json", "w", encoding="utf-8") as f:
|
|
json.dump(structured, f, ensure_ascii=False, indent=2)
|
|
|
|
print("结构化内容已写入 wechat_dev_seo_structured.json")
|