feat: 添加 LLM Content Extractor 浏览器扩展

- 支持框选区域提取网页内容
- 支持整页内容提取
- 输出格式:Markdown/JSON/XML
- 自动复制到剪贴板
This commit is contained in:
empty
2025-12-03 16:44:03 +08:00
commit b1d9f2c518
13 changed files with 2067 additions and 0 deletions

View File

@@ -0,0 +1,94 @@
# LLM Content Extractor
一个 Chrome 浏览器扩展,用于截取网页内容并转换为大模型友好的格式。
## 功能特性
- 🎯 **区域框选提取** - 拖拽鼠标框选想要提取的区域
- 📄 **整页提取** - 一键提取整个页面内容
- 📝 **多种输出格式** - 支持 Markdown、JSON、XML
- 📋 **自动复制** - 提取后自动复制到剪贴板
- 💾 **历史记录** - 可随时复制上次提取的内容
## 支持提取的内容类型
- 标题 (h1-h6)
- 段落
- 代码块(保留语言标识)
- 有序/无序列表
- 表格
- 图片(保留 src 和 alt
- 链接(保留文本和 href
## 安装方法
1. 打开 Chrome 浏览器,访问 `chrome://extensions/`
2. 开启右上角的 **开发者模式**
3. 点击 **加载已解压的扩展程序**
4. 选择 `browser-extension` 文件夹
## 使用方法
1. 点击浏览器工具栏中的扩展图标
2. 选择输出格式Markdown/JSON/XML
3. 点击 **框选区域提取****提取整页内容**
4. 如果是框选模式,拖拽鼠标选择区域
5. 提取完成后内容自动复制到剪贴板
## 快捷操作
- **ESC** - 取消框选模式
## 输出示例
### Markdown 格式
```markdown
# 标题
这是一段文字内容。
- 列表项 1
- 列表项 2
| 表头1 | 表头2 |
| --- | --- |
| 数据1 | 数据2 |
```
### JSON 格式
```json
[
{
"type": "heading",
"level": 1,
"content": "标题"
},
{
"type": "paragraph",
"content": "这是一段文字内容。"
}
]
```
## 注意事项
- 首次使用需要刷新页面才能生效
- 某些页面可能因安全策略限制而无法使用
- 图标文件需要自行添加16x16, 48x48, 128x128 PNG
## 开发
```bash
# 项目结构
browser-extension/
├── manifest.json # 扩展配置
├── popup.html # 弹出窗口
├── popup.js # 弹出窗口逻辑
├── content.js # 内容脚本
├── content.css # 内容脚本样式
└── icons/ # 图标文件夹
```
## License
MIT

View File

@@ -0,0 +1,76 @@
.llm-extractor-overlay {
position: fixed;
top: 0;
left: 0;
width: 100vw;
height: 100vh;
background: rgba(0, 0, 0, 0.3);
cursor: crosshair;
z-index: 999998;
}
.llm-extractor-selection {
position: fixed;
border: 2px dashed #667eea;
background: rgba(102, 126, 234, 0.1);
z-index: 999999;
pointer-events: none;
}
.llm-extractor-hint {
position: fixed;
top: 20px;
left: 50%;
transform: translateX(-50%);
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 12px 24px;
border-radius: 8px;
font-size: 14px;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
z-index: 1000000;
animation: slideDown 0.3s ease-out;
}
.llm-extractor-notification {
position: fixed;
bottom: 20px;
right: 20px;
background: #333;
color: white;
padding: 12px 20px;
border-radius: 8px;
font-size: 14px;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
z-index: 1000000;
animation: slideUp 0.3s ease-out;
}
.llm-extractor-notification.fade-out {
opacity: 0;
transition: opacity 0.3s ease-out;
}
@keyframes slideDown {
from {
transform: translateX(-50%) translateY(-20px);
opacity: 0;
}
to {
transform: translateX(-50%) translateY(0);
opacity: 1;
}
}
@keyframes slideUp {
from {
transform: translateY(20px);
opacity: 0;
}
to {
transform: translateY(0);
opacity: 1;
}
}

View File

@@ -0,0 +1,478 @@
// 全局变量
let isSelecting = false;
let selectionBox = null;
let startX, startY;
let overlay = null;
let currentFormat = 'markdown';
// 监听来自 popup 的消息
chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
if (request.action === 'startSelection') {
currentFormat = request.format || 'markdown';
startSelectionMode();
sendResponse({ success: true });
} else if (request.action === 'extractFullPage') {
currentFormat = request.format || 'markdown';
const content = extractContent(document.body);
const formatted = formatContent(content, currentFormat);
copyToClipboard(formatted);
saveToStorage(formatted);
sendResponse({ success: true });
}
return true;
});
// 开始框选模式
function startSelectionMode() {
isSelecting = true;
// 创建遮罩层
overlay = document.createElement('div');
overlay.className = 'llm-extractor-overlay';
document.body.appendChild(overlay);
// 创建提示
const hint = document.createElement('div');
hint.className = 'llm-extractor-hint';
hint.textContent = '拖拽鼠标框选要提取的区域ESC 取消';
document.body.appendChild(hint);
// 绑定事件
document.addEventListener('mousedown', onMouseDown);
document.addEventListener('keydown', onKeyDown);
}
function onMouseDown(e) {
if (!isSelecting) return;
startX = e.clientX;
startY = e.clientY;
// 创建选择框
selectionBox = document.createElement('div');
selectionBox.className = 'llm-extractor-selection';
selectionBox.style.left = startX + 'px';
selectionBox.style.top = startY + 'px';
document.body.appendChild(selectionBox);
document.addEventListener('mousemove', onMouseMove);
document.addEventListener('mouseup', onMouseUp);
}
function onMouseMove(e) {
if (!selectionBox) return;
const currentX = e.clientX;
const currentY = e.clientY;
const left = Math.min(startX, currentX);
const top = Math.min(startY, currentY);
const width = Math.abs(currentX - startX);
const height = Math.abs(currentY - startY);
selectionBox.style.left = left + 'px';
selectionBox.style.top = top + 'px';
selectionBox.style.width = width + 'px';
selectionBox.style.height = height + 'px';
}
function onMouseUp(e) {
if (!selectionBox) return;
const rect = selectionBox.getBoundingClientRect();
// 清理选择框
document.removeEventListener('mousemove', onMouseMove);
document.removeEventListener('mouseup', onMouseUp);
// 查找选区内的元素
const elements = getElementsInRect(rect);
if (elements.length > 0) {
// 提取内容
const content = extractFromElements(elements);
const formatted = formatContent(content, currentFormat);
copyToClipboard(formatted);
saveToStorage(formatted);
showNotification('✅ 内容已提取并复制到剪贴板');
} else {
showNotification('❌ 未选中任何内容');
}
cleanup();
}
function onKeyDown(e) {
if (e.key === 'Escape') {
cleanup();
}
}
function cleanup() {
isSelecting = false;
if (selectionBox) {
selectionBox.remove();
selectionBox = null;
}
if (overlay) {
overlay.remove();
overlay = null;
}
const hint = document.querySelector('.llm-extractor-hint');
if (hint) hint.remove();
document.removeEventListener('mousedown', onMouseDown);
document.removeEventListener('mousemove', onMouseMove);
document.removeEventListener('mouseup', onMouseUp);
document.removeEventListener('keydown', onKeyDown);
}
// 获取选区内的元素
function getElementsInRect(rect) {
const elements = [];
const allElements = document.body.querySelectorAll('*');
allElements.forEach(el => {
const elRect = el.getBoundingClientRect();
if (isRectOverlap(rect, elRect) && isVisibleElement(el)) {
elements.push(el);
}
});
// 找到最小公共祖先
if (elements.length > 0) {
return [findCommonAncestor(elements)];
}
return elements;
}
function isRectOverlap(rect1, rect2) {
return !(rect1.right < rect2.left ||
rect1.left > rect2.right ||
rect1.bottom < rect2.top ||
rect1.top > rect2.bottom);
}
function isVisibleElement(el) {
const style = window.getComputedStyle(el);
return style.display !== 'none' &&
style.visibility !== 'hidden' &&
style.opacity !== '0';
}
function findCommonAncestor(elements) {
if (elements.length === 1) return elements[0];
let ancestor = elements[0];
for (let i = 1; i < elements.length; i++) {
ancestor = findAncestor(ancestor, elements[i]);
}
return ancestor;
}
function findAncestor(el1, el2) {
const ancestors = [];
let node = el1;
while (node) {
ancestors.push(node);
node = node.parentElement;
}
node = el2;
while (node) {
if (ancestors.includes(node)) return node;
node = node.parentElement;
}
return document.body;
}
// 从元素中提取内容
function extractFromElements(elements) {
const content = [];
elements.forEach(el => {
content.push(...extractContent(el));
});
return content;
}
// 提取内容
function extractContent(root) {
const content = [];
const processed = new Set();
function processElement(el) {
if (processed.has(el)) return;
const tagName = el.tagName?.toLowerCase();
// 跳过不需要的元素
if (['script', 'style', 'noscript', 'iframe', 'svg'].includes(tagName)) {
return;
}
// 标题
if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {
processed.add(el);
content.push({
type: 'heading',
level: parseInt(tagName[1]),
content: el.textContent.trim()
});
return;
}
// 段落
if (tagName === 'p') {
const text = el.textContent.trim();
if (text) {
processed.add(el);
content.push({
type: 'paragraph',
content: text
});
}
return;
}
// 代码块
if (tagName === 'pre' || tagName === 'code') {
if (tagName === 'pre' || !el.closest('pre')) {
processed.add(el);
const lang = el.className.match(/language-(\w+)/)?.[1] ||
el.getAttribute('data-lang') || '';
content.push({
type: 'code',
language: lang,
content: el.textContent
});
}
return;
}
// 列表
if (tagName === 'ul' || tagName === 'ol') {
processed.add(el);
const items = Array.from(el.querySelectorAll(':scope > li'))
.map(li => li.textContent.trim())
.filter(Boolean);
if (items.length) {
content.push({
type: 'list',
ordered: tagName === 'ol',
items: items
});
}
return;
}
// 表格
if (tagName === 'table') {
processed.add(el);
const rows = Array.from(el.querySelectorAll('tr')).map(tr => {
return Array.from(tr.querySelectorAll('th, td'))
.map(cell => cell.textContent.trim());
});
if (rows.length) {
content.push({
type: 'table',
rows: rows
});
}
return;
}
// 图片
if (tagName === 'img') {
processed.add(el);
content.push({
type: 'image',
src: el.src,
alt: el.alt || ''
});
return;
}
// 链接
if (tagName === 'a') {
processed.add(el);
const text = el.textContent.trim();
if (text) {
content.push({
type: 'link',
text: text,
href: el.href
});
}
return;
}
// 递归处理子元素
Array.from(el.children).forEach(child => processElement(child));
}
processElement(root);
// 如果没有提取到结构化内容,回退到纯文本
if (content.length === 0) {
const text = root.textContent.trim();
if (text) {
content.push({
type: 'paragraph',
content: text
});
}
}
return content;
}
// 格式化内容
function formatContent(content, format) {
switch (format) {
case 'markdown':
return toMarkdown(content);
case 'json':
return JSON.stringify(content, null, 2);
case 'xml':
return toXML(content);
default:
return toMarkdown(content);
}
}
// 转换为 Markdown
function toMarkdown(content) {
return content.map(item => {
switch (item.type) {
case 'heading':
return '#'.repeat(item.level) + ' ' + item.content + '\n';
case 'paragraph':
return item.content + '\n';
case 'code':
const lang = item.language || '';
return '```' + lang + '\n' + item.content + '\n```\n';
case 'list':
return item.items.map((text, i) => {
const prefix = item.ordered ? `${i + 1}. ` : '- ';
return prefix + text;
}).join('\n') + '\n';
case 'table':
if (item.rows.length === 0) return '';
const header = '| ' + item.rows[0].join(' | ') + ' |';
const separator = '| ' + item.rows[0].map(() => '---').join(' | ') + ' |';
const body = item.rows.slice(1)
.map(row => '| ' + row.join(' | ') + ' |')
.join('\n');
return [header, separator, body].filter(Boolean).join('\n') + '\n';
case 'image':
return `![${item.alt}](${item.src})\n`;
case 'link':
return `[${item.text}](${item.href})\n`;
default:
return '';
}
}).join('\n');
}
// 转换为 XML
function toXML(content) {
let xml = '<?xml version="1.0" encoding="UTF-8"?>\n<document>\n';
content.forEach(item => {
switch (item.type) {
case 'heading':
xml += ` <heading level="${item.level}">${escapeXML(item.content)}</heading>\n`;
break;
case 'paragraph':
xml += ` <paragraph>${escapeXML(item.content)}</paragraph>\n`;
break;
case 'code':
xml += ` <code language="${item.language || ''}">${escapeXML(item.content)}</code>\n`;
break;
case 'list':
xml += ` <list ordered="${item.ordered}">\n`;
item.items.forEach(text => {
xml += ` <item>${escapeXML(text)}</item>\n`;
});
xml += ' </list>\n';
break;
case 'table':
xml += ' <table>\n';
item.rows.forEach((row, i) => {
xml += ` <row index="${i}">\n`;
row.forEach((cell, j) => {
xml += ` <cell index="${j}">${escapeXML(cell)}</cell>\n`;
});
xml += ' </row>\n';
});
xml += ' </table>\n';
break;
case 'image':
xml += ` <image src="${escapeXML(item.src)}" alt="${escapeXML(item.alt)}"/>\n`;
break;
case 'link':
xml += ` <link href="${escapeXML(item.href)}">${escapeXML(item.text)}</link>\n`;
break;
}
});
xml += '</document>';
return xml;
}
function escapeXML(str) {
return str
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;')
.replace(/>/g, '&gt;')
.replace(/"/g, '&quot;')
.replace(/'/g, '&apos;');
}
// 复制到剪贴板
async function copyToClipboard(text) {
try {
await navigator.clipboard.writeText(text);
} catch (err) {
// 降级方案
const textarea = document.createElement('textarea');
textarea.value = text;
textarea.style.position = 'fixed';
textarea.style.opacity = '0';
document.body.appendChild(textarea);
textarea.select();
document.execCommand('copy');
document.body.removeChild(textarea);
}
}
// 保存到 storage
function saveToStorage(text) {
chrome.storage.local.set({ lastExtraction: text });
}
// 显示通知
function showNotification(message) {
const notification = document.createElement('div');
notification.className = 'llm-extractor-notification';
notification.textContent = message;
document.body.appendChild(notification);
setTimeout(() => {
notification.classList.add('fade-out');
setTimeout(() => notification.remove(), 300);
}, 2000);
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 257 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 B

View File

@@ -0,0 +1,32 @@
{
"manifest_version": 3,
"name": "LLM Content Extractor",
"version": "1.0.0",
"description": "截图并提取网页内容,转换为大模型友好格式",
"permissions": [
"activeTab",
"scripting",
"clipboardWrite",
"storage"
],
"action": {
"default_popup": "popup.html",
"default_icon": {
"16": "icons/icon16.png",
"48": "icons/icon48.png",
"128": "icons/icon128.png"
}
},
"content_scripts": [
{
"matches": ["<all_urls>"],
"js": ["content.js"],
"css": ["content.css"]
}
],
"icons": {
"16": "icons/icon16.png",
"48": "icons/icon48.png",
"128": "icons/icon128.png"
}
}

View File

@@ -0,0 +1,139 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LLM Content Extractor</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
width: 320px;
padding: 16px;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 200px;
}
.container {
background: white;
border-radius: 12px;
padding: 20px;
box-shadow: 0 4px 20px rgba(0,0,0,0.15);
}
h1 {
font-size: 16px;
color: #333;
margin-bottom: 16px;
display: flex;
align-items: center;
gap: 8px;
}
h1::before {
content: "✨";
}
.btn {
width: 100%;
padding: 12px 16px;
border: none;
border-radius: 8px;
font-size: 14px;
font-weight: 500;
cursor: pointer;
transition: all 0.2s;
display: flex;
align-items: center;
justify-content: center;
gap: 8px;
margin-bottom: 10px;
}
.btn:last-child {
margin-bottom: 0;
}
.btn-primary {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.btn-primary:hover {
transform: translateY(-1px);
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4);
}
.btn-secondary {
background: #f5f5f5;
color: #333;
}
.btn-secondary:hover {
background: #ebebeb;
}
.format-select {
width: 100%;
padding: 10px 12px;
border: 1px solid #e0e0e0;
border-radius: 8px;
font-size: 14px;
margin-bottom: 12px;
background: white;
}
.status {
margin-top: 12px;
padding: 10px;
border-radius: 6px;
font-size: 13px;
display: none;
}
.status.success {
display: block;
background: #e8f5e9;
color: #2e7d32;
}
.status.error {
display: block;
background: #ffebee;
color: #c62828;
}
.divider {
height: 1px;
background: #e0e0e0;
margin: 16px 0;
}
.tip {
font-size: 12px;
color: #888;
text-align: center;
}
</style>
</head>
<body>
<div class="container">
<h1>LLM Content Extractor</h1>
<select id="formatSelect" class="format-select">
<option value="markdown">Markdown 格式</option>
<option value="json">JSON 结构化</option>
<option value="xml">XML 格式</option>
</select>
<button id="selectBtn" class="btn btn-primary">
<span>🎯</span> 框选区域提取
</button>
<button id="fullPageBtn" class="btn btn-secondary">
<span>📄</span> 提取整页内容
</button>
<div class="divider"></div>
<button id="copyLastBtn" class="btn btn-secondary">
<span>📋</span> 复制上次结果
</button>
<div id="status" class="status"></div>
<p class="tip">提取后内容自动复制到剪贴板</p>
</div>
<script src="popup.js"></script>
</body>
</html>

View File

@@ -0,0 +1,72 @@
document.addEventListener('DOMContentLoaded', () => {
const selectBtn = document.getElementById('selectBtn');
const fullPageBtn = document.getElementById('fullPageBtn');
const copyLastBtn = document.getElementById('copyLastBtn');
const formatSelect = document.getElementById('formatSelect');
const status = document.getElementById('status');
function showStatus(message, type) {
status.textContent = message;
status.className = `status ${type}`;
setTimeout(() => {
status.className = 'status';
}, 3000);
}
// 框选区域提取
selectBtn.addEventListener('click', async () => {
const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
const format = formatSelect.value;
await chrome.tabs.sendMessage(tab.id, {
action: 'startSelection',
format: format
});
window.close();
});
// 提取整页内容
fullPageBtn.addEventListener('click', async () => {
const [tab] = await chrome.tabs.query({ active: true, currentWindow: true });
const format = formatSelect.value;
try {
const response = await chrome.tabs.sendMessage(tab.id, {
action: 'extractFullPage',
format: format
});
if (response && response.success) {
showStatus('✅ 内容已复制到剪贴板', 'success');
} else {
showStatus('❌ 提取失败', 'error');
}
} catch (err) {
showStatus('❌ 请刷新页面后重试', 'error');
}
});
// 复制上次结果
copyLastBtn.addEventListener('click', async () => {
const result = await chrome.storage.local.get('lastExtraction');
if (result.lastExtraction) {
await navigator.clipboard.writeText(result.lastExtraction);
showStatus('✅ 已复制上次结果', 'success');
} else {
showStatus('❌ 暂无历史记录', 'error');
}
});
// 恢复上次选择的格式
chrome.storage.local.get('format', (result) => {
if (result.format) {
formatSelect.value = result.format;
}
});
// 保存格式选择
formatSelect.addEventListener('change', () => {
chrome.storage.local.set({ format: formatSelect.value });
});
});