feat: 多页面深度探索 + 折叠菜单支持

新增功能: - 多页面探索: 使用队列管理待探索页面，跟踪深度 - 折叠菜单: 点击后重新扫描发现新元素，自动展开并探索 - DOM 快速发现: 毫秒级元素发现（替代 AI 调用） - 预计算坐标: DOM 发现时计算坐标，点击时直接使用探索效果: - 访问页面: 1 → 4 - 发现元素: 17 → 30 - 检测菜单展开: ✅ - 检测页面跳转: ✅ - 危险操作保护: ✅
2025-12-28 21:15:50 +08:00
parent b126ce2d49
commit d6793cd758
2 changed files with 246 additions and 74 deletions
--- a/src/agent/explorer.py
+++ b/src/agent/explorer.py
@@ -35,7 +35,7 @@ class FeatureExplorer:
    
    def explore(self, config: Dict = None) -> Dict[str, Any]:
        """
-        执行主动探索
+        执行多页面深度探索
        
        Returns:
            探索结果报告
@@ -46,74 +46,230 @@ class FeatureExplorer:
        start_url = self.browser.page.url
        self.visited_urls.add(start_url)
        click_count = 0
+        current_depth = 0
        
        print(f"🔍 开始探索: {start_url}")
-        print(f"   配置: 最大点击={self.config['max_clicks']}")
+        print(f"   配置: 最大深度={self.config['max_depth']}, 最大点击={self.config['max_clicks']}")
        
-        # 首次发现页面元素
-        print(f"   正在分析页面元素...")
-        all_elements = self._discover_elements()
-        print(f"   发现 {len(all_elements)} 个可交互元素")
+        # 使用队列管理待探索的页面和元素
+        page_queue = [(start_url, 0)]  # (url, depth)
+        explored_pages = set()
        
-        if not all_elements:
-            print("   ⚠️ 没有发现可交互元素")
-            return self._generate_report(start_url, 0)
-        
-        # 过滤和排序
-        elements = self._filter_and_sort(all_elements)
-        print(f"   过滤后 {len(elements)} 个待探索元素")
-        
-        # 探索循环 - 逐个探索发现的元素
-        for element in elements:
-            if click_count >= self.config["max_clicks"]:
-                print(f"   达到最大点击数 {self.config['max_clicks']}")
-                break
+        while page_queue and click_count < self.config["max_clicks"]:
+            current_url, current_depth = page_queue.pop(0)
            
-            click_count += 1
-            print(f"\n   [{click_count}/{min(len(elements), self.config['max_clicks'])}] 探索: {element.get('name', '未知')}")
+            if current_url in explored_pages:
+                continue
            
-            # 执行探索
-            self._explore_element(element, click_count)
+            if current_depth > self.config["max_depth"]:
+                print(f"   ⏩ 跳过深度 {current_depth} 页面: {current_url}")
+                continue
+            
+            explored_pages.add(current_url)
+            
+            # 导航到目标页面（如果不是当前页面）
+            if self.browser.page.url != current_url:
+                print(f"\n📄 导航到: {current_url}")
+                try:
+                    self.browser.goto(current_url)
+                    self.browser.wait(1000)
+                except:
+                    print(f"   ⚠️ 导航失败")
+                    continue
+            
+            print(f"\n{'='*50}")
+            print(f"📍 深度 {current_depth}: 探索页面")
+            print(f"   URL: {current_url[:60]}...")
+            print(f"{'='*50}")
+            
+            # 发现当前页面元素
+            print(f"   正在分析页面元素...")
+            all_elements = self._discover_elements()
+            print(f"   发现 {len(all_elements)} 个可交互元素")
+            
+            if not all_elements:
+                print("   ⚠️ 没有发现可交互元素")
+                continue
+            
+            # 过滤和排序
+            elements = self._filter_and_sort(all_elements)
+            print(f"   过滤后 {len(elements)} 个待探索元素")
+            
+            # 在当前页面探索元素
+            element_index = 0
+            while element_index < len(elements) and click_count < self.config["max_clicks"]:
+                element = elements[element_index]
+                element_index += 1
+                
+                click_count += 1
+                name = element.get('name', '未知')
+                print(f"\n   [{click_count}] 探索: {name}")
+                
+                # 记录操作前的状态
+                before_url = self.browser.page.url
+                before_count = len(elements)
+                
+                # 执行探索
+                self._explore_element(element, click_count)
+                
+                # 检查是否发生页面跳转
+                self.browser.wait(300)
+                after_url = self.browser.page.url
+                
+                if before_url != after_url:
+                    print(f"      🔀 页面跳转: {after_url[:50]}...")
+                    
+                    # 添加新页面到队列
+                    if after_url not in explored_pages and current_depth < self.config["max_depth"]:
+                        page_queue.append((after_url, current_depth + 1))
+                        self.visited_urls.add(after_url)
+                        
+                        # 更新站点地图
+                        if before_url not in self.site_map:
+                            self.site_map[before_url] = []
+                        if after_url not in self.site_map[before_url]:
+                            self.site_map[before_url].append(after_url)
+                    
+                    # 返回原页面继续探索
+                    try:
+                        self.browser.goto(before_url)
+                        self.browser.wait(500)
+                    except:
+                        break
+                else:
+                    # 没有跳转，检查是否有新元素出现（如折叠菜单展开）
+                    new_elements = self._discover_elements()
+                    new_filtered = self._filter_and_sort(new_elements)
+                    
+                    # 找出新出现的元素
+                    existing_names = {e.get("name") for e in elements}
+                    new_items = [e for e in new_filtered if e.get("name") not in existing_names]
+                    
+                    if new_items:
+                        print(f"      📋 发现 {len(new_items)} 个新元素（菜单展开）")
+                        # 将新元素插入到当前位置之后
+                        elements = elements[:element_index] + new_items + elements[element_index:]
        
-        print(f"\n✅ 探索完成: {click_count} 次点击")
+        print(f"\n✅ 探索完成:")
+        print(f"   - 点击次数: {click_count}")
+        print(f"   - 访问页面: {len(self.visited_urls)}")
+        print(f"   - 发现元素: {len(self.discovered_elements)}")
        
        # 生成报告
        return self._generate_report(start_url, click_count)
    
-    def _discover_elements(self) -> List[Dict]:
-        """让 AI 发现页面上所有可交互元素"""
+    def _discover_elements(self, use_ai: bool = False) -> List[Dict]:
+        """发现页面上所有可交互元素"""
+        # 默认使用 DOM 快速发现，可选使用 AI
+        if use_ai:
+            return self._discover_elements_ai()
+        else:
+            return self._discover_elements_dom()
+    
+    def _discover_elements_dom(self) -> List[Dict]:
+        """使用 DOM 快速发现可交互元素（毫秒级）"""
+        current_url = self.browser.page.url
+        
+        try:
+            result = self.browser.page.evaluate('''
+                () => {
+                    const elements = [];
+                    const seen = new Set();
+                    
+                    // 查找所有可交互元素
+                    const selectors = [
+                        'a[href]',           // 链接
+                        'button',            // 按钮
+                        '[role="button"]',   // 角色按钮
+                        '[role="menuitem"]', // 菜单项
+                        '[role="tab"]',      // 标签页
+                        '[role="link"]',     // 角色链接
+                        '.nav-item, .menu-item', // 导航项
+                        '[onclick]',         // 点击事件
+                        'input[type="submit"]', // 提交按钮
+                    ];
+                    
+                    for (const selector of selectors) {
+                        document.querySelectorAll(selector).forEach(el => {
+                            const text = el.textContent?.trim().substring(0, 50) || '';
+                            const key = text + el.tagName;
+                            
+                            if (!text || seen.has(key)) return;
+                            if (text.length < 2 || text.length > 50) return;
+                            
+                            const rect = el.getBoundingClientRect();
+                            if (rect.width <= 0 || rect.height <= 0) return;
+                            if (rect.top < 0 || rect.left < 0) return;
+                            
+                            seen.add(key);
+                            
+                            // 推断类型
+                            let type = 'link';
+                            if (el.tagName === 'BUTTON' || el.getAttribute('role') === 'button') type = 'button';
+                            if (el.closest('nav') || el.classList.contains('nav-item')) type = 'navigation';
+                            if (el.getAttribute('role') === 'menuitem') type = 'menu';
+                            if (el.getAttribute('role') === 'tab') type = 'tab';
+                            
+                            elements.push({
+                                name: text,
+                                type: type,
+                                tagName: el.tagName,
+                                priority: type === 'navigation' ? 8 : 5,
+                                x: Math.round(rect.left + rect.width / 2),
+                                y: Math.round(rect.top + rect.height / 2)
+                            });
+                        });
+                    }
+                    
+                    // 额外查找 cursor:pointer 元素
+                    document.querySelectorAll('*').forEach(el => {
+                        if (window.getComputedStyle(el).cursor === 'pointer') {
+                            const text = Array.from(el.childNodes)
+                                .filter(n => n.nodeType === 3)
+                                .map(n => n.textContent.trim())
+                                .join('').substring(0, 50);
+                            
+                            if (!text || text.length < 2 || seen.has(text + el.tagName)) return;
+                            
+                            const rect = el.getBoundingClientRect();
+                            if (rect.width <= 0 || rect.height <= 0 || rect.width > 500) return;
+                            if (rect.top < 0 || rect.left < 0) return;
+                            
+                            seen.add(text + el.tagName);
+                            elements.push({
+                                name: text,
+                                type: 'link',
+                                tagName: el.tagName,
+                                priority: 4,
+                                x: Math.round(rect.left + rect.width / 2),
+                                y: Math.round(rect.top + rect.height / 2)
+                            });
+                        }
+                    });
+                    
+                    return elements;
+                }
+            ''')
+            
+            # 添加元数据
+            for el in result:
+                el["source_url"] = current_url
+                el["discovered_at"] = datetime.now().isoformat()
+            
+            return result
+            
+        except Exception as e:
+            logger.warning(f"DOM 发现失败: {e}")
+            return []
+    
+    def _discover_elements_ai(self) -> List[Dict]:
+        """使用 AI 发现页面元素（较慢但更智能）"""
        img = self.browser.screenshot_base64()
        current_url = self.browser.page.url
        
-        prompt = """分析当前页面截图，识别所有可交互的 UI 元素。
-
-**请识别以下类型的元素**:
-1. 导航菜单项
-2. 侧边栏链接
-3. 操作按钮
-4. 表单输入框
-5. 下拉菜单
-6. 标签页/Tab
-7. 可点击的卡片或列表项
-
-**返回格式** (只返回 JSON):
-```json
-{
-    "elements": [
-        {
-            "name": "元素名称/文字",
-            "type": "navigation|button|form|menu|tab|link|card",
-            "description": "功能描述",
-            "priority": 1-10,
-            "is_dangerous": false
-        }
-    ],
-    "page_title": "页面标题",
-    "page_type": "dashboard|list|form|detail|login|other"
-}
-```
-
-优先级说明: 10=核心功能, 5=普通功能, 1=次要功能"""
+        prompt = """分析截图，识别可交互元素。返回 JSON:
+{"elements": [{"name": "文字", "type": "navigation|button|link", "priority": 1-10}]}
+只返回 JSON。"""

        response = self.analyzer.model.analyze(img, prompt)
        
@@ -123,15 +279,13 @@ class FeatureExplorer:
                result = json.loads(match.group())
                elements = result.get("elements", [])
                
-                # 添加元数据
                for el in elements:
                    el["source_url"] = current_url
                    el["discovered_at"] = datetime.now().isoformat()
                
-                logger.info(f"发现 {len(elements)} 个可交互元素")
                return elements
        except Exception as e:
-            logger.warning(f"解析元素失败: {e}")
+            logger.warning(f"AI 解析失败: {e}")
        
        return []
    
@@ -200,9 +354,15 @@ class FeatureExplorer:
            self.action_log.append(action_record)
            return
        
-        # 执行点击 - 使用 DOM 选择器代替 AI 定位（更快）
+        # 执行点击 - 优先使用 DOM 发现时预计算的坐标
        try:
-            coords = self._find_element_by_name(name)
+            # 优先使用元素自带的坐标（DOM 发现时已计算）
+            if "x" in element and "y" in element:
+                coords = (element["x"], element["y"])
+            else:
+                # 回退到名称查找
+                coords = self._find_element_by_name(name)
+            
            if coords:
                print(f"      → 点击 ({coords[0]}, {coords[1]})")
                self.browser.click_at(coords[0], coords[1])
--- a/src/browser/controller.py
+++ b/src/browser/controller.py
@@ -41,21 +41,33 @@ class BrowserController:
        if self._page:
            self._page.click(selector)

-    def click_at(self, x: int, y: int) -> None:
-        """Click at specific coordinates using JavaScript for better compatibility"""
+    def click_at(self, x: int, y: int) -> bool:
+        """Click at specific coordinates using JavaScript for better compatibility
+        
+        Returns:
+            bool: True if element was found and clicked
+        """
        if self._page:
-            # 使用 elementFromPoint 找到坐标处的元素，然后触发点击
-            self._page.evaluate(f"""
-                (coords) => {{
-                    const el = document.elementFromPoint(coords.x, coords.y);
-                    if (el) {{
-                        el.click();
-                        if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {{
-                            el.focus();
-                        }}
-                    }}
-                }}
-            """, {"x": x, "y": y})
+            try:
+                result = self._page.evaluate("""
+                    (coords) => {
+                        const el = document.elementFromPoint(coords.x, coords.y);
+                        if (el && typeof el.click === 'function') {
+                            el.click();
+                            if (el.tagName === 'INPUT' || el.tagName === 'TEXTAREA') {
+                                el.focus();
+                            }
+                            return { clicked: true, tagName: el.tagName };
+                        }
+                        return { clicked: false };
+                    }
+                """, {"x": x, "y": y})
+                return result.get("clicked", False)
+            except Exception as e:
+                # 回退到鼠标点击
+                self._page.mouse.click(x, y)
+                return True
+        return False

    def type_text(self, selector: str, text: str) -> None:
        """Type text into element"""