From c15139f5eebc4ef25e74630cc7e266668aa5d616 Mon Sep 17 00:00:00 2001 From: wmymz Date: Fri, 7 Nov 2025 13:20:18 +0800 Subject: [PATCH] feat: add multi-encoding support, .gitignore integration, and WebSocket optimization - Add multi-encoding file support (UTF-8, GBK, GB2312, Latin-1) - Add .gitignore integration for automatic pattern exclusion - Optimize WebSocket reconnection with exponential backoff - Change tool response format from list to dict for better compatibility - Add bilingual README (Chinese and English) - Update installation instructions - Add project description to pyproject.toml --- .gitignore | 8 +- README.md | 349 +++++++++++++++++---------- README_EN.md | 351 ++++++++++++++++++++++++++++ pyproject.toml | 5 +- src/acemcp/index/manager.py | 88 ++++++- src/acemcp/server.py | 13 +- src/acemcp/tools/search_context.py | 12 +- src/acemcp/web/app.py | 7 +- src/acemcp/web/templates/index.html | 54 ++++- uv.lock | 13 +- 10 files changed, 747 insertions(+), 153 deletions(-) create mode 100644 README_EN.md diff --git a/.gitignore b/.gitignore index 3de9dae..0a18b2e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,7 @@ wheels/ # Virtual environments .venv +venv -# Acemcp specific -.acemcp_index/ -.acemcp_index_dev/ -.secrets.toml -settings.local.toml +# ai +.cunzhi-memory/ diff --git a/README.md b/README.md index 30f4830..9767b2e 100644 --- a/README.md +++ b/README.md @@ -1,23 +1,44 @@ +简体中文 | [English](./README_EN.md) + # Acemcp -MCP server for codebase indexing and semantic search. +代码库索引和语义搜索的 MCP 服务器。 Acemcp MCP server -## Installation +## 安装 + +### 作为工具安装(推荐) ```bash -uv add mcp httpx fastapi "uvicorn[standard]" toml websockets -uv sync +# 安装到系统 +uv tool install acemcp + +# 或临时运行(无需安装) +uvx acemcp ``` -## Configuration +### 开发安装 -The configuration file is automatically created at `~/.acemcp/settings.toml` on first run with default values. +```bash +# 克隆仓库 +git clone https://github.com/qy527145/acemcp.git +cd acemcp -Edit `~/.acemcp/settings.toml` to configure: +# 安装依赖 +uv sync + +# 运行 +uv run acemcp +``` + +## 配置 + +配置文件会在首次运行时自动创建在 `~/.acemcp/settings.toml`,包含默认值。 + +编辑 `~/.acemcp/settings.toml` 进行配置: ```toml BATCH_SIZE = 10 MAX_LINES_PER_BLOB = 800 @@ -27,24 +48,24 @@ TEXT_EXTENSIONS = [".py", ".js", ".ts", ...] EXCLUDE_PATTERNS = [".venv", "node_modules", ".git", "__pycache__", "*.pyc", ...] ``` -**Configuration options:** -- `BATCH_SIZE`: Number of files to upload per batch (default: 10) -- `MAX_LINES_PER_BLOB`: Maximum lines per blob before splitting large files (default: 800) -- `BASE_URL`: API endpoint URL -- `TOKEN`: Authentication token -- `TEXT_EXTENSIONS`: List of file extensions to index -- `EXCLUDE_PATTERNS`: List of patterns to exclude from indexing (supports wildcards like `*.pyc`) +**配置选项:** +- `BATCH_SIZE`: 每批上传的文件数量(默认:10) +- `MAX_LINES_PER_BLOB`: 大文件分割前的最大行数(默认:800) +- `BASE_URL`: API 端点 URL +- `TOKEN`: 认证令牌 +- `TEXT_EXTENSIONS`: 要索引的文件扩展名列表 +- `EXCLUDE_PATTERNS`: 要排除的模式列表(支持通配符如 `*.pyc`) -You can also configure via: -- **Command line arguments** (highest priority): `--base-url`, `--token` -- **Web management interface** (updates user config file) -- **Environment variables** with `ACEMCP_` prefix +您还可以通过以下方式配置: +- **命令行参数**(最高优先级):`--base-url`、`--token` +- **Web 管理界面**(更新用户配置文件) +- **环境变量**(使用 `ACEMCP_` 前缀) -## MCP Configuration +## MCP 配置 -Add the following to your MCP client configuration (e.g., Claude Desktop): +将以下内容添加到您的 MCP 客户端配置中(例如 Claude Desktop): -### Basic Configuration +### 基础配置 ```json { @@ -60,14 +81,14 @@ Add the following to your MCP client configuration (e.g., Claude Desktop): ``` -**Available command line arguments:** -- `--base-url`: Override BASE_URL configuration -- `--token`: Override TOKEN configuration -- `--web-port`: Enable web management interface on specified port (e.g., 8080) +**可用的命令行参数:** +- `--base-url`: 覆盖 BASE_URL 配置 +- `--token`: 覆盖 TOKEN 配置 +- `--web-port`: 在指定端口启用 Web 管理界面(例如 8080) -### Configuration with Web Management Interface +### 启用 Web 管理界面的配置 -To enable the web management interface, add the `--web-port` argument: +要启用 Web 管理界面,添加 `--web-port` 参数: ```json { @@ -84,111 +105,118 @@ To enable the web management interface, add the `--web-port` argument: } ``` -Then access the management interface at `http://localhost:8888` +然后访问管理界面:`http://localhost:8888` -**Web Management Features:** -- **Configuration Management**: View and edit server configuration (BASE_URL, TOKEN, BATCH_SIZE, MAX_LINES_PER_BLOB, TEXT_EXTENSIONS) -- **Real-time Logs**: Monitor server logs in real-time via WebSocket connection -- **Tool Debugger**: Test and debug MCP tools directly from the web interface - - Test `index_code` tool with any project path - - Test `search_context` tool with project path and query - - View formatted results and error messages +**Web 管理功能:** +- **配置管理**:查看和编辑服务器配置(BASE_URL、TOKEN、BATCH_SIZE、MAX_LINES_PER_BLOB、TEXT_EXTENSIONS) +- **实时日志**:通过 WebSocket 连接实时监控服务器日志,具有智能重连功能 + - 指数退避重连策略(1秒 → 1.5秒 → 2.25秒 ... 最大 30秒) + - 最多 10 次重连尝试,防止无限循环 + - 网络故障时自动重连 + - 减少日志噪音(WebSocket 连接记录在 DEBUG 级别) +- **工具调试器**:直接从 Web 界面测试和调试 MCP 工具 + - 测试 `search_context` 工具,输入项目路径和查询 + - 查看格式化的结果和错误消息 -## Tools +## 工具 ### search_context -Search for relevant code context based on a query. This tool **automatically performs incremental indexing** before searching, ensuring results are always up-to-date. It performs **semantic search** across your codebase and returns formatted text snippets showing where relevant code is located. +基于查询搜索相关的代码上下文。此工具在搜索前**自动执行增量索引**,确保结果始终是最新的。它在您的代码库中执行**语义搜索**,并返回格式化的文本片段,显示相关代码的位置。 -**Key Features:** -- **Automatic Incremental Indexing**: Before each search, the tool automatically indexes only new or modified files, skipping unchanged files for efficiency -- **No Manual Indexing Required**: You don't need to manually index your project - just search and the tool handles indexing automatically -- **Always Up-to-Date**: Search results reflect the current state of your codebase +**核心特性:** +- **自动增量索引**:每次搜索前,工具自动仅索引新文件或修改过的文件,跳过未更改的文件以提高效率 +- **无需手动索引**:您无需手动索引项目 - 只需搜索,工具会自动处理索引 +- **始终保持最新**:搜索结果反映代码库的当前状态 +- **多编码支持**:自动检测和处理多种文件编码(UTF-8、GBK、GB2312、Latin-1) +- **.gitignore 集成**:索引项目时自动遵守 `.gitignore` 模式 -**Parameters:** -- `project_root_path` (string): Absolute path to the project root directory - - **IMPORTANT**: Use forward slashes (`/`) as path separators, even on Windows - - Windows example: `C:/Users/username/projects/myproject` - - Linux/Mac example: `/home/username/projects/myproject` -- `query` (string): Natural language search query to find relevant code context - - Use descriptive keywords related to what you're looking for - - The tool performs semantic matching, not just keyword search - - Returns code snippets with file paths and line numbers +**参数:** +- `project_root_path`(字符串):项目根目录的绝对路径 + - **重要**:即使在 Windows 上也使用正斜杠(`/`)作为路径分隔符 + - Windows 示例:`C:/Users/username/projects/myproject` + - Linux/Mac 示例:`/home/username/projects/myproject` +- `query`(字符串):用于查找相关代码上下文的自然语言搜索查询 + - 使用与您要查找的内容相关的描述性关键词 + - 工具执行语义匹配,而不仅仅是关键词搜索 + - 返回带有文件路径和行号的代码片段 -**What it returns:** -- Formatted text snippets from files that match your query -- File paths and line numbers for each snippet -- Context around the relevant code sections -- Multiple results ranked by relevance +**返回内容:** +- 与您的查询匹配的文件中的格式化文本片段 +- 每个片段的文件路径和行号 +- 相关代码部分周围的上下文 +- 按相关性排序的多个结果 -**Query Examples:** +**查询示例:** -1. **Finding configuration code:** +1. **查找配置代码:** ```json { "project_root_path": "C:/Users/username/projects/myproject", - "query": "查找所有调用 get_model 的地方" + "query": "日志配置 设置 初始化 logger" } ``` - Returns: Code related to logging setup, logger initialization, and configuration + 返回:与日志设置、logger 初始化和配置相关的代码 -2. **Finding authentication logic:** +2. **查找认证逻辑:** ```json { "project_root_path": "C:/Users/username/projects/myproject", - "query": "user authentication login password validation" + "query": "用户认证 登录 密码验证" } ``` - Returns: Authentication handlers, login functions, password validation code + 返回:认证处理器、登录函数、密码验证代码 -3. **Finding database code:** +3. **查找数据库代码:** ```json { "project_root_path": "C:/Users/username/projects/myproject", - "query": "database connection pool initialization" + "query": "数据库连接池 初始化" } ``` - Returns: Database connection setup, pool configuration, initialization code + 返回:数据库连接设置、连接池配置、初始化代码 -4. **Finding error handling:** +4. **查找错误处理:** ```json { "project_root_path": "C:/Users/username/projects/myproject", - "query": "error handling exception try catch" + "query": "错误处理 异常 try catch" } ``` - Returns: Error handling patterns, exception handlers, try-catch blocks + 返回:错误处理模式、异常处理器、try-catch 块 -5. **Finding API endpoints:** +5. **查找 API 端点:** ```json { "project_root_path": "C:/Users/username/projects/myproject", - "query": "API endpoint routes HTTP handlers" + "query": "API 端点 路由 HTTP 处理器" } ``` - Returns: API route definitions, HTTP handlers, endpoint implementations + 返回:API 路由定义、HTTP 处理器、端点实现 -**Tips for better results:** -- Use multiple related keywords (e.g., "logging configuration setup" instead of just "logging") -- Include technical terms specific to what you're looking for -- Describe the functionality rather than exact variable names -- Try different phrasings if the first query doesn't return what you need +**获得更好结果的技巧:** +- 使用多个相关关键词(例如,"日志配置设置"而不仅仅是"日志") +- 包含您要查找的特定技术术语 +- 描述功能而不是确切的变量名 +- 如果第一次查询没有返回您需要的内容,尝试不同的措辞 -**Indexing Features:** -- **Incremental Indexing**: Only new or modified files are uploaded, unchanged files are skipped -- **Hash-based Deduplication**: Files are identified by SHA-256 hash of path + content -- **Automatic Retry**: Network requests are automatically retried up to 3 times with exponential backoff (1s, 2s, 4s) -- **Batch Resilience**: If a batch upload fails after retries, the tool continues with the next batch -- **File Splitting**: Large files are automatically split into multiple blobs (default: 800 lines per blob) -- **Exclude Patterns**: Automatically skips virtual environments, node_modules, .git, build artifacts, etc. +**索引特性:** +- **增量索引**:仅上传新文件或修改过的文件,跳过未更改的文件 +- **基于哈希的去重**:通过路径 + 内容的 SHA-256 哈希识别文件 +- **自动重试**:网络请求自动重试最多 3 次,采用指数退避(1秒、2秒、4秒) +- **批次弹性**:如果批次上传在重试后失败,工具会继续处理下一批次 +- **文件分割**:大文件自动分割为多个块(默认:每块 800 行) +- **排除模式**:自动跳过虚拟环境、node_modules、.git、构建产物等 +- **多编码支持**:自动检测文件编码(UTF-8、GBK、GB2312、Latin-1),并在失败时回退到 UTF-8 错误处理 +- **.gitignore 集成**:自动从项目根目录加载并遵守 `.gitignore` 模式,与配置的排除模式结合使用 -**Search Features:** -- **Automatic Retry**: Search requests are automatically retried up to 3 times with exponential backoff (2s, 4s, 8s) -- **Graceful Degradation**: Returns a clear error message if the search fails after all retries -- **Timeout Handling**: Uses a 60-second timeout to handle long-running searches -- **Empty Result Handling**: Returns a helpful message if no relevant code is found +**搜索特性:** +- **自动重试**:搜索请求自动重试最多 3 次,采用指数退避(2秒、4秒、8秒) +- **优雅降级**:如果所有重试后搜索失败,返回清晰的错误消息 +- **超时处理**:使用 60 秒超时来处理长时间运行的搜索 +- **空结果处理**:如果未找到相关代码,返回有用的消息 -**Default Exclude Patterns:** +**默认排除模式:** ``` .venv, venv, .env, env, node_modules, .git, .svn, .hg, __pycache__, .pytest_cache, .mypy_cache, .tox, .eggs, *.egg-info, dist, build, @@ -196,54 +224,127 @@ Search for relevant code context based on a query. This tool **automatically per pip-log.txt, pip-delete-this-directory.txt, .coverage, htmlcov, .gradle, target, bin, obj ``` -Patterns support wildcards (`*`, `?`) and match against directory/file names or paths. +模式支持通配符(`*`、`?`),并匹配目录/文件名或路径。 -## Usage +**注意:** 如果项目根目录存在 `.gitignore` 文件,其模式将自动加载并与配置的排除模式结合使用。`.gitignore` 模式遵循 Git 的标准 wildmatch 语法。 -1. Start the MCP server (automatically started by MCP client) -2. Use `search_context` to search for code context - - The tool automatically indexes your project before searching - - Incremental indexing ensures only new/modified files are uploaded - - No manual indexing step required! +## 高级特性 -## Data Storage +### 多编码文件支持 -- **Configuration**: `~/.acemcp/settings.toml` -- **Indexed projects**: `~/.acemcp/data/projects.json` (fixed location) -- **Log files**: `~/.acemcp/log/acemcp.log` (with automatic rotation) -- Projects are identified by their absolute path (normalized with forward slashes) +Acemcp 自动检测和处理不同字符编码的文件,适用于国际化项目: -## Logging +- **自动检测**:按顺序尝试多种编码:UTF-8 → GBK → GB2312 → Latin-1 +- **回退处理**:如果所有编码都失败,使用 UTF-8 错误处理以防止崩溃 +- **日志记录**:记录每个文件成功使用的编码(DEBUG 级别) +- **无需配置**:开箱即用,支持大多数常见编码 -The application automatically logs to `~/.acemcp/log/acemcp.log` with the following features: +这对以下情况特别有用: +- 混合编码文件的项目(例如,UTF-8 源代码 + GBK 文档) +- 使用非 UTF-8 编码的遗留代码库 +- 具有不同语言文件的国际团队 -- **Console output**: INFO level and above (colored output) -- **File output**: DEBUG level and above (detailed format with module, function, and line number) -- **Automatic rotation**: Log files are rotated when they reach 5MB -- **Retention**: Maximum of 10 log files are kept -- **Compression**: Rotated log files are automatically compressed to `.zip` format -- **Thread-safe**: Logging is thread-safe for concurrent operations +### .gitignore 集成 -**Log format:** +Acemcp 自动遵守您项目的 `.gitignore` 文件: + +- **自动加载**:如果存在,从项目根目录读取 `.gitignore` +- **标准语法**:支持 Git 的标准 wildmatch 模式 +- **组合过滤**:与配置的 `EXCLUDE_PATTERNS` 一起工作 +- **目录处理**:正确处理带有尾部斜杠的目录模式 +- **无需配置**:只需在项目根目录放置 `.gitignore` + +**`.gitignore` 模式示例:** +```gitignore +# 依赖 +node_modules/ +vendor/ + +# 构建输出 +dist/ +build/ +*.pyc + +# IDE 文件 +.vscode/ +.idea/ + +# 环境文件 +.env +.env.local +``` + +所有这些模式在索引期间都会自动遵守,并与默认排除模式结合使用。 + +## 使用方法 + +1. 启动 MCP 服务器(由 MCP 客户端自动启动) +2. 使用 `search_context` 搜索代码上下文 + - 工具在搜索前自动索引您的项目 + - 增量索引确保仅上传新文件/修改过的文件 + - 无需手动索引步骤! + - 无论编码如何,文件都会自动处理 + - 自动遵守 `.gitignore` 模式 + +## 数据存储 + +- **配置**:`~/.acemcp/settings.toml` +- **已索引项目**:`~/.acemcp/data/projects.json`(固定位置) +- **日志文件**:`~/.acemcp/log/acemcp.log`(自动轮转) +- 项目通过其绝对路径识别(使用正斜杠规范化) + +## 日志记录 + +应用程序自动记录日志到 `~/.acemcp/log/acemcp.log`,具有以下特性: + +- **控制台输出**:INFO 级别及以上(彩色输出) +- **文件输出**:DEBUG 级别及以上(详细格式,包含模块、函数和行号) +- **自动轮转**:日志文件达到 5MB 时自动轮转 +- **保留策略**:最多保留 10 个日志文件 +- **压缩**:轮转的日志文件自动压缩为 `.zip` 格式 +- **线程安全**:日志记录对并发操作是线程安全的 + +**日志格式:** ``` 2025-11-06 13:51:25 | INFO | acemcp.server:main:103 - Starting acemcp MCP server... ``` -The log files are automatically created on first run and require no manual configuration. +日志文件在首次运行时自动创建,无需手动配置。 -## Web Management Interface +## Web 管理界面 -The web management interface provides: -- **Real-time server status** monitoring -- **Live log streaming** via WebSocket -- **Configuration viewing** (current settings) -- **Project statistics** (number of indexed projects) +Web 管理界面提供: +- **实时服务器状态**监控 +- **实时日志流**通过 WebSocket +- **配置查看**(当前设置) +- **项目统计**(已索引项目数量) -To enable the web interface, use the `--web-port` argument when starting the server. +要启用 Web 界面,在启动服务器时使用 `--web-port` 参数。 -**Features:** -- Real-time log display with auto-scroll -- Server status and metrics -- Configuration overview -- Responsive design with Tailwind CSS -- No build step required (uses CDN resources) \ No newline at end of file +**功能:** +- 带自动滚动的实时日志显示 +- 服务器状态和指标 +- 配置概览 +- 使用 Tailwind CSS 的响应式设计 +- 无需构建步骤(使用 CDN 资源) +- 具有指数退避的智能 WebSocket 重连 + +## 最近更新 + +### 版本 0.1.3(最新) + +**新特性:** +- ✨ **多编码支持**:自动检测和处理多种文件编码(UTF-8、GBK、GB2312、Latin-1) +- ✨ **.gitignore 集成**:自动从项目根目录加载并遵守 `.gitignore` 模式 +- ✨ **改进的工具响应格式**:从基于列表的格式改为基于字典的格式,以提高客户端兼容性 + +**改进:** +- 🔧 **WebSocket 优化**:具有指数退避的智能重连(1秒 → 最大 30秒) +- 🔧 **减少日志噪音**:WebSocket 连接现在记录在 DEBUG 级别而不是 INFO +- 🔧 **连接稳定性**:最多 10 次重连尝试,防止无限循环 +- 🔧 **更好的错误处理**:对无法用任何编码解码的文件进行优雅回退 + +**错误修复:** +- 🐛 修复了频繁的 WebSocket 连接/断开循环 +- 🐛 修复了读取非 UTF-8 编码文件时的编码错误 +- 🐛 改进了对带有目录匹配的 .gitignore 模式的处理 \ No newline at end of file diff --git a/README_EN.md b/README_EN.md new file mode 100644 index 0000000..0b26f61 --- /dev/null +++ b/README_EN.md @@ -0,0 +1,351 @@ +[简体中文](./README.md) | English + +# Acemcp + +MCP server for codebase indexing and semantic search. + + + Acemcp MCP server + + +## Installation + +### Install as Tool (Recommended) + +```bash +# Install to system +uv tool install acemcp + +# Or run temporarily (no installation required) +uvx acemcp +``` + +### Development Installation + +```bash +# Clone repository +git clone https://github.com/qy527145/acemcp.git +cd acemcp + +# Install dependencies +uv sync + +# Run +uv run acemcp +``` + +## Configuration + +The configuration file is automatically created at `~/.acemcp/settings.toml` on first run with default values. + +Edit `~/.acemcp/settings.toml` to configure: +```toml +BATCH_SIZE = 10 +MAX_LINES_PER_BLOB = 800 +BASE_URL = "https://your-api-endpoint.com" +TOKEN = "your-bearer-token-here" +TEXT_EXTENSIONS = [".py", ".js", ".ts", ...] +EXCLUDE_PATTERNS = [".venv", "node_modules", ".git", "__pycache__", "*.pyc", ...] +``` + +**Configuration options:** +- `BATCH_SIZE`: Number of files to upload per batch (default: 10) +- `MAX_LINES_PER_BLOB`: Maximum lines per blob before splitting large files (default: 800) +- `BASE_URL`: API endpoint URL +- `TOKEN`: Authentication token +- `TEXT_EXTENSIONS`: List of file extensions to index +- `EXCLUDE_PATTERNS`: List of patterns to exclude from indexing (supports wildcards like `*.pyc`) + +You can also configure via: +- **Command line arguments** (highest priority): `--base-url`, `--token` +- **Web management interface** (updates user config file) +- **Environment variables** with `ACEMCP_` prefix + +## MCP Configuration + +Add the following to your MCP client configuration (e.g., Claude Desktop): + +### Basic Configuration + +```json +{ + "mcpServers": { + "acemcp": { + "command": "uvx", + "args": [ + "acemcp" + ] + } + } +} +``` + + +**Available command line arguments:** +- `--base-url`: Override BASE_URL configuration +- `--token`: Override TOKEN configuration +- `--web-port`: Enable web management interface on specified port (e.g., 8080) + +### Configuration with Web Management Interface + +To enable the web management interface, add the `--web-port` argument: + +```json +{ + "mcpServers": { + "acemcp": { + "command": "uvx", + "args": [ + "acemcp", + "--web-port", + "8888" + ] + } + } +} +``` + +Then access the management interface at `http://localhost:8888` + +**Web Management Features:** +- **Configuration Management**: View and edit server configuration (BASE_URL, TOKEN, BATCH_SIZE, MAX_LINES_PER_BLOB, TEXT_EXTENSIONS) +- **Real-time Logs**: Monitor server logs in real-time via WebSocket connection with intelligent reconnection + - Exponential backoff reconnection strategy (1s → 1.5s → 2.25s ... max 30s) + - Maximum 10 reconnection attempts to prevent infinite loops + - Automatic reconnection on network failures + - Reduced log noise (WebSocket connections logged at DEBUG level) +- **Tool Debugger**: Test and debug MCP tools directly from the web interface + - Test `search_context` tool with project path and query + - View formatted results and error messages + +## Tools + +### search_context + +Search for relevant code context based on a query. This tool **automatically performs incremental indexing** before searching, ensuring results are always up-to-date. It performs **semantic search** across your codebase and returns formatted text snippets showing where relevant code is located. + +**Key Features:** +- **Automatic Incremental Indexing**: Before each search, the tool automatically indexes only new or modified files, skipping unchanged files for efficiency +- **No Manual Indexing Required**: You don't need to manually index your project - just search and the tool handles indexing automatically +- **Always Up-to-Date**: Search results reflect the current state of your codebase +- **Multi-Encoding Support**: Automatically detects and handles multiple file encodings (UTF-8, GBK, GB2312, Latin-1) +- **.gitignore Integration**: Automatically respects `.gitignore` patterns when indexing projects + +**Parameters:** +- `project_root_path` (string): Absolute path to the project root directory + - **IMPORTANT**: Use forward slashes (`/`) as path separators, even on Windows + - Windows example: `C:/Users/username/projects/myproject` + - Linux/Mac example: `/home/username/projects/myproject` +- `query` (string): Natural language search query to find relevant code context + - Use descriptive keywords related to what you're looking for + - The tool performs semantic matching, not just keyword search + - Returns code snippets with file paths and line numbers + +**What it returns:** +- Formatted text snippets from files that match your query +- File paths and line numbers for each snippet +- Context around the relevant code sections +- Multiple results ranked by relevance + +**Query Examples:** + +1. **Finding configuration code:** + ```json + { + "project_root_path": "C:/Users/username/projects/myproject", + "query": "logging configuration setup initialization logger" + } + ``` + Returns: Code related to logging setup, logger initialization, and configuration + +2. **Finding authentication logic:** + ```json + { + "project_root_path": "C:/Users/username/projects/myproject", + "query": "user authentication login password validation" + } + ``` + Returns: Authentication handlers, login functions, password validation code + +3. **Finding database code:** + ```json + { + "project_root_path": "C:/Users/username/projects/myproject", + "query": "database connection pool initialization" + } + ``` + Returns: Database connection setup, pool configuration, initialization code + +4. **Finding error handling:** + ```json + { + "project_root_path": "C:/Users/username/projects/myproject", + "query": "error handling exception try catch" + } + ``` + Returns: Error handling patterns, exception handlers, try-catch blocks + +5. **Finding API endpoints:** + ```json + { + "project_root_path": "C:/Users/username/projects/myproject", + "query": "API endpoint routes HTTP handlers" + } + ``` + Returns: API route definitions, HTTP handlers, endpoint implementations + +**Tips for better results:** +- Use multiple related keywords (e.g., "logging configuration setup" instead of just "logging") +- Include technical terms specific to what you're looking for +- Describe the functionality rather than exact variable names +- Try different phrasings if the first query doesn't return what you need + +**Indexing Features:** +- **Incremental Indexing**: Only new or modified files are uploaded, unchanged files are skipped +- **Hash-based Deduplication**: Files are identified by SHA-256 hash of path + content +- **Automatic Retry**: Network requests are automatically retried up to 3 times with exponential backoff (1s, 2s, 4s) +- **Batch Resilience**: If a batch upload fails after retries, the tool continues with the next batch +- **File Splitting**: Large files are automatically split into multiple blobs (default: 800 lines per blob) +- **Exclude Patterns**: Automatically skips virtual environments, node_modules, .git, build artifacts, etc. +- **Multi-Encoding Support**: Automatically detects file encoding (UTF-8, GBK, GB2312, Latin-1) with fallback to UTF-8 with error handling +- **.gitignore Integration**: Automatically loads and respects `.gitignore` patterns from project root, combined with configured exclude patterns + +**Search Features:** +- **Automatic Retry**: Search requests are automatically retried up to 3 times with exponential backoff (2s, 4s, 8s) +- **Graceful Degradation**: Returns a clear error message if the search fails after all retries +- **Timeout Handling**: Uses a 60-second timeout to handle long-running searches +- **Empty Result Handling**: Returns a helpful message if no relevant code is found + +**Default Exclude Patterns:** +``` +.venv, venv, .env, env, node_modules, .git, .svn, .hg, __pycache__, +.pytest_cache, .mypy_cache, .tox, .eggs, *.egg-info, dist, build, +.idea, .vscode, .DS_Store, *.pyc, *.pyo, *.pyd, .Python, +pip-log.txt, pip-delete-this-directory.txt, .coverage, htmlcov, +.gradle, target, bin, obj +``` +Patterns support wildcards (`*`, `?`) and match against directory/file names or paths. + +**Note:** If a `.gitignore` file exists in the project root, its patterns will be automatically loaded and combined with the configured exclude patterns. The `.gitignore` patterns follow Git's standard wildmatch syntax. + +## Advanced Features + +### Multi-Encoding File Support + +Acemcp automatically detects and handles files with different character encodings, making it suitable for international projects: + +- **Automatic Detection**: Tries multiple encodings in order: UTF-8 → GBK → GB2312 → Latin-1 +- **Fallback Handling**: If all encodings fail, uses UTF-8 with error handling to prevent crashes +- **Logging**: Records which encoding was successfully used for each file (DEBUG level) +- **No Configuration Required**: Works out of the box for most common encodings + +This is particularly useful for: +- Projects with mixed encoding files (e.g., UTF-8 source code + GBK documentation) +- Legacy codebases using non-UTF-8 encodings +- International teams with files in different languages + +### .gitignore Integration + +Acemcp automatically respects your project's `.gitignore` file: + +- **Automatic Loading**: Reads `.gitignore` from project root if it exists +- **Standard Syntax**: Supports Git's standard wildmatch patterns +- **Combined Filtering**: Works alongside configured `EXCLUDE_PATTERNS` +- **Directory Handling**: Properly handles directory patterns with trailing slashes +- **No Configuration Required**: Just place a `.gitignore` in your project root + +**Example `.gitignore` patterns:** +```gitignore +# Dependencies +node_modules/ +vendor/ + +# Build outputs +dist/ +build/ +*.pyc + +# IDE files +.vscode/ +.idea/ + +# Environment files +.env +.env.local +``` + +All these patterns will be automatically respected during indexing, in addition to the default exclude patterns. + +## Usage + +1. Start the MCP server (automatically started by MCP client) +2. Use `search_context` to search for code context + - The tool automatically indexes your project before searching + - Incremental indexing ensures only new/modified files are uploaded + - No manual indexing step required! + - Files are automatically handled regardless of encoding + - `.gitignore` patterns are automatically respected + +## Data Storage + +- **Configuration**: `~/.acemcp/settings.toml` +- **Indexed projects**: `~/.acemcp/data/projects.json` (fixed location) +- **Log files**: `~/.acemcp/log/acemcp.log` (with automatic rotation) +- Projects are identified by their absolute path (normalized with forward slashes) + +## Logging + +The application automatically logs to `~/.acemcp/log/acemcp.log` with the following features: + +- **Console output**: INFO level and above (colored output) +- **File output**: DEBUG level and above (detailed format with module, function, and line number) +- **Automatic rotation**: Log files are rotated when they reach 5MB +- **Retention**: Maximum of 10 log files are kept +- **Compression**: Rotated log files are automatically compressed to `.zip` format +- **Thread-safe**: Logging is thread-safe for concurrent operations + +**Log format:** +``` +2025-11-06 13:51:25 | INFO | acemcp.server:main:103 - Starting acemcp MCP server... +``` + +The log files are automatically created on first run and require no manual configuration. + +## Web Management Interface + +The web management interface provides: +- **Real-time server status** monitoring +- **Live log streaming** via WebSocket +- **Configuration viewing** (current settings) +- **Project statistics** (number of indexed projects) + +To enable the web interface, use the `--web-port` argument when starting the server. + +**Features:** +- Real-time log display with auto-scroll +- Server status and metrics +- Configuration overview +- Responsive design with Tailwind CSS +- No build step required (uses CDN resources) +- Intelligent WebSocket reconnection with exponential backoff + +## Recent Updates + +### Version 0.1.3 (Latest) + +**New Features:** +- ✨ **Multi-Encoding Support**: Automatic detection and handling of multiple file encodings (UTF-8, GBK, GB2312, Latin-1) +- ✨ **.gitignore Integration**: Automatic loading and respect of `.gitignore` patterns from project root +- ✨ **Improved Tool Response Format**: Changed from list-based to dictionary-based response format for better client compatibility + +**Improvements:** +- 🔧 **WebSocket Optimization**: Intelligent reconnection with exponential backoff (1s → 30s max) +- 🔧 **Reduced Log Noise**: WebSocket connections now logged at DEBUG level instead of INFO +- 🔧 **Connection Stability**: Maximum 10 reconnection attempts to prevent infinite loops +- 🔧 **Better Error Handling**: Graceful fallback for files that can't be decoded with any encoding + +**Bug Fixes:** +- 🐛 Fixed frequent WebSocket connection/disconnection cycles +- 🐛 Fixed encoding errors when reading files with non-UTF-8 encodings +- 🐛 Improved handling of .gitignore patterns with directory matching + diff --git a/pyproject.toml b/pyproject.toml index 4abbd25..086440c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "acemcp" -version = "0.1.3" -description = "Add your description here" +version = "0.1.4" +description = "MCP server for codebase indexing and semantic search with multi-encoding support and .gitignore integration" readme = "README.md" authors = [ { name = "wmymz", email = "wmymz@icloud.com" } @@ -16,6 +16,7 @@ dependencies = [ "uvicorn[standard]>=0.34.0", "toml>=0.10.2", "websockets>=14.1", + "pathspec>=0.12.1", ] [project.scripts] diff --git a/src/acemcp/index/manager.py b/src/acemcp/index/manager.py index 1b6f964..70824a4 100644 --- a/src/acemcp/index/manager.py +++ b/src/acemcp/index/manager.py @@ -8,9 +8,46 @@ import os from pathlib import Path import httpx +import pathspec from loguru import logger +def read_file_with_encoding(file_path: Path) -> str: + """Read file content with automatic encoding detection. + + Tries multiple encodings in order: utf-8, gbk, gb2312, latin-1. + + Args: + file_path: Path to the file to read + + Returns: + File content as string + + Raises: + Exception: If file cannot be read with any supported encoding + """ + encodings = ["utf-8", "gbk", "gb2312", "latin-1"] + + for encoding in encodings: + try: + with file_path.open("r", encoding=encoding) as f: + content = f.read() + logger.debug(f"Successfully read {file_path} with encoding: {encoding}") + return content + except (UnicodeDecodeError, LookupError): + continue + + # If all encodings fail, try with errors='ignore' + try: + with file_path.open("r", encoding="utf-8", errors="ignore") as f: + content = f.read() + logger.warning(f"Read {file_path} with utf-8 and errors='ignore' (some characters may be lost)") + return content + except Exception as e: + logger.error(f"Failed to read {file_path} with any encoding: {e}") + raise + + def calculate_blob_name(path: str, content: str) -> str: """Calculate blob_name (blob id) using SHA-256 hash. @@ -64,6 +101,30 @@ class IndexManager: """ return str(Path(path).resolve()).replace("\\", "/") + def _load_gitignore(self, root_path: Path) -> pathspec.PathSpec | None: + """Load and parse .gitignore file from project root. + + Args: + root_path: Root path of the project + + Returns: + PathSpec object if .gitignore exists, None otherwise + """ + gitignore_path = root_path / ".gitignore" + if not gitignore_path.exists(): + logger.debug(f"No .gitignore found at {gitignore_path}") + return None + + try: + with gitignore_path.open("r", encoding="utf-8") as f: + patterns = f.read().splitlines() + spec = pathspec.PathSpec.from_lines("gitwildmatch", patterns) + logger.info(f"Loaded .gitignore with {len(patterns)} patterns from {gitignore_path}") + return spec + except Exception as e: + logger.warning(f"Failed to load .gitignore from {gitignore_path}: {e}") + return None + async def _retry_request(self, func, max_retries: int = 3, retry_delay: float = 1.0, *args, **kwargs): """Retry an async function with exponential backoff. @@ -100,12 +161,13 @@ class IndexManager: raise last_exception - def _should_exclude(self, path: Path, root_path: Path) -> bool: - """Check if a path should be excluded based on exclude patterns. + def _should_exclude(self, path: Path, root_path: Path, gitignore_spec: pathspec.PathSpec | None = None) -> bool: + """Check if a path should be excluded based on exclude patterns and .gitignore. Args: path: Path to check root_path: Root path of the project + gitignore_spec: PathSpec object from .gitignore (optional) Returns: True if path should be excluded, False otherwise @@ -115,6 +177,18 @@ class IndexManager: path_str = str(relative_path) path_parts = relative_path.parts + # Check .gitignore patterns first + if gitignore_spec is not None: + # Use forward slashes for gitignore matching + path_str_forward = path_str.replace("\\", "/") + # Add trailing slash for directories + if path.is_dir(): + path_str_forward += "/" + if gitignore_spec.match_file(path_str_forward): + logger.debug(f"Excluded by .gitignore: {path_str_forward}") + return True + + # Check exclude_patterns for pattern in self.exclude_patterns: # Check if pattern matches any part of the path for part in path_parts: @@ -214,20 +288,23 @@ class IndexManager: msg = f"Project root path does not exist: {project_root_path}" raise FileNotFoundError(msg) + # Load .gitignore if exists + gitignore_spec = self._load_gitignore(root_path) + for dirpath, dirnames, filenames in os.walk(root_path): current_dir = Path(dirpath) # Filter out excluded directories to prevent os.walk from descending into them dirnames[:] = [ d for d in dirnames - if not self._should_exclude(current_dir / d, root_path) + if not self._should_exclude(current_dir / d, root_path, gitignore_spec) ] for filename in filenames: file_path = current_dir / filename # Check if file should be excluded - if self._should_exclude(file_path, root_path): + if self._should_exclude(file_path, root_path, gitignore_spec): excluded_count += 1 logger.debug(f"Excluded file: {file_path.relative_to(root_path)}") continue @@ -237,8 +314,7 @@ class IndexManager: try: relative_path = file_path.relative_to(root_path) - with file_path.open("r", encoding="utf-8") as f: - content = f.read() + content = read_file_with_encoding(file_path) # Split file if necessary file_blobs = self._split_file_content(str(relative_path), content) diff --git a/src/acemcp/server.py b/src/acemcp/server.py index b57f78c..e252669 100644 --- a/src/acemcp/server.py +++ b/src/acemcp/server.py @@ -47,7 +47,7 @@ async def list_tools() -> list[Tool]: @app.call_tool() -async def call_tool(name: str, arguments: dict) -> list[dict]: +async def call_tool(name: str, arguments: dict) -> dict: """Handle tool calls. Args: @@ -62,7 +62,7 @@ async def call_tool(name: str, arguments: dict) -> list[dict]: if name == "search_context": return await search_context_tool(arguments) - return [{"type": "text", "text": f"Unknown tool: {name}"}] + return {"type": "text", "text": f"Unknown tool: {name}"} async def run_web_server(port: int) -> None: @@ -72,7 +72,14 @@ async def run_web_server(port: int) -> None: port: Port to run the web server on """ web_app = create_app() - config_uvicorn = uvicorn.Config(web_app, host="0.0.0.0", port=port, log_level="info") + # Set log_level to "warning" to reduce WebSocket connection noise + config_uvicorn = uvicorn.Config( + web_app, + host="0.0.0.0", + port=port, + log_level="warning", + access_log=False # Disable access log to reduce noise + ) server = uvicorn.Server(config_uvicorn) await server.serve() diff --git a/src/acemcp/tools/search_context.py b/src/acemcp/tools/search_context.py index ab0d3e8..f203232 100644 --- a/src/acemcp/tools/search_context.py +++ b/src/acemcp/tools/search_context.py @@ -8,7 +8,7 @@ from acemcp.config import get_config from acemcp.index import IndexManager -async def search_context_tool(arguments: dict[str, Any]) -> list[dict[str, Any]]: +async def search_context_tool(arguments: dict[str, Any]) -> dict[str, Any]: """Search for code context based on query. Args: @@ -17,17 +17,17 @@ async def search_context_tool(arguments: dict[str, Any]) -> list[dict[str, Any]] - query: Search query string Returns: - List containing search results + Dictionary containing search results """ try: project_root_path = arguments.get("project_root_path") query = arguments.get("query") if not project_root_path: - return [{"type": "text", "text": "Error: project_root_path is required"}] + return {"type": "text", "text": "Error: project_root_path is required"} if not query: - return [{"type": "text", "text": "Error: query is required"}] + return {"type": "text", "text": "Error: query is required"} logger.info(f"Tool invoked: search_context for project {project_root_path} with query: {query}") @@ -43,9 +43,9 @@ async def search_context_tool(arguments: dict[str, Any]) -> list[dict[str, Any]] ) result = await index_manager.search_context(project_root_path, query) - return [{"type": "text", "text": result}] + return {"type": "text", "text": result} except Exception as e: logger.exception("Error in search_context_tool") - return [{"type": "text", "text": f"Error: {e!s}"}] + return {"type": "text", "text": f"Error: {e!s}"} diff --git a/src/acemcp/web/app.py b/src/acemcp/web/app.py index 6eca1da..5dca2ae 100644 --- a/src/acemcp/web/app.py +++ b/src/acemcp/web/app.py @@ -172,15 +172,16 @@ def create_app() -> FastAPI: await websocket.accept() queue: asyncio.Queue = asyncio.Queue() log_broadcaster.add_client(queue) + logger.debug("WebSocket client connected") try: while True: log_message = await queue.get() await websocket.send_text(log_message) except WebSocketDisconnect: - pass - except Exception: - logger.exception("WebSocket error") + logger.debug("WebSocket client disconnected normally") + except Exception as e: + logger.warning(f"WebSocket error: {e}") finally: log_broadcaster.remove_client(queue) diff --git a/src/acemcp/web/templates/index.html b/src/acemcp/web/templates/index.html index 742e1e0..67b0f73 100644 --- a/src/acemcp/web/templates/index.html +++ b/src/acemcp/web/templates/index.html @@ -281,6 +281,11 @@ wsConnected: false, ws: null, logIdCounter: 0, + wsReconnectAttempts: 0, + wsMaxReconnectAttempts: 10, + wsReconnectDelay: 1000, // Initial delay: 1 second + wsReconnectTimer: null, + wsManualClose: false, // Tool debugger state selectedTool: 'search_context', toolArgs: { @@ -384,12 +389,25 @@ }, connectWebSocket() { + // Clear any pending reconnect timer + if (this.wsReconnectTimer) { + clearTimeout(this.wsReconnectTimer); + this.wsReconnectTimer = null; + } + // Close existing connection if any if (this.ws && (this.ws.readyState === WebSocket.OPEN || this.ws.readyState === WebSocket.CONNECTING)) { console.log('Closing existing WebSocket connection'); + this.wsManualClose = true; this.ws.close(); } + // Check if max reconnect attempts reached + if (this.wsReconnectAttempts >= this.wsMaxReconnectAttempts) { + console.warn('Max WebSocket reconnect attempts reached. Please refresh the page.'); + return; + } + const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:'; const wsUrl = `${protocol}//${window.location.host}/ws/logs`; @@ -397,6 +415,9 @@ this.ws.onopen = () => { this.wsConnected = true; + this.wsReconnectAttempts = 0; // Reset on successful connection + this.wsReconnectDelay = 1000; // Reset delay + this.wsManualClose = false; console.log('WebSocket connected'); }; @@ -416,10 +437,24 @@ }); }; - this.ws.onclose = () => { + this.ws.onclose = (event) => { this.wsConnected = false; - console.log('WebSocket disconnected, reconnecting...'); - setTimeout(() => this.connectWebSocket(), 3000); + + // Don't reconnect if manually closed or normal closure + if (this.wsManualClose || event.code === 1000) { + console.log('WebSocket closed normally'); + return; + } + + // Increment reconnect attempts + this.wsReconnectAttempts++; + + // Calculate exponential backoff delay (max 30 seconds) + const delay = Math.min(this.wsReconnectDelay * Math.pow(1.5, this.wsReconnectAttempts - 1), 30000); + + console.log(`WebSocket disconnected (attempt ${this.wsReconnectAttempts}/${this.wsMaxReconnectAttempts}), reconnecting in ${delay}ms...`); + + this.wsReconnectTimer = setTimeout(() => this.connectWebSocket(), delay); }; this.ws.onerror = (error) => { @@ -427,6 +462,19 @@ }; }, + disconnectWebSocket() { + this.wsManualClose = true; + if (this.wsReconnectTimer) { + clearTimeout(this.wsReconnectTimer); + this.wsReconnectTimer = null; + } + if (this.ws) { + this.ws.close(); + this.ws = null; + } + this.wsConnected = false; + }, + clearLogs() { this.logs = []; }, diff --git a/uv.lock b/uv.lock index 5f236bd..45e130e 100644 --- a/uv.lock +++ b/uv.lock @@ -4,7 +4,7 @@ requires-python = ">=3.10" [[package]] name = "acemcp" -version = "0.1.3" +version = "0.1.4" source = { editable = "." } dependencies = [ { name = "dynaconf" }, @@ -12,6 +12,7 @@ dependencies = [ { name = "httpx" }, { name = "loguru" }, { name = "mcp" }, + { name = "pathspec" }, { name = "toml" }, { name = "uvicorn", extra = ["standard"] }, { name = "websockets" }, @@ -31,6 +32,7 @@ requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "mcp", specifier = ">=1.1.2" }, + { name = "pathspec", specifier = ">=0.12.1" }, { name = "toml", specifier = ">=0.10.2" }, { name = "uvicorn", extras = ["standard"], specifier = ">=0.34.0" }, { name = "websockets", specifier = ">=14.1" }, @@ -504,6 +506,15 @@ wheels = [ { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] +[[package]] +name = "pathspec" +version = "0.12.1" +source = { registry = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" } +sdist = { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/ca/bc/f35b8446f4531a7cb215605d100cd88b7ac6f44ab3fc94870c120ab3adbf/pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712", size = 51043, upload-time = "2023-12-10T22:30:45Z" } +wheels = [ + { url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/cc/20/ff623b09d963f88bfde16306a54e12ee5ea43e9b597108672ff3a408aad6/pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08", size = 31191, upload-time = "2023-12-10T22:30:43.14Z" }, +] + [[package]] name = "pip" version = "25.3"