#!/usr/bin/env python3
"""
Phase 6: Aggregate findings from all agents and generate final JSON report.

Reads all findings/*.json files, deduplicates, validates against
pentest_json_spec.md, and outputs report.json.

Usage:
    python generate_report.py <workspace_dir> --target-url <url> [--tech-stack ...]
"""

import argparse
import json
import os
import re
import sys
import random
import string
from datetime import datetime, timezone
from pathlib import Path

# ---------------------------------------------------------------------------
# Constants — synced with pentest_json_spec.md
# ---------------------------------------------------------------------------

VALID_SEVERITIES = ("critical", "high", "medium", "low", "info")
VALID_CONFIDENCES = ("confirmed", "likely", "potential")
VALID_TYPES = (
    "sqli", "xss_stored", "xss_reflected", "xss_dom", "idor", "ssrf", "rce",
    "lfi", "rfi", "xxe", "ssti", "csrf", "open_redirect", "auth_bypass",
    "broken_access_control", "information_disclosure", "insecure_deserialization",
    "unknown",
)

# type → type_zh 归一化映射表 —— 同一类型始终使用同一中文表述，不做过细区分
TYPE_ZH_MAP = {
    # 注入类
    "sqli": "SQL注入",
    "nosqli": "SQL注入",
    "xss_stored": "XSS注入",
    "xss_reflected": "XSS注入",
    "xss_dom": "XSS注入",
    "ssrf": "SSRF",
    "xxe": "XXE注入",
    "ssti": "模板注入",
    "rce": "远程代码执行",
    "crlf_injection": "CRLF注入",
    "xslt_injection": "注入攻击",
    "el_injection": "注入攻击",
    "jndi_injection": "注入攻击",
    "command_injection": "命令注入",
    "prototype_pollution": "原型污染",
    "type_juggling": "类型混淆",
    "insecure_deserialization": "反序列化漏洞",
    # 访问控制类
    "idor": "越权访问",
    "broken_access_control": "访问控制缺陷",
    "auth_bypass": "认证绕过",
    "csrf": "CSRF",
    # 信息泄露类
    "information_disclosure": "信息泄露",
    "open_redirect": "开放重定向",
    # 文件类
    "lfi": "文件包含",
    "rfi": "文件包含",
    "dir_traversal": "目录穿越",
    "file_upload": "文件上传漏洞",
    # HTTP协议类
    "request_smuggling": "HTTP走私",
    "waf_bypass": "WAF绕过",
    # 业务逻辑类
    "workflow_bypass": "业务逻辑漏洞",
    "race_condition": "竞争条件",
    "pricing_manipulation": "业务逻辑漏洞",
    "coupon_abuse": "业务逻辑漏洞",
    "subscription_hijack": "业务逻辑漏洞",
    # 其他
    "clickjacking": "点击劫持",
    "webshell": "Webshell",
    "weak_password": "弱口令",
    "brute_force": "暴力破解",
    "git_exposure": "信息泄露",
    "directory_listing": "信息泄露",
    "cookie_security": "Cookie安全问题",
    "url_redirect": "开放重定向",
    "unknown": "未分类",
}

REQUIRED_VULN_FIELDS = [
    "vuln_id", "title", "type", "severity", "confidence",
    "authenticated", "target_url", "description",
]

# type → RepairSuggestions 整改建议映射表
REPAIR_SUGGESTIONS_MAP = {
    "sqli": "1. 使用参数化查询（预编译语句）替代字符串拼接SQL；2. 实施输入验证和白名单过滤，拒绝异常字符；3. 对数据库账户实施最小权限原则，避免使用root/dba账户连接数据库；4. 部署WAF拦截常见SQL注入payload；5. 开启数据库慢查询日志监控异常请求。",
    "nosqli": "1. 使用参数化查询或ODM/ORM框架处理NoSQL查询；2. 对用户输入进行严格类型检查，拒绝JSON对象作为查询参数；3. 避免使用$where等JavaScript执行操作符；4. 实施输入验证，确保查询字段类型与预期一致。",
    "xss_stored": "1. 对用户输入进行输出编码（HTML实体编码），在渲染时转义特殊字符；2. 实施Content-Security-Policy(CSP)头限制脚本执行来源；3. 对富文本输入使用安全的HTML过滤器（如DOMPurify）；4. 对存储的评论内容实施标签白名单机制。",
    "xss_reflected": "1. 对所有用户输入参数进行输出编码/转义处理；2. 设置正确的Content-Type响应头防止MIME嗅探；3. 启用X-Content-Type-Options: nosniff安全头；4. 部署CSP头限制脚本执行来源；5. 避免将用户输入直接嵌入HTML响应中。",
    "xss_dom": "1. 避免使用innerHTML、outerHTML、document.write等危险DOM API，改用textContent或createElement；2. 对用户输入到DOM sink的数据进行编码转义；3. 审查JavaScript代码中location.hash、location.search等source点到sink的数据流；4. 部署CSP限制内联脚本执行。",
    "idor": "1. 实施基于角色的访问控制(RBAC)，在API层校验请求的资源所有权；2. 使用不可预测的资源标识符（如UUID）替代递增数字ID；3. 对每个API请求进行权限校验，确保当前用户有权访问请求的资源；4. 实施多层级授权检查（对象级和功能级）。",
    "ssrf": "1. 对用户提供的URL进行严格验证，限制允许的协议(http/https)和域名白名单；2. 禁止访问内网IP地址段(10.x/172.16.x/192.168.x/127.x)和云元数据地址(169.254.169.254)；3. 使用URL解析库而非正则表达式进行验证；4. 服务端发起请求时使用独立的网络命名空间。",
    "rce": "1. 避免使用任何系统命令执行函数（exec/system/popen等）处理用户输入；2. 如必须执行系统命令，使用参数化命令替代shell拼接；3. 实施严格的输入白名单验证；4. 使用沙箱或容器隔离命令执行环境；5. 以最低权限运行应用程序进程。",
    "lfi": "1. 对用户传入的文件路径参数实施严格的白名单验证；2. 使用固定的文件映射表而非直接使用用户输入的路径；3. 禁止路径穿越字符(../、..\\)；4. 使用chroot或虚拟文件系统限制文件访问范围；5. 配置Web服务器禁止访问敏感系统文件。",
    "rfi": "1. 禁用PHP的allow_url_include和allow_url_fopen配置项；2. 不对用户输入进行文件包含操作；3. 实施白名单机制限制可包含的文件路径；4. 使用Web应用防火墙拦截外部URL的文件包含请求。",
    "xxe": "1. 禁用XML解析器的DTD处理功能（DocumentBuilderFactory.setFeature(DISALLOW_DOCTYPE_DECL, true)）；2. 避免使用用户可控的XML数据；3. 使用JSON替代XML作为数据交换格式；4. 配置XML解析器拒绝外部实体引用。",
    "ssti": "1. 使用模板引擎的自动转义功能（autoescape=true）；2. 不对用户输入进行服务端模板渲染；3. 使用沙箱化的模板环境，限制可访问的函数和对象；4. 避免在模板表达式中拼接用户输入。",
    "csrf": "1. 确保所有状态修改请求都携带有效的CSRF Token；2. 验证CSRF Token在服务端的唯一性和有效性；3. 检查Cookie的SameSite标志位设置为Strict或Lax；4. 对关键操作添加二次验证（如密码确认、短信验证码）；5. 验证请求的Referer/Origin头。",
    "open_redirect": "1. 对重定向URL参数实施白名单验证，仅允许信任的域名；2. 使用间接引用（映射ID）替代直接URL重定向；3. 禁止协议相对URL(//)和绝对URL跳转到外部域名；4. 在跳转前向用户展示即将前往的目标地址并需确认。",
    "auth_bypass": "1. 对所有敏感接口实施认证检查，确保未授权请求被拦截；2. 避免依赖客户端逻辑进行权限控制；3. 使用统一的认证中间件/过滤器确保所有路由受保护；4. 定期审查新增API端点的安全配置；5. 实施基于角色的访问控制(RBAC)。",
    "broken_access_control": "1. 实施最小权限原则，默认拒绝所有未明确授权的请求；2. 在API/路由层实施统一的权限校验中间件；3. 对敏感操作实施多因素认证(MFA)；4. 确保会话Cookie设置HttpOnly、Secure、SameSite标志；5. 登录后重新生成会话标识符防止会话固定攻击。",
    "information_disclosure": "1. 审查所有API响应，移除不应暴露的敏感字段（密码哈希、内部ID、密钥等）；2. 实施DTO模式，将内部数据模型与外部API响应解耦；3. 配置Web服务器隐藏版本信息（Server头、X-Powered-By头）；4. 移除生产环境中的调试端点（如/actuator/*、/debug/*）；5. 确保错误页面不泄露堆栈跟踪和代码路径。",
    "insecure_deserialization": "1. 避免反序列化来自不可信来源的数据；2. 实施类型白名单，仅允许预期类的反序列化；3. 使用安全的序列化框架，配置gadget chain过滤；4. 对序列化数据进行签名/加密防止篡改；5. 部署RASP运行时应用自保护监控反序列化行为。",
    "crlf_injection": "1. 对用户输入进行CR/LF字符过滤或编码；2. 避免将用户输入直接设置到HTTP响应头中；3. 使用Web框架提供的安全API设置响应头；4. 部署WAF拦截含%0d%0a的请求参数。",
    "xslt_injection": "1. 禁用XSLT处理器中的扩展函数（如php:function()）；2. 不对用户输入进行XSLT样式表转换；3. 使用预定义的安全样式表而非动态加载；4. 配置XSLT处理器限制文件系统访问。",
    "el_injection": "1. 避免将用户输入传入表达式引擎求值；2. 禁用Spring EL等表达式语言的安全相关类访问；3. 使用参数绑定替代表达式拼接；4. 实施表达式白名单验证，仅允许预期的表达式模式。",
    "jndi_injection": "1. 升级日志框架到修复版本（Log4j 2.17.1+）；2. 禁用JNDI Lookup或设置log4j2.formatMsgNoLookups=true；3. 部署WAF拦截含${jndi:}的请求；4. 限制应用的网络出站连接，阻止对恶意LDAP/RMI服务器的访问。",
    "prototype_pollution": "1. 使用Object.create(null)创建无原型对象；2. 对用户输入的JSON进行__proto__和constructor字段过滤；3. 升级Node.js和依赖库到修复版本；4. 使用Object.freeze(Object.prototype)冻结原型链。",
    "type_juggling": "1. 使用严格比较操作符(===/!==)替代松散比较(==/!=)；2. 对用户输入进行严格类型转换和验证；3. 避免将用户输入直接与敏感值比较；4. 对密码哈希比较使用hash_equals等时间安全函数。",
    "request_smuggling": "1. 确保前端代理和后端服务器使用一致的HTTP头解析逻辑；2. 禁用Transfer-Encoding头或统一处理策略；3. 升级Web服务器到修复HTTP走私漏洞的版本；4. 使用HTTP/2协议避免HTTP/1.1走私问题；5. 配置WAF检测走私特征。",
    "workflow_bypass": "1. 在服务端实施完整的状态机校验，确保每个状态转换合法；2. 对业务流程的每个步骤进行前置条件验证；3. 使用工作流引擎管理复杂业务流程；4. 对状态变更实施审计日志记录。",
    "race_condition": "1. 使用数据库行级锁（SELECT ... FOR UPDATE）保护并发操作；2. 使用Redis分布式锁或数据库乐观锁防止重复操作；3. 实施幂等性设计，确保重复请求产生相同结果；4. 对关键业务操作实施频率限制。",
    "pricing_manipulation": "1. 在服务端计算价格，不信任客户端提交的金额/折扣字段；2. 对价格相关字段实施服务端二次校验；3. 使用数据库精度类型（DECIMAL）而非浮点数存储金额；4. 对异常价格变动实施告警监控。",
    "coupon_abuse": "1. 对优惠券实施一次性使用限制（使用后标记已使用）；2. 将优惠券绑定到特定用户账户防止共享；3. 设置优惠券的有效期和使用次数上限；4. 实施频率限制防止自动化批量使用。",
    "subscription_hijack": "1. 对订阅变更操作实施严格的所有权验证；2. 降级订阅后应立即移除高权限功能访问；3. 退款后同步取消相关订阅权益；4. 对支付方式变更实施二次身份验证。",
    "waf_bypass": "1. 升级WAF规则和特征库到最新版本；2. 配置多层WAF（边缘WAF + 应用层WAF）；3. 启用WAF的异常检测和机器学习模式；4. 定期审查WAF拦截日志分析绕过尝试。",
    "clickjacking": "1. 设置X-Frame-Options: DENY或SAMEORIGIN响应头；2. 配置Content-Security-Policy的frame-ancestors指令；3. 对关键操作使用JavaScript的frame-busting代码；4. 对嵌入iframe的场景实施Origin校验。",
    "webshell": "1. 严格限制文件上传功能，仅允许安全的文件类型（图片、文档）；2. 对上传文件进行内容验证（魔数检测、图片二次渲染）；3. 将上传文件存储在非执行目录，禁止上传目录执行脚本；4. 部署Webshell检测工具定期扫描上传目录。",
    "weak_password": "1. 强制实施密码复杂度策略（最小8位、含大小写字母+数字+特殊字符）；2. 对新注册/修改密码检查是否使用弱口令字典；3. 实施账户锁定策略（连续5次失败锁定15分钟）；4. 建议使用密码泄露查询API（Have I Been Pwned）阻止使用已泄露密码。",
    "brute_force": "1. 实施登录频率限制（如每分钟最多5次尝试）；2. 部署验证码机制（CAPTCHA/reCAPTCHA）增加自动化攻击成本；3. 实施渐进式延迟响应，每次失败增加响应时间；4. 配置账户锁定策略；5. 监控并告警异常登录模式。",
    "git_exposure": "1. 立即删除生产环境中的.git目录；2. 在Web服务器配置中禁止访问隐藏文件/目录（.git、.svn、.env等）；3. 部署.gitjumbo等工具检测Git暴露；4. 使用.gitignore排除敏感文件。",
    "directory_listing": "1. 在Web服务器配置中禁用目录浏览（Nginx: autoindex off; Apache: Options -Indexes）；2. 在应用目录下放置空的index.html文件；3. 审查所有公开目录确保不包含敏感文件。",
    "cookie_security": "1. 为所有Cookie设置Secure标志，确保仅通过HTTPS传输；2. 设置HttpOnly标志防止JavaScript读取敏感Cookie；3. 设置SameSite=Strict或Lax标志防止CSRF攻击；4. 设置合理的Cookie过期时间。",
    "file_upload": "1. 实施严格的文件类型白名单验证（扩展名+MIME类型+魔数三重校验）；2. 禁止上传可执行文件（.php/.jsp/.aspx/.exe等）；3. 重命名上传文件为随机名称防止路径猜测；4. 将上传文件存储到独立的非Web执行目录或对象存储。",
    "dir_traversal": "1. 对用户传入的文件路径参数进行规范化处理后校验；2. 禁止路径中包含../或..\\序列；3. 使用白名单限制可访问的目录范围；4. 使用安全的文件API（如Java的Path.normalize()）处理路径。",
    "url_redirect": "1. 对重定向目标URL实施域名白名单验证；2. 使用映射ID替代直接URL跳转；3. 跳转前向用户展示目标地址需确认；4. 禁止协议相对URL跳转到外部域名。",
    "command_injection": "1. 避免使用系统命令执行函数处理用户输入；2. 使用参数化命令替代shell命令拼接；3. 实施严格的输入白名单验证；4. 使用容器隔离命令执行环境。",
    "unknown": "1. 针对该漏洞类型进行专项分析和修复；2. 参考OWASP Top 10和对应技术栈的安全最佳实践；3. 建议进行代码安全审计确认具体风险；4. 根据实际漏洞利用场景制定针对性修复方案。",
}

REQUIRED_HTTP_FIELDS = [
    "seq", "label",
    "request.method", "request.url",
    "response.status_code",
]

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def generate_report_id() -> str:
    """Generate unique report ID: VUL-YYYY-XXXXX"""
    chars = string.ascii_uppercase + string.digits
    random_part = ''.join(random.choices(chars, k=5))
    year = datetime.now().year
    return f"VUL-{year}-{random_part}"


def _nested_get(obj: dict, dotted_key: str):
    """Retrieve a nested value from a dict using dot notation."""
    parts = dotted_key.split('.')
    cur = obj
    for p in parts:
        if not isinstance(cur, dict):
            return None
        cur = cur.get(p)
        if cur is None:
            return None
    return cur


# ---------------------------------------------------------------------------
# Loading
# ---------------------------------------------------------------------------


def load_findings(findings_dir: Path) -> list:
    """Load all findings JSON files and return merged findings list."""
    all_findings = []

    if not findings_dir.exists():
        print(f"[!] 未找到 findings 目录: {findings_dir}")
        return all_findings

    for f in sorted(findings_dir.glob("*.json")):
        try:
            with open(f, 'r', encoding='utf-8') as fh:
                data = json.load(fh)
                findings = data.get("findings", [])
                agent_name = data.get("agent", f.stem)
                for finding in findings:
                    finding["_source_agent"] = agent_name
                all_findings.extend(findings)
                print(f"  [+] {f.name}: {len(findings)} 个发现")
        except (json.JSONDecodeError, IOError) as e:
            print(f"  [!] 读取失败 {f.name}: {e}")

    return all_findings


def load_json(path: Path) -> dict:
    """Load a JSON file, returning {} on failure."""
    try:
        with open(path, 'r', encoding='utf-8') as fh:
            return json.load(fh)
    except (json.JSONDecodeError, IOError, FileNotFoundError):
        return {}


# ---------------------------------------------------------------------------
# Deduplication
# ---------------------------------------------------------------------------


def deduplicate_findings(findings: list) -> list:
    """
    Deduplicate findings.
    Two findings are duplicates if they share the same type + base target_url.
    Keep the one with higher confidence and more http_interactions.
    """
    seen: dict[tuple, dict] = {}

    for f in findings:
        vuln_type = f.get("type", "unknown")
        target_url = f.get("target_url", "")
        norm_url = target_url.split('?')[0].rstrip('/')
        key = (vuln_type, norm_url)

        if key not in seen:
            seen[key] = f
        else:
            existing = seen[key]
            confidence_order = {"confirmed": 3, "likely": 2, "potential": 1}
            existing_score = confidence_order.get(existing.get("confidence", "potential"), 0)
            existing_score += len(existing.get("http_interactions", []))
            new_score = confidence_order.get(f.get("confidence", "potential"), 0)
            new_score += len(f.get("http_interactions", []))
            if new_score > existing_score:
                seen[key] = f

    return list(seen.values())


# ---------------------------------------------------------------------------
# Normalisation
# ---------------------------------------------------------------------------


# --- RepairSuggestions context keywords mapping ---
# Maps keyword patterns → specific suggestions (checked BEFORE generic type mapping)
# NOTE: patterns are checked in order; more specific patterns must come first.
REPAIR_SUGGESTIONS_CONTEXT_MAP = [
    # 0: API / password hash exposure (api-console users API)
    (["password", "密码哈希", "api.console", "users"],
     "1. 从API响应中彻底移除密码哈希字段（spec.password），密码哈希不应在任何API接口中返回；2. 实施字段级过滤，根据调用方角色动态控制返回字段（普通用户不可见权限信息）；3. 引入DTO模式，将内部数据模型与API响应解耦，仅返回前端必需的最小字段集；4. 参考OWASP API Security Top 10中的API4:2023（过度数据暴露）进行加固；5. 对敏感API实施速率限制，防止大规模数据爬取。"),
    # 1: Session fixation
    (["会话固定", "登录前后值不变", "XSRF-TOKEN", "SESSION"],
     "1. 在用户登录成功后立即使旧会话失效并重新生成SESSION和XSRF-TOKEN（Spring Security中配置session-fixation-protection=\"newSession\"）；2. 确保新会话仅迁移最小必要属性（如角色信息）；3. 为CSRF Token设置合理的过期时间（如30分钟）；4. 实施多设备登录管理，限制同一账户的并发会话数；5. 确保Cookie设置Secure、HttpOnly、SameSite标志。"),
    # 2: robots.txt exposure
    (["robots.txt", "Disallow", "Sitemap"],
     "1. 从robots.txt中移除/console的Disallow指令（攻击者无需遵守robots.txt，暴露路径反而提供信息）；2. 将Sitemap URL中的localhost修正为实际的公网域名和HTTPS协议；3. 确保application.yml中的externalUrl配置项设置为正确的公网地址；4. 对敏感路径添加认证要求，而非依赖robots.txt隐藏。"),
    # 3: Actuator health endpoint (unauthorized access)
    (["/actuator/health", "健康状态", "liveness", "readiness", "未授权可访问"],
     "1. 配置management.endpoint.health.show-details=when-authorized要求认证后才显示详细信息；2. 如果/actuator/health用于外部负载均衡器健康检查，建议仅返回最简响应（200 OK），移除groups字段；3. 使用Spring Security对所有/actuator/*端点添加认证要求；4. 配置management.server.port为独立端口仅对内网开放。"),
    # 4: Actuator globalinfo (authenticated config exposure)
    (["globalinfo", "siteTitle", "externalUrl", "postSlugGenerationStrategy", "时区"],
     "1. 将application.yml中的externalUrl从localhost修正为实际公网地址，避免密码重置等邮件链接指向localhost；2. 评估globalinfo端点的必要性，如非必需则禁用（management.endpoint.globalinfo.enabled=false）；3. 限制该端点的访问权限，仅允许管理员角色访问；4. 审查端点返回的配置信息，移除不必要公开的设置项。"),
    # 5: Comment plugin config / captcha disabled
    (["评论", "captcha", "验证码功能未启用", "commentwidget"],
     "1. 在管理后台启用评论验证码功能（建议配置CAPTCHA或reCAPTCHA v2/v3）；2. 限制评论插件配置API的访问权限，避免低权限用户获取安全配置信息；3. 实施评论频率限制（如：同一IP每分钟最多3条评论），防止自动化垃圾评论；4. 对评论内容实施严格的HTML标签白名单过滤（仅允许<b>、<i>、<a>等安全标签）。"),
    # 6: Security headers missing
    (["安全响应头", "Strict-Transport-Security", "Content-Security-Policy", "HSTS", "CSP"],
     "1. 优先部署HTTPS：配置SSL/TLS证书，添加Strict-Transport-Security: max-age=31536000; includeSubDomains强制HTTPS；2. 添加Content-Security-Policy头，建议初始策略为default-src 'self'；script-src 'self' 'unsafe-inline'，后续逐步收紧；3. 添加Permissions-Policy头限制不必要的浏览器功能；4. 如果使用Nginx反向代理，可在server块中添加add_header指令。"),
    # 7: Open redirect
    (["开放重定向", "redirect", "跳转到"],
     "1. 对重定向URL参数实施白名单验证，仅允许信任的域名；2. 使用间接引用（映射ID）替代直接URL重定向；3. 禁止协议相对URL(//)和绝对URL跳转到外部域名；4. 在跳转前向用户展示即将前往的目标地址并需确认。"),
    # 8: XSS
    (["XSS", "跨站脚本", "脚本注入"],
     "1. 对用户输入进行输出编码（HTML实体编码），在渲染时转义特殊字符；2. 实施Content-Security-Policy(CSP)头限制脚本执行来源；3. 对富文本输入使用安全的HTML过滤器（如DOMPurify）；4. 避免使用innerHTML等危险DOM API，改用textContent。"),
    # 9: SQL injection
    (["SQL注入", "SQLi", "sql injection"],
     "1. 使用参数化查询（预编译语句）替代字符串拼接SQL；2. 实施输入验证和白名单过滤，拒绝异常字符；3. 对数据库账户实施最小权限原则；4. 部署WAF拦截常见SQL注入payload；5. 开启数据库慢查询日志监控异常请求。"),
    # 10: File upload / traversal
    (["文件上传", "文件包含", "路径穿越", "穿越"],
     "1. 实施严格的文件类型白名单验证（扩展名+MIME类型+魔数三重校验）；2. 禁止上传可执行文件；3. 重命名上传文件为随机名称防止路径猜测；4. 对文件路径参数实施白名单验证，禁止路径中包含../或..\\序列。"),
    # 11: SSRF
    (["SSRF", "服务端请求伪造", "内网"],
     "1. 对用户提供的URL进行严格验证，限制允许的协议(http/https)和域名白名单；2. 禁止访问内网IP地址段和云元数据地址；3. 使用URL解析库而非正则表达式进行验证；4. 服务端发起请求时使用独立的网络命名空间。"),
]


def _build_repair_suggestions(vuln: dict) -> str:
    """Build targeted RepairSuggestions based on vuln type, title, and URL."""
    title = (vuln.get("title") or "").lower()
    target_url = (vuln.get("target_url") or "").lower()
    description = (vuln.get("description") or "").lower()
    combined = f"{title} {target_url} {description}"

    # Check context-specific suggestions first
    for keywords, suggestion in REPAIR_SUGGESTIONS_CONTEXT_MAP:
        if any(kw.lower() in combined for kw in keywords):
            return suggestion

    # Fallback to generic type-based suggestions
    raw_type = vuln.get("type", "unknown")
    return REPAIR_SUGGESTIONS_MAP.get(raw_type, REPAIR_SUGGESTIONS_MAP.get("unknown", ""))


def _is_chinese_text(text: str) -> bool:
    """Check if text already contains Chinese characters.

    Returns True if text has at least one CJK character AND Chinese chars
    make up a meaningful portion (scales threshold for short text).
    A text like 'SQL注入 /api/users' (3 CJK in 20 chars = 15%) returns True.
    A text like 'SQL Injection in /api/users' (0 CJK) returns False.
    """
    if not text:
        return True
    cjk_count = sum(1 for ch in text if '一' <= ch <= '鿿')
    if cjk_count == 0:
        return False
    total_chars = len(text.strip())
    if total_chars == 0:
        return True
    # For short text with any Chinese content, consider it Chinese
    if cjk_count >= 2:
        return True
    # Single CJK char in very long English text is suspicious, require 10%
    return cjk_count / total_chars >= 0.10


def _translate_to_chinese(text: str) -> str:
    """Translate vulnerability title/description to Chinese.

    Uses a built-in dictionary for common vulnerability terms with
    case-insensitive matching. For full sentences that can't be translated,
    returns with a translation note so the orchestrator can handle it via LLM.
    """
    if not text or _is_chinese_text(text):
        return text

    # Common vulnerability term translations (all keys lowercase for matching)
    # Note: longer terms must come first (sorted by -len) so "cross-site scripting"
    # matches before "xss" to avoid double translation like "XSS注入注入".
    term_map = {
        "cross-site scripting": "XSS注入",
        "stored xss": "存储型XSS",
        "reflected xss": "反射型XSS",
        "dom-based xss": "DOM型XSS",
        "sql injection": "SQL注入",
        "server-side request forgery": "SSRF",
        "remote code execution": "远程代码执行",
        "rce": "远程代码执行",
        "local file inclusion": "本地文件包含",
        "remote file inclusion": "远程文件包含",
        "path traversal": "目录穿越",
        "directory traversal": "目录穿越",
        "cross-site request forgery": "CSRF",
        "authentication bypass": "认证绕过",
        "insecure direct object reference": "越权访问",
        "idor": "越权访问",
        "broken access control": "访问控制缺陷",
        "information disclosure": "信息泄露",
        "open redirect": "开放重定向",
        "server-side template injection": "模板注入",
        "ssti": "模板注入",
        "xml external entity": "XXE注入",
        "command injection": "命令注入",
        "insecure deserialization": "反序列化漏洞",
        "crlf injection": "CRLF注入",
        "http request smuggling": "HTTP走私",
        "http response splitting": "HTTP响应分割",
        "file upload": "文件上传漏洞",
        "webshell": "Webshell",
        "weak password": "弱口令",
        "brute force": "暴力破解",
        "race condition": "竞争条件",
        "prototype pollution": "原型污染",
        "type juggling": "类型混淆",
        "clickjacking": "点击劫持",
        "cookie security": "Cookie安全问题",
        "waf bypass": "WAF绕过",
        "dns rebinding": "DNS重绑定",
        "subdomain takeover": "子域名接管",
        "cache deception": "缓存欺骗",
        "business logic flaw": "业务逻辑漏洞",
        "parameter tampering": "参数篡改",
        "mass assignment": "批量赋值",
        "graphql": "GraphQL",
        "nosql injection": "NoSQL注入",
        "jndi injection": "JNDI注入",
        "expression language injection": "表达式注入",
        "xslt injection": "XSLT注入",
        "request smuggling": "HTTP走私",
        "unauthorized access": "未授权访问",
        "data exposure": "数据泄露",
        "vulnerability": "漏洞",
        "parameter": "参数",
        "endpoint": "接口",
        "response": "响应",
        "request": "请求",
        "unauthenticated": "未认证",
        "without authentication": "无需认证",
        "allows attacker": "允许攻击者",
        "allows an attacker": "允许攻击者",
        "could allow": "可能导致",
        "could be exploited": "可被利用",
        "arbitrary": "任意",
        "unrestricted": "无限制",
        "missing": "缺失",
        "bypass": "绕过",
        "xxe": "XXE注入",
        "xss": "XSS注入",
        "csrf": "CSRF",
    }

    # Build single-pass alternation regex sorted by length descending (longest
    # first) so "cross-site scripting" matches before "xss" and replacement
    # text is never re-matched.
    sorted_terms = sorted(term_map.keys(), key=len, reverse=True)
    pattern = re.compile('|'.join(re.escape(t) for t in sorted_terms), re.IGNORECASE)
    result = pattern.sub(lambda m: term_map[m.group(0).lower()], text)

    if result != text:
        return result

    # No terms matched — return with translation note
    return f"[待翻译] {text}"


def normalise_vuln(vuln: dict, index: int) -> dict:
    """Normalise a single vulnerability entry to match pentest_json_spec.md."""
    # --- vuln_id ----------------------------------------------------------
    vuln["vuln_id"] = f"VUL-{index:03d}"

    # --- type -------------------------------------------------------------
    if vuln.get("type") not in VALID_TYPES:
        # Attempt mapping from common synonyms
        synonym_map = {
            "cmdi": "rce", "command_injection": "rce",
            "file_upload": "broken_access_control",
            "dir_traversal": "lfi",
            "url_redirect": "open_redirect",
            "nosqli": "sqli",
            "weak_password": "auth_bypass",
            "brute_force": "auth_bypass",
            "webshell": "rce",
            "git_exposure": "information_disclosure",
            "directory_listing": "information_disclosure",
            "cookie_security": "information_disclosure",
        }
        vuln["type"] = synonym_map.get(vuln.get("type", ""), "unknown")

    # --- type_zh (auto-populate from type) --------------------------------
    raw_type = vuln.get("type", "unknown")
    vuln["type_zh"] = TYPE_ZH_MAP.get(raw_type, raw_type)

    # --- title Chinese check & translation --------------------------------
    title = vuln.get("title", "")
    if title and not _is_chinese_text(title):
        vuln["title"] = _translate_to_chinese(title)
        if vuln["title"].startswith("[待翻译]"):
            # Fallback: use type_zh as title prefix
            vuln["title"] = f"{vuln['type_zh']} - {vuln.get('target_url', '未知目标')}"

    # --- description Chinese check & translation --------------------------
    description = vuln.get("description", "")
    if description and not _is_chinese_text(description):
        vuln["description"] = _translate_to_chinese(description)
        if vuln["description"].startswith("[待翻译]"):
            vuln["description"] = f"[待翻译原文: {description}]"

    # --- RepairSuggestions (auto-populate with context-aware logic) ---------
    if not vuln.get("RepairSuggestions"):
        vuln["RepairSuggestions"] = _build_repair_suggestions(vuln)

    # --- severity ---------------------------------------------------------
    if vuln.get("severity") not in VALID_SEVERITIES:
        vuln["severity"] = "info"

    # --- confidence -------------------------------------------------------
    if vuln.get("confidence") not in VALID_CONFIDENCES:
        vuln["confidence"] = "potential"

    # --- authenticated ----------------------------------------------------
    if not isinstance(vuln.get("authenticated"), bool):
        vuln["authenticated"] = False

    # --- description (ensure exists) --------------------------------------
    if not vuln.get("description"):
        vuln["description"] = ""

    # --- http_interactions normalisation ----------------------------------
    interactions = vuln.get("http_interactions", [])
    normalised_interactions = []
    for ix, hi in enumerate(interactions, 1):
        entry = {}
        entry["seq"] = hi.get("seq", ix)
        entry["label"] = hi.get("label", f"step {ix}")

        # request
        req = hi.get("request", {})
        entry["request"] = {
            "method": (req.get("method") or "GET").upper(),
            "url": req.get("url", ""),
            "headers": req.get("headers") if isinstance(req.get("headers"), dict) else {},
            "body": req.get("body", None),
        }

        # response
        resp = hi.get("response", {})
        # 支持 body 和 body_excerpt 两种字段名
        resp_body = resp.get("body") or resp.get("body_excerpt")
        entry["response"] = {
            "status_code": resp.get("status_code", 0) if isinstance(resp.get("status_code"), (int, float)) else 0,
            "headers": resp.get("headers") if isinstance(resp.get("headers"), dict) else {},
            "body": resp_body,
        }

        normalised_interactions.append(entry)

    vuln["http_interactions"] = normalised_interactions

    # --- strip internal fields --------------------------------------------
    return {k: v for k, v in vuln.items() if not k.startswith('_')}


# ---------------------------------------------------------------------------
# Validation
# ---------------------------------------------------------------------------


def validate_report(report: dict) -> list[str]:
    """
    Validate report against pentest_json_spec.md.
    Returns a list of error messages (empty = valid).
    """
    errors = []

    # --- report_meta ------------------------------------------------------
    meta = report.get("report_meta", {})
    for field in ("report_id", "generated_at"):
        if not meta.get(field):
            errors.append(f"report_meta.{field} 缺失")
    scope = meta.get("scope", {})
    if not scope.get("target_url"):
        errors.append("report_meta.scope.target_url 缺失")

    # --- summary ----------------------------------------------------------
    summary = report.get("summary", {})
    for level in ("total", "critical", "high", "medium", "low", "info"):
        if level not in summary:
            errors.append(f"summary.{level} 缺失")

    calc_total = sum(summary.get(k, 0) for k in ("critical", "high", "medium", "low", "info"))
    if summary.get("total") != calc_total:
        errors.append(
            f"summary.total ({summary.get('total')}) != "
            f"critical+high+medium+low+info ({calc_total})"
        )

    # --- vulnerabilities --------------------------------------------------
    vulns = report.get("vulnerabilities", [])
    if len(vulns) != summary.get("total", -1):
        errors.append(
            f"vulnerabilities 数量 ({len(vulns)}) != summary.total ({summary.get('total')})"
        )

    seen_ids = set()
    for i, v in enumerate(vulns):
        prefix = f"vulnerabilities[{i}]"

        # required fields
        for field in REQUIRED_VULN_FIELDS:
            if field not in v or v[field] is None:
                errors.append(f"{prefix}.{field} 缺失或为 null")

        # enum checks
        if v.get("severity") not in VALID_SEVERITIES:
            errors.append(f"{prefix}.severity='{v.get('severity')}' 不在 {VALID_SEVERITIES}")
        if v.get("confidence") not in VALID_CONFIDENCES:
            errors.append(f"{prefix}.confidence='{v.get('confidence')}' 不在 {VALID_CONFIDENCES}")
        if v.get("type") not in VALID_TYPES:
            errors.append(f"{prefix}.type='{v.get('type')}' 不在 {VALID_TYPES}")
        if not isinstance(v.get("authenticated"), bool):
            errors.append(f"{prefix}.authenticated 应为 boolean")

        # unique vuln_id
        vid = v.get("vuln_id")
        if vid in seen_ids:
            errors.append(f"{prefix}.vuln_id='{vid}' 重复")
        seen_ids.add(vid)

        # http_interactions
        for j, hi in enumerate(v.get("http_interactions", [])):
            hi_prefix = f"{prefix}.http_interactions[{j}]"
            for field in REQUIRED_HTTP_FIELDS:
                if _nested_get(hi, field) is None:
                    errors.append(f"{hi_prefix}.{field} 缺失")
            if not isinstance(hi.get("response", {}).get("status_code"), (int, float)):
                errors.append(f"{hi_prefix}.response.status_code 应为 number")

    return errors


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------


def main():
    parser = argparse.ArgumentParser(description="Generate final JSON report from agent findings")
    parser.add_argument("workspace", help="Path to vibe-pentest workspace directory")
    parser.add_argument("--target-url", required=True, help="Target URL of the pentest")
    parser.add_argument("--tech-stack", nargs='*', default=[], help="Identified tech stack")
    parser.add_argument("--test-accounts", default="[]", help="Test accounts JSON array")
    parser.add_argument("--test-accounts-file", help="Path to a JSON file containing the test accounts array")
    parser.add_argument("--report-id", default=None, help="Custom report ID")
    parser.add_argument("--output", help="Optional fixed JSON output path, for example workspace/report.json")
    args = parser.parse_args()

    workspace = Path(args.workspace)
    findings_dir = workspace / "findings"

    # ---- Enforce: all 6 agent findings files must exist --------------------
    # Accept both underscore (injection_agent.json) and dash (injection-agent.json) naming
    REQUIRED_AGENT_NAMES = [
        "injection",
        "auth",
        "file",
        "api",
        "business",
        "misc",
    ]
    missing_agents = []
    for name in REQUIRED_AGENT_NAMES:
        us_file = findings_dir / f"{name}_agent.json"
        dash_file = findings_dir / f"{name}-agent.json"
        if not us_file.exists() and not dash_file.exists():
            missing_agents.append(name)
    if missing_agents:
        print(f"[!] 多 Agent 架构校验失败：缺少 {len(missing_agents)} 个 Agent 的输出文件")
        for name in missing_agents:
            print(f"    - findings/{name}_agent.json (或 {name}-agent.json)")
        print(f"\n[*] Vibe Pentest 必须以多 Agent 并行方式执行，缺少任一 Agent 的报告不完整。")
        print(f"[*] 请确认以下 6 个 Agent 均已成功执行并输出结果：")
        for name in REQUIRED_AGENT_NAMES:
            us_file = findings_dir / f"{name}_agent.json"
            dash_file = findings_dir / f"{name}-agent.json"
            status = "存在" if (us_file.exists() or dash_file.exists()) else "缺失"
            print(f"    [{status}] {name}_agent.json")
        sys.exit(1)
    print(f"[+] 多 Agent 架构校验通过：6 个 Agent 输出文件全部存在")

    print(f"[*] 聚合发现数据: {findings_dir}")

    # ---- Load findings ----------------------------------------------------
    all_findings = load_findings(findings_dir)
    print(f"[*] 总发现数: {len(all_findings)}")

    # ---- Deduplicate ------------------------------------------------------
    unique_findings = deduplicate_findings(all_findings)
    print(f"[+] 去重后: {len(unique_findings)}")

    # ---- Sort by severity (primary) then confidence (secondary) -----------
    # critical=0 … info=4; confirmed=0 … potential=2
    # Ensures VUL-001 is the most critical+confirmed finding.
    severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3, "info": 4}
    confidence_order = {"confirmed": 0, "likely": 1, "potential": 2}
    unique_findings.sort(key=lambda f: (
        severity_order.get(f.get("severity", "info"), 5),
        confidence_order.get(f.get("confidence", "potential"), 3),
    ))

    # ---- Normalise & assign VUL-xxx IDs ------------------------------------
    for i, f in enumerate(unique_findings, 1):
        unique_findings[i - 1] = normalise_vuln(f, i)

    # ---- Compute summary --------------------------------------------------
    summary = {"total": 0, "critical": 0, "high": 0, "medium": 0, "low": 0, "info": 0}
    for f in unique_findings:
        sev = f.get("severity", "info")
        if sev in summary:
            summary[sev] += 1
    summary["total"] = sum(summary[k] for k in ("critical", "high", "medium", "low", "info"))

    # ---- Enrich with external context -------------------------------------
    fingerprint = load_json(workspace / "fingerprint.json")
    session = load_json(workspace / "session.json")

    tech_stack = args.tech_stack
    if not tech_stack and fingerprint:
        tech_stack = fingerprint.get("tech_stack", [])

    test_accounts = args.test_accounts
    if args.test_accounts_file:
        with open(args.test_accounts_file, "r", encoding="utf-8") as fh:
            test_accounts = fh.read()
    if test_accounts == "[]" and session:
        accts = session.get("test_accounts", [])
        if accts:
            test_accounts = json.dumps(accts)

    # ---- Build report -----------------------------------------------------
    report = {
        "report_meta": {
            "report_id": args.report_id or generate_report_id(),
            "generated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
            "tester": "vibe-pentest-agent-v1",
            "scope": {
                "target_url": args.target_url,
                "tech_stack": tech_stack,
            },
            "test_accounts": json.loads(test_accounts),
        },
        "summary": summary,
        "vulnerabilities": unique_findings,
    }

    # ---- Validate ---------------------------------------------------------
    print("\n[*] 格式验证中...")
    validation_errors = validate_report(report)
    if validation_errors:
        print(f"[!] 发现 {len(validation_errors)} 个格式问题:")
        for err in validation_errors:
            print(f"    - {err}")
        print("\n[*] 报告仍会输出，但请修复上述问题后重新生成。")
    else:
        print("[+] 格式验证通过")

    # ---- Write report -----------------------------------------------------
    # Generate timestamp from report's generated_at field (sanitized for filenames)
    report_id = report["report_meta"]["report_id"]
    generated_at = report["report_meta"]["generated_at"]
    # Convert ISO 8601 to filename-safe: 2026-05-19T08:50:00Z → 20260519T085000Z
    ts_safe = generated_at.replace(":", "").replace("-", "")
    report_file_ts = workspace / f"report_{ts_safe}.json"
    report_file_id = workspace / f"report_{report_id}.json"

    for report_file in (report_file_ts, report_file_id):
        with open(report_file, 'w', encoding='utf-8') as fh:
            json.dump(report, fh, indent=2, ensure_ascii=False)
    if args.output:
        output_file = Path(args.output)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        with open(output_file, 'w', encoding='utf-8') as fh:
            json.dump(report, fh, indent=2, ensure_ascii=False)

    print(f"\n[+] 报告已生成: {report_file_ts}")
    print(f"[+] 报告已存档: {report_file_id}")
    if args.output:
        print(f"[+] 报告已写入: {args.output}")
    print(f"\n[*] 汇总:")
    print(f"    总计: {summary['total']}")
    print(f"    严重: {summary['critical']}")
    print(f"    高危: {summary['high']}")
    print(f"    中危: {summary['medium']}")
    print(f"    低危: {summary['low']}")
    print(f"    信息: {summary['info']}")

    # Exit non-zero if validation failed
    if validation_errors:
        sys.exit(1)


if __name__ == "__main__":
    main()
