#!/usr/bin/env python3
"""
Phase 5.6: Evidence verification for vulnerability findings.

Reads all findings/*.json, performs automated evidence quality checks,
and flags findings that need manual review by the orchestrator.

Checks performed:
  1. Structural: empty http_interactions, missing essential fields
  2. Status code: 404/500 on claimed vulnerability
  3. Type-specific: response body must contain expected evidence patterns
  4. Body quality: empty body, generic error pages, insufficient evidence

Output: workspace/verification.json with verdict for each finding.

Usage:
    python verify_findings.py <workspace_dir>
"""

import argparse
import json
import sys
from pathlib import Path

# ---- Type-specific evidence patterns ------------------------------------
# Each tuple: (pattern_type, patterns, requirement)
# pattern_type: "must_contain_one" / "must_not_contain" / "must_be_different"
VULN_TYPE_EVIDENCE = {
    "sqli": {
        "must_contain_one": [
            "SQL syntax", "mysql_fetch", "mysql error", "ORA-", "PostgreSQL",
            "SQLite", "Microsoft OLE DB", "ODBC Driver", "JDBC", "SQLSTATE",
            "You have an error in your SQL", "check the manual",
            "Unknown column", "Unclosed quotation mark", "syntax error",
            "Warning: mysql", "Fatal error: Uncaught mysqli_sql_exception",
            "PDOException", "SQLException", "DB2 SQL Error",
        ],
        "must_not_contain": [],
    },
    "xss_stored": {
        "must_contain_one": [
            "<script>alert", "onerror=alert", "onclick=alert",
            "javascript:alert", "<img src=x onerror", "<svg onload",
        ],
        "must_not_contain": ["&lt;script&gt;", "&lt;img"],
    },
    "xss_reflected": {
        "must_contain_one": [
            "<script>alert", "onerror=alert", "onclick=alert",
            "javascript:alert", "<img src=x onerror", "<svg onload",
        ],
        "must_not_contain": ["&lt;script&gt;", "&lt;img"],
    },
    "xss_dom": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "ssrf": {
        "must_contain_one": [
            "ami-id", "instance-id", "security-credentials",  # AWS
            "computeMetadata", "service-accounts",  # GCP
            "azure", "ovf-env.xml",  # Azure
            "aliyun", "instance/",  # Alibaba
            "kubernetes", "serviceaccount",  # K8s
            "redis_version", "# Server",  # Redis
            "docker", "containers",  # Docker
            "elasticsearch", "cluster_name",  # ES
            "STAT", "VERSION", "CLIENT LIST",  # Memcached
        ],
        "must_not_contain": [],
    },
    "rce": {
        "must_contain_one": [
            "uid=", "gid=", "root:", "bin:",  # /etc/passwd
            "Linux", "Darwin", "Windows",  # uname
            "total", "drwx",  # ls output
            "PID", "CMD",  # ps output
            "eth0", "lo:",  # ifconfig
        ],
        "must_not_contain": [],
    },
    "lfi": {
        "must_contain_one": [
            "root:", "daemon:", "bin:", "nobody:",  # /etc/passwd
            "<?php", "[extensions]", "DB_NAME",  # config files
            "BEGIN RSA PRIVATE KEY",  # SSH key
            "shadow", "passwd",
        ],
        "must_not_contain": [],
    },
    "xxe": {
        "must_contain_one": [
            "root:", "daemon:",  # file read
            "ENTITY", "DOCTYPE",  # DTD
        ],
        "must_not_contain": [],
    },
    "ssti": {
        "must_contain_one": ["49", "777", "7*7"],
        "must_not_contain": [],
    },
    "idor": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "auth_bypass": {
        "must_contain_one": [],
        "must_not_contain": ["login", "signin", "unauthorized", "401", "403"],
    },
    "information_disclosure": {
        "must_contain_one": [
            "password", "secret", "api_key", "token", "private_key",
            "connectionString", "jdbc:", "mysql://", "Driver={",
            ".git", "HEAD", "refs/heads",
            "WEB-INF", "web.xml", "application.properties",
            "actuator", "health", "env", "mappings",
            "accessKey", "SecretKey", "AKIA",
        ],
        "must_not_contain": [],
    },
    "open_redirect": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "csrf": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "file_upload": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "directory_listing": {
        "must_contain_one": [
            "Index of", "Directory Listing", "Parent Directory",
            "[DIR]", "&lt;dir&gt;", "To Parent Directory",
        ],
        "must_not_contain": [],
    },
    "request_smuggling": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "prototype_pollution": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "insecure_deserialization": {
        "must_contain_one": [],
        "must_not_contain": [],
    },
    "weak_password": {
        "must_contain_one": [],
        "must_not_contain": ["login", "signin", "密码错误", "incorrect", "invalid"],
    },
    "brute_force": {
        "must_contain_one": [],
        "must_not_contain": ["login", "signin", "密码错误", "incorrect", "invalid"],
    },
}

# ---- Error page patterns (responses that suggest NOT a real vuln) --------
ERROR_PAGE_PATTERNS = [
    "<title>404", "<title>403", "<title>500",
    "页面不存在", "Page Not Found", "Not Found</h1>",
    "nginx</title>", "Apache Tomcat", "IIS 7.5",
    "Stack trace:", "Exception in thread",
    "未找到", "找不到", "无法找到",
    "Bad Request</h1>", "Method Not Allowed",
    "Service Unavailable", "Service Temporarily Unavailable",
]


def load_all_findings(findings_dir: Path) -> list:
    """Load all findings from findings/*.json and return flat list."""
    all_findings = []
    if not findings_dir.exists():
        return all_findings

    for fp in sorted(findings_dir.glob("*.json")):
        try:
            with open(fp, 'r', encoding='utf-8') as fh:
                data = json.load(fh)
            agent = data.get("agent", fp.stem)
            for finding in data.get("findings", []):
                finding["_source_file"] = fp.name
                finding["_source_agent"] = agent
                all_findings.append(finding)
        except (json.JSONDecodeError, IOError) as e:
            print(f"  [!] 读取失败 {fp.name}: {e}", file=sys.stderr)

    return all_findings


def check_structural(finding: dict) -> list[str]:
    """Check structural integrity. Returns list of issue descriptions."""
    issues = []
    interactions = finding.get("http_interactions", [])

    if not interactions:
        issues.append("无 http_interactions 证据")
        return issues

    for i, hi in enumerate(interactions):
        seq = hi.get("seq", i + 1)
        resp = hi.get("response", {})
        status = resp.get("status_code", 0)
        body = resp.get("body") or ""

        # Empty response body
        if status == 200 and not body:
            issues.append(f"seq={seq}: 200 但响应体为空")

        # 404 on claimed vulnerability
        if status == 404:
            issues.append(f"seq={seq}: 返回 404，可能端点不存在")

        # 5xx on claimed vulnerability (might be real but needs review)
        if status >= 500:
            issues.append(f"seq={seq}: 返回 {status}，可能触发异常但需人工确认是否为漏洞证据")

    return issues


def check_evidence_type(vuln_type: str, interactions: list) -> list[str]:
    """Check if response evidence matches the claimed vulnerability type."""
    issues = []
    evidence_rules = VULN_TYPE_EVIDENCE.get(vuln_type)
    if not evidence_rules:
        return []

    all_body = ""
    for hi in interactions:
        resp = hi.get("response", {})
        body = (resp.get("body") or "").lower()
        all_body += " " + body

    # Check must_contain_one
    must_contain = evidence_rules.get("must_contain_one", [])
    if must_contain:
        found = any(pattern.lower() in all_body for pattern in must_contain)
        if not found:
            issues.append(
                f"类型 {vuln_type} 要求响应中至少包含以下一条特征，但均未找到: "
                f"{must_contain[:5]}..."
            )

    # Check must_not_contain
    must_not = evidence_rules.get("must_not_contain", [])
    if must_not:
        found_bad = [p for p in must_not if p.lower() in all_body]
        if found_bad:
            issues.append(
                f"响应中含不应出现的特征(表示payload被转义/防护): {found_bad}"
            )

    return issues


def check_body_quality(interactions: list) -> list[str]:
    """Check response body quality — is it a real vulnerability or an error page?"""
    issues = []

    for hi in interactions:
        resp = hi.get("response", {})
        body = (resp.get("body") or "").lower()
        seq = hi.get("seq", hi.get("label", "?"))

        # Check for error page patterns
        for pattern in ERROR_PAGE_PATTERNS:
            if pattern.lower() in body:
                issues.append(f"seq={seq}: 响应体含错误页面特征 '{pattern}' — 可能非漏洞证据")
                break

        # Body too short for a meaningful vulnerability response
        if len(body) < 30 and resp.get("status_code") == 200:
            issues.append(f"seq={seq}: 响应体过短({len(body)}字符)，证据不充分")

    return issues


def verify_one_finding(finding: dict) -> dict:
    """Verify a single finding. Returns verification result dict."""
    vuln_id = finding.get("vuln_id", "?")
    vuln_type = finding.get("type", "unknown")
    title = finding.get("title", "")
    severity = finding.get("severity", "info")
    confidence = finding.get("confidence", "potential")
    interactions = finding.get("http_interactions", [])

    all_issues = []
    all_issues.extend(check_structural(finding))
    all_issues.extend(check_evidence_type(vuln_type, interactions))
    all_issues.extend(check_body_quality(interactions))

    # Determine verdict
    if not interactions:
        verdict = "reject"
        reason = "无 http_interactions 证据"
    elif any("404" in iss for iss in all_issues) and len(all_issues) >= 2:
        verdict = "reject"
        reason = "证据指向404/不存在的端点"
    elif len(all_issues) >= 3:
        verdict = "needs_review"
        reason = f"存在 {len(all_issues)} 个证据质量问题"
    elif len(all_issues) >= 1:
        verdict = "needs_review"
        reason = f"存在 {len(all_issues)} 个证据质量问题"
    else:
        verdict = "verified"
        reason = "自动检查通过"

    return {
        "vuln_id": vuln_id,
        "title": title,
        "type": vuln_type,
        "severity": severity,
        "confidence": confidence,
        "source_agent": finding.get("_source_agent", "?"),
        "source_file": finding.get("_source_file", "?"),
        "verdict": verdict,
        "issues": all_issues,
        "reason": reason,
    }


def main():
    parser = argparse.ArgumentParser(
        description="Phase 5.6: Verify vulnerability finding evidence"
    )
    parser.add_argument("workspace", help="Path to vibe-pentest workspace directory")
    parser.add_argument("--strict", action="store_true", help="In strict mode, reject is final")
    args = parser.parse_args()

    workspace = Path(args.workspace)
    findings_dir = workspace / "findings"

    if not findings_dir.exists():
        print(f"[!] findings 目录不存在: {findings_dir}")
        sys.exit(1)

    # Load all findings
    all_findings = load_all_findings(findings_dir)
    print(f"[*] 加载 {len(all_findings)} 个漏洞发现")

    if not all_findings:
        print("[*] 无发现，无需复查")
        verification = {
            "total": 0,
            "verified": 0,
            "needs_review": 0,
            "rejected": 0,
            "findings": [],
        }
        output_path = workspace / "verification.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(verification, f, indent=2, ensure_ascii=False)
        print(f"[+] 复查报告已保存: {output_path}")
        return

    # Verify each finding
    results = []
    for finding in all_findings:
        result = verify_one_finding(finding)
        results.append(result)

    # Summary
    verified = [r for r in results if r["verdict"] == "verified"]
    needs_review = [r for r in results if r["verdict"] == "needs_review"]
    rejected = [r for r in results if r["verdict"] == "reject"]

    verification = {
        "total": len(results),
        "verified": len(verified),
        "needs_review": len(needs_review),
        "rejected": len(rejected),
        "findings": results,
    }

    # Print summary
    print(f"\n{'=' * 55}")
    print(f"  证据复查结果:")
    print(f"    总计:       {verification['total']}")
    print(f"    通过(verified):    {verification['verified']}")
    print(f"    待审(needs_review): {verification['needs_review']}")
    print(f"    剔除(rejected):    {verification['rejected']}")
    print(f"{'=' * 55}")

    if needs_review:
        print(f"\n  [!] 待人工审查的漏洞:")
        for r in needs_review:
            print(f"    [{r['vuln_id']}] {r['type']} — {r['title'][:60]}")
            for iss in r["issues"]:
                print(f"        ! {iss}")

    if rejected:
        print(f"\n  [x] 建议剔除的漏洞:")
        for r in rejected:
            print(f"    [{r['vuln_id']}] {r['type']} — {r['title'][:60]}")
            print(f"        原因: {r['reason']}")

    # Write verification report
    output_path = workspace / "verification.json"
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(verification, f, indent=2, ensure_ascii=False)
    print(f"\n[+] 复查报告已保存: {output_path}")

    # Also write a human-readable summary
    summary_lines = [
        f"证据复查摘要",
        f"{'=' * 50}",
        f"总计: {verification['total']}",
        f"通过: {verification['verified']}",
        f"待审: {verification['needs_review']}",
        f"剔除: {verification['rejected']}",
        f"",
    ]

    if needs_review:
        summary_lines.append("--- 待人工审查 ---")
        for r in needs_review:
            summary_lines.append(f"  [{r['vuln_id']}] {r['type']} — {r['title'][:60]}")
            for iss in r["issues"]:
                summary_lines.append(f"    ! {iss}")

    if rejected:
        summary_lines.append("\n--- 建议剔除 ---")
        for r in rejected:
            summary_lines.append(f"  [{r['vuln_id']}] {r['type']} — {r['title'][:60]}")

    summary_path = workspace / "verification_summary.txt"
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(summary_lines))

    # Exit code: non-zero if any rejected or needs_review
    if args.strict and (rejected or needs_review):
        sys.exit(1)


if __name__ == "__main__":
    main()
