suanli-juzhen/01_扫描模块/scripts/import_scan_results.py

#!/usr/bin/env python3
"""
扫描结果全量导入 MongoDB
========================
将端口扫描结果 + 用户链信息 + 登录分析 合并写入 KR.分布式矩阵IP_已扫描

功能：
1. 读取扫描结果 JSON
2. 从 KR.分布式矩阵IP 查找每个 IP 关联的所有用户记录
3. 构建完整文档（用户链 + 扫描 + 登录分析）
4. 写入 KR.分布式矩阵IP_已扫描
5. 同时导入木蚂蚁的扫描结果

表结构设计（字段说明）：
- ip:                    IP地址
- source_db:             数据来源库
- source_col:            数据来源表
- scan_time:             扫描时间
- port_count:            开放端口数
- open_ports:            端口详情 {port: {service, banner, open}}
- ssh_open:              SSH是否开放
- ssh_port:              SSH端口号（22/2222）
- ssh_banner:            SSH banner
- rdp_open:              RDP是否开放
- vnc_open:              VNC是否开放
- telnet_open:           Telnet是否开放
- http_open:             HTTP是否开放
- https_open:            HTTPS是否开放
- baota_open:            宝塔面板是否开放
- server_types:          服务器类型列表
- os_guess:              操作系统推测
- ssh_version:           SSH版本
- remote_methods:        可用远程登录方式
- ssh_difficulty:        SSH登录难度(1-5)
- ssh_difficulty_stars:  难度星级
- ssh_difficulty_label:  难度说明
- ssh_notes:             SSH分析备注
- deploy_score:          部署适合度评分(0-100)
- deploy_ready:          是否适合部署
- deploy_notes:          部署说明
- login_suggestions:     登录建议（SSH/RDP/VNC/Web）
- user_count:            该IP关联用户数
- users:                 关联用户列表
- primary_user:          首要用户（最近活跃）
"""

import pymongo
import json
import os
import sys
import argparse
from datetime import datetime
from collections import defaultdict

MONGO_URI = 'mongodb://admin:admin123@localhost:27017/?authSource=admin'

# SSH 难易度标签
DIFFICULTY_LABELS = {
    1: "极易(默认密码/老设备/Telnet)",
    2: "较易(弱密码/嵌入式/VNC)",
    3: "中等(标准服务器)",
    4: "较难(新版本SSH/密钥优先)",
    5: "极难(仅密钥认证/防火墙)",
}

# 常见默认凭证（按设备类型）
DEFAULT_CREDENTIALS = {
    "Linux/BSD": [
        {"username": "root", "password": "root", "note": "Linux默认"},
        {"username": "root", "password": "admin", "note": "常见弱密码"},
        {"username": "root", "password": "123456", "note": "常见弱密码"},
        {"username": "root", "password": "password", "note": "常见弱密码"},
        {"username": "admin", "password": "admin", "note": "管理员默认"},
        {"username": "ubuntu", "password": "ubuntu", "note": "Ubuntu默认"},
    ],
    "Ubuntu Linux": [
        {"username": "ubuntu", "password": "ubuntu", "note": "Ubuntu默认"},
        {"username": "root", "password": "root", "note": "root默认"},
        {"username": "root", "password": "admin123", "note": "常见密码"},
    ],
    "Debian Linux": [
        {"username": "root", "password": "root", "note": "Debian默认"},
        {"username": "root", "password": "admin", "note": "常见密码"},
    ],
    "CentOS/RHEL": [
        {"username": "root", "password": "root", "note": "CentOS默认"},
        {"username": "centos", "password": "centos", "note": "CentOS用户"},
        {"username": "root", "password": "admin123", "note": "常见密码"},
    ],
    "Windows": [
        {"username": "Administrator", "password": "admin123", "note": "Windows默认"},
        {"username": "Administrator", "password": "123456", "note": "常见密码"},
        {"username": "admin", "password": "admin", "note": "常见密码"},
    ],
    "嵌入式/路由器": [
        {"username": "admin", "password": "admin", "note": "路由器默认"},
        {"username": "root", "password": "admin", "note": "嵌入式默认"},
        {"username": "root", "password": "root", "note": "默认密码"},
    ],
    "MikroTik路由器": [
        {"username": "admin", "password": "", "note": "MikroTik默认空密码"},
        {"username": "admin", "password": "admin", "note": "常见密码"},
    ],
    "Unknown": [
        {"username": "root", "password": "root", "note": "通用默认"},
        {"username": "admin", "password": "admin", "note": "通用默认"},
        {"username": "root", "password": "123456", "note": "常见弱密码"},
    ],
}


def build_login_suggestions(ip, scan_data, os_guess, users):
    """构建完整的登录建议"""
    open_ports = scan_data.get("open_ports", {})
    suggestions = {}

    # SSH 登录建议
    ssh_port = None
    if "22" in open_ports or 22 in open_ports:
        ssh_port = 22
    elif "2222" in open_ports or 2222 in open_ports:
        ssh_port = 2222

    if ssh_port:
        creds = list(DEFAULT_CREDENTIALS.get(os_guess, DEFAULT_CREDENTIALS["Unknown"]))

        # 从关联用户中提取可能的登录凭证
        for user in users[:5]:  # 最多取5个用户
            username = user.get("username", "")
            pw_hash = user.get("password", "")
            if username and len(username) <= 32:
                creds.append({
                    "username": username,
                    "password": f"(hash:{pw_hash[:16]}...)" if pw_hash else "",
                    "note": f"数据库用户-{user.get('source_col', '')}"
                })

        suggestions["ssh"] = {
            "port": ssh_port,
            "command": f"ssh root@{ip} -p {ssh_port}",
            "command_with_password": f"sshpass -p 'PASSWORD' ssh -o StrictHostKeyChecking=no root@{ip} -p {ssh_port}",
            "try_credentials": creds,
            "auth_type_guess": "password" if os_guess in ("嵌入式/路由器", "MikroTik路由器") else "password/key",
        }

    # RDP 登录建议
    if "3389" in open_ports or 3389 in open_ports:
        suggestions["rdp"] = {
            "port": 3389,
            "command": f"open rdp://{ip}",
            "command_rdesktop": f"rdesktop {ip}:3389",
            "try_credentials": DEFAULT_CREDENTIALS.get("Windows", []),
        }

    # VNC 登录建议
    if "5900" in open_ports or 5900 in open_ports:
        suggestions["vnc"] = {
            "port": 5900,
            "command": f"open vnc://{ip}",
            "common_passwords": ["", "123456", "password", "admin"],
        }

    # Telnet 登录建议
    if "23" in open_ports or 23 in open_ports:
        suggestions["telnet"] = {
            "port": 23,
            "command": f"telnet {ip} 23",
            "try_credentials": [
                {"username": "admin", "password": "admin"},
                {"username": "root", "password": "root"},
            ],
        }

    # Web 访问
    web_urls = {}
    if "80" in open_ports or 80 in open_ports:
        web_urls["http"] = f"http://{ip}"
    if "443" in open_ports or 443 in open_ports:
        web_urls["https"] = f"https://{ip}"
    if "8888" in open_ports or 8888 in open_ports:
        web_urls["baota"] = f"http://{ip}:8888"
    if web_urls:
        suggestions["web"] = web_urls

    return suggestions


def build_deploy_notes(scan_data, os_guess):
    """构建部署说明"""
    score = scan_data.get("analysis", {}).get("deploy_score", 0)
    open_ports = scan_data.get("open_ports", {})
    notes = []

    has_ssh = "22" in open_ports or 22 in open_ports or "2222" in open_ports or 2222 in open_ports

    if has_ssh:
        notes.append("SSH可达")
        if os_guess in ("Ubuntu Linux", "Debian Linux", "CentOS/RHEL", "Linux/BSD"):
            notes.append(f"{os_guess}系统，适合部署Docker/Agent")
        elif os_guess == "Windows":
            notes.append("Windows系统，可部署Windows Agent")
        elif os_guess == "嵌入式/路由器":
            notes.append("嵌入式设备，资源有限，可部署轻量Agent")
        else:
            notes.append("系统未知，需确认后部署")
    else:
        notes.append("无SSH，需通过其他方式部署")

    if "3389" in open_ports or 3389 in open_ports:
        notes.append("有RDP可远程桌面操作")
    if "8888" in open_ports or 8888 in open_ports:
        notes.append("有宝塔面板，可Web管理")

    return "; ".join(notes)


def process_scan_results(scan_json_path, mumayi_json_path=None):
    """处理扫描结果，构建完整文档并写入MongoDB"""

    client = pymongo.MongoClient(MONGO_URI)
    db = client['KR']
    matrix_coll = db['分布式矩阵IP']
    target_coll = db['分布式矩阵IP_已扫描']

    # 读取扫描结果
    print("读取扫描结果...")
    with open(scan_json_path, 'r') as f:
        scan_data = json.load(f)

    results = scan_data.get("results", {})
    scan_info = scan_data.get("scan_info", {})
    print(f"  扫描结果: {len(results):,} 个IP有端口")

    # 读取木蚂蚁结果
    mumayi_results = {}
    if mumayi_json_path and os.path.exists(mumayi_json_path):
        print("读取木蚂蚁扫描结果...")
        with open(mumayi_json_path, 'r') as f:
            mumayi_data = json.load(f)
        mumayi_results = mumayi_data.get("results", {})
        print(f"  木蚂蚁结果: {len(mumayi_results):,} 个IP有端口")

    # 合并所有扫描结果
    all_scan_results = {}
    all_scan_results.update(results)
    all_scan_results.update(mumayi_results)
    print(f"  合并总计: {len(all_scan_results):,} 个IP")

    # 清空目标表
    print("\n清空目标表 KR.分布式矩阵IP_已扫描...")
    target_coll.delete_many({})

    # 批量处理
    batch_docs = []
    batch_size = 5000
    total_processed = 0
    total_users_linked = 0

    scanned_ips = list(all_scan_results.keys())
    total_ips = len(scanned_ips)

    print(f"\n开始构建完整文档（{total_ips:,} 个IP）...")

    for ip in scanned_ips:
        scan = all_scan_results[ip]
        open_ports = scan.get("open_ports", {})
        analysis = scan.get("analysis", {})

        # 从分布式矩阵IP表查找所有关联用户
        user_records = list(matrix_coll.find(
            {"$or": [{"ip": ip}, {"ip_reg": ip}, {"ip_last": ip}]},
            {"_id": 0, "username": 1, "email": 1, "password": 1, "salt": 1,
             "phone": 1, "qq": 1, "region": 1, "country": 1, "province": 1, "city": 1,
             "source_db": 1, "source_col": 1, "reg_time": 1, "last_active_time": 1,
             "R_score": 1, "F_score": 1, "M_score": 1, "RFM_total": 1,
             "value_level": 1, "user_type": 1, "extra": 1, "ip": 1, "ip_reg": 1, "ip_last": 1}
        ))

        total_users_linked += len(user_records)

        # 选首要用户（最近活跃的）
        primary_user = {}
        if user_records:
            sorted_users = sorted(user_records, key=lambda x: str(x.get("last_active_time") or ""), reverse=True)
            primary_user = sorted_users[0]

        # 提取来源信息
        sources = list(set(f"{u.get('source_db', '')}.{u.get('source_col', '')}" for u in user_records))
        source_cols = list(set(u.get('source_col', '') for u in user_records))

        # 端口布尔标记
        op_keys = set(str(k) for k in open_ports.keys()) | set(int(k) for k in open_ports.keys() if str(k).isdigit())
        ssh_open = 22 in op_keys or "22" in op_keys or 2222 in op_keys or "2222" in op_keys
        ssh_port = 22 if (22 in op_keys or "22" in op_keys) else (2222 if (2222 in op_keys or "2222" in op_keys) else None)
        ssh_banner = ""
        if ssh_port:
            ssh_banner = open_ports.get(str(ssh_port), open_ports.get(ssh_port, {})).get("banner", "")

        os_guess = analysis.get("os_guess", "Unknown")

        # 构建用户摘要列表（精简，保留关键登录信息）
        users_summary = []
        for u in user_records[:20]:  # 最多20条
            users_summary.append({
                "username": u.get("username", ""),
                "email": u.get("email", ""),
                "password_hash": u.get("password", ""),
                "salt": u.get("salt", ""),
                "phone": u.get("phone", ""),
                "qq": u.get("qq", ""),
                "source_col": u.get("source_col", ""),
                "reg_time": u.get("reg_time", ""),
                "last_active": u.get("last_active_time", ""),
                "value_level": u.get("value_level", ""),
            })

        # SSH 难易度
        ssh_diff = analysis.get("ssh_difficulty", {})
        if isinstance(ssh_diff, dict):
            diff_val = ssh_diff.get("difficulty", 5)
            diff_stars = ssh_diff.get("difficulty_stars", "")
            diff_notes = ssh_diff.get("notes", [])
        else:
            diff_val = ssh_diff if isinstance(ssh_diff, int) else 5
            diff_stars = "★" * diff_val + "☆" * (5 - diff_val)
            diff_notes = []

        # 登录建议
        login_suggestions = build_login_suggestions(ip, scan, os_guess, user_records)

        # 部署说明
        deploy_notes = build_deploy_notes(scan, os_guess)

        # 构建完整文档
        doc = {
            # === 基本标识 ===
            "ip": ip,
            "sources": sources,
            "source_cols": source_cols,
            "primary_source": source_cols[0] if source_cols else "",

            # === 扫描结果 ===
            "scan_time": scan.get("scan_time", datetime.now().isoformat()),
            "port_count": scan.get("port_count", len(open_ports)),
            "open_ports": {str(k): v for k, v in open_ports.items()},

            # === 端口快捷标记（方便查询）===
            "ssh_open": ssh_open,
            "ssh_port": ssh_port,
            "ssh_banner": ssh_banner,
            "rdp_open": 3389 in op_keys or "3389" in op_keys,
            "vnc_open": 5900 in op_keys or "5900" in op_keys,
            "telnet_open": 23 in op_keys or "23" in op_keys,
            "http_open": 80 in op_keys or "80" in op_keys,
            "https_open": 443 in op_keys or "443" in op_keys,
            "baota_open": 8888 in op_keys or "8888" in op_keys,

            # === 服务器分类 ===
            "server_types": analysis.get("server_types", []),
            "os_guess": os_guess,
            "ssh_version": analysis.get("ssh_version", ""),

            # === 远程登录分析 ===
            "remote_methods": analysis.get("remote_methods", []),
            "remote_method_count": len(analysis.get("remote_methods", [])),
            "ssh_difficulty": diff_val,
            "ssh_difficulty_stars": diff_stars,
            "ssh_difficulty_label": DIFFICULTY_LABELS.get(diff_val, "未知"),
            "ssh_notes": diff_notes,

            # === 登录建议（核心：方便直接复制登录）===
            "login_suggestions": login_suggestions,
            "quick_ssh_cmd": login_suggestions.get("ssh", {}).get("command", ""),
            "quick_rdp_cmd": login_suggestions.get("rdp", {}).get("command", ""),
            "quick_vnc_cmd": login_suggestions.get("vnc", {}).get("command", ""),
            "quick_web_url": login_suggestions.get("web", {}).get("http", ""),
            "quick_baota_url": login_suggestions.get("web", {}).get("baota", ""),

            # === 部署评估 ===
            "deploy_score": analysis.get("deploy_score", 0),
            "deploy_ready": analysis.get("deploy_ready", False),
            "deploy_notes": deploy_notes,

            # === 用户链 ===
            "user_count": len(user_records),
            "users": users_summary,
            "primary_user": {
                "username": primary_user.get("username", ""),
                "email": primary_user.get("email", ""),
                "password_hash": primary_user.get("password", ""),
                "salt": primary_user.get("salt", ""),
                "phone": primary_user.get("phone", ""),
                "qq": primary_user.get("qq", ""),
                "region": primary_user.get("region", ""),
                "province": primary_user.get("province", ""),
                "city": primary_user.get("city", ""),
                "value_level": primary_user.get("value_level", ""),
                "user_type": primary_user.get("user_type", ""),
            } if primary_user else {},

            # === 元数据 ===
            "imported_at": datetime.now().isoformat(),
            "scan_source": "kr_full_scan" if ip in results else "mumayi_full_scan",
        }

        batch_docs.append(doc)
        total_processed += 1

        if len(batch_docs) >= batch_size:
            target_coll.insert_many(batch_docs, ordered=False)
            progress = total_processed / total_ips * 100
            print(f"  [{progress:5.1f}%] {total_processed:,}/{total_ips:,} 已导入 | 关联用户: {total_users_linked:,}")
            batch_docs = []

    # 插入剩余
    if batch_docs:
        target_coll.insert_many(batch_docs, ordered=False)

    print(f"\n全部导入完成: {total_processed:,} 条")
    print(f"关联用户总数: {total_users_linked:,}")

    # 创建索引
    print("\n创建索引...")
    indexes = [
        ("ip", 1),
        ("ssh_open", 1),
        ("rdp_open", 1),
        ("ssh_difficulty", 1),
        ("deploy_score", -1),
        ("deploy_ready", 1),
        ("primary_source", 1),
        ("os_guess", 1),
        ("user_count", -1),
        ("port_count", -1),
    ]
    for field, direction in indexes:
        target_coll.create_index([(field, direction)])
        print(f"  索引: {field}")

    # 复合索引
    target_coll.create_index([("ssh_open", 1), ("ssh_difficulty", 1)])
    target_coll.create_index([("deploy_ready", 1), ("deploy_score", -1)])
    print(f"  复合索引: ssh_open+ssh_difficulty, deploy_ready+deploy_score")

    # 统计
    print(f"\n=== 最终统计 ===")
    print(f"总记录: {target_coll.count_documents({}):,}")
    print(f"SSH可达: {target_coll.count_documents({'ssh_open': True}):,}")
    print(f"RDP可达: {target_coll.count_documents({'rdp_open': True}):,}")
    print(f"VNC可达: {target_coll.count_documents({'vnc_open': True}):,}")
    print(f"Telnet可达: {target_coll.count_documents({'telnet_open': True}):,}")
    print(f"宝塔面板: {target_coll.count_documents({'baota_open': True}):,}")
    print(f"可部署: {target_coll.count_documents({'deploy_ready': True}):,}")

    # SSH难度分布
    print(f"\nSSH难度分布:")
    for diff in range(1, 6):
        count = target_coll.count_documents({"ssh_difficulty": diff})
        label = DIFFICULTY_LABELS.get(diff, "")
        print(f"  {diff}星 {label}: {count:,}")

    # 各来源统计
    print(f"\n各来源统计:")
    pipeline = [
        {"$unwind": "$source_cols"},
        {"$group": {"_id": "$source_cols", "count": {"$sum": 1}, "ssh": {"$sum": {"$cond": ["$ssh_open", 1, 0]}}}},
        {"$sort": {"count": -1}}
    ]
    for doc in target_coll.aggregate(pipeline):
        print(f"  {doc['_id']}: {doc['count']:,} (SSH: {doc['ssh']:,})")

    return total_processed


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--scan-json", required=True, help="扫描结果JSON路径")
    parser.add_argument("--mumayi-json", default="", help="木蚂蚁扫描结果JSON路径")
    args = parser.parse_args()

    total = process_scan_results(args.scan_json, args.mumayi_json)
    print(f"\n完成! 共导入 {total:,} 条到 KR.分布式矩阵IP_已扫描")


if __name__ == "__main__":
    main()