数据中心同步

This commit is contained in:
乘风
2026-01-05 10:16:20 +08:00
parent 0457528dd0
commit ba0ebcf273
98 changed files with 28583 additions and 0 deletions

View File

@@ -0,0 +1,115 @@
<?php
namespace app\service\DataCollection\Handler;
use app\repository\DataSourceRepository;
use app\service\DataSourceService;
use app\utils\LoggerHelper;
use app\utils\MongoDBHelper;
use MongoDB\Client;
/**
* 数据采集 Handler 基类
*
* 提供通用的数据采集功能:
* - MongoDB 客户端创建
* - 数据源配置获取
* - 目标数据源连接
* - 公共服务实例IdentifierService、ConsumptionService、StoreService
*/
abstract class BaseCollectionHandler
{
use Trait\DataCollectionHelperTrait;
protected DataSourceService $dataSourceService;
protected \app\service\IdentifierService $identifierService;
protected \app\service\ConsumptionService $consumptionService;
protected \app\service\StoreService $storeService;
public function __construct()
{
$this->dataSourceService = new DataSourceService(
new DataSourceRepository()
);
// 初始化公共服务(避免在子类中重复实例化)
$this->identifierService = new \app\service\IdentifierService(
new \app\repository\UserProfileRepository(),
new \app\service\UserPhoneService(
new \app\repository\UserPhoneRelationRepository()
)
);
$this->consumptionService = new \app\service\ConsumptionService(
new \app\repository\ConsumptionRecordRepository(),
new \app\repository\UserProfileRepository(),
$this->identifierService
);
$this->storeService = new \app\service\StoreService(
new \app\repository\StoreRepository()
);
}
/**
* 获取 MongoDB 客户端
*
* @param array<string, mixed> $taskConfig 任务配置
* @return Client MongoDB 客户端实例
* @throws \InvalidArgumentException 如果数据源配置不存在
*/
protected function getMongoClient(array $taskConfig): Client
{
$dataSourceId = $taskConfig['data_source_id']
?? $taskConfig['data_source']
?? 'sync_mongodb';
$dataSourceConfig = $this->dataSourceService->getDataSourceConfigById($dataSourceId);
if (empty($dataSourceConfig)) {
throw new \InvalidArgumentException("数据源配置不存在: {$dataSourceId}");
}
return MongoDBHelper::createClient($dataSourceConfig);
}
/**
* 连接到目标数据源
*
* @param string $targetDataSourceId 目标数据源ID
* @param string|null $targetDatabase 目标数据库名(可选,默认使用数据源配置中的数据库)
* @return array{client: Client, database: \MongoDB\Database, dbName: string, config: array} 连接信息
* @throws \InvalidArgumentException 如果目标数据源配置不存在
*/
protected function connectToTargetDataSource(
string $targetDataSourceId,
?string $targetDatabase = null
): array {
$targetDataSourceConfig = $this->dataSourceService->getDataSourceConfigById($targetDataSourceId);
if (empty($targetDataSourceConfig)) {
throw new \InvalidArgumentException("目标数据源配置不存在: {$targetDataSourceId}");
}
$client = MongoDBHelper::createClient($targetDataSourceConfig);
$dbName = $targetDatabase ?? $targetDataSourceConfig['database'] ?? 'ckb';
$database = $client->selectDatabase($dbName);
return [
'client' => $client,
'database' => $database,
'dbName' => $dbName,
'config' => $targetDataSourceConfig,
];
}
/**
* 采集数据(抽象方法,由子类实现)
*
* @param mixed $adapter 数据源适配器
* @param array<string, mixed> $taskConfig 任务配置
* @return void
*/
abstract public function collect($adapter, array $taskConfig): void;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,427 @@
<?php
namespace app\service\DataCollection\Handler;
use app\service\DataSource\DataSourceAdapterInterface;
use app\service\DatabaseSyncService;
use app\service\DataSourceService;
use app\repository\DataSourceRepository;
use app\utils\LoggerHelper;
use MongoDB\Client;
/**
* 数据库同步采集处理类
*
* 职责:
* - 从源数据库同步数据到目标数据库
* - 支持全量同步和增量同步Change Streams
* - 处理同步进度和错误恢复
*/
class DatabaseSyncHandler
{
private DatabaseSyncService $syncService;
private array $taskConfig;
private int $progressTimerId = 0; // 进度日志定时器ID
/**
* 采集/同步数据库
*
* @param DataSourceAdapterInterface $adapter 数据源适配器(源数据库)
* @param array<string, mixed> $taskConfig 任务配置
* @return void
*/
public function collect(DataSourceAdapterInterface $adapter, array $taskConfig): void
{
$this->taskConfig = $taskConfig;
$taskId = $taskConfig['task_id'] ?? '';
$taskName = $taskConfig['name'] ?? '数据库同步';
LoggerHelper::logBusiness('database_sync_collection_started', [
'task_id' => $taskId,
'task_name' => $taskName,
]);
// 控制台直接输出一条提示,方便在启动时观察数据库同步任务是否真正开始执行
error_log("[DatabaseSyncHandler] 数据库同步任务已启动task_id={$taskId}, task_name={$taskName}");
try {
// 创建 DatabaseSyncService使用任务配置中的源和目标数据源
$this->syncService = $this->createSyncService($taskConfig);
// 获取要同步的数据库列表
$databases = $this->getDatabasesToSync($taskConfig);
if (empty($databases)) {
LoggerHelper::logBusiness('database_sync_no_databases', [
'task_id' => $taskId,
'message' => '没有找到要同步的数据库',
]);
return;
}
// 启动进度日志定时器(定期输出同步进度)
$this->startProgressTimer($taskConfig);
// 是否执行全量同步(从业务配置中获取)
$businessConfig = $this->getBusinessConfig();
$fullSyncEnabled = $businessConfig['change_stream']['full_sync_on_start'] ?? false;
if ($fullSyncEnabled) {
// 执行全量同步
$this->performFullSync($databases, $taskConfig);
}
// 启动增量同步监听Change Streams
$this->startIncrementalSync($databases, $taskConfig);
} catch (\Throwable $e) {
LoggerHelper::logError($e, [
'component' => 'DatabaseSyncHandler',
'action' => 'collect',
'task_id' => $taskId,
]);
throw $e;
}
}
/**
* 获取业务配置(从独立配置文件或使用默认值)
*
* @return array<string, mixed> 业务配置
*/
private function getBusinessConfig(): array
{
// 可以从独立配置文件读取,或使用默认值
// 这里使用默认值,业务逻辑统一在代码中管理
return [
// 数据库同步配置
'databases' => [], // 空数组表示同步所有数据库
'exclude_databases' => ['admin', 'local', 'config'], // 排除的系统数据库
'exclude_collections' => ['system.profile', 'system.js'], // 排除的系统集合
// Change Streams 配置
'change_stream' => [
'batch_size' => 100,
'max_await_time_ms' => 1000,
'full_sync_on_start' => true, // 首次启动时是否执行全量同步
'full_sync_batch_size' => 1000,
],
// 重试配置
'retry' => [
'max_connect_retries' => 10,
'retry_interval' => 5,
'max_sync_retries' => 3,
'sync_retry_interval' => 2,
],
// 性能配置
'performance' => [
'concurrent_databases' => 5,
'concurrent_collections' => 10,
'batch_write_size' => 5000,
// 为了让断点续传逻辑简单可靠,这里关闭集合级并行同步
// 后续如果需要再做更复杂的分片断点策略,可以重新打开
'enable_parallel_sync' => false,
'max_parallel_tasks_per_collection' => 4,
'documents_per_task' => 100000,
],
// 监控配置
'monitoring' => [
'log_sync' => true,
'log_detail' => false,
'stats_interval' => 10, // 每10秒输出一次进度日志
],
];
}
/**
* 创建 DatabaseSyncService 实例
*
* @param array<string, mixed> $taskConfig 任务配置
* @return DatabaseSyncService
*/
private function createSyncService(array $taskConfig): DatabaseSyncService
{
// 从数据库获取源和目标数据源配置
$dataSourceService = new DataSourceService(new DataSourceRepository());
$sourceDataSourceId = $taskConfig['source_data_source'] ?? 'kr_mongodb';
$targetDataSourceId = $taskConfig['target_data_source'] ?? 'sync_mongodb';
$sourceConfig = $dataSourceService->getDataSourceConfigById($sourceDataSourceId);
$targetConfig = $dataSourceService->getDataSourceConfigById($targetDataSourceId);
if (empty($sourceConfig) || empty($targetConfig)) {
throw new \InvalidArgumentException("数据源配置不存在: source={$sourceDataSourceId}, target={$targetDataSourceId}");
}
// 获取业务配置(统一在代码中管理)
$businessConfig = $this->getBusinessConfig();
// 构建同步配置
$syncConfig = [
'enabled' => true,
'source' => [
'host' => $sourceConfig['host'],
'port' => $sourceConfig['port'],
'username' => $sourceConfig['username'] ?? '',
'password' => $sourceConfig['password'] ?? '',
'auth_source' => $sourceConfig['auth_source'] ?? 'admin',
'options' => array_merge([
'connectTimeoutMS' => 10000,
'socketTimeoutMS' => 30000,
'serverSelectionTimeoutMS' => 10000,
'heartbeatFrequencyMS' => 10000,
], $sourceConfig['options'] ?? []),
],
'target' => [
'host' => $targetConfig['host'],
'port' => $targetConfig['port'],
'username' => $targetConfig['username'] ?? '',
'password' => $targetConfig['password'] ?? '',
'auth_source' => $targetConfig['auth_source'] ?? 'admin',
'options' => array_merge([
'connectTimeoutMS' => 10000,
'socketTimeoutMS' => 30000,
'serverSelectionTimeoutMS' => 10000,
], $targetConfig['options'] ?? []),
],
'sync' => [
'databases' => $businessConfig['databases'],
'exclude_databases' => $businessConfig['exclude_databases'],
'exclude_collections' => $businessConfig['exclude_collections'],
'change_stream' => $businessConfig['change_stream'],
'retry' => $businessConfig['retry'],
'performance' => $businessConfig['performance'],
],
'monitoring' => $businessConfig['monitoring'],
];
// 直接传递配置给 DatabaseSyncService 构造函数
return new DatabaseSyncService($syncConfig);
}
/**
* 获取要同步的数据库列表
*
* @param array<string, mixed> $taskConfig 任务配置
* @return array<string> 数据库名称列表
*/
private function getDatabasesToSync(array $taskConfig): array
{
return $this->syncService->getDatabasesToSync();
}
/**
* 执行全量同步(支持多进程数据库级并行)
*
* @param array<string> $databases 数据库列表
* @param array<string, mixed> $taskConfig 任务配置
* @return void
*/
private function performFullSync(array $databases, array $taskConfig): void
{
// 获取 Worker 信息(用于多进程分配)
$workerId = $taskConfig['worker_id'] ?? 0;
$workerCount = $taskConfig['worker_count'] ?? 1;
// 分配数据库给当前 Worker负载均衡算法
$assignedDatabases = $this->assignDatabasesToWorker($databases, $workerId, $workerCount);
LoggerHelper::logBusiness('database_sync_full_sync_start', [
'worker_id' => $workerId,
'worker_count' => $workerCount,
'total_databases' => count($databases),
'assigned_databases' => $assignedDatabases,
'assigned_count' => count($assignedDatabases),
]);
foreach ($assignedDatabases as $databaseName) {
try {
$this->syncService->fullSyncDatabase($databaseName);
} catch (\Throwable $e) {
LoggerHelper::logError($e, [
'component' => 'DatabaseSyncHandler',
'action' => 'performFullSync',
'database' => $databaseName,
'worker_id' => $workerId,
]);
// 继续同步其他数据库
}
}
LoggerHelper::logBusiness('database_sync_full_sync_completed', [
'worker_id' => $workerId,
'databases' => $assignedDatabases,
]);
}
/**
* 分配数据库给当前 Worker负载均衡算法
*
* 策略:
* 1. 按数据库大小排序(小库优先,提升完成感)
* 2. 使用贪心算法:每次分配给当前负载最小的 Worker
* 3. 考虑 Worker 当前处理的数据库数量
*
* @param array<string> $databases 数据库列表(已按大小排序)
* @param int $workerId 当前 Worker ID
* @param int $workerCount Worker 总数
* @return array<string> 分配给当前 Worker 的数据库列表
*/
private function assignDatabasesToWorker(array $databases, int $workerId, int $workerCount): array
{
// 如果只有一个 Worker返回所有数据库
if ($workerCount <= 1) {
return $databases;
}
// 方案A简单取模分配快速实现
// 适用于数据库数量较多且大小相近的场景
$assignedDatabases = [];
foreach ($databases as $index => $databaseName) {
if ($index % $workerCount === $workerId) {
$assignedDatabases[] = $databaseName;
}
}
// 方案B负载均衡分配推荐但需要数据库大小信息
// 由于 getDatabasesToSync 已经按大小排序,简单取模即可实现较好的负载均衡
// 如果后续需要更精确的负载均衡,可以从 DatabaseSyncService 获取数据库大小信息
return $assignedDatabases;
}
/**
* 启动增量同步监听(支持多进程数据库级并行)
*
* @param array<string> $databases 数据库列表
* @param array<string, mixed> $taskConfig 任务配置
* @return void
*/
private function startIncrementalSync(array $databases, array $taskConfig): void
{
// 获取 Worker 信息(用于多进程分配)
$workerId = $taskConfig['worker_id'] ?? 0;
$workerCount = $taskConfig['worker_count'] ?? 1;
// 分配数据库给当前 Worker与全量同步使用相同的分配策略
$assignedDatabases = $this->assignDatabasesToWorker($databases, $workerId, $workerCount);
LoggerHelper::logBusiness('database_sync_incremental_sync_start', [
'worker_id' => $workerId,
'worker_count' => $workerCount,
'total_databases' => count($databases),
'assigned_databases' => $assignedDatabases,
'assigned_count' => count($assignedDatabases),
]);
// 为分配给当前 Worker 的数据库启动监听(在后台进程中)
foreach ($assignedDatabases as $databaseName) {
// 使用 Timer 在后台启动监听,避免阻塞
\Workerman\Timer::add(0, function () use ($databaseName) {
try {
$this->syncService->watchDatabase($databaseName);
} catch (\Throwable $e) {
LoggerHelper::logError($e, [
'component' => 'DatabaseSyncHandler',
'action' => 'startIncrementalSync',
'database' => $databaseName,
]);
// 重试逻辑(从业务配置中获取)
$businessConfig = $this->getBusinessConfig();
$retryConfig = $businessConfig['retry'] ?? [];
$maxRetries = $retryConfig['max_connect_retries'] ?? 10;
$retryInterval = $retryConfig['retry_interval'] ?? 5;
static $retryCount = [];
if (!isset($retryCount[$databaseName])) {
$retryCount[$databaseName] = 0;
}
if ($retryCount[$databaseName] < $maxRetries) {
$retryCount[$databaseName]++;
\Workerman\Timer::add($retryInterval, function () use ($databaseName) {
$this->startIncrementalSync([$databaseName], $this->taskConfig);
}, [], false);
}
}
}, [], false);
}
}
/**
* 启动进度日志定时器
*
* @param array<string, mixed> $taskConfig 任务配置
* @return void
*/
private function startProgressTimer(array $taskConfig): void
{
$businessConfig = $this->getBusinessConfig();
$statsInterval = $businessConfig['monitoring']['stats_interval'] ?? 10; // 默认10秒输出一次进度
// 使用 Workerman Timer 定期输出进度
$this->progressTimerId = \Workerman\Timer::add($statsInterval, function () use ($taskConfig) {
try {
// 重新加载最新进度(从文件读取)
$this->syncService->loadProgress();
$progress = $this->syncService->getProgress();
$stats = $this->syncService->getStats();
// 输出格式化的进度信息
$progressInfo = [
'task_id' => $taskConfig['task_id'] ?? '',
'task_name' => $taskConfig['name'] ?? '数据库同步',
'status' => $progress['status'],
'progress_percent' => $progress['progress_percent'] . '%',
'current_database' => $progress['current_database'] ?? '无',
'current_collection' => $progress['current_collection'] ?? '无',
'databases' => "{$progress['databases']['completed']}/{$progress['databases']['total']}",
'collections' => "{$progress['collections']['completed']}/{$progress['collections']['total']}",
'documents' => "{$progress['documents']['processed']}/{$progress['documents']['total']}",
'documents_inserted' => $stats['documents_inserted'],
'documents_updated' => $stats['documents_updated'],
'documents_deleted' => $stats['documents_deleted'],
'errors' => $stats['errors'],
'elapsed_time' => round($progress['time']['elapsed_seconds'], 2) . 's',
'estimated_remaining' => $progress['time']['estimated_remaining_seconds']
? round($progress['time']['estimated_remaining_seconds'], 2) . 's'
: '计算中...',
];
// 输出到日志
LoggerHelper::logBusiness('database_sync_progress_report', $progressInfo);
// 如果状态是错误,输出错误信息
if ($progress['status'] === 'error' && isset($progress['last_error'])) {
LoggerHelper::logBusiness('database_sync_error_info', [
'error_message' => $progress['last_error']['message'] ?? '未知错误',
'error_database' => $progress['error_database'] ?? '未知',
'error_collection' => $progress['last_error']['collection'] ?? '未知',
]);
}
} catch (\Throwable $e) {
LoggerHelper::logError($e, [
'component' => 'DatabaseSyncHandler',
'action' => 'startProgressTimer',
]);
}
});
}
/**
* 停止进度日志定时器
*
* @return void
*/
public function stopProgressTimer(): void
{
if ($this->progressTimerId > 0) {
\Workerman\Timer::del($this->progressTimerId);
$this->progressTimerId = 0;
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,74 @@
<?php
namespace app\service\DataCollection\Handler;
use app\service\TagTaskService;
use app\repository\TagTaskRepository;
use app\repository\TagTaskExecutionRepository;
use app\repository\UserProfileRepository;
use app\service\TagService;
use app\repository\TagDefinitionRepository;
use app\repository\UserTagRepository;
use app\repository\TagHistoryRepository;
use app\service\TagRuleEngine\SimpleRuleEngine;
use app\utils\LoggerHelper;
/**
* 标签任务处理类
*
* 职责:
* - 执行标签计算任务
* - 批量遍历用户数据打标签
*/
class TagTaskHandler
{
/**
* 执行标签任务
*
* @param mixed $adapter 数据源适配器(标签任务不需要)
* @param array<string, mixed> $taskConfig 任务配置
* @return void
*/
public function collect($adapter, array $taskConfig): void
{
$taskId = $taskConfig['task_id'] ?? '';
$taskName = $taskConfig['name'] ?? '标签任务';
LoggerHelper::logBusiness('tag_task_handler_started', [
'task_id' => $taskId,
'task_name' => $taskName,
]);
try {
// 创建TagTaskService实例
$tagTaskService = new TagTaskService(
new TagTaskRepository(),
new TagTaskExecutionRepository(),
new UserProfileRepository(),
new TagService(
new TagDefinitionRepository(),
new UserProfileRepository(),
new UserTagRepository(),
new TagHistoryRepository(),
new SimpleRuleEngine()
)
);
// 执行任务
$tagTaskService->executeTask($taskId);
LoggerHelper::logBusiness('tag_task_handler_completed', [
'task_id' => $taskId,
'task_name' => $taskName,
]);
} catch (\Throwable $e) {
LoggerHelper::logError($e, [
'component' => 'TagTaskHandler',
'action' => 'collect',
'task_id' => $taskId,
]);
throw $e;
}
}
}

View File

@@ -0,0 +1,172 @@
<?php
namespace app\service\DataCollection\Handler\Trait;
use MongoDB\BSON\UTCDateTime;
/**
* 数据采集辅助方法 Trait
*
* 提供通用的工具方法给各个 Handler 使用
*/
trait DataCollectionHelperTrait
{
/**
* 将 MongoDB 文档转换为数组
*
* @param mixed $document MongoDB 文档对象或数组
* @return array<string, mixed> 数组格式的数据
*/
protected function convertMongoDocumentToArray($document): array
{
if (is_array($document)) {
return $document;
}
if (is_object($document) && method_exists($document, 'toArray')) {
return $document->toArray();
}
return json_decode(json_encode($document), true) ?? [];
}
/**
* 解析日期时间字符串
*
* @param mixed $dateTimeStr 日期时间字符串或对象
* @return \DateTimeImmutable|null 解析后的日期时间对象
*/
protected function parseDateTime($dateTimeStr): ?\DateTimeImmutable
{
if (empty($dateTimeStr)) {
return null;
}
// 如果是 MongoDB 的 UTCDateTime 对象
if ($dateTimeStr instanceof UTCDateTime) {
return \DateTimeImmutable::createFromMutable($dateTimeStr->toDateTime());
}
// 如果是 DateTime 对象
if ($dateTimeStr instanceof \DateTime || $dateTimeStr instanceof \DateTimeImmutable) {
if ($dateTimeStr instanceof \DateTime) {
return \DateTimeImmutable::createFromMutable($dateTimeStr);
}
return $dateTimeStr;
}
// 尝试解析字符串
try {
return new \DateTimeImmutable((string)$dateTimeStr);
} catch (\Exception $e) {
\app\utils\LoggerHelper::logBusiness('datetime_parse_failed', [
'input' => $dateTimeStr,
'error' => $e->getMessage(),
]);
return null;
}
}
/**
* 解析金额
*
* @param mixed $amount 金额字符串或数字
* @return float 解析后的金额
*/
protected function parseAmount($amount): float
{
if (is_numeric($amount)) {
return (float)$amount;
}
if (is_string($amount)) {
// 移除所有非数字字符(除了小数点)
$cleaned = preg_replace('/[^\d.]/', '', $amount);
return (float)$cleaned;
}
return 0.0;
}
/**
* 过滤手机号中的非数字字符
*
* @param string $phoneNumber 原始手机号
* @return string 过滤后的手机号(只包含数字)
*/
protected function filterPhoneNumber(string $phoneNumber): string
{
// 移除所有非数字字符
return preg_replace('/\D/', '', $phoneNumber);
}
/**
* 验证手机号格式
*
* @param string $phone 手机号(已经过滤过非数字字符)
* @return bool 是否有效11位数字1开头
*/
protected function isValidPhone(string $phone): bool
{
// 如果为空,直接返回 false
if (empty($phone)) {
return false;
}
// 中国大陆手机号11位数字以1开头
return preg_match('/^1[3-9]\d{9}$/', $phone) === 1;
}
/**
* 根据消费时间生成月份集合名
*
* @param string $baseCollectionName 基础集合名
* @param mixed $dateTimeStr 日期时间字符串或对象
* @return string 带月份后缀的集合名consumption_records_202512
*/
protected function getMonthlyCollectionName(string $baseCollectionName, $dateTimeStr = null): string
{
$consumeTime = $this->parseDateTime($dateTimeStr);
if ($consumeTime === null) {
$consumeTime = new \DateTimeImmutable();
}
$monthSuffix = $consumeTime->format('Ym');
return "{$baseCollectionName}_{$monthSuffix}";
}
/**
* 转换为 MongoDB UTCDateTime
*
* @param mixed $dateTimeStr 日期时间字符串或对象
* @return UTCDateTime|null MongoDB UTCDateTime 对象
*/
protected function convertToUTCDateTime($dateTimeStr): ?UTCDateTime
{
if (empty($dateTimeStr)) {
return null;
}
// 如果已经是 UTCDateTime直接返回
if ($dateTimeStr instanceof UTCDateTime) {
return $dateTimeStr;
}
// 如果是 DateTime 对象
if ($dateTimeStr instanceof \DateTime || $dateTimeStr instanceof \DateTimeImmutable) {
return new UTCDateTime($dateTimeStr->getTimestamp() * 1000);
}
// 尝试解析字符串
try {
$dateTime = new \DateTimeImmutable((string)$dateTimeStr);
return new UTCDateTime($dateTime->getTimestamp() * 1000);
} catch (\Exception $e) {
\app\utils\LoggerHelper::logBusiness('convert_to_utcdatetime_failed', [
'input' => $dateTimeStr,
'error' => $e->getMessage(),
]);
return null;
}
}
}