"""
Phase 0 分析器测试
"""

import os
import sys
import shutil
import tempfile
from pathlib import Path
import pandas as pd
import numpy as np
import yaml
import pytest

# 添加项目路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root / 'scripts'))
sys.path.insert(0, str(project_root / 'backend'))

from analyze_orders import OrderAnalyzer


@pytest.fixture
def temp_dir():
    """创建临时目录"""
    temp = tempfile.mkdtemp()
    yield temp
    shutil.rmtree(temp)


@pytest.fixture
def sample_order_data():
    """创建示例订单数据"""
    data = {
        '原始订单编号': ['ORD001', 'ORD001', 'ORD002', 'ORD003', 'ORD003'],
        '网店名称': ['店铺A', '店铺A', '店铺B', '店铺A', '店铺A'],
        '交易状态': ['交易成功', '交易成功', '交易成功', '交易关闭', '交易关闭'],
        '付款时间': ['2025-01-15 10:30:00', '2025-01-15 10:30:00', '2025-01-16 14:20:00', None, None],
        '下载时间': ['2025-01-20 08:00:00', '2025-01-20 08:00:00', '2025-01-20 08:00:00', '2025-01-20 08:00:00', '2025-01-20 08:00:00'],
        '线上宝贝名称': ['Nike 运动鞋 AJ1 Low', 'Nike 运动T恤', 'Adidas 外套 2025款', 'Puma 背包', 'Puma 帽子'],
        '线上销售属性': ['颜色:黑色;尺码:42', '颜色:白色;尺码:L', '颜色:蓝色;尺码:XL', '颜色:红色', '颜色:黑色'],
        '线上商家编码': ['NK001', 'NK002', 'AD001', 'PM001', 'PM002'],
        '商品编号': ['SP001', 'SP002', 'SP003', 'SP004', 'SP005'],
        'SKU编号': ['SKU001', 'SKU002', 'SKU003', 'SKU004', 'SKU005'],
        '图片': [
            'http://img1.jpg,http://img2.jpg',
            'http://img3.jpg;http://img4.jpg',
            'http://img5.jpg|http://img6.jpg',
            'http://img7.jpg',
            'http://img8.jpg http://img9.jpg'
        ],
        '数量': [1, 2, 1, 1, 1],
        '订单单价': [599.00, 199.00, 899.00, 299.00, 99.00],
        '订单金额': [599.00, 398.00, 899.00, 299.00, 99.00],
        '退款状态': ['', '', '', '退款成功', '退款成功'],
        '买家留言': ['尽快发货', '尽快发货', '', '', ''],
        '卖家备注': ['', '', '大客户', '', '']
    }
    return pd.DataFrame(data)


def test_analyzer_initialization(temp_dir):
    """测试分析器初始化"""
    analyzer = OrderAnalyzer(
        shared_inbox=temp_dir,
        file_glob="*.xlsx",
        limit=5
    )
    
    assert analyzer.shared_inbox == Path(temp_dir)
    assert analyzer.file_glob == "*.xlsx"
    assert analyzer.limit == 5
    assert analyzer.reports_dir.exists()
    assert analyzer.configs_dir.exists()
    assert analyzer.synonyms_dir.exists()


def test_column_analysis(temp_dir, sample_order_data):
    """测试列分析功能"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 分析订单编号列
    stats = analyzer.analyze_column(sample_order_data['原始订单编号'], '原始订单编号')
    assert stats['name'] == '原始订单编号'
    assert stats['total_count'] == 5
    assert stats['null_count'] == 0
    assert stats['unique_count'] == 3
    assert stats['inferred_type'] == 'string'
    
    # 分析数量列
    stats = analyzer.analyze_column(sample_order_data['数量'], '数量')
    assert stats['inferred_type'] == 'int'
    
    # 分析金额列
    stats = analyzer.analyze_column(sample_order_data['订单单价'], '订单单价')
    assert stats['inferred_type'] == 'decimal'
    
    # 分析图片列
    stats = analyzer.analyze_column(sample_order_data['图片'], '图片')
    assert stats['inferred_type'] == 'list'


def test_date_parsing(temp_dir, sample_order_data):
    """测试日期解析"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 测试付款时间解析
    success_rate = analyzer._try_parse_dates(sample_order_data['付款时间'].dropna())
    assert success_rate > 0.9  # 应该能成功解析大部分日期
    assert len(analyzer.date_formats) > 0


def test_brand_extraction(temp_dir, sample_order_data):
    """测试品牌提取"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 提取品牌
    analyzer._extract_brands(sample_order_data['线上宝贝名称'])
    
    # 检查品牌频率
    assert 'Nike' in analyzer.brand_frequency
    assert 'Adidas' in analyzer.brand_frequency
    assert 'Puma' in analyzer.brand_frequency
    assert analyzer.brand_frequency['Nike'] == 2  # Nike 出现2次
    assert analyzer.brand_frequency['Puma'] == 2  # Puma 出现2次


def test_color_size_extraction(temp_dir, sample_order_data):
    """测试颜色尺码提取"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 提取颜色和尺码
    analyzer._extract_color_size_patterns(sample_order_data['线上销售属性'])
    
    # 检查提取结果
    assert '黑色' in analyzer.color_patterns or '白色' in analyzer.color_patterns
    assert 'L' in analyzer.size_patterns or 'XL' in analyzer.size_patterns or '42' in analyzer.size_patterns


def test_image_splitter_detection(temp_dir, sample_order_data):
    """测试图片分隔符检测"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 检测分隔符
    splitter = analyzer.analyze_image_splitters(sample_order_data)
    
    # 应该能检测到逗号、分号、竖线或空格
    assert splitter is not None
    assert any(char in splitter for char in [',', ';', '|', r'\s'])


def test_product_key_analysis(temp_dir, sample_order_data):
    """测试产品键分析"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 分析产品键
    keys, uniqueness = analyzer.analyze_product_keys(sample_order_data)
    
    # 应该返回合理的键组合
    assert len(keys) > 0
    assert uniqueness > 0
    assert '原始订单编号' in keys or 'SKU编号' in keys


def test_full_analysis_flow(temp_dir, sample_order_data):
    """测试完整分析流程"""
    # 创建测试 Excel 文件
    excel_path = Path(temp_dir) / 'test_orders.xlsx'
    sample_order_data.to_excel(excel_path, index=False)
    
    # 运行分析器
    analyzer = OrderAnalyzer(
        shared_inbox=temp_dir,
        file_glob="*.xlsx"
    )
    
    # 执行分析
    files = analyzer.find_excel_files()
    assert len(files) == 1
    
    df = analyzer.read_excel_file(files[0])
    assert len(df) == 5
    
    # 分析所有列
    for col in df.columns:
        stats = analyzer.analyze_column(df[col], col)
        analyzer.column_stats[col] = stats
    
    analyzer.all_data = [df]
    
    # 生成报告
    analyzer.generate_eda_report()
    assert (analyzer.reports_dir / 'eda.md').exists()
    
    # 生成配置
    analyzer.generate_column_map()
    assert (analyzer.configs_dir / 'column_map.yaml').exists()
    
    # 验证配置内容
    with open(analyzer.configs_dir / 'column_map.yaml', 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
        assert 'columns' in config
        assert 'product_key' in config
        assert '原始订单编号' in config['columns']
    
    # 生成同义词表
    analyzer.generate_synonyms()
    assert (analyzer.synonyms_dir / 'brand_aliases.csv').exists()
    assert (analyzer.synonyms_dir / 'color_aliases.csv').exists()
    assert (analyzer.synonyms_dir / 'size_aliases.csv').exists()
    
    # 生成规则文档
    analyzer.generate_parsing_rules()
    assert (analyzer.reports_dir / 'parsing_rules.md').exists()


def test_enum_detection(temp_dir):
    """测试枚举值检测"""
    analyzer = OrderAnalyzer(shared_inbox=temp_dir)
    
    # 创建有限枚举值的数据
    enum_data = pd.Series(['已付款', '未付款', '已付款', '已退款', '已付款'] * 100)
    stats = analyzer.analyze_column(enum_data, '支付状态')
    
    assert '支付状态' in analyzer.enum_candidates
    assert '已付款' in analyzer.enum_candidates['支付状态']
    assert '未付款' in analyzer.enum_candidates['支付状态']
    assert '已退款' in analyzer.enum_candidates['支付状态']


def test_sensitive_data_detection():
    """测试敏感数据检测"""
    # 这个测试验证报告中是否包含敏感信息建议
    analyzer = OrderAnalyzer(shared_inbox='.')
    analyzer.column_stats = {
        '买家身份证': {'sample_values': ['110101199001011234']},
        '手机号': {'sample_values': ['13812345678']},
        '邮箱': {'sample_values': ['test@example.com']},
        '普通字段': {'sample_values': ['普通数据']}
    }
    
    analyzer.reports_dir.mkdir(exist_ok=True, parents=True)
    analyzer.generate_eda_report()
    
    # 读取报告验证
    with open(analyzer.reports_dir / 'eda.md', 'r', encoding='utf-8') as f:
        content = f.read()
        assert '敏感信息检测' in content
        assert '身份证' in content or '手机' in content or '邮箱' in content


if __name__ == '__main__':
    pytest.main([__file__, '-v'])