#!/usr/bin/env python3
"""
Phase 0: 订单数据剖析与映射生成
分析 Excel 文件，生成 EDA 报告、列映射配置、同义词表等
"""

import os
import re
import sys
import hashlib
import argparse
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple, Set
from collections import Counter, defaultdict
import warnings

import pandas as pd
import numpy as np
import yaml
from rapidfuzz import fuzz, process

warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

# 添加项目根目录到 Python 路径
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root / 'backend'))

# 加载环境变量
from dotenv import load_dotenv
env_path = project_root / '.env'
if env_path.exists():
    load_dotenv(env_path)

class OrderAnalyzer:
    """订单数据分析器"""
    
    def __init__(self, 
                 shared_inbox: str,
                 file_glob: str = "*.xls*",
                 limit: Optional[int] = None):
        self.shared_inbox = Path(shared_inbox)
        self.file_glob = file_glob
        self.limit = limit
        self.reports_dir = project_root / 'reports'
        self.configs_dir = project_root / 'configs'
        self.synonyms_dir = self.configs_dir / 'synonyms'
        
        # 创建输出目录
        self.reports_dir.mkdir(exist_ok=True, parents=True)
        self.configs_dir.mkdir(exist_ok=True, parents=True)
        self.synonyms_dir.mkdir(exist_ok=True, parents=True)
        
        # 分析结果存储
        self.all_data = []
        self.column_stats = {}
        self.date_formats = []
        self.enum_candidates = {}
        self.brand_frequency = Counter()
        self.color_patterns = set()
        self.size_patterns = set()
        
    def find_excel_files(self) -> List[Path]:
        """查找目录下的 Excel 文件"""
        files = sorted(
            self.shared_inbox.glob(self.file_glob),
            key=lambda x: x.stat().st_mtime,
            reverse=True
        )
        if self.limit:
            files = files[:self.limit]
        return files
    
    def read_excel_file(self, file_path: Path) -> pd.DataFrame:
        """读取 Excel 文件，自动处理编码"""
        try:
            # 尝试读取 .xlsx
            if file_path.suffix.lower() == '.xlsx':
                df = pd.read_excel(file_path, engine='openpyxl')
            else:
                # 尝试读取 .xls
                df = pd.read_excel(file_path, engine='xlrd')
        except:
            # 如果失败，尝试用 pandas 默认引擎
            df = pd.read_excel(file_path)
        
        print(f"  读取文件: {file_path.name}, 行数: {len(df)}, 列数: {len(df.columns)}")
        return df
    
    def analyze_column(self, series: pd.Series, col_name: str) -> Dict[str, Any]:
        """分析单个列的统计信息"""
        stats = {
            'name': col_name,
            'total_count': len(series),
            'null_count': series.isna().sum(),
            'null_rate': f"{series.isna().mean():.2%}",
            'unique_count': series.nunique(),
            'dtype': str(series.dtype),
            'sample_values': [],
            'inferred_type': 'string',
            'anomalies': []
        }
        
        # 获取非空值
        non_null = series.dropna()
        if len(non_null) == 0:
            return stats
        
        # 采样值
        sample_size = min(5, len(non_null))
        stats['sample_values'] = non_null.sample(min(sample_size, len(non_null))).tolist()[:5]
        
        # 类型推断
        stats['inferred_type'] = self._infer_column_type(non_null, col_name)
        
        # 枚举候选检测
        if stats['unique_count'] < 20 and len(non_null) > 10:
            value_counts = non_null.value_counts()
            if '状态' in col_name or 'status' in col_name.lower():
                self.enum_candidates[col_name] = value_counts.index.tolist()
                stats['enum_values'] = value_counts.to_dict()
        
        # 品牌词提取（从商品名称）
        if '宝贝名称' in col_name or '商品名称' in col_name:
            self._extract_brands(non_null)
        
        # 颜色尺码模式提取
        if '销售属性' in col_name or '属性' in col_name:
            self._extract_color_size_patterns(non_null)
        
        return stats
    
    def _infer_column_type(self, series: pd.Series, col_name: str) -> str:
        """推断列的数据类型"""
        # 检查是否为日期
        if '时间' in col_name or '日期' in col_name or 'time' in col_name.lower() or 'date' in col_name.lower():
            success_rate = self._try_parse_dates(series)
            if success_rate > 0.5:
                return 'datetime'
        
        # 检查是否为数值
        try:
            pd.to_numeric(series)
            if series.dtype in [np.int64, np.float64]:
                if '数量' in col_name or 'quantity' in col_name.lower():
                    return 'int'
                elif '金额' in col_name or '价格' in col_name or '单价' in col_name:
                    return 'decimal'
                return 'numeric'
        except:
            pass
        
        # 检查是否为图片链接
        if '图片' in col_name or 'image' in col_name.lower() or 'img' in col_name.lower():
            return 'list'
        
        # 检查是否为枚举
        unique_ratio = series.nunique() / len(series)
        if unique_ratio < 0.1 and series.nunique() < 20:
            return 'enum'
        
        return 'string'
    
    def _try_parse_dates(self, series: pd.Series) -> float:
        """尝试解析日期并返回成功率"""
        date_formats = [
            '%Y-%m-%d %H:%M:%S',
            '%Y/%m/%d %H:%M:%S', 
            '%Y-%m-%d %H:%M',
            '%Y/%m/%d %H:%M',
            '%Y-%m-%d',
            '%Y/%m/%d',
            '%d/%m/%Y %H:%M:%S',
            '%d-%m-%Y %H:%M:%S',
            '%m/%d/%Y %H:%M:%S',
        ]
        
        best_rate = 0
        best_format = None
        
        for fmt in date_formats:
            try:
                parsed = pd.to_datetime(series, format=fmt, errors='coerce')
                success_rate = 1 - parsed.isna().mean()
                if success_rate > best_rate:
                    best_rate = success_rate
                    best_format = fmt
            except:
                continue
        
        if best_format and best_format not in self.date_formats:
            self.date_formats.append(best_format)
        
        return best_rate
    
    def _extract_brands(self, series: pd.Series):
        """从商品名称中提取品牌"""
        for text in series.sample(min(100, len(series))):
            if pd.isna(text):
                continue
            text = str(text)
            
            # 提取首词（可能是品牌）
            words = text.split()
            if words:
                first_word = words[0].strip()
                # 过滤掉明显不是品牌的词
                if len(first_word) > 1 and not first_word.isdigit():
                    self.brand_frequency[first_word] += 1
            
            # 提取括号内容（可能包含品牌）
            bracket_match = re.findall(r'[（\(]([^）\)]+)[）\)]', text)
            for match in bracket_match:
                if len(match) > 1 and not match.isdigit():
                    self.brand_frequency[match] += 1
    
    def _extract_color_size_patterns(self, series: pd.Series):
        """提取颜色和尺码模式"""
        color_keywords = ['颜色', 'Color', 'colour', '色']
        size_keywords = ['尺码', 'Size', '尺寸', '号']
        
        for text in series.sample(min(100, len(series))):
            if pd.isna(text):
                continue
            text = str(text)
            
            # 提取颜色
            for keyword in color_keywords:
                pattern = rf'{keyword}[：:=]?\s*([^,;，；\s]+)'
                matches = re.findall(pattern, text, re.IGNORECASE)
                self.color_patterns.update(matches)
            
            # 提取尺码
            for keyword in size_keywords:
                pattern = rf'{keyword}[：:=]?\s*([^,;，；\s]+)'
                matches = re.findall(pattern, text, re.IGNORECASE)
                self.size_patterns.update(matches)
    
    def analyze_image_splitters(self, df: pd.DataFrame) -> str:
        """分析图片字段的分隔符"""
        image_cols = [col for col in df.columns if '图片' in col or 'image' in col.lower()]
        
        if not image_cols:
            return r'[,;|\s]+'
        
        splitter_scores = Counter()
        splitters = [',', ';', '|', r'\s+', '，', '；']
        
        for col in image_cols:
            sample = df[col].dropna().sample(min(100, len(df[col].dropna())))
            for text in sample:
                text = str(text)
                # 检查是否包含 URL 模式
                if 'http' in text or 'www' in text:
                    for splitter in splitters:
                        parts = re.split(splitter, text)
                        # 如果分割后有多个 URL，增加该分隔符的分数
                        url_count = sum(1 for p in parts if 'http' in p or 'www' in p)
                        if url_count > 1:
                            splitter_scores[splitter] += url_count
        
        # 返回最佳分隔符
        if splitter_scores:
            best_splitter = max(splitter_scores, key=splitter_scores.get)
            # 组合成正则表达式
            return f'[{best_splitter}]+'
        
        return r'[,;|\s]+'
    
    def analyze_product_keys(self, df: pd.DataFrame) -> Tuple[List[str], float]:
        """分析产品唯一键组合"""
        key_candidates = [
            ['原始订单编号', '线上商家编码', 'SKU编号', '商品编号', '订单单价', '数量'],
            ['原始订单编号', '商品编号', 'SKU编号'],
            ['原始订单编号', '线上商家编码'],
        ]
        
        best_keys = []
        best_uniqueness = 0
        
        for keys in key_candidates:
            # 检查所有键是否存在
            if all(k in df.columns for k in keys):
                # 创建组合键并检查唯一性
                df_test = df[keys].dropna()
                if len(df_test) > 0:
                    combined = df_test.apply(lambda x: '_'.join(str(v) for v in x), axis=1)
                    uniqueness = combined.nunique() / len(combined)
                    if uniqueness > best_uniqueness:
                        best_uniqueness = uniqueness
                        best_keys = keys
        
        return best_keys, best_uniqueness
    
    def generate_eda_report(self):
        """生成 EDA 报告"""
        report_path = self.reports_dir / 'eda.md'
        
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write("# 订单数据探索性分析报告 (EDA)\n\n")
            f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            # 文件概览
            f.write("## 1. 数据文件概览\n\n")
            f.write(f"- 分析文件数: {len(self.all_data)}\n")
            total_rows = sum(len(df) for df in self.all_data)
            f.write(f"- 总行数: {total_rows:,}\n")
            if self.all_data:
                f.write(f"- 列数: {len(self.all_data[0].columns)}\n\n")
            
            # 列统计
            f.write("## 2. 列统计分析\n\n")
            f.write("| 列名 | 空值率 | 唯一值数 | 推断类型 | 示例值 |\n")
            f.write("|------|--------|----------|----------|--------|\n")
            
            for col_name, stats in self.column_stats.items():
                # 处理手动创建的 stats（用于测试）
                null_rate = stats.get('null_rate', 'N/A')
                unique_count = stats.get('unique_count', 'N/A')
                inferred_type = stats.get('inferred_type', 'string')
                sample_values = stats.get('sample_values', [])
                
                sample_str = ', '.join(str(v)[:20] for v in sample_values[:3])
                f.write(f"| {col_name} | {null_rate} | {unique_count} | "
                       f"{inferred_type} | {sample_str} |\n")
            
            # 枚举列候选
            f.write("\n## 3. 枚举列候选\n\n")
            for col_name, values in self.enum_candidates.items():
                f.write(f"### {col_name}\n")
                f.write(f"可能的枚举值: {', '.join(str(v) for v in values[:10])}\n\n")
            
            # 时间列解析
            f.write("## 4. 时间列解析\n\n")
            f.write("检测到的日期格式:\n")
            for fmt in self.date_formats:
                f.write(f"- `{fmt}`\n")
            
            # 图片字段分析
            f.write("\n## 5. 图片字段分析\n\n")
            best_splitter = self.analyze_image_splitters(self.all_data[0] if self.all_data else pd.DataFrame())
            f.write(f"推荐的分隔符正则: `{best_splitter}`\n")
            
            # 产品键分析
            f.write("\n## 6. 产品唯一键分析\n\n")
            if self.all_data:
                best_keys, uniqueness = self.analyze_product_keys(pd.concat(self.all_data, ignore_index=True))
                f.write(f"推荐的产品键组合: {', '.join(best_keys)}\n")
                f.write(f"唯一性: {uniqueness:.2%}\n")
            
            # 品牌词频统计
            f.write("\n## 7. 品牌词频统计 (Top 20)\n\n")
            f.write("| 品牌候选词 | 出现次数 |\n")
            f.write("|-----------|----------|\n")
            for brand, count in self.brand_frequency.most_common(20):
                f.write(f"| {brand} | {count} |\n")
            
            # 敏感信息检测
            f.write("\n## 8. 敏感信息检测\n\n")
            sensitive_patterns = {
                '身份证': r'\d{15,18}',
                '手机号': r'1[3-9]\d{9}',
                '邮箱': r'[\w\.-]+@[\w\.-]+\.\w+'
            }
            f.write("建议对以下字段进行脱敏处理:\n")
            for col_name in self.column_stats:
                if any(keyword in col_name for keyword in ['身份证', '手机', '电话', '邮箱', 'email']):
                    f.write(f"- {col_name}: 建议使用 SHA256 哈希或保留后4位\n")
        
        print(f"✓ EDA 报告已生成: {report_path}")
    
    def generate_column_map(self):
        """生成列映射配置"""
        config_path = self.configs_dir / 'column_map.yaml'
        
        # 获取最佳分隔符
        best_splitter = self.analyze_image_splitters(self.all_data[0] if self.all_data else pd.DataFrame())
        
        # 获取产品键
        best_keys = []
        if self.all_data:
            best_keys, _ = self.analyze_product_keys(pd.concat(self.all_data, ignore_index=True))
        
        config = {
            'columns': {},
            'product_key': best_keys or ['原始订单编号', '线上商家编码', 'SKU编号', '商品编号', '订单单价', '数量'],
            'image_splitter': best_splitter
        }
        
        # 必需字段映射
        required_fields = {
            '原始订单编号': 'string',
            '交易状态': 'enum', 
            '线上宝贝名称': 'string',
            '数量': 'int',
            '订单单价': 'decimal'
        }
        
        for col_name, stats in self.column_stats.items():
            col_config = {
                'required': col_name in required_fields,
                'type': stats['inferred_type']
            }
            
            # 添加枚举值
            if col_name in self.enum_candidates:
                col_config['values'] = self.enum_candidates[col_name]
            
            # 添加日期格式
            if stats['inferred_type'] == 'datetime' and self.date_formats:
                col_config['formats'] = self.date_formats
            
            # 图片字段特殊处理
            if '图片' in col_name:
                col_config['type'] = 'list'
                col_config['splitter'] = best_splitter
            
            config['columns'][col_name] = col_config
        
        with open(config_path, 'w', encoding='utf-8') as f:
            yaml.dump(config, f, allow_unicode=True, default_flow_style=False)
        
        print(f"✓ 列映射配置已生成: {config_path}")
    
    def generate_synonyms(self):
        """生成同义词表"""
        # 品牌别名
        brand_aliases_path = self.synonyms_dir / 'brand_aliases.csv'
        brand_df = pd.DataFrame([
            {'alias': brand, 'canonical': brand, 'confidence': count}
            for brand, count in self.brand_frequency.most_common(50)
        ])
        
        # 添加一些常见的品牌同义词示例
        common_aliases = [
            {'alias': 'Nike', 'canonical': 'NIKE', 'confidence': 100},
            {'alias': 'nike', 'canonical': 'NIKE', 'confidence': 100},
            {'alias': 'Adidas', 'canonical': 'ADIDAS', 'confidence': 100},
            {'alias': 'adidas', 'canonical': 'ADIDAS', 'confidence': 100},
        ]
        brand_df = pd.concat([brand_df, pd.DataFrame(common_aliases)], ignore_index=True)
        brand_df.to_csv(brand_aliases_path, index=False, encoding='utf-8')
        print(f"✓ 品牌别名表已生成: {brand_aliases_path}")
        
        # 颜色别名
        color_aliases_path = self.synonyms_dir / 'color_aliases.csv'
        color_data = []
        
        # 预置颜色同义词
        color_mapping = {
            '黑色': ['黑', 'Black', 'black', 'BLK', '黑色'],
            '白色': ['白', 'White', 'white', 'WHT', '白色'],
            '红色': ['红', 'Red', 'red', '红色'],
            '蓝色': ['蓝', 'Blue', 'blue', '蓝色'],
            '绿色': ['绿', 'Green', 'green', '绿色'],
            '黄色': ['黄', 'Yellow', 'yellow', '黄色'],
            '灰色': ['灰', 'Gray', 'Grey', 'gray', 'grey', '灰色'],
            '粉色': ['粉', 'Pink', 'pink', '粉红', '粉色'],
        }
        
        for canonical, aliases in color_mapping.items():
            for alias in aliases:
                color_data.append({'alias': alias, 'canonical': canonical, 'confidence': 100})
        
        # 添加从数据中提取的颜色
        for color in list(self.color_patterns)[:20]:
            if color and len(color) > 1:
                color_data.append({'alias': color, 'canonical': color, 'confidence': 50})
        
        color_df = pd.DataFrame(color_data)
        color_df.to_csv(color_aliases_path, index=False, encoding='utf-8')
        print(f"✓ 颜色别名表已生成: {color_aliases_path}")
        
        # 尺码别名
        size_aliases_path = self.synonyms_dir / 'size_aliases.csv'
        size_data = []
        
        # 预置尺码同义词
        size_mapping = {
            'S': ['S', 's', 'Small', 'small', '小', '小码'],
            'M': ['M', 'm', 'Medium', 'medium', '中', '中码'],
            'L': ['L', 'l', 'Large', 'large', '大', '大码'],
            'XL': ['XL', 'xl', 'X-Large', 'x-large', '加大', '加大码'],
            'XXL': ['XXL', 'xxl', '2XL', '2xl', '特大', '特大码'],
            'XXXL': ['XXXL', 'xxxl', '3XL', '3xl'],
        }
        
        for canonical, aliases in size_mapping.items():
            for alias in aliases:
                size_data.append({'alias': alias, 'canonical': canonical, 'confidence': 100})
        
        # 添加从数据中提取的尺码
        for size in list(self.size_patterns)[:20]:
            if size and len(size) > 0:
                size_data.append({'alias': size, 'canonical': size, 'confidence': 50})
        
        size_df = pd.DataFrame(size_data)
        size_df.to_csv(size_aliases_path, index=False, encoding='utf-8')
        print(f"✓ 尺码别名表已生成: {size_aliases_path}")
    
    def generate_parsing_rules(self):
        """生成解析规则文档"""
        rules_path = self.reports_dir / 'parsing_rules.md'
        
        with open(rules_path, 'w', encoding='utf-8') as f:
            f.write("# 数据解析规则\n\n")
            f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            
            f.write("## 1. 品牌提取规则\n\n")
            f.write("1. 优先从 `brand_aliases.csv` 中匹配别名映射\n")
            f.write("2. 若未匹配，从商品名称提取:\n")
            f.write("   - 提取首词（空格分隔的第一个词）\n")
            f.write("   - 提取括号内容 `[（(]([^）)]+)[）)]`\n")
            f.write("3. 使用 RapidFuzz 进行模糊匹配，相似度 > 85% 时生成 pending 别名建议\n\n")
            
            f.write("## 2. 货号提取规则\n\n")
            f.write("1. 从商品名称中去除品牌词\n")
            f.write("2. 提取字母数字组合: `[A-Za-z0-9]+`\n")
            f.write("3. 多个片段用下划线拼接，保持大写\n")
            f.write("4. 示例: `Nike AJ1234 Low` → 货号: `AJ1234_LOW`\n\n")
            
            f.write("## 3. 颜色提取规则\n\n")
            f.write("从以下字段提取:\n")
            f.write("- 线上销售属性\n")
            f.write("- 商品名称\n\n")
            f.write("匹配模式:\n")
            f.write("```regex\n")
            f.write(r"(颜色|Color|colour)[：:=]\s*([^,;，；\s]+)")
            f.write("\n```\n\n")
            
            f.write("## 4. 尺寸提取规则\n\n")
            f.write("从以下字段提取:\n")
            f.write("- 线上销售属性\n")
            f.write("- 商品名称\n\n")
            f.write("匹配模式:\n")
            f.write("```regex\n")
            f.write(r"(尺码|Size|尺寸|号)[：:=]\s*([^,;，；\s]+)")
            f.write("\n```\n\n")
            
            f.write("## 5. SKU ID 生成规则\n\n")
            f.write("```python\n")
            f.write("sku_id = f\"{slug(merchant_code or product_id or sku_number)}_{color}_{size}\"\n")
            f.write("```\n\n")
            f.write("其中 `slug()` 函数:\n")
            f.write("- 转小写\n")
            f.write("- 替换空格为下划线\n")
            f.write("- 移除特殊字符\n\n")
            
            f.write("## 6. 图片链接处理\n\n")
            f.write(f"分隔符正则: `{self.analyze_image_splitters(self.all_data[0] if self.all_data else pd.DataFrame())}`\n")
            f.write("处理步骤:\n")
            f.write("1. 按分隔符拆分\n")
            f.write("2. 验证 URL 格式（包含 http:// 或 https://）\n")
            f.write("3. 存储为 JSON 数组\n\n")
            
            f.write("## 7. 去重键 (product_key)\n\n")
            if self.all_data:
                best_keys, _ = self.analyze_product_keys(pd.concat(self.all_data, ignore_index=True))
                f.write(f"字段组合: {', '.join(best_keys)}\n")
            f.write("生成方式: SHA1(字段值拼接)\n")
        
        print(f"✓ 解析规则文档已生成: {rules_path}")
    
    def run(self):
        """运行完整的分析流程"""
        print("=" * 60)
        print("Phase 0: 订单数据剖析与映射生成")
        print("=" * 60)
        
        # 查找文件
        files = self.find_excel_files()
        if not files:
            print(f"错误: 在 {self.shared_inbox} 中未找到匹配 {self.file_glob} 的文件")
            return
        
        print(f"\n找到 {len(files)} 个文件:")
        for f in files:
            print(f"  - {f.name}")
        
        # 读取和分析每个文件
        print("\n开始分析数据...")
        for file_path in files:
            df = self.read_excel_file(file_path)
            self.all_data.append(df)
            
            # 分析每列
            for col in df.columns:
                if col not in self.column_stats:
                    self.column_stats[col] = self.analyze_column(df[col], col)
        
        # 合并所有数据进行整体分析
        if self.all_data:
            combined_df = pd.concat(self.all_data, ignore_index=True)
            print(f"\n合并数据: 总行数 {len(combined_df):,}, 总列数 {len(combined_df.columns)}")
        
        # 生成各种输出
        print("\n生成分析产物...")
        self.generate_eda_report()
        self.generate_column_map()
        self.generate_synonyms()
        self.generate_parsing_rules()
        
        # 打印摘要
        print("\n" + "=" * 60)
        print("分析完成！生成的文件:")
        print("=" * 60)
        print(f"  1. EDA 报告: reports/eda.md")
        print(f"  2. 列映射配置: configs/column_map.yaml")
        print(f"  3. 品牌别名: configs/synonyms/brand_aliases.csv")
        print(f"  4. 颜色别名: configs/synonyms/color_aliases.csv")
        print(f"  5. 尺码别名: configs/synonyms/size_aliases.csv")
        print(f"  6. 解析规则: reports/parsing_rules.md")
        
        print("\n关键洞察:")
        print(f"  - 数据列数: {len(self.column_stats)}")
        print(f"  - 枚举列候选: {len(self.enum_candidates)}")
        print(f"  - 日期格式数: {len(self.date_formats)}")
        print(f"  - 品牌候选数: {len(self.brand_frequency)}")
        print(f"  - 颜色模式数: {len(self.color_patterns)}")
        print(f"  - 尺码模式数: {len(self.size_patterns)}")

def main():
    """主函数"""
    parser = argparse.ArgumentParser(description="订单数据分析工具")
    parser.add_argument('--limit', type=int, help='限制分析的文件数量')
    parser.add_argument('--inbox', type=str, help='共享文件夹路径')
    args = parser.parse_args()
    
    # 从环境变量获取配置
    shared_inbox = args.inbox or os.getenv('SHARED_INBOX_DIR', '.')
    file_glob = os.getenv('FILE_GLOB', '*.xls*')
    
    # 创建分析器并运行
    analyzer = OrderAnalyzer(
        shared_inbox=shared_inbox,
        file_glob=file_glob,
        limit=args.limit
    )
    analyzer.run()

if __name__ == '__main__':
    main()