#!/usr/bin/env python3
"""
Investigate the systematic pattern of incorrect Tods brand extraction
"""
import sys
import asyncio
sys.path.append('backend')

from sqlalchemy import select, text, func
from app.core.database import get_db
from app.models.products_master import ProductMaster
from app.utils.text_parser import BrandExtractor
import re

async def investigate_brand_pattern():
    """Investigate the pattern of brand extraction errors"""
    print("深入调查品牌提取错误的规律...")
    print("=" * 80)
    
    try:
        async for db in get_db():
            # 1. Check the specific product mentioned
            print("\n1. 检查用户提到的具体产品:")
            print("-" * 80)
            
            specific_query = select(ProductMaster).where(
                ProductMaster.线上宝贝名称.like('%棕色鳄鱼压纹麻花扣乐福鞋XXW79A0DD00WENS801%')
            )
            
            specific_result = await db.execute(specific_query)
            specific_products = specific_result.scalars().all()
            
            if specific_products:
                for product in specific_products:
                    print(f"产品ID: {product.id}")
                    print(f"产品名称: {product.线上宝贝名称}")
                    print(f"数据库中的品牌: '{product.品牌}'")
                    print(f"品牌长度: {len(product.品牌)} 字符")
                    print(f"货号: '{product.货号}'")
                    
                    # Check if it matches the problematic pattern
                    expected_bad = "Tods棕色鳄鱼压纹麻花扣乐福鞋XXW79A0DD00WENS8018.15LA"
                    if product.品牌 == expected_bad:
                        print(f"❌ 确认品牌错误！")
                    elif product.品牌 == "Tods":
                        print(f"✅ 品牌正确")
                    else:
                        print(f"⚠️ 品牌异常: '{product.品牌}'")
            else:
                print("未找到该产品")
            
            # 2. Find ALL products with this systematic error pattern
            print("\n\n2. 查找所有符合错误模式的产品:")
            print("-" * 80)
            
            # Pattern: Brand includes product description and code
            pattern_query = text("""
                SELECT id, 线上宝贝名称, 品牌, 货号, length(品牌) as brand_len
                FROM products_master
                WHERE 线上宝贝名称 LIKE '%Tods%'
                AND 品牌 != 'Tods'
                AND length(品牌) > 20
                ORDER BY id
                LIMIT 20
            """)
            
            pattern_result = await db.execute(pattern_query)
            problematic = pattern_result.fetchall()
            
            if problematic:
                print(f"发现 {len(problematic)} 个符合错误模式的产品:\n")
                for i, row in enumerate(problematic, 1):
                    print(f"{i}. ID: {row[0]}")
                    print(f"   产品: {row[1]}")
                    print(f"   错误品牌: '{row[2]}'")
                    print(f"   品牌长度: {row[4]} 字符")
                    print()
                    
                    # Analyze the pattern
                    product_name = row[1]
                    bad_brand = row[2]
                    
                    # Check if brand contains parts of product name
                    if "Tods" in bad_brand and "女款" not in bad_brand and "代购" not in bad_brand:
                        print(f"   分析: 品牌包含产品描述，但移除了'女款'和'代购'")
            else:
                print("✅ 未发现符合错误模式的产品")
            
            # 3. Test brand extraction logic directly
            print("\n\n3. 测试品牌提取逻辑:")
            print("-" * 80)
            
            extractor = BrandExtractor()
            
            test_cases = [
                "Tods女款棕色鳄鱼压纹麻花扣乐福鞋XXW79A0DD00WENS801代购8.15LA",
                "Tods女款黑色一字带平底凉鞋 XXW15L0IH90MIDB999美国代购 8.15LA",
                "Tods女款牛仔拼色镂空锁扣装饰踩鞋XXW00G0GW40R43ZZEU代购8.15LA"
            ]
            
            for test in test_cases:
                print(f"\n测试输入: {test}")
                
                # Extract brand
                brand = extractor.extract_brand(test)
                print(f"提取的品牌: '{brand}'")
                
                # Check what would happen if we remove 女款 and 代购
                modified = test.replace("女款", "").replace("代购", "").replace("美国代购", "")
                print(f"如果去掉'女款'和'代购': {modified}")
                
                # Extract from modified
                bad_brand = extractor.extract_brand(modified)
                print(f"从修改后提取: '{bad_brand}'")
                
                # Manually construct the bad pattern
                pattern_match = re.match(r'Tods(.*?)(?:XXW|[A-Z0-9]{6,})', test)
                if pattern_match:
                    potential_bad = "Tods" + pattern_match.group(1).replace("女款", "")
                    print(f"潜在错误模式: '{potential_bad}'")
            
            # 4. Check for any preprocessing issues
            print("\n\n4. 检查数据预处理问题:")
            print("-" * 80)
            
            # Check if there are raw_orders with modified data
            raw_check_query = text("""
                SELECT COUNT(*) as count,
                       SUM(CASE WHEN 线上宝贝名称 LIKE '%Tods女款%' THEN 1 ELSE 0 END) as with_nvkuan,
                       SUM(CASE WHEN 线上宝贝名称 LIKE '%Tods%' AND 线上宝贝名称 NOT LIKE '%Tods女款%' THEN 1 ELSE 0 END) as without_nvkuan
                FROM raw_orders
                WHERE 线上宝贝名称 LIKE '%Tods%'
            """)
            
            raw_result = await db.execute(raw_check_query)
            raw_stats = raw_result.first()
            
            print(f"原始订单统计:")
            print(f"  总Tods订单: {raw_stats[0]}")
            print(f"  包含'Tods女款': {raw_stats[1]}")
            print(f"  只有'Tods'(无女款): {raw_stats[2]}")
            
            # 5. Check if this is happening during aggregation
            print("\n\n5. 检查是否在聚合过程中出错:")
            print("-" * 80)
            
            # Find products with wrong brands and check their source
            wrong_brand_query = text("""
                SELECT pm.id, pm.线上宝贝名称, pm.品牌, 
                       COUNT(DISTINCT ro.id) as order_count,
                       GROUP_CONCAT(DISTINCT ro.线上宝贝名称, '|||') as raw_names
                FROM products_master pm
                LEFT JOIN raw_orders ro ON ro.线上宝贝名称 LIKE '%' || SUBSTR(pm.货号, 1, 10) || '%'
                WHERE pm.品牌 LIKE 'Tods%' AND pm.品牌 != 'Tods'
                GROUP BY pm.id
                LIMIT 5
            """)
            
            wrong_result = await db.execute(wrong_brand_query)
            wrong_products = wrong_result.fetchall()
            
            if wrong_products:
                print(f"错误品牌产品的来源分析:")
                for row in wrong_products:
                    print(f"\n产品ID: {row[0]}")
                    print(f"产品主表名称: {row[1]}")
                    print(f"错误品牌: '{row[2]}'")
                    print(f"关联订单数: {row[3]}")
                    if row[4]:
                        raw_names = row[4].split('|||')[:3]  # Show first 3
                        print(f"原始订单名称样例:")
                        for name in raw_names:
                            if name:
                                print(f"  - {name}")
            
            break
            
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    asyncio.run(investigate_brand_pattern())