"""
文本解析工具：品牌、货号、颜色、尺码提取
"""

import re
import logging
from typing import Optional, List, Dict, Any, Tuple
from rapidfuzz import fuzz, process

from app.utils.brand_config import get_brand_config

logger = logging.getLogger(__name__)


def slug(text: str) -> str:
    """
    将文本转换为 slug 格式
    - 转小写
    - 替换空格为下划线  
    - 移除特殊字符
    """
    if not text:
        return ""
    
    # 转小写
    text = text.lower()
    # 替换空格和常见分隔符为下划线
    text = re.sub(r'[\s\-\.]+', '_', text)
    # 移除特殊字符，只保留字母、数字、下划线
    text = re.sub(r'[^a-z0-9_]', '', text)
    # 移除多余的下划线
    text = re.sub(r'_+', '_', text).strip('_')
    
    return text


class BrandExtractor:
    """品牌提取器"""
    
    def __init__(self, brand_aliases: Dict[str, str] = None):
        """
        初始化品牌提取器
        
        Args:
            brand_aliases: 品牌别名映射 {alias: canonical_brand}
        """
        self.brand_aliases = brand_aliases or {}
        # 创建不区分大小写的映射
        self.case_insensitive_aliases = {
            k.lower(): v for k, v in self.brand_aliases.items()
        }
    
    def extract_brand(self, product_name: str) -> Optional[str]:
        """
        从商品名称提取品牌
        
        Args:
            product_name: 商品名称
            
        Returns:
            标准化品牌名，如果未找到返回 None
        """
        if not product_name or not isinstance(product_name, str):
            return None
        
        # 1. 优先从别名表匹配
        brand = self._match_from_aliases(product_name)
        if brand:
            return self._clean_brand_name(brand)
        
        # 2. 提取候选品牌词
        candidates = self._extract_brand_candidates(product_name)
        
        # 3. 模糊匹配现有品牌
        if candidates and self.brand_aliases:
            brand = self._fuzzy_match_brands(candidates)
            if brand:
                return self._clean_brand_name(brand)
        
        # 4. 返回最有可能的候选词（也需要清理）
        if candidates:
            return self._clean_brand_name(candidates[0])
        return None
    
    def _match_from_aliases(self, product_name: str) -> Optional[str]:
        """从别名表和配置匹配品牌"""
        product_lower = product_name.lower()

        # 获取品牌配置
        config = get_brand_config()

        # 1. 优先级品牌匹配（从配置加载）
        priority_match = config.match_priority_brand(product_lower)
        if priority_match:
            return priority_match

        # 2. 复合品牌匹配（从配置加载）
        compound_match = config.match_compound_brand(product_lower)
        if compound_match:
            return compound_match

        # 3. 简单品牌匹配（从配置加载）
        simple_match = config.match_simple_brand(product_lower)
        if simple_match:
            return simple_match

        # 4. 精确匹配整个别名（来自数据库） - 按长度排序，先匹配长的
        sorted_aliases = sorted(self.case_insensitive_aliases.items(),
                               key=lambda x: len(x[0]), reverse=True)

        for alias_lower, canonical in sorted_aliases:
            if alias_lower in product_lower:
                return canonical

        return None
    
    def _extract_brand_candidates(self, product_name: str) -> List[str]:
        """提取品牌候选词"""
        candidates = []

        # 获取品牌配置
        config = get_brand_config()

        # 需要忽略的非品牌关键词（从配置加载）
        ignore_keywords = config.ignore_keywords

        # 预处理：处理常见连写品牌和空格问题（从配置加载）
        preprocessed_name = config.preprocess_product_name(product_name)

        # 处理品牌名直接接中文的情况 (在中英文交界处添加空格)
        # 在英文字母后紧跟中文字符的地方添加空格
        preprocessed_name = re.sub(r'([A-Za-z])([\u4e00-\u9fff])', r'\1 \2', preprocessed_name)
        # 在中文字符后紧跟英文字母的地方添加空格
        preprocessed_name = re.sub(r'([\u4e00-\u9fff])([A-Za-z])', r'\1 \2', preprocessed_name)
        
        # 处理分隔符问题，确保 & 周围有空格
        preprocessed_name = preprocessed_name.replace('&', ' & ')
        
        # 分词处理
        words = preprocessed_name.strip().split()
        
        # 跳过忽略关键词，找到真正的品牌词
        for i, word in enumerate(words):
            # 清理单词（去除标点符号）
            clean_word = word.strip('，。！？；：""''()（）[]【】{}〈〉《》')
            
            if clean_word in ignore_keywords:
                continue
            
            # 检查是否是有效的品牌候选词
            if len(clean_word) > 1 and not clean_word.isdigit():
                # 处理组合词（如 "小王国ALEXANDER" -> "ALEXANDER"）
                if any(ignore in clean_word for ignore in ignore_keywords):
                    # 去除忽略关键词
                    for ignore in ignore_keywords:
                        clean_word = clean_word.replace(ignore, '')
                    clean_word = clean_word.strip()
                
                if len(clean_word) > 1:
                    # 检查是否是已知品牌的开头
                    is_known_brand = False
                    combined_brand = None

                    # 首先检查单个词是否是完整品牌（如 Tods）- 从配置加载
                    if config.is_single_word_brand(clean_word):
                        candidates.append(clean_word)
                        return candidates  # 立即返回，避免处理后续词汇

                    # 然后检查多词品牌
                    if not is_known_brand and i + 1 < len(words):
                        next_word = words[i + 1].strip('，。！？；：""''()（）[]【】{}〈〉《》')
                        if (len(next_word) > 1 and not next_word.isdigit() and
                            not config.is_ignore_keyword(next_word)):
                            # 检查常见的多词品牌组合
                            combined = f"{clean_word} {next_word}"
                            # 优先考虑已知的多词品牌 - 从配置加载
                            known_brand = config.is_known_multi_word_brand(combined)
                            if known_brand:
                                candidates.append(known_brand)
                                return candidates  # 立即返回，避免处理后续词汇

                            if len(next_word) > 1:
                                candidates.append(combined)
                                return candidates  # 立即返回，避免处理后续词汇
                    
                    # 如果到这里说明是单个有效词但不是已知品牌，也添加并返回
                    candidates.append(clean_word)
                    return candidates  # 立即返回，避免处理后续词汇
        
        # 提取括号内容
        bracket_patterns = [
            r'[（\(]([^）\)]+)[）\)]',  # 圆括号
            r'[【\[]([^】\]]+)[】\]]',  # 方括号
        ]
        
        for pattern in bracket_patterns:
            matches = re.findall(pattern, product_name)
            for match in matches:
                match = match.strip()
                if len(match) > 1 and not match.isdigit():
                    candidates.append(match)
        
        # 移除重复并保持顺序，同时过滤过长的品牌名
        seen = set()
        unique_candidates = []
        for candidate in candidates:
            # 过滤掉明显过长的品牌名（可能是产品描述）
            if (candidate.lower() not in seen and 
                len(candidate) <= 25 and  # 品牌名不应超过25个字符
                not any(keyword in candidate.lower() for keyword in 
                       ['颜色', '款式', '短款', '长款', '大衣', '外套', '上衣', '连衣裙', '羽绒', 
                        '毛毛', '羊毛', '羊绒', '黑色', '白色', '红色', '蓝色', '粉色', '灰色',
                        'mini', '做旧', '麻花扣', '马鞍包', '方扣', '中跟鞋', '短靴'])):
                seen.add(candidate.lower())
                unique_candidates.append(candidate)
        
        return unique_candidates
    
    def _fuzzy_match_brands(self, candidates: List[str], threshold: int = 85) -> Optional[str]:
        """模糊匹配品牌"""
        if not candidates or not self.brand_aliases:
            return None
        
        # 获取所有标准品牌名
        canonical_brands = list(set(self.brand_aliases.values()))
        
        for candidate in candidates:
            # 模糊匹配
            result = process.extractOne(candidate, canonical_brands, scorer=fuzz.ratio)
            if result and result[1] >= threshold:
                logger.debug(f"Fuzzy matched brand: {candidate} -> {result[0]} (score: {result[1]})")
                return result[0]
        
        return None
    
    def suggest_new_alias(self, product_name: str, threshold: int = 85) -> List[Dict[str, Any]]:
        """
        为未匹配的品牌生成别名建议
        
        Returns:
            建议列表，格式: [{"alias": str, "canonical": str, "confidence": int}]
        """
        candidates = self._extract_brand_candidates(product_name)
        suggestions = []
        
        if not candidates or not self.brand_aliases:
            return suggestions
        
        canonical_brands = list(set(self.brand_aliases.values()))
        
        for candidate in candidates:
            if candidate.lower() not in self.case_insensitive_aliases:
                # 模糊匹配寻找相似品牌
                result = process.extractOne(candidate, canonical_brands, scorer=fuzz.ratio)
                if result and result[1] >= threshold:
                    suggestions.append({
                        "alias": candidate,
                        "canonical": result[0], 
                        "confidence": result[1]
                    })
        
        return suggestions
    
    def _clean_brand_name(self, brand: str) -> str:
        """
        清理品牌名，只保留英文部分
        如果品牌同时包含中英文，只返回英文部分
        """
        if not brand:
            return brand
        
        # 如果同时包含中英文，提取英文部分
        has_english = bool(re.search(r'[A-Za-z]', brand))
        has_chinese = bool(re.search(r'[\u4e00-\u9fff]', brand))
        
        if has_english and has_chinese:
            # 提取所有英文单词（包括撇号，如Arc'teryx）
            english_words = re.findall(r'[A-Za-z]+(?:\'[A-Za-z]+)?', brand)
            if english_words:
                # 合并英文单词，保持原有空格
                cleaned = ' '.join(english_words)
                # 特殊处理一些品牌名
                if 'Arc' in cleaned and 'teryx' in cleaned:
                    return "Arc'teryx"
                return cleaned
        
        return brand
    
    def extract(self, product_name: str) -> Optional[str]:
        """
        品牌提取的外部接口方法
        兼容产品主表服务调用
        """
        return self.extract_brand(product_name)


class ProductCodeExtractor:
    """货号提取器"""
    
    def extract_product_code(self, product_name: str, brand: Optional[str] = None) -> Optional[str]:
        """
        从商品名称提取货号
        
        Args:
            product_name: 商品名称
            brand: 已识别的品牌（用于过滤）
            
        Returns:
            提取的货号
        """
        if not product_name or not isinstance(product_name, str):
            return None
        
        # 特殊处理Arc'teryx品牌 - 优先提取X开头的货号
        if brand and brand.lower() in ["arc'teryx", "arcteryx", "arc teryx"]:
            arcteryx_code = self._extract_arcteryx_code(product_name)
            if arcteryx_code:
                return arcteryx_code

        # 特殊处理Canada Goose品牌 - 优先提取4位数字+字母的货号
        if brand and brand.lower() in ["canada goose", "canadagoose", "加拿大鹅"]:
            canada_goose_code = self._extract_canada_goose_code(product_name)
            if canada_goose_code:
                return canada_goose_code
        
        # 移除品牌词
        clean_name = self._remove_brand(product_name, brand)
        
        # 需要忽略的采购标识符（扩展列表）
        procurement_keywords = {'AT', 'MC', 'GN', 'NY', 'LA', 'AP', 'SS', 'ZT', 'CN', '现货', 'LAMC'}
        
        # 通用货号提取模式：数字、字母、数字字母混搭 + 分隔符组合
        code_patterns = [
            # 0. 8位及以上纯数字（Burberry等品牌的典型货号）- 优先级最高
            r'(?<=[\u4e00-\u9fff])\d{8,}(?=[\s\u4e00-\u9fff]|$)',  # 紧跟中文的8位数字
            r'\b\d{8,}\b',                                          # 独立的8位及以上数字
            
            # 1. 多部分代码（用空格、-、/、.分隔）
            r'[A-Za-z0-9]+(?:[-\s/\.][A-Za-z0-9]+)+',  # 包含分隔符的组合：AP-TSH-0462, M33LB041O 1003, FARAH-D 368
            
            # 2. 字母数字混合代码（单一代码）
            r'[A-Za-z]+\d+[A-Za-z0-9]*',                      # 字母开头：M33LB041O, X6592
            r'\d+[A-Za-z]+[A-Za-z0-9]*',                      # 数字开头：8C2238047T
            
            # 3. 特定格式代码（使用单词边界）
            r'\b[A-Z]{2,}[0-9]{2,}\b',                        # 大写字母+数字：UP5036
            r'\b\d{5,}\b',                                     # 中长数字编号：32367, 117077
            
            # 4. 单个英文产品代码（如MAGAN, ELIN等）
            r'(?<=\s|[\u4e00-\u9fff])[A-Z]{3,}(?=\s|[\u4e00-\u9fff]|$)',  # 大写英文产品代码：MAGAN, ELIN, ALIAH, KOSY
            
            # 5. 紧邻中文的代码（处理中英混排）
            r'[A-Za-z0-9]{6,}(?=\s|[\u4e00-\u9fff]|$)',      # 长编号（紧邻中文或结尾）
        ]
        
        codes = []
        for pattern in code_patterns:
            matches = re.findall(pattern, clean_name)
            codes.extend(matches)
        
        # 特殊处理：多部分代码（如 UP5036 UW400 DUDI）
        if 'UMA WANG' in product_name:
            multi_part_codes = self._extract_uma_wang_codes(clean_name, codes)
            if multi_part_codes:
                return ' '.join(multi_part_codes).upper()
        
        if not codes:
            return None
        
        # 4. 选择最佳候选
        # 优先选择包含字母和数字的组合，且长度较长的
        valid_codes = []
        for code in codes:
            code_upper = code.upper()
            
            # 更全面的货号验证逻辑
            if self._is_valid_product_code(code, procurement_keywords):
                # 对于有效代码，进行后处理以移除尾部的日期
                cleaned_code = self._clean_trailing_dates(code)
                
                # 如果清理后的代码不同且仍然有效，使用清理后的版本
                if (cleaned_code and cleaned_code != code and 
                    self._is_valid_product_code(cleaned_code, procurement_keywords)):
                    valid_codes.append((cleaned_code, code))  # 保存清理后的和原始的
                else:
                    # 否则使用原始代码
                    valid_codes.append((code, code))  # 保存两个版本
        
        if valid_codes:
            # 优先返回包含数字的代码，然后按长度排序
            def score_code(code_tuple):
                code = code_tuple[0]  # 使用清理后的代码进行评分
                has_digits = bool(re.search(r'\d', code))
                has_letters = bool(re.search(r'[A-Za-z]', code))
                has_separators = bool(re.search(r'[-\s/\.]', code))
                
                score = 0
                
                # 特殊处理：如果包含 "Logo" 或其他描述性词汇，大幅降低评分
                descriptive_words = ['LOGO', 'CLASSIC', 'VINTAGE', 'BASIC', 'COLLECTION']
                if any(word in code.upper() for word in descriptive_words):
                    score -= 3000
                
                # 8位纯数字代码（如 80721821, 80899451）- Burberry等品牌的典型货号
                if re.match(r'^\d{8,}$', code):
                    score += 6000
                # 完整的多部分代码（如 A-03-10598-GRY1, 586402 1000）优先级很高
                elif has_separators and has_digits and len(code) >= 8:
                    score += 5000
                # 空格分隔的纯数字代码（如 586402 1000）
                elif ' ' in code and re.match(r'^\d+\s+\d+$', code):
                    score += 4500
                # 其他空格分隔的代码（如 M33LB041O 1003）
                elif ' ' in code and re.search(r'\d', code):
                    score += 4000
                # 纯英文但多词的代码（如 MOSS MAI BOXY TEE）
                elif has_letters and ' ' in code and len(code.split()) >= 3:
                    score += 3500
                elif has_letters and ' ' in code and len(code.split()) >= 2:
                    score += 3000
                # 纯数字代码（如 32367, 117077）
                elif re.match(r'^\d+$', code) and len(code) >= 5:
                    score += 2800
                # 字母数字混合的代码（如 X6592, K91KQ21）
                elif has_digits and has_letters and ' ' not in code:
                    score += 2500
                # 包含分隔符的其他代码（如 FARAH-D, TIMT-2O）
                elif has_separators and '-' in code:
                    score += 2200
                # 单个英文代码（如 CLUNYE, MAGAN）
                elif has_letters and ' ' not in code and len(code) >= 3:
                    score += 1800
                # 纯数字长代码
                elif has_digits and len(code) >= 6:
                    score += 1500
                # 其他分隔符代码
                elif has_separators:
                    score += 1200
                
                # 长度加分 - 长字段比短字段评分更高
                if len(code) >= 20:  # 超长代码
                    score += 1000
                elif len(code) >= 15:  # 很长的完整代码
                    score += 800
                elif len(code) >= 12:  # 较长代码
                    score += 600
                elif len(code) >= 10:  # 中等长度完整代码
                    score += 400
                elif len(code) >= 8:   # 标准长度代码
                    score += 200
                elif len(code) >= 6:   # 较短代码
                    score += 100
                    
                # 长度权重加分 - 每个字符都重要
                score += len(code) * 10
                
                return score
            
            best_code = max(valid_codes, key=score_code)
            return best_code[0]  # 返回清理后的代码（保持原始大小写）
        
        # 如果没有有效的货号，返回None
        return None
    
    def _clean_trailing_dates(self, code: str) -> Optional[str]:
        """
        清理代码尾部的日期格式
        
        Args:
            code: 产品代码
            
        Returns:
            清理后的代码
        """
        if not code:
            return code
            
        # 移除尾部的日期格式模式和采购标识符（按优先级排序）
        date_cleanup_patterns = [
            r'\s+\d{1,2}\.\d{1,2}\s*[A-Z]{2,4}$', # 移除 " 8.17 LA", " 6.18 MC", " 8.17 LAMC" 等  
            r'\s+\d{1,2}\.\d{1,2}$',               # 移除 " 8.13", " 8.18" 等
            r'\s+\d{1,2}\.\d{1}$',                 # 移除 " 8.2", " 8.1" 等（单位数小数）
            r'\d{1,2}\.\d{1,2}$',                  # 移除直接连接的 "8.13", "8.16" 等
            r'[A-Za-z]\d{1,2}\.\d{1,2}$',         # 移除字母+日期 "TEE8.13", "E8.16" 等
            r'\s+\d{1,2}[A-Z]{2,4}$',             # 移除 " 15LA", " 19MC", " 17LAMC" 等
            r'\s+(MC|AT|GN|NY|LA|AP|SS|ZT|CN|现货|LAMC)$', # 移除单独的采购标识符
            r'\s+\d{1,2}$',                       # 移除单独的数字 " 8", " 13" (日期片段)
            r'\s+\d{1,2}[A-Z]$',                  # 移除 " 6M", " 20M" 等（日期+M）
        ]
        
        cleaned = code
        for pattern in date_cleanup_patterns:
            cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
        
        # 如果清理后为空或太短，返回原始代码
        if not cleaned or len(cleaned.strip()) < 2:
            return code
            
        return cleaned
    
    def _extract_arcteryx_code(self, product_name: str) -> Optional[str]:
        """
        专门处理Arc'teryx品牌的货号提取
        优先查找X开头的字母数字混合代码（如 X8485, X7411, X000009239）
        """
        if not product_name:
            return None

        # 查找所有X开头的代码（不包含中文）
        # 模式：X后跟数字，可能还有字母数字混合
        # 不使用\b边界，因为X可能紧贴中文字符
        x_patterns = [
            r'X\d{9,}',           # X后跟至少9位数字（如 X000009239）
            r'X\d{4,8}(?![A-Za-z])',  # X后跟4-8位数字，后面不跟字母（如 X8485, X7411）
            r'\bX\d{4,}',         # 有单词边界的X代码（如空格后的X8485）
        ]

        x_codes = []
        for pattern in x_patterns:
            matches = re.findall(pattern, product_name, re.IGNORECASE)
            x_codes.extend(matches)

        # 去重并保持顺序
        seen = set()
        unique_codes = []
        for code in x_codes:
            if code.upper() not in seen:
                seen.add(code.upper())
                unique_codes.append(code)

        # 如果找到X开头的代码，选择最长的那个
        if unique_codes:
            # 按长度排序，返回最长的
            unique_codes.sort(key=len, reverse=True)
            return unique_codes[0].upper()  # 返回大写形式

        return None

    def _extract_canada_goose_code(self, product_name: str) -> Optional[str]:
        """
        专门处理Canada Goose品牌的货号提取
        Canada Goose货号规则：4位数字 + 1-3个字母（如 2054M, 3900W, 2836LB, 4815MT）
        """
        if not product_name:
            return None

        # Canada Goose 特定的货号模式
        # 匹配4位数字后跟1-3个字母（大小写不敏感）
        cg_patterns = [
            r'\b\d{4}[A-Z]{1,3}\b',     # 标准格式：4位数字+1-3个字母，有单词边界
            r'(?<=[\u4e00-\u9fff])\d{4}[A-Z]{1,3}(?=[\s\u4e00-\u9fff]|$)',  # 紧跟中文的格式
            r'(?<=\s)\d{4}[A-Z]{1,3}(?=\s|$)',  # 空格分隔的格式
            r'(?<=[/])\d{4}[A-Z]{1,3}(?=[\s/]|$)',  # 斜杠分隔的格式
        ]

        cg_codes = []
        for pattern in cg_patterns:
            matches = re.findall(pattern, product_name, re.IGNORECASE)
            cg_codes.extend(matches)

        # 去重并保持顺序
        seen = set()
        unique_codes = []
        for code in cg_codes:
            code_upper = code.upper()
            # 验证是有效的Canada Goose货号（排除可能的误匹配）
            # 字母部分应该是常见的后缀：M(男款), W/L(女款), MB/MT/MCB等
            if code_upper not in seen:
                # 提取字母后缀部分
                letter_suffix = re.search(r'[A-Z]+$', code_upper).group()
                # 常见的Canada Goose后缀
                valid_suffixes = ['M', 'W', 'L', 'MB', 'MT', 'MCB', 'WT', 'LB', 'WB', 'MW']
                # 如果后缀在有效列表中，或者长度不超过3个字母，都认为有效
                if letter_suffix in valid_suffixes or len(letter_suffix) <= 3:
                    seen.add(code_upper)
                    unique_codes.append(code_upper)

        # 如果找到Canada Goose货号，返回第一个（通常是最相关的）
        if unique_codes:
            return unique_codes[0]

        return None
    
    def _extract_uma_wang_codes(self, text: str, found_codes: List[str]) -> Optional[List[str]]:
        """
        专门处理UMA WANG的多部分货号（如 UP5036 UW400 或 US9001 UW900）
        """
        # 查找UMA WANG特有的模式：US/UP + 数字 和 UW + 数字
        uma_patterns = [
            r'(U[SP]\d+)\s+(UW\d+)',  # US9001 UW900 或 UP5036 UW400
            r'(UP\d+)\s+(UW\d+)\s+(DUDI)',  # 特殊的三部分模式
        ]

        for pattern in uma_patterns:
            match = re.search(pattern, text)
            if match:
                return list(match.groups())

        # 回退到通用多部分检测，但要去重和避免重复
        if len(found_codes) >= 2:
            seen = set()
            consecutive_parts = []

            # 首先找出所有有效的独立代码
            for code in found_codes:
                # 跳过已见过的代码
                if code.upper() in seen:
                    continue

                # 跳过是其他代码子串的代码
                is_substring = False
                for other_code in found_codes:
                    if code != other_code and code in other_code and ' ' in other_code:
                        is_substring = True
                        break

                # 只添加有效的独立代码
                if not is_substring and len(code) >= 4 and re.search(r'[A-Za-z].*\d', code):
                    # 如果代码包含空格，检查是否已经包含了其组成部分
                    if ' ' in code:
                        parts = code.split()
                        # 如果组成部分都已经在seen中，跳过这个组合
                        if all(p.upper() in seen for p in parts):
                            continue
                        # 添加组合代码并标记其组成部分
                        consecutive_parts.append(code)
                        seen.add(code.upper())
                        for part in parts:
                            seen.add(part.upper())
                    else:
                        # 单个代码，直接添加
                        if code.upper() not in seen:
                            consecutive_parts.append(code)
                            seen.add(code.upper())

            # 返回最多前2个独立代码部分
            if len(consecutive_parts) >= 1:
                return consecutive_parts[:2]

        return None
    
    def _is_valid_product_code(self, code: str, procurement_keywords: set) -> bool:
        """
        验证是否为有效的产品代码
        
        有效条件：
        1. 数字、字母、或数字字母混搭
        2. 可包含分隔符 -, 空格, /, .
        3. 排除日期格式和采购标识符
        4. 排除纯英文单词（如YUKON）
        """
        code_upper = code.upper()
        
        # 1. 基本长度要求
        if len(code) < 2:
            return False
        
        # 2. 排除纯采购标识符
        if code_upper in procurement_keywords:
            return False
        
        # 3. 排除明显的日期格式
        # 日期格式模式：数字.数字MC/LA/AT等
        date_patterns = [
            r'^\d{1,2}\.\d{1,2}[A-Z]{2,3}$',         # 如: 8.19MC, 11.27MC
            r'^\d{1,2}\.\d{1,2}$',                    # 如: 8.19, 11.27
            r'^\d{1,2}\.\d{1,2}[A-Z]$',              # 如: 6.20M, 8.14M (日期+单字母)
            r'^\d{1,2}[A-Z]{2,3}$',                   # 如: 19MC, 27LA, 15LA (短数字+采购标识)
            r'^\d{1,2}\.\d{1,2}\s+[A-Z]{2,3}$',      # 如: 8.17 LA, 6.18 MC (空格分隔的日期+采购)
        ]
        
        for pattern in date_patterns:
            if re.match(pattern, code_upper):
                return False
        
        # 特别检查短数字+采购标识的模式
        if len(code) <= 4 and re.match(r'^\d{1,2}[A-Z]{2,3}$', code_upper):
            # 检查是否以采购标识结尾
            for keyword in procurement_keywords:
                if code_upper.endswith(keyword):
                    return False
        
        # 对于纯数字代码，允许5位或以上的数字
        if re.match(r'^\d+$', code) and len(code) >= 5:
            return True
        
        # 对于以单独数字结尾的代码，检查是否为有效的产品代码+日期片段
        if re.search(r'\s\d{1,2}$', code) and len(code.split()[-1]) <= 2:
            # 移除尾部数字检查剩余部分
            without_trailing_digit = re.sub(r'\s\d{1,2}$', '', code).strip()
            
            # 如果剩余部分是纯英文单词，可能是 "Steffey Friend 8" 情况
            if re.match(r'^[A-Za-z\s]+$', without_trailing_digit):
                return True
            # 如果剩余部分包含分隔符，可能是 "FARAH-D 368 8", "TIMT-2O 762 8" 情况
            elif re.search(r'[-/\.]', without_trailing_digit):
                return True
            # 如果剩余部分是字母数字混合，可能是 "TINO-D 8" 情况
            elif re.search(r'[A-Za-z]', without_trailing_digit) and re.search(r'\d', without_trailing_digit):
                return True
            else:
                # 其他情况排除
                return False
        
        # 对于以日期格式结尾的代码，检查是否为可能的产品名+日期
        if re.search(r'\s\d{1,2}\.\d{1,2}$', code):
            # 移除日期部分，检查剩余部分是否为有效的产品名
            without_date = re.sub(r'\s\d{1,2}\.\d{1,2}$', '', code).strip()
            
            # 如果剩余部分是纯英文产品名（如 "Steffey Friend"）
            if re.match(r'^[A-Za-z\s]+$', without_date) and len(without_date.split()) >= 2:
                return True
            # 如果剩余部分包含分隔符（如 "TINO-D", "FARAH-D"）
            elif re.search(r'[-/]', without_date):
                return True
            # 如果剩余部分是字母数字混合（如产品代码）
            elif re.search(r'[A-Za-z]', without_date) and len(without_date) >= 3:
                return True
            else:
                # 其他情况排除
                return False
        
        # 4. 排除纯英文单词和品牌名（如YUKON, Boyfriend, SKIMS等）
        # 但保留可能的产品名称组合（如Steffey Friend）
        if re.match(r'^[A-Za-z\s]+$', code):
            # 已知的品牌名和常见描述词，这些不是货号
            brand_and_descriptive_words = {
                'SKIMS', 'ALEXANDER WANG', 'ALEXANDER', 'WANG', 'MOOSE KNUCKLES', 
                'MOOSE', 'KNUCKLES', 'UMA WANG', 'UMA', 'TODS', 'ACNE STUDIOS', 'ACNE',
                'ST JOHN', 'RAG BONE', 'ANINE BING', 'BOGNER', 'MONCLER', 'ARCTERYX',
                'POLO RALPH LAUREN', 'POLO', 'RALPH', 'LAUREN', 'ALEXANDER MCQUEEN',
                'MCQUEEN', 'STUART WEITZMAN', 'STUART', 'WEITZMAN', 'ARITZIA',
                'BOYFRIEND', 'YUKON', 'CLASSIC', 'VINTAGE', 'PREMIUM', 'BASIC', 
                'LUXURY', 'CASUAL', 'FORMAL', 'SPORT', 'ACTIVE', 'COMFORT',
                'SLIM', 'REGULAR', 'OVERSIZED', 'FITTED', 'LOOSE', 'TIGHT',
                'BODYWEAR', 'COLLECTION', 'LIMITED', 'EDITION', 'SIMA', 'HOODY', 'LOGO'
            }
            
            # 如果是已知的品牌名，一定排除
            if code_upper in brand_and_descriptive_words:
                return False
            
            # 对于多单词的英文组合，允许作为产品代码
            # 但需要排除明显的描述性词汇组合
            if ' ' in code:
                words = code_upper.split()
                
                # 如果包含已知品牌名，排除
                if any(word in brand_and_descriptive_words for word in words):
                    return False
                
                # 允许2-5个单词的组合作为产品代码
                if 2 <= len(words) <= 5:
                    return True
            
            # 单个英文单词：3个字母及以上就可以，但排除明显的描述词
            elif ' ' not in code:
                # 长于或者等于3个字母的就行（用户要求）
                if len(code) >= 3:
                    return True
                else:
                    return False
        
        # 5. 检查是否符合货号的基本格式要求
        # 货号应该包含：数字、字母、或两者混搭，可以有分隔符
        has_letters = re.search(r'[A-Za-z]', code)
        has_digits = re.search(r'\d', code)
        has_separators = re.search(r'[-\s/\.]', code)
        
        # 有效的货号类型：
        # - 纯数字（6位以上）
        # - 字母+数字混搭（优先）
        # - 包含分隔符的组合
        is_long_number = code.isdigit() and len(code) >= 6
        is_mixed_alphanumeric = has_letters and has_digits
        is_separated_format = has_separators and (has_letters or has_digits)
        
        if is_long_number or is_mixed_alphanumeric or is_separated_format:
            # 最后检查：排除明显以采购标识符结尾的短代码
            if len(code) <= 4 and code_upper.endswith(tuple(procurement_keywords)):
                return False
            return True
        
        return False
    
    def _remove_brand(self, text: str, brand: Optional[str]) -> str:
        """从文本中移除品牌词"""
        if not brand:
            return text
        
        # 移除品牌词（不区分大小写）
        pattern = re.compile(re.escape(brand), re.IGNORECASE)
        return pattern.sub('', text).strip()


class AttributeExtractor:
    """属性提取器（颜色、尺码等）"""
    
    def __init__(self, color_aliases: Dict[str, str] = None, size_aliases: Dict[str, str] = None):
        """
        初始化属性提取器
        
        Args:
            color_aliases: 颜色别名映射
            size_aliases: 尺码别名映射
        """
        self.color_aliases = color_aliases or {}
        self.size_aliases = size_aliases or {}
        
        # 创建不区分大小写的映射
        self.case_insensitive_colors = {
            k.lower(): v for k, v in self.color_aliases.items()
        }
        self.case_insensitive_sizes = {
            k.lower(): v for k, v in self.size_aliases.items()
        }
    
    def extract_attributes(self, product_name: str, sales_attrs: str = "") -> Dict[str, Optional[str]]:
        """
        从商品名称和销售属性中提取颜色、尺码
        
        Args:
            product_name: 商品名称
            sales_attrs: 销售属性（如：颜色:黑色;尺码:L 或 棕色_39）
            
        Returns:
            {"color": str, "size": str}
        """
        result = {"color": None, "size": None}
        
        # 首先尝试从销售属性中提取（处理 color_size 格式）
        if sales_attrs and '_' in sales_attrs:
            # 处理常见的 "颜色_尺寸" 格式
            parts = sales_attrs.split('_')
            if len(parts) >= 2:
                # 第一部分通常是颜色
                color_candidate = parts[0].strip()
                # 最后一部分通常是尺寸
                size_candidate = parts[-1].strip()
                
                # 验证并设置颜色
                if color_candidate and not color_candidate.isdigit():
                    result["color"] = self._normalize_color(color_candidate)
                
                # 验证并设置尺寸
                if size_candidate:
                    # 尺寸可能是数字（鞋码）或字母（衣服尺码）
                    result["size"] = self._normalize_size(size_candidate.upper())
        
        # 如果underscore格式没有提取到，尝试斜杠格式
        elif sales_attrs and '/' in sales_attrs:
            parts = sales_attrs.split('/')
            if len(parts) >= 2:
                # 第一部分通常是颜色
                result["color"] = self._normalize_color(parts[0].strip())
                # 最后一部分可能是尺寸
                last_part = parts[-1].strip()
                if re.match(r'^[XSML]+$|^\d+\.?\d*$', last_part.upper()):
                    result["size"] = self._normalize_size(last_part.upper())
        
        # 如果还没有提取到，使用原有的结构化提取逻辑
        combined_text = f"{product_name} {sales_attrs or ''}"
        
        if not result["color"]:
            result["color"] = self._extract_color(combined_text)
        
        if not result["size"]:
            result["size"] = self._extract_size(combined_text)
        
        return result
    
    def _extract_color(self, text: str) -> Optional[str]:
        """提取颜色"""
        if not text:
            return None
        
        # 颜色关键词模式
        color_patterns = [
            r'(?:颜色|Color|colour)[：:=]\s*([^,;，；\s]+)',
            r'(?:色|色彩)[：:=]\s*([^,;，；\s]+)',
        ]
        
        # 先尝试结构化提取
        for pattern in color_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                color = self._normalize_color(match.strip())
                if color:
                    return color
        
        # 如果结构化提取失败，尝试别名匹配
        text_lower = text.lower()
        for alias_lower, canonical in self.case_insensitive_colors.items():
            if alias_lower in text_lower:
                return canonical
        
        return None
    
    def _extract_size(self, text: str) -> Optional[str]:
        """提取尺码"""
        if not text:
            return None
        
        # 尺码关键词模式
        size_patterns = [
            r'(?:尺码|Size|尺寸|号)[：:=]\s*([^,;，；\s]+)',
            r'(?:码|size)[：:=]\s*([^,;，；\s]+)',
        ]
        
        # 先尝试结构化提取
        for pattern in size_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            for match in matches:
                size = self._normalize_size(match.strip())
                if size:
                    return size
        
        # 如果结构化提取失败，尝试别名匹配
        text_lower = text.lower()
        for alias_lower, canonical in self.case_insensitive_sizes.items():
            if alias_lower in text_lower:
                return canonical
        
        return None
    
    def _normalize_color(self, color: str) -> Optional[str]:
        """标准化颜色"""
        if not color:
            return None
        
        color_lower = color.lower().strip()
        return self.case_insensitive_colors.get(color_lower, color)
    
    def _normalize_size(self, size: str) -> Optional[str]:
        """标准化尺码"""
        if not size:
            return None
        
        size_lower = size.lower().strip()
        return self.case_insensitive_sizes.get(size_lower, size)


def generate_sku_id(merchant_code: Optional[str], 
                   product_id: Optional[str], 
                   sku_number: Optional[str],
                   color: Optional[str], 
                   size: Optional[str]) -> str:
    """
    生成 SKU ID
    
    Args:
        merchant_code: 线上商家编码
        product_id: 商品编号
        sku_number: SKU编号  
        color: 颜色
        size: 尺寸
        
    Returns:
        生成的 SKU ID
    """
    # 选择主要标识符
    main_id = merchant_code or product_id or sku_number or "unknown"
    
    # 生成 SKU ID
    parts = [
        slug(main_id),
        slug(color or ""),
        slug(size or "")
    ]
    
    # 移除空部分
    parts = [p for p in parts if p]
    
    return "_".join(parts) if parts else "unknown_sku"