# %% [markdown] cell-dfc44c7f8266
## 🕐请从本地上传一个用电商记插件采集的新版淘宝搜索结果(含综合排序和销量排序)的Excel文件。[下载示例](https://www.dianshangji.cn/u/user5344/cfiles/browse/index?fid=3)
# %%--- [html] cell-ecbcd2c2eabd
# properties:
#   run_on_load: true
# ---%%
请上传一个Excel文件:<br><input class="btn btn-primary" type="file" id="fileInput" multiple />
# %%--- [python] cell-493af944326a
# properties:
#   run_on_load: true
# ---%%
import pandas as pd
import micropip
import js
await micropip.install("/pypi/dianshangji-0.1-py3-none-any.whl")
from dianshangji import loader, palette, taobao
font = await loader.init()
# %% [markdown] cell-8f9e8f9369a1
## 🕑设置文件名
# %%--- [javascript] cell-a4635ddf8ad4
# properties:
#   run_on_load: true
# ---%%
initFileInputs("fileInput", {
    "淘宝": "/file1.xlsx"
})
# %% [markdown] cell-834de6988198
## 🕒数据清洗和处理
# %% [python] cell-ea254ca39d1f
import re

def replace_numbers_with_X(s):
    # 使用正则表达式匹配数值,包括整数和浮点数,并将其替换为'X'
    return re.sub(r'-?\d+\.?\d*', 'X', s)

def process_service(df, field_attribute = '服务标签'):
    df[field_attribute] = df[field_attribute].apply(replace_numbers_with_X)    
# %% [python] cell-24a153c7d686
print("开始读取内存中的Excel文件……")
# 读取Excel文件,跳过前两行(即从第三行开始读取),只保留“价格”、“月销量”和“属性”三个字段
excel_file_path = '/file1.xlsx'
field_attribute = '服务标签'

df_zonghe = taobao.process_data(pd.read_excel(excel_file_path, skiprows=2, usecols=['商品ID','价格','月销量', field_attribute], sheet_name = "综合"), field_attribute = field_attribute, max_x = 1000, max_y = 10000)
df_xiaoliang = taobao.process_data(pd.read_excel(excel_file_path, skiprows=2, usecols=['商品ID','价格','月销量', field_attribute], sheet_name = "销量"), field_attribute = field_attribute, max_x = 1000, max_y = 10000)

process_service(df_zonghe)
process_service(df_xiaoliang)

print(df_zonghe.head(5))
print(df_xiaoliang.head(5))

attributes_zonghe = taobao.process_attribute(df_zonghe, field_attribute = field_attribute)
print("综合排序全部属性", attributes_zonghe)
attributes_zonghe = {k: v for k, v in attributes_zonghe.items() if 2 <= len(v) <= 20}
print("综合排序有效属性", attributes_zonghe)

attributes_xiaoliang = taobao.process_attribute(df_xiaoliang, field_attribute = field_attribute)
print("销量排序全部属性", attributes_xiaoliang)
attributes_xiaoliang = {k: v for k, v in attributes_xiaoliang.items() if 2 <= len(v) <= 20}
print("销量排序有效属性", attributes_xiaoliang)

color_names = palette.color_names(lang = 'en')

# import json
# js.window.openNotebookAssistant('cell-33f8e4eabe32', 'python', '你是一名商业分析师,为了进行有效的数据分析,尤其是当考虑使用Preattentive Attributes(如颜色、大小、形状和长度)来代表数据时,选择那些能够显著区分商品特征且对用户购买决策有重大影响的属性尤为重要。你的任务是从如下描述商品的属性名+一列属性值的JSON对象中选出最有意义的10个属性名,每个属性名的所有属性值应限定在同一分类维度内。它们的属性值将用Preattentive Attributes代表,优先考虑颜色,其次考虑大小、形状和长度。你要为每一个属性的第一个属性名挑选最适合的颜色作为视觉特征。任一颜色名必须从这个颜色名集合中选择:' + ",".join(color_names) + '。\n你把选好的这些属性放在一个JSON对象中取名为selected_attributes,作为key,每个属性的第一个属性值及其颜色作为value,即一个属性值+一个相应的颜色名作为一个嵌套的JSON对象。你用python编程语言书写这个JSON对象。\n\n商品描述的属性数据如下:' + json.dumps(attributes_zonghe, ensure_ascii = False))
# 综合排序全部属性 {'服务标签': ['包邮', '跨店每X减X', '退货宝', ' X期', '公益宝贝', '淘金币已抵X元', '全网低价', '淘工厂', '新品', '已补X元', 'X期', '官方立减X元', 'ifashion', 'X天价保', '限时直降X元', '淘金币抵X元', '全球购']} 
# %% [markdown] cell-cffb23f7d99a
## 🕓设置商品属性和代表色
# %% [python] cell-33f8e4eabe32
selected_attributes = {
    "服务标签": {"跨店每X减X": "Ochre Red"}
}
print(selected_attributes.keys())
# %% [markdown] cell-ec675b09e97c
## 🕔绘制属性分布散点图
# %% [python] cell-cfb28f8a843c
import random
import matplotlib.pyplot as plt

def plot_attributes(df, attributes, selected_attributes, field_x, field_y, field_attribute, caption):    
    for attribute_key, attribute_value in selected_attributes.items():            
        if not attribute_key in attributes:
            continue
        # attribute_key: 服务标签
        # attribute_value: {"跨店每X减X": "Ochre Red"}
        primary_value = next(iter(attribute_value))    
        # 跨店每X减X
        primary_color = attribute_value[primary_value]
        # Ochre Red
        values = attributes[attribute_key]
        # ['包邮', '跨店每X减X', '退货宝', ' X期', '淘金币已抵X元', '公益宝贝', '全网低价', '淘工厂', '新品', '已补X元', 'X期', '官方立减X元', 'ifashion', 'X天价保', '淘金币抵X元', '限时直降X元', '全球购']
        colors = palette.color_picker(primary_color, len(values), hex_only = True)
        # ['#ff7d72', '#ff5f32', '#ff9152', '#ffaa5b', '#ffba4b', '#ffd23a', '#ffdd00', '#f2ff60', '#c0ffb0', '#cfffea', '#00ffee', '#c9f4ff', '#80b9ff', '#9e7eff', '#ffb5f1', '#ff8eb3', '#ff627a']
        full_colors = []
        plt.figure(figsize=(18, 12))
        for i in range(0, len(values)):        
            attribute_value = values[i]
            # 跨店每X减X
            attribute_color = palette.set_brightness(colors[i], 1.0)
            # 饱和度 100%
            full_colors.append(attribute_color)            
            # 跨店每X减X
            pattern = attribute_value
            df_subset = df[df[field_attribute].str.contains(pattern)]               
            # print(pattern + ' ' + attribute_color + ' ' + colors[i])         
            plt.scatter(df_subset[field_x], [y + 50 * random.random() for y in df_subset[field_y]], c = attribute_color, edgecolors = '#A0A0A0', alpha=0.5, s = 15, label = attribute_value, linewidths=1)
        print(caption + ' ' + attribute_key, values, full_colors)
        plt.title(caption + ' ' + attribute_key, font=font)
        plt.xlabel(field_x, font=font)
        plt.ylabel(field_y, font=font)
        plt.grid(True)
        plt.legend(prop = font)
        plt.show()
        plt.close()

plot_attributes(df_zonghe, attributes_zonghe, selected_attributes, '价格', '月销量', field_attribute, '淘宝搜索综合排序')
plot_attributes(df_xiaoliang, attributes_xiaoliang, selected_attributes, '价格', '月销量', field_attribute, '淘宝搜索销量排序')