# %%--- [python] cell-98b1a89fdb99
# properties:
#   locked: true
# ---%%
import pyodide, os, requests, js, json, micropip, xlrd, pandas as pd
await micropip.install('et_xmlfile-2.0.0-py3-none-any.whl')
await micropip.install('openpyxl-3.1.5-py2.py3-none-any.whl')
await micropip.install('shuyouqi-0.0.0-py3-none-any.whl')
from shuyouqi import profiling
# %%--- [html] cell-57297c57aa83
# properties:
#   run_on_load: true
#   locked: true
# ---%%
<a href="https://public.juguandian.com/rpa4.html?fp=taobaoshopitemlist2025" target=_blank>👉用电商记插件采集淘宝天猫店铺内宝贝列表</a>
# %%--- [prompt] cell-763f7ada2d66
# properties:
#   locked: true
# ---%%
Excel文件中的“销量”字段包含多种格式的数据,诸如“100”、“100+”、“200”、“1300+”以及“5万”等。请编写Python代码,执行以下任务:
    将“销量”字段中的所有数据转换为整数类型:
        移除末尾的“+”字符;
        将“5万”这样的格式转换为实际的数值(例如将“5万”转化为50000)。
    将“宝贝ID”字段的类型转换为字符串。
    将转换后的结果保存并覆盖原Excel文件。
请确保在处理过程中正确地识别并处理各种格式的数据。
# %% [python] cell-fb088f82c49f
input_file = '/home/RPA_20250426_161959.xlsx'  # 输入文件路径

# 读取原始Excel文件,转换“销量”字段的数值
import pandas as pd
import re

def convert_value(value):
    # 将输入转换为字符串,以便处理
    value_str = str(value)
    
    # 使用正则表达式匹配模式
    match = re.match(r"(\d+)([万])?", value_str)
    
    if match:
        number = int(match.group(1))
        unit = match.group(2)
        
        # 根据单位进行相应的转换
        if unit == '万':
            return number * 10000
        else:
            # 如果没有单位或者有其他情况(比如末尾带+),直接处理
            return int(re.sub(r"[^\d]", "", value_str))
    else:
        # 如果不匹配任何已知模式,尝试直接转换为整数
        return int(value)

# 读取原始Excel文件
output_file = input_file  # 输出文件路径

# 假设目标列名为 'YourColumnName'
df = pd.read_excel(input_file)

# 处理目标列,应用convert_value函数
df['销量'] = df['销量'].apply(convert_value)

df['宝贝ID'] = df['宝贝ID'].astype(str)

# 将结果写入新的Excel文件
df.to_excel(output_file, index=False)
# %%--- [markdown] cell-6fd45dfcfd20
# properties:
#   locked: true
# ---%%
运行Python数据清洗程序后,在内存文件系统窗口点击“保存”按钮💾,更新电脑上的Excel文件。
# %%--- [sql] cell-fcba7ee79803
# properties:
#   locked: true
# ---%%
SHOW TABLES
# %% [plaintext] cell-12b0a310d5c0
[
  {"name": "rpa_20250426_161959"},
  {"name": "rpa_20250427_152444"},
  {"name": "rpa_20250428_140934"},
  {"name": "rpa_20250429_143958"}
]
# %% [prompt] cell-edd5764c21d0
你是一个精通DuckDB SQL的分析师,你的任务是将《资料》中指定名称的所有表格合并为一个名为itemlist的视图(如果该表已存在,则先删除该表)。请根据表格名称生成一个SQL查询语句,并确保包含适当的连接条件(如有需要)。
# %% [sql] cell-ce345cb8869d
DROP VIEW IF EXISTS itemlist;
CREATE VIEW itemlist AS
SELECT * FROM rpa_20250426_161959
UNION ALL
SELECT * FROM rpa_20250427_152444
UNION ALL
SELECT * FROM rpa_20250428_140934
UNION ALL
SELECT * FROM rpa_20250429_143958;
# %%--- [sql] cell-11c43244067
# properties:
#   locked: true
# ---%%
SELECT count(*) FROM itemlist;
# %% [plaintext] cell-59d79f853299
[
  {"count_star()": 3992}
]
# %%--- [sql] cell-98b8bba7c0eb
# properties:
#   locked: true
# ---%%
SELECT * FROM itemlist LIMIT 5;
# %% [plaintext] cell-b77a24bf0622
[
  {"店铺域名": "https://shop412023277.taobao.com", "采集日期": "2025-04-26", "宝贝ID": "905003071844", "标题": "网红吸管水杯女生高颜值卡通可爱学生大容量塑料杯子防摔便携新款", "销量": 0, "图片": "https://gw.alicdn.com/imgextra/O1CN01bTnlQl1hNQGW70143_!!4611686018427382793-0-item_pic.jpg", "价格": ""},
  {"店铺域名": "https://shop412023277.taobao.com", "采集日期": "2025-04-26", "宝贝ID": "906174376427", "标题": "黄油小熊水杯女网红可爱简约吸管杯高颜值大容量家用防摔便携杯子", "销量": 2, "图片": "https://gw.alicdn.com/imgextra/O1CN01NfuCCv1hNQGZIDVtB_!!4611686018427382793-0-item_pic.jpg", "价格": ""},
  {"店铺域名": "https://shop412023277.taobao.com", "采集日期": "2025-04-26", "宝贝ID": "906139460937", "标题": "双饮水杯女生高颜值网红可爱吸管杯大容量便携耐高温塑料杯子ins", "销量": 0, "图片": "https://gw.alicdn.com/imgextra/O1CN01zjtN5b1hNQGYh8tUx_!!4611686018427382793-0-item_pic.jpg", "价格": ""}
]
# %% [prompt] cell-06534e80af6d
你是一个精通DuckDB SQL的分析师,你的任务是筛选出每天销量都没有减少的商品,并根据销量增幅对这些商品进行排序,选出前30个商品的完整销量记录。假设你已经每天采集了一批指定淘宝商品的销量记录,并且已采集若干天数据,保存在表格itemlist中。请根据《资料》提供的表格结构生成SQL查询语句,确保输出的记录按照销量增幅排序,并且只包含前50个商品,最终查询结果保存在名为“topitemlist”的视图中(如果该表已存在,则先删除该表)。
# %% [sql] cell-19cd41da7d84
DROP VIEW IF EXISTS topitemlist;
CREATE VIEW topitemlist AS (
WITH
-- 步骤1: 筛选出每日销量未减少的商品ID
非递减商品 AS (
    SELECT 宝贝ID
    FROM (
        SELECT 
            宝贝ID,
            销量,
            LAG(销量) OVER (PARTITION BY 宝贝ID ORDER BY 采集日期) AS 前日销量
        FROM itemlist
    )
    GROUP BY 宝贝ID
    HAVING BOOL_AND(COALESCE(销量 >= 前日销量, TRUE))
),

-- 步骤2: 计算每个商品的销量总增幅(最后一日 - 第一日)
增幅计算 AS (
    SELECT 
        宝贝ID,
        (MAX(CASE WHEN is_last THEN 销量 END) - 
         MIN(CASE WHEN is_first THEN 销量 END)) AS 总增幅
    FROM (
        SELECT 
            宝贝ID,
            销量,
            采集日期 = MAX(采集日期) OVER (PARTITION BY 宝贝ID) AS is_last,
            采集日期 = MIN(采集日期) OVER (PARTITION BY 宝贝ID) AS is_first
        FROM itemlist
        WHERE 宝贝ID IN (SELECT 宝贝ID FROM 非递减商品)
    )
    GROUP BY 宝贝ID
),

-- 步骤3: 按增幅排序并取前50名
TOP商品 AS (
    SELECT 
        宝贝ID,
        总增幅,
        ROW_NUMBER() OVER (ORDER BY 总增幅 DESC) AS 排名
    FROM 增幅计算
    WHERE 总增幅 >= 0  -- 确保非负增长
    QUALIFY 排名 <= 50
)

-- 最终输出:获取TOP商品的完整记录并按规则排序
SELECT 
    il.*,
    t.总增幅,
    t.排名
FROM itemlist il
JOIN TOP商品 t ON il.宝贝ID = t.宝贝ID
ORDER BY 
    t.排名,         -- 先按总增幅排名排序
    il.采集日期    -- 同商品内按日期排序
);

SELECT * FROM topitemlist LIMIT 8;
# %% [plaintext] cell-3d21432f7632
[
  {"店铺域名": "https://shop332388057.taobao.com", "采集日期": "2025-04-26", "宝贝ID": "861839485931", "标题": "高颜值保温杯女ins2024新款316不锈钢水杯男生大容量学生专用杯子", "销量": 2000, "图片": "https://gw.alicdn.com/imgextra/O1CN01reQcYn1sX0IX2N89q_!!4611686018427380639-0-item_pic.jpg", "价格": "", "总增幅": 1000, "排名": 1},
  {"店铺域名": "https://shop332388057.taobao.com", "采集日期": "2025-04-27", "宝贝ID": "861839485931", "标题": "高颜值保温杯女ins2024新款316不锈钢水杯男生大容量学生专用杯子", "销量": 3000, "图片": "https://gw.alicdn.com/imgextra/O1CN01reQcYn1sX0IX2N89q_!!4611686018427380639-0-item_pic.jpg", "价格": "[1_7ij1w4tp#51#GRkZGRRXvkWoSh1D1Ul1RDtFaE6HTJVCcHl1eTdBDUjZQn9EaUVoShlN2kQUTTFKgkQ7QflH0UmwT7ZGIkzqV41NMXgkQM55cXmqeepPjkO6Q8RI1086]", "总增幅": 1000, "排名": 1},
  {"店铺域名": "https://shop332388057.taobao.com", "采集日期": "2025-04-28", "宝贝ID": "861839485931", "标题": "高颜值保温杯女ins2024新款316不锈钢水杯男生大容量学生专用杯子", "销量": 3000, "图片": "https://gw.alicdn.com/imgextra/O1CN01reQcYn1sX0IX2N89q_!!4611686018427380639-0-item_pic.jpg", "价格": "[1_7ij1w4tp#51#YmJiYm8AZD0mPmwtoDNCPOgxoTyVMKw/rD+ePAgCjzJOMVw4zze+LFk8mjRgP8U/NzCYNig/bjcmNN8ypjQtPjktYzCYMJg1jzhnNVoxuTDtPxA/QD/D]", "总增幅": 1000, "排名": 1},
  {"店铺域名": "https://shop332388057.taobao.com", "采集日期": "2025-04-29", "宝贝ID": "861839485931", "标题": "高颜值保温杯女ins2024新款316不锈钢水杯男生大容量学生专用杯子", "销量": 3000, "图片": "https://gw.alicdn.com/imgextra/O1CN01reQcYn1sX0IX2N89q_!!4611686018427380639-0-item_pic.jpg", "价格": "[1_7ij1w4tp#51#ISEhISxxQHk1ef50DkHSdRZ6KXPbc6J/sHQxcF5xW3x7b0NvjXXMQX55639ScVJ9DXQ/chd/8HGqbyJ+JkHtcZV7BG8jQNF9hXqkc8V57HboccN/tnYZeplA9Xhj]", "总增幅": 1000, "排名": 1},
  {"店铺域名": "https://shop285494718.taobao.com", "采集日期": "2025-04-26", "宝贝ID": "838310049660", "标题": "高颜值保温杯女25款家用咖啡杯外带车载杯子不锈钢冰霸水杯礼物男", "销量": 5000, "图片": "https://gw.alicdn.com/imgextra/O1CN01zMdfPM1rqrq3h2Lgh_!!0-item_pic.jpg", "价格": "", "总增幅": 1000, "排名": 2},
  {"店铺域名": "https://shop285494718.taobao.com", "采集日期": "2025-04-27", "宝贝ID": "838310049660", "标题": "高颜值保温杯女25款家用咖啡杯外带车载杯子不锈钢冰霸水杯礼物男", "销量": 6000, "图片": "https://gw.alicdn.com/imgextra/O1CN01zMdfPM1rqrq3h2Lgh_!!0-item_pic.jpg", "价格": "[1_7ij1w4tp#51#UVFRUVwOFg3ZCCIEkjEiMYUI/QMkHzQK+APFAXYHTActABsPwDFpMeQC7wZQBGwBcAUbD7kI/QFxDx8H7AnXDAQNBgNQAjoI/Q1fC7IAOwWhB1wE7Q==]", "总增幅": 1000, "排名": 2},
  {"店铺域名": "https://shop285494718.taobao.com", "采集日期": "2025-04-28", "宝贝ID": "838310049660", "标题": "高颜值保温杯女25款家用咖啡杯外带车载杯子不锈钢冰霸水杯礼物男", "销量": 6000, "图片": "https://gw.alicdn.com/imgextra/O1CN01zMdfPM1rqrq3h2Lgh_!!0-item_pic.jpg", "价格": "[1_7ij1w4tp#51#ZGRkZGk0RT5lBGoEvwUYMFA4zzKRPIU0FjZlMXQwDzyFPnsy8TgRMhg+gDb4N2s0qz0KMC4+ZTtaMmY5RTAcNngECCuNMjI1CTfUPK47LyrHOcA6GQ==]", "总增幅": 1000, "排名": 2},
  {"店铺域名": "https://shop285494718.taobao.com", "采集日期": "2025-04-29", "宝贝ID": "838310049660", "标题": "高颜值保温杯女25款家用咖啡杯外带车载杯子不锈钢冰霸水杯礼物男", "销量": 6000, "图片": "https://gw.alicdn.com/imgextra/O1CN01zMdfPM1rqrq3h2Lgh_!!0-item_pic.jpg", "价格": "[1_7ij1w4tp#51#WFhYWFUCwQk7CnYB/TgzAfQEvhejCmoMJw9/FtwPdA89FgsE/Be3A84LxQy1BjEEZQg4CxYIeQ9/OHwKWTg0BJYJTDinB/AJIwV6AB8I0g==]", "总增幅": 1000, "排名": 2}
]
# %% [prompt] cell-c81aca793e72
你是精通Vega-Lite的可视化专家,你的任务是生成Vega-Lite JSON代码,用于绘制折线图。具体要求如下:
    X轴:使用“采集日期”字段。
    Y轴:使用“销量”字段。
    颜色:不同颜色的折线表示不同的“宝贝ID”,排名越前面的颜色越深。
    数据源:表格名为“topitemlist”,包含“宝贝ID”、“采集日期”和“销量”字段,描述参看《资料》。
生成的Vega-Lite JSON代码应该能够清晰地展示每个宝贝ID随时间变化的销量趋势,使用颜色区分宝贝ID的排名。在工具提示中显示宝贝的所有信息。
# %% [vegalite] cell-8a6c304f3dca
{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
  "width": 1000,
  "height": 1000,
  "data": {
    "name": "topitemlist"
  },
  "transform": [
    {
      "calculate": "toDate(datum['采集日期'])",
      "as": "采集日期"
    }
  ],
  "mark": {
    "type": "line",
    "tooltip": true
  },
  "encoding": {
    "x": {
      "field": "采集日期",
      "type": "temporal",
      "axis": {
        "title": "采集日期",
        "labelAngle": 45
      }
    },
    "y": {
      "field": "销量",
      "type": "quantitative",
      "axis": {
        "title": "销量"
      }
    },
    "color": {
      "field": "宝贝ID",
      "type": "nominal",
      "scale": {
        "domain": {
          "data": "topitemlist",
          "field": "宝贝ID",
          "sort": {
            "field": "排名",
            "op": "min",
            "order": "ascending"
          }
        },
        "scheme": "viridis"
      },
      "legend": {
        "titleFontSize": 14,
        "labelFontSize": 12,
        "orient": "right",
        "columns": 2
      }
    },
    "tooltip": [
      {"field": "宝贝ID", "type": "nominal", "title": "商品ID"},
      {"field": "标题", "type": "ordinal", "title": "标题"},
      {"field": "采集日期", "type": "temporal", "title": "日期", "format": "%Y-%m-%d"},
      {"field": "排名", "type": "ordinal", "title": "当前排名"},
      {"field": "销量", "type": "quantitative", "title": "销量"},            
      {"field": "总增幅", "type": "ordinal", "title": "总增幅"}      
    ]
  },
  "config": {
    "view": {
      "stroke": "transparent"
    }
  }
}