# %%--- [html] cell-081b3edd6cb6
# properties:
#   run_on_load: true
# ---%%
请上传一个淘宝搜索的Excel文件:<br><input class="btn btn-primary" type="file" id="fileInput" multiple />
# %%--- [python] cell-31cc88e0bfa6
# properties:
#   run_on_load: true
# ---%%
import pandas as pd
import micropip
await micropip.install("/pypi/dianshangji-0.1-py3-none-any.whl")
from dianshangji import loader, palette, taobao
font = await loader.init()
# %%--- [javascript] cell-1090a7390838
# properties:
#   run_on_load: true
# ---%%
initFileInputs("fileInput", {
    "淘宝": "/file1.xlsx"
})
# %% [python] cell-94addd9ab5d1
import json
import re

# 假设这是你的 DataFrame

data = {
    '标题': [
        '骨传导助听一体机骨传感器辅听器不入耳老人耳背聋蓝牙助听拾音器',
        '仁和助听器老人专用正品老年人耳聋耳背老年隐形式年轻人高端耳机',
        '海尔老人专用助听器重度耳聋耳背新型正品年轻人旗舰店老年人耳机'
    ],
    '产品特性': [
        '佩戴方式:骨传导|适用人群:老人|核心功能:耳背|核心功能:聋|智能化与便捷性:蓝牙|核心功能:助听|核心功能:拾音器',
        '商标:仁和|适用人群:老人专用|核心功能:耳聋|核心功能:耳背|佩戴方式:老年隐形式|适用人群:年轻人|核心功能:高端',
        '商标:海尔|适用人群:老人专用|核心功能:重度耳聋|核心功能:耳背|智能化与便捷性:新型|适用人群:年轻人'
    ]
}

df = pd.DataFrame(data)

df = pd.read_excel('/file1.xlsx', skiprows=2, usecols=['标题','产品特性'], sheet_name = "销量")

# 去除重复标题的记录
df = df.drop_duplicates(subset=['标题'])

# 去掉"产品特性"列中为空或非字符串的记录
df = df[df['产品特性'].apply(lambda x: isinstance(x, str) and bool(x.strip()))]

# 提取“产品特性”中的所有特性名称(字段名),这些字段名会作为新的列名
def extract_feature_names(row):
    # 匹配特性名称(如佩戴方式、适用人群等)    
    feature_names = re.findall(r'([^:|]+):', row)
    return feature_names

# 获取所有不同的特性字段名
all_features = set()
df['特性字段'] = df['产品特性'].apply(extract_feature_names)
for feature_list in df['特性字段']:
    all_features.update(feature_list)

print(all_features)

# 提取每个特性对应的值
def extract_features(row, feature):
    # 使用正则表达式提取所有符合条件的特性值
    matches = re.findall(f'{feature}:(.*?)(?=\||$)', row)
    
    # 如果找到了匹配项,返回一个列表,若没有找到,则返回None
    return matches if matches else None

# 动态创建新的列,提取对应的特性值
for feature in all_features:
    df[feature] = df['产品特性'].apply(lambda row: extract_features(row, feature))


# 打印结果查看
print(df[list(all_features)])


# %% [python] cell-8eb8eb8e7cac
# 提取所有相关字段的关键词并汇总到一个列表中
all_keywords = []

# 将商标、适用人群和核心功能字段的关键词提取到一个列表
for col in all_features:
    # 分割并去除多余的空格
    all_keywords.extend(df[col].explode().dropna().tolist())

# 将所有的关键词转换为Series并计算频率
keyword_counts = pd.Series(all_keywords).value_counts()

# 显示关键词频率
#print(keyword_counts)

# 根据2/8原则筛选最常见的20%关键词
total_keywords = len(keyword_counts)
sorted_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)

# 计算前20%关键词的个数
top_20_percent_count = int(total_keywords * 0.2)
top_keywords = sorted_keywords[:top_20_percent_count]

# 输出最值得研究的关键词及其频率
print("最值得研究的前100个关键词:")
#for keyword, count in top_keywords:
#    print(f"{keyword}: {count}")
print(top_keywords[0:100])

# 将top_keywords转换为DataFrame格式
word_values = [{'Keyword': keyword, 'Frequency': frequency} for keyword, frequency in top_keywords[0:100]]

# 将Vega-Lite 数值对象转为字符串并存储在全局变量中
import js
window = js.window  # 在Pyodide中使用js.window模拟JavaScript的window
window.word_values = json.dumps(word_values)
# %% [html] cell-9b4c78a7a46e
<div id="vis" style="width: 800px; height: 600px;"></div>  <!-- 这里将显示图表 -->
# %% [esm] cell-cc1306aa8ad4
import "https://juguandian.com/vega5.min.js";
import "https://juguandian.com/vegalite5.min.js";
import "https://juguandian.com/vega-embed.min.js";

// 在 JavaScript 中调用 word_values
const values = JSON.parse(window.word_values);  // 获取存储的 JSON 数据

// 创建Vega-Lite JSON模板
const spec = {
    "data": {
        "values": values
    },
    "mark": "point",
    "encoding": {
        "x": {
            "field": "Keyword",
            "type": "nominal",
            "axis": {"title": "关键词"}
        },
        "y": {
            "field": "Frequency",
            "type": "quantitative",
            "axis": {"title": "频次"}
        },
        "size": {
            "field": "Frequency",
            "type": "quantitative",
            "legend": {"title": "关键词大小"}
        },
        "color": {
            "field": "Keyword",
            "type": "nominal",
            "legend": {"title": "关键词"}
        },
        "tooltip": [
            {"field": "Keyword", "type": "nominal", "title": "关键词"},
            {"field": "Frequency", "type": "quantitative", "title": "频次"}
        ]
    },
    "title": "关键词频次气泡图"
}



// 配置项可以在此处添加,如设置默认字体、颜色等
const config =  {
    // default view background color
    // covers the entire view component
    background: "#efefef",
    axis: {
        labelFont: "serif",
        labelFontSize: 16,
        tickWidth: 3,
        tickColor: "red",
    }
};

await vegaEmbed("#vis", spec, {
    config: config,
    tooltip: { theme: "dark" },
});
# %% [python] cell-8a6959a1c8ff
# 将word_values转化为d3-cloud需要的格式 {text: "Keyword", size: "Font Size"}
# 你可以根据需求设置字体大小的比例
min_size = 12  # 设置最小字体大小
max_size = 64  # 设置最大字体大小

word_values = [{'Keyword': keyword, 'Frequency': frequency} for keyword, frequency in top_keywords[0:300]]
# 计算词频的最大值和最小值
min_frequency = min([d['Frequency'] for d in word_values])
max_frequency = max([d['Frequency'] for d in word_values])

# 将Frequency映射到font size
def frequency_to_size(frequency, min_frequency, max_frequency, min_size, max_size):
    return min_size + (frequency - min_frequency) * (max_size - min_size) / (max_frequency - min_frequency)

# 转化为d3-cloud格式
d3_cloud_data = [{
    'text': word['Keyword'],
    'size': frequency_to_size(word['Frequency'], min_frequency, max_frequency, min_size, max_size)
} for word in word_values]
print(d3_cloud_data)
window.d3_cloud_data = json.dumps(d3_cloud_data)
# %% [html] cell-89df33194696
<div id="wordcloud"></div>
# %% [esm] cell-51ed9a17572d
import 'https://juguandian.com/d3.v5.min.js'
import 'https://juguandian.com/d3.layout.cloud.min.js'


// 在 JavaScript 中调用 word_values
const values = JSON.parse(window.d3_cloud_data);  // 获取存储的 JSON 数据


var layout = d3.layout.cloud()
    .size([1800, 500])
    .words(values)
    .padding(5)
    .rotate(function() { return ~~(Math.random() * 2) * 90; })
    .font("Impact")
    .fontSize(function(d) { return d.size; })
    .on("end", draw);

layout.start();

function draw(words) {
  d3.select("#wordcloud").append("svg")
      .attr("width", layout.size()[0])
      .attr("height", layout.size()[1])
    .append("g")
      .attr("transform", "translate(" + layout.size()[0] / 2 + "," + layout.size()[1] / 2 + ")")
    .selectAll("text")
      .data(words)
    .enter().append("text")
      .style("font-size", function(d) { return d.size + "px"; })
      .style("font-family", "Impact")
      .attr("text-anchor", "middle")
      .attr("transform", function(d) {
        return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
      })
      .text(function(d) { return d.text; });      

}

# %% [python] cell-a01cbd089dc3
# 展示适用人群的分布情况
#print(all_features)
#print(top_keywords[0:10])

from itertools import combinations

keyword_dict = {item['Keyword']: item['Frequency'] for item in word_values}

# 计算关键词对的共现频率
co_occurrence = {}

# 遍历所有商品标题
for idx, row in df.iterrows():
    # 获取当前商品标题中的所有关键词
    keywords_in_title = set()
    for col in all_features:
        if row[col]:
            keywords_in_title.update(row[col])
    
    # 计算每对关键词的共现
    for keyword1, keyword2 in combinations(keywords_in_title, 2):
        if keyword1 in keyword_dict and keyword2 in keyword_dict:
            if (keyword1, keyword2) not in co_occurrence:
                co_occurrence[(keyword1, keyword2)] = 0
            co_occurrence[(keyword1, keyword2)] += 1

# 打印共现矩阵的示例样本
sample_co_occurrence = {key: co_occurrence[key] for key in list(co_occurrence)[:5]}  # 取前5对示例
print(sample_co_occurrence)

# 转化为d3力导向图的link形式
word_associations = [
    {'source': pair[0], 'target': pair[1], 'value': freq}
    for pair, freq in co_occurrence.items()
]

word_associations = sorted(word_associations, key = lambda x: x['value'], reverse = True)
# 输出结果
print(word_associations[:5])

import js
window = js.window
window.word_associations = json.dumps(word_associations)
# %% [html] cell-d63e49261caf
<div id="wordcloud2"></div>
# %% [esm] cell-3cc982d8b497
// 词频数据(词与词的大小关联)
const values = JSON.parse(window.d3_cloud_data);  // 获取存储的 JSON 数据
const links = JSON.parse(window.word_associations);

console.log(values)
console.log(links)

// 创建节点(词汇)
const nodes = [];
values.forEach(word => {
    nodes.push({id: word.text, size: word.size})
});
console.log(nodes[0])

// 创建D3力导向图布局
const width = 1200;
const height = 600;

const simulation = d3.forceSimulation(nodes)
    .force("link", d3.forceLink(links).id(d => d.id).distance(d => Math.max(300 - d.value / 4, 50)))  // 调整距离基于频次
    .force("charge", d3.forceManyBody().strength(-200))  // 设置节点之间的斥力
    .force("center", d3.forceCenter(width / 2, height / 2));  // 使图形居中


// 创建SVG元素
const svg = d3.select("#wordcloud2").append("svg")
    .attr("width", width)
    .attr("height", height);

// 绘制链接(词对之间的关系)
const link = svg.append("g")
    .selectAll(".link")
    .data(links)
    .enter().append("line")
    .attr("class", "link")
    .style("stroke-width", d => Math.sqrt(d.value) / 10);  // 线条粗细基于频次

// 绘制节点(词汇)
const node = svg.append("g")
    .selectAll(".node")
    .data(nodes)
    .enter().append("circle")
    .attr("class", "node")
    .attr("r", d => d.size)  // 节点大小
    .style("fill", "#69b3a2")
    .call(d3.drag()  // 允许拖动节点
        .on("start", dragStarted)
        .on("drag", dragged)
        .on("end", dragEnded));

// 节点标签(显示词汇)
const labels = svg.append("g")
    .selectAll(".label")
    .data(nodes)
    .enter().append("text")
    .attr("class", "label")
    .attr("dx", 12)
    .attr("dy", ".35em")
    .text(d => d.id);

// 更新图形的位置
simulation.on("tick", () => {
    link
        .attr("x1", d => d.source.x)
        .attr("y1", d => d.source.y)
        .attr("x2", d => d.target.x)
        .attr("y2", d => d.target.y);

    node
        .attr("cx", d => d.x)
        .attr("cy", d => d.y);

    labels
        .attr("x", d => d.x)
        .attr("y", d => d.y);
});

// 拖动行为
function dragStarted(event, d) {
    if (!event.active) simulation.alphaTarget(0.3).restart();
    d.fx = d.x;
    d.fy = d.y;
}

function dragged(event, d) {
    d.fx = event.x;
    d.fy = event.y;
}

function dragEnded(event, d) {
    if (!event.active) simulation.alphaTarget(0);
    d.fx = null;
    d.fy = null;
}