事先我已经跑好数据,产出了一个包含
R:Recency,最近一次消费距今天数
F:最近一周订单
M:最近一周用户消费额
三个数据的表。
import pandas as pd
df = pd.read_excel('/Users/zhaoningbo/dataset/rfm.xlsx')
应该做一个异常值检验,这次先跳过。
import plotly.plotly as py
import pandas as pd
scatter = dict(
mode = "markers",
name = "y",
type = "scatter3d",
x = df['r'], y = df['f'], z = df['m'],
marker = dict( size=2, color="rgb(23, 190, 207)" )
)
layout = dict(
title = '3d point clustering',
scene = dict(
xaxis = dict( zeroline=False ),
yaxis = dict( zeroline=False ),
zaxis = dict( zeroline=False ),
)
)
fig = dict( data=[scatter], layout=layout )
# Use py.iplot() for IPython notebook
plotly.offline.init_notebook_mode() #初始化jupyter notebook中的绘图模式
plotly.offline.iplot(fig, filename='3d point clustering')
#转换数据格式
tmp=np.array([df.r,df.f,df.m]).T
#调用python关于机器学习sklearn库中的KMeans
from sklearn.cluster import KMeans
#设置分为3类,并训练数据
kms=KMeans(n_clusters=3)
y=kms.fit_predict(tmp)
#将分类结果以散点图形式展示
y
array([0, 2, 1, …, 2, 1, 0], dtype=int32)
df["type"] = y
d0 = df[df.type==0]
d1 = df[df.type==1]
d2 = df[df.type==2]
df.type.value_counts()
0 587
2 264
1 227
Name: type, dtype: int64
import plotly.plotly as py
import pandas as pd
scatter0 = dict(
mode = "markers",
name = "d0",
type = "scatter3d",
x = d0['r'], y = d0['f'], z = d0['m'],
marker = dict( size=2, color="rgb(23, 190, 207)" )
)
scatter1 = dict(
mode = "markers",
name = "d1",
type = "scatter3d",
x = d1['r'], y = d1['f'], z = d1['m'],
marker = dict( size=2, color="rgb(49,54,149)" )
)
scatter2 = dict(
mode = "markers",
name = "d2",
type = "scatter3d",
x = d2['r'], y = d2['f'], z = d2['m'],
marker = dict( size=2, color="rgb(253,174,97)" )
)
fig = dict( data=[scatter0,scatter1,scatter2], layout=layout )
# Use py.iplot() for IPython notebook
plotly.offline.init_notebook_mode() #初始化jupyter notebook中的绘图模式
plotly.offline.iplot(fig, filename='3d point clustering')
因为一开始就选择了用户价值模型,所以没有做降维。