基于SVD的推荐系统实现

唐宇迪老师推荐算法实战课程的源码复现,原理还未仔细解释 不足之处望多多指正。

导入所需工具包

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import recmetrics
import matplotlib.pyplot as plt
from surprise import Reader,SVD,Dataset
from surprise.model_selection import train_test_split

加载评分数据,(这里只过滤筛选一部分数据用于实验)

ratings=pd.read_csv("ratings.csv")
ratings=ratings.query('rating>=3')
ratings.reset_index(drop=True,inplace=True)
ratings.head()
userId movieId rating timestamp
0 1 2 3.5 1112486027
1 1 29 3.5 1112484676
2 1 32 3.5 1112484819
3 1 47 3.5 1112484727
4 1 50 3.5 1112484580

数据过滤:过滤出评分超过1000部电影的用户,方便起见避免出现稀疏矩阵的情况

n=1000
users=ratings.userId.value_counts()
users=users[users>n].index.tolist()
ratings=ratings.query('userId in @users')
print(ratings.shape)
ratings.head()
(1317902, 4)
userId movieId rating timestamp
15918 156 1 5.0 1037739266
15919 156 2 5.0 1040937649
15920 156 4 3.0 1038801803
15921 156 5 3.0 1040944583
15922 156 6 4.0 1037822117

从movies.csv中拿出对应的数据的(电影的title、genres)

movies=pd.read_csv('movies.csv')
print(movies.shape)
movies.head()
(27278, 3)
movieId title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
1 2 Jumanji (1995) Adventure|Children|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance
4 5 Father of the Bride Part II (1995) Comedy
# 得到上表ratings表里对应的电影特征数据
rated_movies=ratings.movieId.tolist()
movies=movies.query('movieId in @rated_movies')
movies.set_index('movieId',inplace=True,drop=True)

制作数据集里的题材特征(将两张表实现关联)

movies=movies.genres.str.split("|",expand=True)
movies.reset_index(inplace=True)
movies.head()
movieId 0 1 2 3 4 5 6 7 8 9
0 1 Adventure Animation Children Comedy Fantasy None None None None None
1 2 Adventure Children Fantasy None None None None None None None
2 3 Comedy Romance None None None None None None None None
3 4 Comedy Drama Romance None None None None None None None
4 5 Comedy None None None None None None None None None

Long tail绘图

  • 使用工具包中的recmetics函数
  • 其中percentage是红色虚线的百分比
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(15,7))
recmetrics.long_tail_plot(df= ratings,
                         item_id_column='movieId',
                         interaction_type='movie ratings',
                         percentage=0.5,
                         x_labels=False)

在这里插入图片描述

利用surprise包构建推荐系统

# 获取surprise所需要格式的数据
reader=Reader(rating_scale=(0,5))
data=Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
# 切分数据集
trainset,testset=train_test_split(data,test_size=0.25)
#SVD训练数据
algo=SVD()
algo.fit(trainset)
<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1af42397080>
# 测试
test=algo.test(testset)
test=pd.DataFrame(test)
test.drop("details",inplace=True,axis=1)
test.columns=['userId','movieId','actual','predictions']
test.head()
userId movieId actual predictions
0 96601 34405 3.5 3.892328
1 119148 608 4.5 4.328727
2 130459 2917 4.0 3.565206
3 94013 2287 4.0 3.967104
4 32984 9 3.0 3.383482
# MSE和RMSE评估指标
print(recmetrics.mse(test.actual,test.predictions))
print(recmetrics.rmse(test.actual,test.predictions))
0.26556372554189756
0.5153287548176383
#模型创建
cf_model=test.pivot_table(index='userId',columns='movieId',values='predictions').fillna(0)
cf_model.head()
movieId 1 2 3 4 5 6 7 8 9 10 ... 131021 131106 131118 131122 131126 131132 131168 131174 131176 131250
userId
156 0.000000 0.0 0.000000 0.0 0.0 4.318701 0.0 0.0 0.0 4.106054 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
208 4.080567 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
359 4.143984 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
394 0.000000 0.0 0.000000 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
572 0.000000 0.0 3.366122 0.0 0.0 0.000000 0.0 0.0 0.0 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 16632 columns

# 推荐系统设计
def get_users_predictions(user_id,n,model):
    recmmended_items=pd.DataFrame(model.loc[user_id])
    recmmended_items.columns=['predicted_rating']
    recmmended_items=recmmended_items.sort_values('predicted_rating',ascending=False)
    recmmended_items=recmmended_items.head(n)
    return recmmended_items.index.tolist()
# 使用举例
get_users_predictions(156,10,cf_model)
[2028, 2762, 1198, 1704, 1242, 593, 1210, 919, 2268, 1136]
# 批量的测试结果
test = test.copy().groupby('userId')['movieId'].agg({'actual':(lambda x:list(set(x)))})
cf_recs=[]=[]
for user in test.index:
    cf_predictions=get_users_predictions(user,10,cf_model)
    cf_recs.append(cf_predictions)
    
test['cf_predictions']=cf_recs
test.head()
actual cf_predictions
userId
156 [6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ... [2028, 2762, 1198, 1704, 1242, 593, 1210, 919,...
208 [3072, 1, 69122, 2567, 3079, 2570, 44555, 1036... [912, 608, 924, 1207, 898, 922, 1256, 44555, 7...
359 [1, 32770, 515, 39427, 2565, 37382, 4103, 6964... [1272, 953, 2804, 2762, 2918, 1207, 1233, 1201...
394 [1537, 33794, 26116, 3077, 4617, 2058, 2571, 5... [858, 922, 2019, 608, 5291, 1228, 3435, 1219, ...
572 [3, 108548, 7173, 4104, 54281, 91658, 2571, 30... [2571, 50, 589, 79132, 47, 58559, 7361, 2959, ...

实验对比

#排行榜
popularity_recs = ratings.movieId.value_counts().head(10).index.tolist()

pop_recs=[]
for user in test.index:
    pop_predictions=popularity_recs
    pop_recs.append(pop_predictions)
    
test['pop_predictions']=pop_recs
test.head()
actual cf_predictions pop_predictions
userId
156 [6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ... [2028, 2762, 1198, 1704, 1242, 593, 1210, 919,... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
208 [3072, 1, 69122, 2567, 3079, 2570, 44555, 1036... [912, 608, 924, 1207, 898, 922, 1256, 44555, 7... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
359 [1, 32770, 515, 39427, 2565, 37382, 4103, 6964... [1272, 953, 2804, 2762, 2918, 1207, 1233, 1201... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
394 [1537, 33794, 26116, 3077, 4617, 2058, 2571, 5... [858, 922, 2019, 608, 5291, 1228, 3435, 1219, ... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
572 [3, 108548, 7173, 4104, 54281, 91658, 2571, 30... [2571, 50, 589, 79132, 47, 58559, 7361, 2959, ... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ...
# 随机选择
ran_recs=[]
for user in test.index:
    random_predictions = ratings.movieId.sample(10).values.tolist()
    ran_recs.append(random_predictions)
test['random_predictions']=ran_recs
test.head()
actual cf_predictions pop_predictions random_predictions
userId
156 [6, 2056, 10, 15, 17, 4117, 22, 23, 24, 2073, ... [2028, 2762, 1198, 1704, 1242, 593, 1210, 919,... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ... [6310, 88125, 7458, 3182, 60684, 5582, 2975, 1...
208 [3072, 1, 69122, 2567, 3079, 2570, 44555, 1036... [912, 608, 924, 1207, 898, 922, 1256, 44555, 7... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ... [3823, 2109, 6924, 5120, 4238, 1407, 5266, 299...
359 [1, 32770, 515, 39427, 2565, 37382, 4103, 6964... [1272, 953, 2804, 2762, 2918, 1207, 1233, 1201... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ... [57421, 1032, 8379, 2539, 5010, 2100, 111, 196...
394 [1537, 33794, 26116, 3077, 4617, 2058, 2571, 5... [858, 922, 2019, 608, 5291, 1228, 3435, 1219, ... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ... [77201, 1372, 7484, 3250, 521, 1396, 5971, 260...
572 [3, 108548, 7173, 4104, 54281, 91658, 2571, 30... [2571, 50, 589, 79132, 47, 58559, 7361, 2959, ... [1198, 1270, 593, 2762, 318, 2571, 260, 1240, ... [5872, 6982, 832, 4495, 70742, 65596, 1, 971, ...

topK 求精度与召回率Precision与Recall

覆盖率

def prediction_coverage(predicted,catalog):
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)*1.0)*100,2)
    return prediction_coverage
catalog = ratings.movieId.unique().tolist()
random_coverage = prediction_coverage(ran_recs,catalog)
pop_coverage = prediction_coverage(pop_recs,catalog)
cf_coverage = prediction_coverage(cf_recs, catalog)
# 覆盖率
coverage_scores=[random_coverage,pop_coverage,cf_coverage]
model_names= ['Random Recommender','Popular Recommender','Collaborative Fillter']

fig=plt.figure(figsize=(7,5))
recmetrics.coverage_plot(coverage_scores,model_names)

在这里插入图片描述

问题(review)

*  以数据为基础的论文查找技巧;
*  SVD为算法基础的推荐基础的算法原理;(之前接触SVD主要是矩阵计算是将其用于对图像的压缩处理,此处的SVD的推荐排序原理不是很清楚,需要查资料学习);
*  原来(大三阶段)看过的研究文献数据集大多来之kaggle实验过程与唐课程的区别在哪?

版权声明:本文为Zengmeng1998原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
THE END
< <上一篇
下一篇>>