基于用户的协同过滤算法 python实现

阅读: 评论:0

基于用户的协同过滤算法 python实现

基于用户的协同过滤算法 python实现

基于用户的协同过滤算法

参考文章:
参考代码和数据:
大部分代码有修改。

python实现

import pandas as pd
import math as mhmovies = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small/movies.csv")
ratings = pd.read_csv("D:/Python/Python36/ML/datasets/ml-latest-small//ratings.csv")
data = pd.merge(movies, ratings, on='movieId')
csv_file = "D:/Python/Python36/ML/datasets/ml-latest-small/data.csv"
data[['userId', 'movieId', 'rating', 'title', 'genres']].sort_values('userId').to_csv(csv_file, index=False,header=None)
# 采用python字典来表示每位用户评论的电影和评分
# 定义数据结构 {userId: {title: rating}}
data = {}
with open(csv_file, 'r', encoding='UTF-8') as file:for line adlines():line = line.strip().split(',')if not line[0] in data.keys():data[line[0]] = {line[3]: line[2]}else:data[line[0]][line[3]] = line[2]# print(data)"""计算任何两位用户之间的相似度,由于每位用户评论的电影不完全一样,所以兽先要找到两位用户共同评论过的电影然后计算两者之间的欧式距离,最后算出两者之间的相似度
"""# 使用欧氏距离倒数计算用户间相似度
def euclidean(userId1, userId2):distance = 0.0for movieId in data[userId1].keys():if movieId in data[userId2].keys():  # movieId实际是titledistance += mh.pow((float(data[userId1][movieId]) - float(data[userId2][movieId])), 2)return 1. / (1.0 + mh.sqrt(distance))# 计算两用户之间的Pearson相关系数
def pearson(userId1, userId2):movies_user1 = data[userId1]movies_user2 = data[userId2]common = {}for movie in movies_user1.keys():if movie in movies_user2.keys():common[movie] = 1  # 过滤掉没有共同评分的电影n = len(common)if n == 0:return 0# 逐个评分乘积和sum12 = sum([float(movies_user1[movie]) * float(movies_user2[movie]) for movie in common])# 评分和sum1 = sum([float(movies_user1[movie]) for movie in common])sum2 = sum([float(movies_user2[movie]) for movie in common])# 评分平方和square_sum1 = sum([mh.pow(float(movies_user1[movie]), 2) for movie in common])square_sum2 = sum([mh.pow(float(movies_user2[movie]), 2) for movie in common])# 分子num = n * sum12 - sum1 * sum2den = mh.sqrt(n * square_sum1 - sum1 * sum1) * mh.sqrt(n * square_sum2 - sum2 * sum2)if den == 0:return 0return num / den# 测试
e1 = euclidean('1', '8')
print("e1 = ", e1)
p1 = pearson('1', '8')
print("p1 = ", p1)# 查找指定用户的K个相似用户
def similar_users(userId, K):if K is None:K = len(data)sim_users = []for userIdx in data.keys():if not userIdx == userId:simi = euclidean(userId, userIdx)sim_users.append((userIdx, simi))sim_users.sort(key=lambda val: val[1], reverse=True)# print("res=", res)return sim_users[: K]# 构造指定用户的相似用户和指定用户未评分电影的评分矩阵
def unrated_movies_matrix(userId, k):# 指定用户的k个相似用户users_scores = similar_users(userId, k)# 指定用户未评分且指定用户的相似用户已评分的电影集合unratedMovies = set()for user_score in users_scores:user = user_score[0]for movie in data[user].keys():if movie not in data[userId].keys():unratedMovies.add(movie)unratedMovies = list(unratedMovies)# movies = unrated_movies('1', 2)cols = ['user', 'simi']d(unratedMovies)rated_matrix = pd.DataFrame()for user_score in users_scores:scores = []user = user_score[0]user_simi = user_score[1]scores.append(user)scores.append(user_simi)for movie in unratedMovies:score = float(data[user].get(movie, 0.0))scores.append(score)df_movie = pd.DataFrame([scores])rated_matrix = rated_matrix.append(df_movie, ignore_index=True)lumns = colscsv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/unweighted_score_matrix_users.csv"_csv(csv1_file)return rated_matrix# 加权指定用户未评分且指定用户的相似用户已评分的电影矩阵,得到指定用户对未评分商品的喜好程度
def unrated_interest_of_user(userId, k, topN):unrated_matrix = unrated_movies_matrix(userId, k)cols = list(lumns)simi_sum = sum(unrated_matrix['simi'])movies_name = list(lumns)[2:]rated_simi = []weighted_rated_simi = ['users', simi_sum]for movie in movies_name:simi = sum(unrated_matrix['simi'].mul(unrated_matrix[movie])) / simi_sum# print('unrated_simi=', (movie, unrated_simi))rated_simi.append((movie, simi))weighted_rated_simi.append(simi)# simi = unrated_interest_of_user('1', 2, 1)df_weighted_score = pd.DataFrame([weighted_rated_simi], columns=cols)rated_matrix = unrated_matrix.append(df_weighted_score, ignore_index=True)csv1_file = "D:/Python/Python36/ML/datasets/ml-latest-small/weighted_score_matrix_users.csv"_csv(csv1_file)unrated_simi = sorted(rated_simi, key=lambda x: x[1], reverse=True)return unrated_simi[: topN]# 召回topN个电影
unrated_interest = unrated_interest_of_user('4', 5, 10)
print(unrated_interest)

本文发布于:2024-01-28 18:48:17,感谢您对本站的认可!

本文链接:https://www.4u4v.net/it/17064389009498.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:算法   用户   python
留言与评论(共有 0 条评论)
   
验证码:

Copyright ©2019-2022 Comsenz Inc.Powered by ©

网站地图1 网站地图2 网站地图3 网站地图4 网站地图5 网站地图6 网站地图7 网站地图8 网站地图9 网站地图10 网站地图11 网站地图12 网站地图13 网站地图14 网站地图15 网站地图16 网站地图17 网站地图18 网站地图19 网站地图20 网站地图21 网站地图22/a> 网站地图23