withopen('../data/ratings.json', 'r') as f: ratings = json.loads(f.read()) users, scmat = list(ratings.keys()), [] for user1 in users: scrow = [] for user2 in users: movies = set() for movie in ratings[user1]: if movie in ratings[user2]: movies.add(movie) iflen(movies) == 0: score = 0 else: x, y = [], [] for movie in movies: x.append(ratings[user1][movie]) y.append(ratings[user2][movie]) x = np.array(x) y = np.array(y) score = 1 / (1 + np.sqrt(((x - y) ** 2).sum())) scrow.append(score) scmat.append(scrow) users = np.array(users) scmat = np.array(scmat) for scrow in scmat: print(' '.join('{:.2f}'.format(score) for score in scrow))
皮尔逊相关系数
1 2 3
A = [1,2,3,1,2] B = [3,4,5,3,4] m = np.corrcoef(A, B)
皮尔逊相关系数 = 协方差 / 标准差之积
相关系数处于[-1, 1]区间。越靠近-1代表两组样本反相关,越靠近1代表两组样本正相关。
案例:使用皮尔逊相关系数计算两用户对一组电影评分的相关性。
1
score = np.corrcoef(x, y)[0, 1]
按照相似度从高到低排列每个用户的相似用户
1 2 3 4 5 6 7 8
# scmat矩阵中每一行为 每一个用户对所有用户的皮尔逊相关系数 for i, user inenumerate(users): # 拿到所有相似用户与相似用户所对应的皮尔逊相关系数 sorted_indices = scmat[i].argsort()[::-1] sorted_indices = sorted_indices[sorted_indices != i] similar_users = users[sorted_indices] similar_scores = scmat[i, sorted_indices] print(user, similar_users, similar_scores, sep='\n')