diff --git a/judge/caching.py b/judge/caching.py index 43479da..029cf08 100644 --- a/judge/caching.py +++ b/judge/caching.py @@ -40,7 +40,10 @@ def cache_wrapper(prefix, timeout=None): def _get(key): if not l0_cache: return cache.get(key) - return l0_cache.get(key) or cache.get(key) + result = l0_cache.get(key) + if result is None: + result = cache.get(key) + return result def _set_l0(key, value): if l0_cache: @@ -56,7 +59,7 @@ def cache_wrapper(prefix, timeout=None): result = _get(cache_key) if result is not None: _set_l0(cache_key, result) - if result == NONE_RESULT: + if type(result) == str and result == NONE_RESULT: result = None return result result = func(*args, **kwargs) diff --git a/judge/ml/collab_filter.py b/judge/ml/collab_filter.py index 6a5d183..d19c5e5 100644 --- a/judge/ml/collab_filter.py +++ b/judge/ml/collab_filter.py @@ -1,7 +1,9 @@ import numpy as np -from django.conf import settings import os +import hashlib + from django.core.cache import cache +from django.conf import settings from judge.caching import cache_wrapper @@ -12,14 +14,13 @@ class CollabFilter: # name = 'collab_filter' or 'collab_filter_time' def __init__(self, name): - embeddings = np.load( + self.embeddings = np.load( os.path.join(settings.ML_OUTPUT_PATH, name + "/embeddings.npz"), allow_pickle=True, ) - arr0, arr1 = embeddings.files + _, problem_arr = self.embeddings.files self.name = name - self.user_embeddings = embeddings[arr0] - self.problem_embeddings = embeddings[arr1] + self.problem_embeddings = self.embeddings[problem_arr] def __str__(self): return self.name @@ -43,18 +44,32 @@ class CollabFilter: scores = u.dot(V.T) return scores + def _get_embedding_version(self): + first_problem = self.problem_embeddings[0] + array_bytes = first_problem.tobytes() + hash_object = hashlib.sha256(array_bytes) + hash_bytes = hash_object.digest() + return hash_bytes.hex()[:5] + + @cache_wrapper(prefix="CFgue", timeout=86400) + def _get_user_embedding(self, user_id, embedding_version): + user_arr, _ = self.embeddings.files + user_embeddings = self.embeddings[user_arr] + if user_id >= len(user_embeddings): + return user_embeddings[0] + return user_embeddings[user_id] + + def get_user_embedding(self, user_id): + version = self._get_embedding_version() + return self._get_user_embedding(user_id, version) + @cache_wrapper(prefix="user_recommendations", timeout=3600) - def user_recommendations(self, user, problems, measure=DOT, limit=None): - uid = user.id - if uid >= len(self.user_embeddings): - uid = 0 - scores = self.compute_scores( - self.user_embeddings[uid], self.problem_embeddings, measure - ) + def user_recommendations(self, user_id, problems, measure=DOT, limit=None): + user_embedding = self.get_user_embedding(user_id) + scores = self.compute_scores(user_embedding, self.problem_embeddings, measure) res = [] # [(score, problem)] for pid in problems: - # pid = problem.id if pid < len(scores): res.append((scores[pid], pid)) diff --git a/judge/utils/problems.py b/judge/utils/problems.py index f35b546..0b69267 100644 --- a/judge/utils/problems.py +++ b/judge/utils/problems.py @@ -1,7 +1,8 @@ from collections import defaultdict from math import e -from datetime import datetime +from datetime import datetime, timedelta import random +from enum import Enum from django.conf import settings from django.core.cache import cache @@ -9,6 +10,7 @@ from django.db.models import Case, Count, ExpressionWrapper, F, Max, Q, When from django.db.models.fields import FloatField from django.utils import timezone from django.utils.translation import gettext as _, gettext_noop +from django.http import Http404 from judge.models import Problem, Submission from judge.ml.collab_filter import CollabFilter @@ -248,3 +250,72 @@ def finished_submission(sub): keys += ["contest_complete:%d" % participation.id] keys += ["contest_attempted:%d" % participation.id] cache.delete_many(keys) + + +class RecommendationType(Enum): + HOT_PROBLEM = 1 + CF_DOT = 2 + CF_COSINE = 3 + CF_TIME_DOT = 4 + CF_TIME_COSINE = 5 + + +# Return a list of list. Each inner list correspond to each type in types +def get_user_recommended_problems( + user_id, + problem_ids, + recommendation_types, + limits, + shuffle=False, +): + cf_model = CollabFilter("collab_filter") + cf_time_model = CollabFilter("collab_filter_time") + + def get_problem_ids_from_type(rec_type, limit): + if type(rec_type) == int: + try: + rec_type = RecommendationType(rec_type) + except ValueError: + raise Http404() + if rec_type == RecommendationType.HOT_PROBLEM: + return [ + problem.id + for problem in hot_problems(timedelta(days=7), limit) + if problem.id in set(problem_ids) + ] + if rec_type == RecommendationType.CF_DOT: + return cf_model.user_recommendations( + user_id, problem_ids, cf_model.DOT, limit + ) + if rec_type == RecommendationType.CF_COSINE: + return cf_model.user_recommendations( + user_id, problem_ids, cf_model.COSINE, limit + ) + if rec_type == RecommendationType.CF_TIME_DOT: + return cf_time_model.user_recommendations( + user_id, problem_ids, cf_model.DOT, limit + ) + if rec_type == RecommendationType.CF_TIME_COSINE: + return cf_time_model.user_recommendations( + user_id, problem_ids, cf_model.COSINE, limit + ) + return [] + + all_problems = [] + for rec_type, limit in zip(recommendation_types, limits): + all_problems += get_problem_ids_from_type(rec_type, limit) + if shuffle: + seed = datetime.now().strftime("%d%m%Y") + random.Random(seed).shuffle(all_problems) + + # deduplicate problems + res = [] + used_pid = set() + + for obj in all_problems: + if type(obj) == tuple: + obj = obj[1] + if obj not in used_pid: + res.append(obj) + used_pid.add(obj) + return res diff --git a/judge/views/problem.py b/judge/views/problem.py index 0bf304b..3b7cd84 100644 --- a/judge/views/problem.py +++ b/judge/views/problem.py @@ -1,10 +1,8 @@ import logging import os import shutil -from datetime import timedelta, datetime from operator import itemgetter from random import randrange -import random from copy import deepcopy from django.core.cache import cache @@ -77,6 +75,8 @@ from judge.utils.problems import ( user_attempted_ids, user_completed_ids, get_related_problems, + get_user_recommended_problems, + RecommendationType, ) from judge.utils.strings import safe_float_or_none, safe_int_or_none from judge.utils.tickets import own_ticket_filter @@ -834,24 +834,34 @@ class ProblemFeed(ProblemList, FeedView): title = _("Problem feed") feed_type = None - # arr = [[], [], ..] - def merge_recommendation(self, arr): - seed = datetime.now().strftime("%d%m%Y") - merged_array = [] - for a in arr: - merged_array += a - random.Random(seed).shuffle(merged_array) + def get_recommended_problem_ids(self, queryset): + user_id = self.request.profile.id + problem_ids = queryset.values_list("id", flat=True) + rec_types = [ + RecommendationType.CF_DOT, + RecommendationType.CF_COSINE, + RecommendationType.CF_TIME_DOT, + RecommendationType.CF_TIME_COSINE, + RecommendationType.HOT_PROBLEM, + ] + limits = [100, 100, 100, 100, 20] + shuffle = True - res = [] - used_pid = set() + allow_debug_type = ( + self.request.user.is_impersonate or self.request.user.is_superuser + ) + if allow_debug_type and "debug_type" in self.request.GET: + try: + debug_type = int(self.request.GET.get("debug_type")) + except ValueError: + raise Http404() + rec_types = [debug_type] + limits = [100] + shuffle = False - for obj in merged_array: - if type(obj) == tuple: - obj = obj[1] - if obj not in used_pid: - res.append(obj) - used_pid.add(obj) - return res + return get_user_recommended_problems( + user_id, problem_ids, rec_types, limits, shuffle + ) def get_queryset(self): if self.feed_type == "volunteer": @@ -885,40 +895,8 @@ class ProblemFeed(ProblemList, FeedView): if not settings.ML_OUTPUT_PATH or not user: return queryset.order_by("?").add_i18n_name(self.request.LANGUAGE_CODE) - cf_model = CollabFilter("collab_filter") - cf_time_model = CollabFilter("collab_filter_time") + q = self.get_recommended_problem_ids(queryset) - queryset = queryset.values_list("id", flat=True) - hot_problems_recommendations = [ - problem.id - for problem in hot_problems(timedelta(days=7), 20) - if problem.id in set(queryset) - ] - - q = self.merge_recommendation( - [ - cf_model.user_recommendations(user, queryset, cf_model.DOT, 100), - cf_model.user_recommendations( - user, - queryset, - cf_model.COSINE, - 100, - ), - cf_time_model.user_recommendations( - user, - queryset, - cf_time_model.COSINE, - 100, - ), - cf_time_model.user_recommendations( - user, - queryset, - cf_time_model.DOT, - 100, - ), - hot_problems_recommendations, - ] - ) queryset = Problem.objects.filter(id__in=q) queryset = queryset.add_i18n_name(self.request.LANGUAGE_CODE)