Refactor problem feed code

This commit is contained in:
cuom1999 2023-11-09 02:43:11 -06:00
parent b6c9ce4763
commit 0b4eeb8751
4 changed files with 134 additions and 67 deletions

View file

@ -40,7 +40,10 @@ def cache_wrapper(prefix, timeout=None):
def _get(key): def _get(key):
if not l0_cache: if not l0_cache:
return cache.get(key) return cache.get(key)
return l0_cache.get(key) or cache.get(key) result = l0_cache.get(key)
if result is None:
result = cache.get(key)
return result
def _set_l0(key, value): def _set_l0(key, value):
if l0_cache: if l0_cache:
@ -56,7 +59,7 @@ def cache_wrapper(prefix, timeout=None):
result = _get(cache_key) result = _get(cache_key)
if result is not None: if result is not None:
_set_l0(cache_key, result) _set_l0(cache_key, result)
if result == NONE_RESULT: if type(result) == str and result == NONE_RESULT:
result = None result = None
return result return result
result = func(*args, **kwargs) result = func(*args, **kwargs)

View file

@ -1,7 +1,9 @@
import numpy as np import numpy as np
from django.conf import settings
import os import os
import hashlib
from django.core.cache import cache from django.core.cache import cache
from django.conf import settings
from judge.caching import cache_wrapper from judge.caching import cache_wrapper
@ -12,14 +14,13 @@ class CollabFilter:
# name = 'collab_filter' or 'collab_filter_time' # name = 'collab_filter' or 'collab_filter_time'
def __init__(self, name): def __init__(self, name):
embeddings = np.load( self.embeddings = np.load(
os.path.join(settings.ML_OUTPUT_PATH, name + "/embeddings.npz"), os.path.join(settings.ML_OUTPUT_PATH, name + "/embeddings.npz"),
allow_pickle=True, allow_pickle=True,
) )
arr0, arr1 = embeddings.files _, problem_arr = self.embeddings.files
self.name = name self.name = name
self.user_embeddings = embeddings[arr0] self.problem_embeddings = self.embeddings[problem_arr]
self.problem_embeddings = embeddings[arr1]
def __str__(self): def __str__(self):
return self.name return self.name
@ -43,18 +44,32 @@ class CollabFilter:
scores = u.dot(V.T) scores = u.dot(V.T)
return scores return scores
def _get_embedding_version(self):
first_problem = self.problem_embeddings[0]
array_bytes = first_problem.tobytes()
hash_object = hashlib.sha256(array_bytes)
hash_bytes = hash_object.digest()
return hash_bytes.hex()[:5]
@cache_wrapper(prefix="CFgue", timeout=86400)
def _get_user_embedding(self, user_id, embedding_version):
user_arr, _ = self.embeddings.files
user_embeddings = self.embeddings[user_arr]
if user_id >= len(user_embeddings):
return user_embeddings[0]
return user_embeddings[user_id]
def get_user_embedding(self, user_id):
version = self._get_embedding_version()
return self._get_user_embedding(user_id, version)
@cache_wrapper(prefix="user_recommendations", timeout=3600) @cache_wrapper(prefix="user_recommendations", timeout=3600)
def user_recommendations(self, user, problems, measure=DOT, limit=None): def user_recommendations(self, user_id, problems, measure=DOT, limit=None):
uid = user.id user_embedding = self.get_user_embedding(user_id)
if uid >= len(self.user_embeddings): scores = self.compute_scores(user_embedding, self.problem_embeddings, measure)
uid = 0
scores = self.compute_scores(
self.user_embeddings[uid], self.problem_embeddings, measure
)
res = [] # [(score, problem)] res = [] # [(score, problem)]
for pid in problems: for pid in problems:
# pid = problem.id
if pid < len(scores): if pid < len(scores):
res.append((scores[pid], pid)) res.append((scores[pid], pid))

View file

@ -1,7 +1,8 @@
from collections import defaultdict from collections import defaultdict
from math import e from math import e
from datetime import datetime from datetime import datetime, timedelta
import random import random
from enum import Enum
from django.conf import settings from django.conf import settings
from django.core.cache import cache from django.core.cache import cache
@ -9,6 +10,7 @@ from django.db.models import Case, Count, ExpressionWrapper, F, Max, Q, When
from django.db.models.fields import FloatField from django.db.models.fields import FloatField
from django.utils import timezone from django.utils import timezone
from django.utils.translation import gettext as _, gettext_noop from django.utils.translation import gettext as _, gettext_noop
from django.http import Http404
from judge.models import Problem, Submission from judge.models import Problem, Submission
from judge.ml.collab_filter import CollabFilter from judge.ml.collab_filter import CollabFilter
@ -248,3 +250,72 @@ def finished_submission(sub):
keys += ["contest_complete:%d" % participation.id] keys += ["contest_complete:%d" % participation.id]
keys += ["contest_attempted:%d" % participation.id] keys += ["contest_attempted:%d" % participation.id]
cache.delete_many(keys) cache.delete_many(keys)
class RecommendationType(Enum):
HOT_PROBLEM = 1
CF_DOT = 2
CF_COSINE = 3
CF_TIME_DOT = 4
CF_TIME_COSINE = 5
# Return a list of list. Each inner list correspond to each type in types
def get_user_recommended_problems(
user_id,
problem_ids,
recommendation_types,
limits,
shuffle=False,
):
cf_model = CollabFilter("collab_filter")
cf_time_model = CollabFilter("collab_filter_time")
def get_problem_ids_from_type(rec_type, limit):
if type(rec_type) == int:
try:
rec_type = RecommendationType(rec_type)
except ValueError:
raise Http404()
if rec_type == RecommendationType.HOT_PROBLEM:
return [
problem.id
for problem in hot_problems(timedelta(days=7), limit)
if problem.id in set(problem_ids)
]
if rec_type == RecommendationType.CF_DOT:
return cf_model.user_recommendations(
user_id, problem_ids, cf_model.DOT, limit
)
if rec_type == RecommendationType.CF_COSINE:
return cf_model.user_recommendations(
user_id, problem_ids, cf_model.COSINE, limit
)
if rec_type == RecommendationType.CF_TIME_DOT:
return cf_time_model.user_recommendations(
user_id, problem_ids, cf_model.DOT, limit
)
if rec_type == RecommendationType.CF_TIME_COSINE:
return cf_time_model.user_recommendations(
user_id, problem_ids, cf_model.COSINE, limit
)
return []
all_problems = []
for rec_type, limit in zip(recommendation_types, limits):
all_problems += get_problem_ids_from_type(rec_type, limit)
if shuffle:
seed = datetime.now().strftime("%d%m%Y")
random.Random(seed).shuffle(all_problems)
# deduplicate problems
res = []
used_pid = set()
for obj in all_problems:
if type(obj) == tuple:
obj = obj[1]
if obj not in used_pid:
res.append(obj)
used_pid.add(obj)
return res

View file

@ -1,10 +1,8 @@
import logging import logging
import os import os
import shutil import shutil
from datetime import timedelta, datetime
from operator import itemgetter from operator import itemgetter
from random import randrange from random import randrange
import random
from copy import deepcopy from copy import deepcopy
from django.core.cache import cache from django.core.cache import cache
@ -77,6 +75,8 @@ from judge.utils.problems import (
user_attempted_ids, user_attempted_ids,
user_completed_ids, user_completed_ids,
get_related_problems, get_related_problems,
get_user_recommended_problems,
RecommendationType,
) )
from judge.utils.strings import safe_float_or_none, safe_int_or_none from judge.utils.strings import safe_float_or_none, safe_int_or_none
from judge.utils.tickets import own_ticket_filter from judge.utils.tickets import own_ticket_filter
@ -834,24 +834,34 @@ class ProblemFeed(ProblemList, FeedView):
title = _("Problem feed") title = _("Problem feed")
feed_type = None feed_type = None
# arr = [[], [], ..] def get_recommended_problem_ids(self, queryset):
def merge_recommendation(self, arr): user_id = self.request.profile.id
seed = datetime.now().strftime("%d%m%Y") problem_ids = queryset.values_list("id", flat=True)
merged_array = [] rec_types = [
for a in arr: RecommendationType.CF_DOT,
merged_array += a RecommendationType.CF_COSINE,
random.Random(seed).shuffle(merged_array) RecommendationType.CF_TIME_DOT,
RecommendationType.CF_TIME_COSINE,
RecommendationType.HOT_PROBLEM,
]
limits = [100, 100, 100, 100, 20]
shuffle = True
res = [] allow_debug_type = (
used_pid = set() self.request.user.is_impersonate or self.request.user.is_superuser
)
if allow_debug_type and "debug_type" in self.request.GET:
try:
debug_type = int(self.request.GET.get("debug_type"))
except ValueError:
raise Http404()
rec_types = [debug_type]
limits = [100]
shuffle = False
for obj in merged_array: return get_user_recommended_problems(
if type(obj) == tuple: user_id, problem_ids, rec_types, limits, shuffle
obj = obj[1] )
if obj not in used_pid:
res.append(obj)
used_pid.add(obj)
return res
def get_queryset(self): def get_queryset(self):
if self.feed_type == "volunteer": if self.feed_type == "volunteer":
@ -885,40 +895,8 @@ class ProblemFeed(ProblemList, FeedView):
if not settings.ML_OUTPUT_PATH or not user: if not settings.ML_OUTPUT_PATH or not user:
return queryset.order_by("?").add_i18n_name(self.request.LANGUAGE_CODE) return queryset.order_by("?").add_i18n_name(self.request.LANGUAGE_CODE)
cf_model = CollabFilter("collab_filter") q = self.get_recommended_problem_ids(queryset)
cf_time_model = CollabFilter("collab_filter_time")
queryset = queryset.values_list("id", flat=True)
hot_problems_recommendations = [
problem.id
for problem in hot_problems(timedelta(days=7), 20)
if problem.id in set(queryset)
]
q = self.merge_recommendation(
[
cf_model.user_recommendations(user, queryset, cf_model.DOT, 100),
cf_model.user_recommendations(
user,
queryset,
cf_model.COSINE,
100,
),
cf_time_model.user_recommendations(
user,
queryset,
cf_time_model.COSINE,
100,
),
cf_time_model.user_recommendations(
user,
queryset,
cf_time_model.DOT,
100,
),
hot_problems_recommendations,
]
)
queryset = Problem.objects.filter(id__in=q) queryset = Problem.objects.filter(id__in=q)
queryset = queryset.add_i18n_name(self.request.LANGUAGE_CODE) queryset = queryset.add_i18n_name(self.request.LANGUAGE_CODE)