Add ML to problem feed

This commit is contained in:
cuom1999 2022-04-11 21:18:01 -05:00
parent 34523ab53f
commit 2fe571379c
8 changed files with 157 additions and 6 deletions

View file

@ -15,3 +15,4 @@ class Command(BaseCommand):
judge.name = options['name']
judge.auth_key = options['auth_key']
judge.save()

View file

@ -0,0 +1,49 @@
from django.core.management.base import BaseCommand
from judge.models import *
from collections import defaultdict
import csv
import os
from django.conf import settings
def gen_submissions():
headers = ['uid', 'pid']
with open(os.path.join(settings.ML_DATA_PATH, 'submissions.csv'), 'w') as csvfile:
f = csv.writer(csvfile)
f.writerow(headers)
last_pid = defaultdict(int)
for u in Profile.objects.all():
used = set()
print('Processing user', u.id)
for s in Submission.objects.filter(user=u).order_by('-date'):
if s.problem.id not in used:
used.add(s.problem.id)
f.writerow([u.id, s.problem.id])
def gen_users():
headers = ['uid', 'username', 'rating', 'points']
with open(os.path.join(settings.ML_DATA_PATH, 'profiles.csv'), 'w') as csvfile:
f = csv.writer(csvfile)
f.writerow(headers)
for u in Profile.objects.all():
f.writerow([u.id, u.username, u.rating, u.performance_points])
def gen_problems():
headers = ['pid', 'code', 'name', 'points', 'url']
with open(os.path.join(settings.ML_DATA_PATH, 'problems.csv'), 'w') as csvfile:
f = csv.writer(csvfile)
f.writerow(headers)
for p in Problem.objects.all():
f.writerow([p.id, p.code, p.name, p.points, 'lqdoj.edu.vn/problem/' + p.code])
class Command(BaseCommand):
help = 'generate data for ML'
def handle(self, *args, **options):
gen_users()
gen_problems()
gen_submissions()

64
judge/ml/collab_filter.py Normal file
View file

@ -0,0 +1,64 @@
import numpy as np
from django.conf import settings
import os
class CollabFilter:
DOT = 'dot'
COSINE = 'cosine'
def __init__(self):
embeddings = np.load(os.path.join(settings.ML_OUTPUT_PATH, 'collab_filter/embeddings.npz'),
allow_pickle=True)
arr0, arr1 = embeddings.files
self.user_embeddings = embeddings[arr0]
self.problem_embeddings = embeddings[arr1]
def compute_scores(self, query_embedding, item_embeddings, measure=DOT):
"""Computes the scores of the candidates given a query.
Args:
query_embedding: a vector of shape [k], representing the query embedding.
item_embeddings: a matrix of shape [N, k], such that row i is the embedding
of item i.
measure: a string specifying the similarity measure to be used. Can be
either DOT or COSINE.
Returns:
scores: a vector of shape [N], such that scores[i] is the score of item i.
"""
u = query_embedding
V = item_embeddings
if measure == self.COSINE:
V = V / np.linalg.norm(V, axis=1, keepdims=True)
u = u / np.linalg.norm(u)
scores = u.dot(V.T)
return scores
def user_recommendations(self, user, problems, measure=DOT, limit=None):
uid = user.id
if uid >= len(self.user_embeddings):
uid = 0
scores = self.compute_scores(
self.user_embeddings[uid], self.problem_embeddings, measure)
res = [] # [(score, problem)]
for problem in problems:
pid = problem.id
if pid < len(scores):
res.append((scores[pid], problem))
res.sort(reverse=True)
return res[:limit]
# return a list of pid
def problems_neighbors(self, problem, problemset, measure=DOT, limit=None):
pid = problem.id
if pid >= len(self.problem_embeddings):
return None
scores = self.compute_scores(
self.problem_embeddings[pid], self.problem_embeddings, measure)
res = []
for p in problemset:
if p.id < len(scores):
res.append((scores[p.id], p))
res.sort(reverse=True)
return res[:limit]

View file

@ -115,7 +115,7 @@ def hot_problems(duration, limit):
qs = cache.get(cache_key)
if qs is None:
qs = Problem.get_public_problems() \
.filter(submission__date__gt=timezone.now() - duration, points__gt=3, points__lt=25)
.filter(submission__date__gt=timezone.now() - duration)
qs0 = qs.annotate(k=Count('submission__user', distinct=True)).order_by('-k').values_list('k', flat=True)
if not qs0:
@ -141,7 +141,7 @@ def hot_problems(duration, limit):
qs = qs.filter(unique_user_count__gt=max(mx / 3.0, 1))
qs = qs.annotate(ordering=ExpressionWrapper(
0.5 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) +
0.02 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) +
100 * e ** (F('unique_user_count') / mx), output_field=FloatField(),
)).order_by('-ordering').defer('description')[:limit]

View file

@ -39,6 +39,7 @@ from judge.utils.strings import safe_float_or_none, safe_int_or_none
from judge.utils.tickets import own_ticket_filter
from judge.utils.views import QueryStringSortMixin, SingleObjectFormView, TitleMixin, generic_message
from judge.views.blog import FeedView
from judge.ml.collab_filter import CollabFilter
def get_contest_problem(problem, profile):
@ -611,9 +612,44 @@ class ProblemFeed(FeedView):
.values_list('problem__id', flat=True))
return queryset.distinct()
# arr = [[], [], ..]
def merge_recommendation(self, arr):
idx = [0] * len(arr)
stop = False
res = []
used_pid = set()
cnt = 0
while not stop:
cnt += 1
stop = True
for i in range(len(arr)):
if idx[i] < len(arr[i]):
obj = arr[i][idx[i]]
if type(obj) == tuple:
obj = obj[1]
elif cnt % 3 != 0: # hot problems appear less
continue
if obj not in used_pid:
res.append(obj)
used_pid.add(obj)
idx[i] += 1
stop = False
return res
def get_queryset(self):
queryset = self.get_unsolved_queryset()
return queryset.order_by('?')
user = self.request.profile
if not settings.ML_OUTPUT_PATH or not user:
return queryset.order_by('?')
cl_model = CollabFilter()
dot_rec = cl_model.user_recommendations(user, queryset, cl_model.DOT, 100)
cosine_rec = cl_model.user_recommendations(user, queryset, cl_model.COSINE, 100)
hot_problems_rec = hot_problems(timedelta(days=7), 10)
q = self.merge_recommendation([dot_rec, cosine_rec, hot_problems_rec])
return q
def get_context_data(self, **kwargs):
context = super(ProblemFeed, self).get_context_data(**kwargs)

View file

@ -35,4 +35,5 @@ netaddr
redis
lupa
websocket-client
python-memcached
python-memcached
numpy

View file

@ -77,7 +77,7 @@
})
$('.blog-description').each(function() {
if ($(this).prop('scrollHeight') > $(this).height() ) {
$(this).parent().css('background-image', '-webkit-linear-gradient(bottom, lightgray, lightgray 3%, transparent 8%, transparent 100%)');
$(this).parent().css('background-image', '-webkit-linear-gradient(bottom, gray, lightgray 3%, transparent 8%, transparent 100%)');
$(this).parent().css('padding-bottom', '0');
$(this).css('cursor', 'pointer');
}

View file

@ -17,7 +17,7 @@
<i class="fa fa-tag"></i>
{% for type in problem.types_list %}
<span class="type-tag">{{ type }}</span>{% if not loop.last %}, {% endif %}
{% endfor %}
{% endfor %}, {{problem.points | int}}
</div>
{% endif %}
<div class='blog-description content-description'>