Add ML to problem feed
This commit is contained in:
parent
34523ab53f
commit
2fe571379c
8 changed files with 157 additions and 6 deletions
|
@ -15,3 +15,4 @@ class Command(BaseCommand):
|
||||||
judge.name = options['name']
|
judge.name = options['name']
|
||||||
judge.auth_key = options['auth_key']
|
judge.auth_key = options['auth_key']
|
||||||
judge.save()
|
judge.save()
|
||||||
|
|
||||||
|
|
49
judge/management/commands/generate_data.py
Normal file
49
judge/management/commands/generate_data.py
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
from judge.models import *
|
||||||
|
from collections import defaultdict
|
||||||
|
import csv
|
||||||
|
import os
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
|
||||||
|
def gen_submissions():
|
||||||
|
headers = ['uid', 'pid']
|
||||||
|
with open(os.path.join(settings.ML_DATA_PATH, 'submissions.csv'), 'w') as csvfile:
|
||||||
|
f = csv.writer(csvfile)
|
||||||
|
f.writerow(headers)
|
||||||
|
|
||||||
|
last_pid = defaultdict(int)
|
||||||
|
for u in Profile.objects.all():
|
||||||
|
used = set()
|
||||||
|
print('Processing user', u.id)
|
||||||
|
for s in Submission.objects.filter(user=u).order_by('-date'):
|
||||||
|
if s.problem.id not in used:
|
||||||
|
used.add(s.problem.id)
|
||||||
|
f.writerow([u.id, s.problem.id])
|
||||||
|
|
||||||
|
def gen_users():
|
||||||
|
headers = ['uid', 'username', 'rating', 'points']
|
||||||
|
with open(os.path.join(settings.ML_DATA_PATH, 'profiles.csv'), 'w') as csvfile:
|
||||||
|
f = csv.writer(csvfile)
|
||||||
|
f.writerow(headers)
|
||||||
|
|
||||||
|
for u in Profile.objects.all():
|
||||||
|
f.writerow([u.id, u.username, u.rating, u.performance_points])
|
||||||
|
|
||||||
|
def gen_problems():
|
||||||
|
headers = ['pid', 'code', 'name', 'points', 'url']
|
||||||
|
with open(os.path.join(settings.ML_DATA_PATH, 'problems.csv'), 'w') as csvfile:
|
||||||
|
f = csv.writer(csvfile)
|
||||||
|
f.writerow(headers)
|
||||||
|
|
||||||
|
for p in Problem.objects.all():
|
||||||
|
f.writerow([p.id, p.code, p.name, p.points, 'lqdoj.edu.vn/problem/' + p.code])
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = 'generate data for ML'
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
gen_users()
|
||||||
|
gen_problems()
|
||||||
|
gen_submissions()
|
64
judge/ml/collab_filter.py
Normal file
64
judge/ml/collab_filter.py
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
import numpy as np
|
||||||
|
from django.conf import settings
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class CollabFilter:
|
||||||
|
DOT = 'dot'
|
||||||
|
COSINE = 'cosine'
|
||||||
|
def __init__(self):
|
||||||
|
embeddings = np.load(os.path.join(settings.ML_OUTPUT_PATH, 'collab_filter/embeddings.npz'),
|
||||||
|
allow_pickle=True)
|
||||||
|
arr0, arr1 = embeddings.files
|
||||||
|
self.user_embeddings = embeddings[arr0]
|
||||||
|
self.problem_embeddings = embeddings[arr1]
|
||||||
|
|
||||||
|
def compute_scores(self, query_embedding, item_embeddings, measure=DOT):
|
||||||
|
"""Computes the scores of the candidates given a query.
|
||||||
|
Args:
|
||||||
|
query_embedding: a vector of shape [k], representing the query embedding.
|
||||||
|
item_embeddings: a matrix of shape [N, k], such that row i is the embedding
|
||||||
|
of item i.
|
||||||
|
measure: a string specifying the similarity measure to be used. Can be
|
||||||
|
either DOT or COSINE.
|
||||||
|
Returns:
|
||||||
|
scores: a vector of shape [N], such that scores[i] is the score of item i.
|
||||||
|
"""
|
||||||
|
u = query_embedding
|
||||||
|
V = item_embeddings
|
||||||
|
if measure == self.COSINE:
|
||||||
|
V = V / np.linalg.norm(V, axis=1, keepdims=True)
|
||||||
|
u = u / np.linalg.norm(u)
|
||||||
|
scores = u.dot(V.T)
|
||||||
|
return scores
|
||||||
|
|
||||||
|
def user_recommendations(self, user, problems, measure=DOT, limit=None):
|
||||||
|
uid = user.id
|
||||||
|
if uid >= len(self.user_embeddings):
|
||||||
|
uid = 0
|
||||||
|
scores = self.compute_scores(
|
||||||
|
self.user_embeddings[uid], self.problem_embeddings, measure)
|
||||||
|
|
||||||
|
res = [] # [(score, problem)]
|
||||||
|
for problem in problems:
|
||||||
|
pid = problem.id
|
||||||
|
if pid < len(scores):
|
||||||
|
res.append((scores[pid], problem))
|
||||||
|
|
||||||
|
res.sort(reverse=True)
|
||||||
|
return res[:limit]
|
||||||
|
|
||||||
|
|
||||||
|
# return a list of pid
|
||||||
|
def problems_neighbors(self, problem, problemset, measure=DOT, limit=None):
|
||||||
|
pid = problem.id
|
||||||
|
if pid >= len(self.problem_embeddings):
|
||||||
|
return None
|
||||||
|
scores = self.compute_scores(
|
||||||
|
self.problem_embeddings[pid], self.problem_embeddings, measure)
|
||||||
|
res = []
|
||||||
|
for p in problemset:
|
||||||
|
if p.id < len(scores):
|
||||||
|
res.append((scores[p.id], p))
|
||||||
|
res.sort(reverse=True)
|
||||||
|
return res[:limit]
|
|
@ -115,7 +115,7 @@ def hot_problems(duration, limit):
|
||||||
qs = cache.get(cache_key)
|
qs = cache.get(cache_key)
|
||||||
if qs is None:
|
if qs is None:
|
||||||
qs = Problem.get_public_problems() \
|
qs = Problem.get_public_problems() \
|
||||||
.filter(submission__date__gt=timezone.now() - duration, points__gt=3, points__lt=25)
|
.filter(submission__date__gt=timezone.now() - duration)
|
||||||
qs0 = qs.annotate(k=Count('submission__user', distinct=True)).order_by('-k').values_list('k', flat=True)
|
qs0 = qs.annotate(k=Count('submission__user', distinct=True)).order_by('-k').values_list('k', flat=True)
|
||||||
|
|
||||||
if not qs0:
|
if not qs0:
|
||||||
|
@ -141,7 +141,7 @@ def hot_problems(duration, limit):
|
||||||
qs = qs.filter(unique_user_count__gt=max(mx / 3.0, 1))
|
qs = qs.filter(unique_user_count__gt=max(mx / 3.0, 1))
|
||||||
|
|
||||||
qs = qs.annotate(ordering=ExpressionWrapper(
|
qs = qs.annotate(ordering=ExpressionWrapper(
|
||||||
0.5 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) +
|
0.02 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) +
|
||||||
100 * e ** (F('unique_user_count') / mx), output_field=FloatField(),
|
100 * e ** (F('unique_user_count') / mx), output_field=FloatField(),
|
||||||
)).order_by('-ordering').defer('description')[:limit]
|
)).order_by('-ordering').defer('description')[:limit]
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,7 @@ from judge.utils.strings import safe_float_or_none, safe_int_or_none
|
||||||
from judge.utils.tickets import own_ticket_filter
|
from judge.utils.tickets import own_ticket_filter
|
||||||
from judge.utils.views import QueryStringSortMixin, SingleObjectFormView, TitleMixin, generic_message
|
from judge.utils.views import QueryStringSortMixin, SingleObjectFormView, TitleMixin, generic_message
|
||||||
from judge.views.blog import FeedView
|
from judge.views.blog import FeedView
|
||||||
|
from judge.ml.collab_filter import CollabFilter
|
||||||
|
|
||||||
|
|
||||||
def get_contest_problem(problem, profile):
|
def get_contest_problem(problem, profile):
|
||||||
|
@ -611,10 +612,45 @@ class ProblemFeed(FeedView):
|
||||||
.values_list('problem__id', flat=True))
|
.values_list('problem__id', flat=True))
|
||||||
return queryset.distinct()
|
return queryset.distinct()
|
||||||
|
|
||||||
|
# arr = [[], [], ..]
|
||||||
|
def merge_recommendation(self, arr):
|
||||||
|
idx = [0] * len(arr)
|
||||||
|
stop = False
|
||||||
|
res = []
|
||||||
|
used_pid = set()
|
||||||
|
cnt = 0
|
||||||
|
while not stop:
|
||||||
|
cnt += 1
|
||||||
|
stop = True
|
||||||
|
for i in range(len(arr)):
|
||||||
|
if idx[i] < len(arr[i]):
|
||||||
|
obj = arr[i][idx[i]]
|
||||||
|
if type(obj) == tuple:
|
||||||
|
obj = obj[1]
|
||||||
|
elif cnt % 3 != 0: # hot problems appear less
|
||||||
|
continue
|
||||||
|
if obj not in used_pid:
|
||||||
|
res.append(obj)
|
||||||
|
used_pid.add(obj)
|
||||||
|
idx[i] += 1
|
||||||
|
stop = False
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
def get_queryset(self):
|
def get_queryset(self):
|
||||||
queryset = self.get_unsolved_queryset()
|
queryset = self.get_unsolved_queryset()
|
||||||
|
user = self.request.profile
|
||||||
|
if not settings.ML_OUTPUT_PATH or not user:
|
||||||
return queryset.order_by('?')
|
return queryset.order_by('?')
|
||||||
|
|
||||||
|
cl_model = CollabFilter()
|
||||||
|
dot_rec = cl_model.user_recommendations(user, queryset, cl_model.DOT, 100)
|
||||||
|
cosine_rec = cl_model.user_recommendations(user, queryset, cl_model.COSINE, 100)
|
||||||
|
hot_problems_rec = hot_problems(timedelta(days=7), 10)
|
||||||
|
|
||||||
|
q = self.merge_recommendation([dot_rec, cosine_rec, hot_problems_rec])
|
||||||
|
return q
|
||||||
|
|
||||||
def get_context_data(self, **kwargs):
|
def get_context_data(self, **kwargs):
|
||||||
context = super(ProblemFeed, self).get_context_data(**kwargs)
|
context = super(ProblemFeed, self).get_context_data(**kwargs)
|
||||||
context['first_page_href'] = self.request.path
|
context['first_page_href'] = self.request.path
|
||||||
|
|
|
@ -36,3 +36,4 @@ redis
|
||||||
lupa
|
lupa
|
||||||
websocket-client
|
websocket-client
|
||||||
python-memcached
|
python-memcached
|
||||||
|
numpy
|
|
@ -77,7 +77,7 @@
|
||||||
})
|
})
|
||||||
$('.blog-description').each(function() {
|
$('.blog-description').each(function() {
|
||||||
if ($(this).prop('scrollHeight') > $(this).height() ) {
|
if ($(this).prop('scrollHeight') > $(this).height() ) {
|
||||||
$(this).parent().css('background-image', '-webkit-linear-gradient(bottom, lightgray, lightgray 3%, transparent 8%, transparent 100%)');
|
$(this).parent().css('background-image', '-webkit-linear-gradient(bottom, gray, lightgray 3%, transparent 8%, transparent 100%)');
|
||||||
$(this).parent().css('padding-bottom', '0');
|
$(this).parent().css('padding-bottom', '0');
|
||||||
$(this).css('cursor', 'pointer');
|
$(this).css('cursor', 'pointer');
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
<i class="fa fa-tag"></i>
|
<i class="fa fa-tag"></i>
|
||||||
{% for type in problem.types_list %}
|
{% for type in problem.types_list %}
|
||||||
<span class="type-tag">{{ type }}</span>{% if not loop.last %}, {% endif %}
|
<span class="type-tag">{{ type }}</span>{% if not loop.last %}, {% endif %}
|
||||||
{% endfor %}
|
{% endfor %}, {{problem.points | int}}
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
<div class='blog-description content-description'>
|
<div class='blog-description content-description'>
|
||||||
|
|
Loading…
Reference in a new issue