diff --git a/judge/management/commands/addjudge.py b/judge/management/commands/addjudge.py index b659364..699b3ed 100644 --- a/judge/management/commands/addjudge.py +++ b/judge/management/commands/addjudge.py @@ -15,3 +15,4 @@ class Command(BaseCommand): judge.name = options['name'] judge.auth_key = options['auth_key'] judge.save() + diff --git a/judge/management/commands/generate_data.py b/judge/management/commands/generate_data.py new file mode 100644 index 0000000..f7fdbd4 --- /dev/null +++ b/judge/management/commands/generate_data.py @@ -0,0 +1,49 @@ +from django.core.management.base import BaseCommand +from judge.models import * +from collections import defaultdict +import csv +import os +from django.conf import settings + + +def gen_submissions(): + headers = ['uid', 'pid'] + with open(os.path.join(settings.ML_DATA_PATH, 'submissions.csv'), 'w') as csvfile: + f = csv.writer(csvfile) + f.writerow(headers) + + last_pid = defaultdict(int) + for u in Profile.objects.all(): + used = set() + print('Processing user', u.id) + for s in Submission.objects.filter(user=u).order_by('-date'): + if s.problem.id not in used: + used.add(s.problem.id) + f.writerow([u.id, s.problem.id]) + +def gen_users(): + headers = ['uid', 'username', 'rating', 'points'] + with open(os.path.join(settings.ML_DATA_PATH, 'profiles.csv'), 'w') as csvfile: + f = csv.writer(csvfile) + f.writerow(headers) + + for u in Profile.objects.all(): + f.writerow([u.id, u.username, u.rating, u.performance_points]) + +def gen_problems(): + headers = ['pid', 'code', 'name', 'points', 'url'] + with open(os.path.join(settings.ML_DATA_PATH, 'problems.csv'), 'w') as csvfile: + f = csv.writer(csvfile) + f.writerow(headers) + + for p in Problem.objects.all(): + f.writerow([p.id, p.code, p.name, p.points, 'lqdoj.edu.vn/problem/' + p.code]) + + +class Command(BaseCommand): + help = 'generate data for ML' + + def handle(self, *args, **options): + gen_users() + gen_problems() + gen_submissions() \ No newline at end of file diff --git a/judge/ml/collab_filter.py b/judge/ml/collab_filter.py new file mode 100644 index 0000000..76f8827 --- /dev/null +++ b/judge/ml/collab_filter.py @@ -0,0 +1,64 @@ +import numpy as np +from django.conf import settings +import os + + +class CollabFilter: + DOT = 'dot' + COSINE = 'cosine' + def __init__(self): + embeddings = np.load(os.path.join(settings.ML_OUTPUT_PATH, 'collab_filter/embeddings.npz'), + allow_pickle=True) + arr0, arr1 = embeddings.files + self.user_embeddings = embeddings[arr0] + self.problem_embeddings = embeddings[arr1] + + def compute_scores(self, query_embedding, item_embeddings, measure=DOT): + """Computes the scores of the candidates given a query. + Args: + query_embedding: a vector of shape [k], representing the query embedding. + item_embeddings: a matrix of shape [N, k], such that row i is the embedding + of item i. + measure: a string specifying the similarity measure to be used. Can be + either DOT or COSINE. + Returns: + scores: a vector of shape [N], such that scores[i] is the score of item i. + """ + u = query_embedding + V = item_embeddings + if measure == self.COSINE: + V = V / np.linalg.norm(V, axis=1, keepdims=True) + u = u / np.linalg.norm(u) + scores = u.dot(V.T) + return scores + + def user_recommendations(self, user, problems, measure=DOT, limit=None): + uid = user.id + if uid >= len(self.user_embeddings): + uid = 0 + scores = self.compute_scores( + self.user_embeddings[uid], self.problem_embeddings, measure) + + res = [] # [(score, problem)] + for problem in problems: + pid = problem.id + if pid < len(scores): + res.append((scores[pid], problem)) + + res.sort(reverse=True) + return res[:limit] + + + # return a list of pid + def problems_neighbors(self, problem, problemset, measure=DOT, limit=None): + pid = problem.id + if pid >= len(self.problem_embeddings): + return None + scores = self.compute_scores( + self.problem_embeddings[pid], self.problem_embeddings, measure) + res = [] + for p in problemset: + if p.id < len(scores): + res.append((scores[p.id], p)) + res.sort(reverse=True) + return res[:limit] \ No newline at end of file diff --git a/judge/utils/problems.py b/judge/utils/problems.py index ba846e6..aacb916 100644 --- a/judge/utils/problems.py +++ b/judge/utils/problems.py @@ -115,7 +115,7 @@ def hot_problems(duration, limit): qs = cache.get(cache_key) if qs is None: qs = Problem.get_public_problems() \ - .filter(submission__date__gt=timezone.now() - duration, points__gt=3, points__lt=25) + .filter(submission__date__gt=timezone.now() - duration) qs0 = qs.annotate(k=Count('submission__user', distinct=True)).order_by('-k').values_list('k', flat=True) if not qs0: @@ -141,7 +141,7 @@ def hot_problems(duration, limit): qs = qs.filter(unique_user_count__gt=max(mx / 3.0, 1)) qs = qs.annotate(ordering=ExpressionWrapper( - 0.5 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) + + 0.02 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) + 100 * e ** (F('unique_user_count') / mx), output_field=FloatField(), )).order_by('-ordering').defer('description')[:limit] diff --git a/judge/views/problem.py b/judge/views/problem.py index 995c14e..25bf526 100644 --- a/judge/views/problem.py +++ b/judge/views/problem.py @@ -39,6 +39,7 @@ from judge.utils.strings import safe_float_or_none, safe_int_or_none from judge.utils.tickets import own_ticket_filter from judge.utils.views import QueryStringSortMixin, SingleObjectFormView, TitleMixin, generic_message from judge.views.blog import FeedView +from judge.ml.collab_filter import CollabFilter def get_contest_problem(problem, profile): @@ -611,9 +612,44 @@ class ProblemFeed(FeedView): .values_list('problem__id', flat=True)) return queryset.distinct() + # arr = [[], [], ..] + def merge_recommendation(self, arr): + idx = [0] * len(arr) + stop = False + res = [] + used_pid = set() + cnt = 0 + while not stop: + cnt += 1 + stop = True + for i in range(len(arr)): + if idx[i] < len(arr[i]): + obj = arr[i][idx[i]] + if type(obj) == tuple: + obj = obj[1] + elif cnt % 3 != 0: # hot problems appear less + continue + if obj not in used_pid: + res.append(obj) + used_pid.add(obj) + idx[i] += 1 + stop = False + return res + + def get_queryset(self): queryset = self.get_unsolved_queryset() - return queryset.order_by('?') + user = self.request.profile + if not settings.ML_OUTPUT_PATH or not user: + return queryset.order_by('?') + + cl_model = CollabFilter() + dot_rec = cl_model.user_recommendations(user, queryset, cl_model.DOT, 100) + cosine_rec = cl_model.user_recommendations(user, queryset, cl_model.COSINE, 100) + hot_problems_rec = hot_problems(timedelta(days=7), 10) + + q = self.merge_recommendation([dot_rec, cosine_rec, hot_problems_rec]) + return q def get_context_data(self, **kwargs): context = super(ProblemFeed, self).get_context_data(**kwargs) diff --git a/requirements.txt b/requirements.txt index 18eb9a3..5d84d30 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,5 @@ netaddr redis lupa websocket-client -python-memcached \ No newline at end of file +python-memcached +numpy \ No newline at end of file diff --git a/templates/blog/list.html b/templates/blog/list.html index 1419703..d8eee93 100644 --- a/templates/blog/list.html +++ b/templates/blog/list.html @@ -77,7 +77,7 @@ }) $('.blog-description').each(function() { if ($(this).prop('scrollHeight') > $(this).height() ) { - $(this).parent().css('background-image', '-webkit-linear-gradient(bottom, lightgray, lightgray 3%, transparent 8%, transparent 100%)'); + $(this).parent().css('background-image', '-webkit-linear-gradient(bottom, gray, lightgray 3%, transparent 8%, transparent 100%)'); $(this).parent().css('padding-bottom', '0'); $(this).css('cursor', 'pointer'); } diff --git a/templates/problem/feed.html b/templates/problem/feed.html index 8de3bd0..a013836 100644 --- a/templates/problem/feed.html +++ b/templates/problem/feed.html @@ -17,7 +17,7 @@ {% for type in problem.types_list %} {{ type }}{% if not loop.last %}, {% endif %} - {% endfor %} + {% endfor %}, {{problem.points | int}} {% endif %}