Add ML to problem feed

2022-04-11 21:18:01 -05:00 · 2022-04-11 21:18:01 -05:00 · 2fe571379c
commit 2fe571379c
parent 34523ab53f
8 changed files with 157 additions and 6 deletions
--- a/judge/management/commands/addjudge.py
+++ b/judge/management/commands/addjudge.py
@ -15,3 +15,4 @@ class Command(BaseCommand):
        judge.name = options['name']
        judge.auth_key = options['auth_key']
        judge.save()
+
--- a/judge/management/commands/generate_data.py
+++ b/judge/management/commands/generate_data.py
@ -0,0 +1,49 @@
+from django.core.management.base import BaseCommand
+from judge.models import *
+from collections import defaultdict
+import csv
+import os
+from django.conf import settings
+
+
+def gen_submissions():
+    headers = ['uid', 'pid']
+    with open(os.path.join(settings.ML_DATA_PATH, 'submissions.csv'), 'w') as csvfile:
+        f = csv.writer(csvfile)
+        f.writerow(headers)
+        
+        last_pid = defaultdict(int)
+        for u in Profile.objects.all():
+            used = set()
+            print('Processing user', u.id)
+            for s in Submission.objects.filter(user=u).order_by('-date'):
+                if s.problem.id not in used:
+                    used.add(s.problem.id)
+                    f.writerow([u.id, s.problem.id])
+
+def gen_users():
+    headers = ['uid', 'username', 'rating', 'points']
+    with open(os.path.join(settings.ML_DATA_PATH, 'profiles.csv'), 'w') as csvfile:
+        f = csv.writer(csvfile)
+        f.writerow(headers)
+        
+        for u in Profile.objects.all():
+            f.writerow([u.id, u.username, u.rating, u.performance_points])
+
+def gen_problems():
+    headers = ['pid', 'code', 'name', 'points', 'url']
+    with open(os.path.join(settings.ML_DATA_PATH, 'problems.csv'), 'w') as csvfile:
+        f = csv.writer(csvfile)
+        f.writerow(headers)
+        
+        for p in Problem.objects.all():
+            f.writerow([p.id, p.code, p.name, p.points, 'lqdoj.edu.vn/problem/' + p.code])
+
+
+class Command(BaseCommand):
+    help = 'generate data for ML'
+
+    def handle(self, *args, **options):
+        gen_users()
+        gen_problems()
+        gen_submissions()
--- a/judge/ml/collab_filter.py
+++ b/judge/ml/collab_filter.py
@ -0,0 +1,64 @@
+import numpy as np
+from django.conf import settings
+import os
+
+
+class CollabFilter:
+    DOT = 'dot'
+    COSINE = 'cosine'
+    def __init__(self):
+        embeddings = np.load(os.path.join(settings.ML_OUTPUT_PATH, 'collab_filter/embeddings.npz'),
+            allow_pickle=True)
+        arr0, arr1 = embeddings.files
+        self.user_embeddings = embeddings[arr0]
+        self.problem_embeddings = embeddings[arr1]
+
+    def compute_scores(self, query_embedding, item_embeddings, measure=DOT):
+        """Computes the scores of the candidates given a query.
+        Args:
+        query_embedding: a vector of shape [k], representing the query embedding.
+        item_embeddings: a matrix of shape [N, k], such that row i is the embedding
+            of item i.
+        measure: a string specifying the similarity measure to be used. Can be
+            either DOT or COSINE.
+        Returns:
+        scores: a vector of shape [N], such that scores[i] is the score of item i.
+        """
+        u = query_embedding
+        V = item_embeddings
+        if measure == self.COSINE:
+            V = V / np.linalg.norm(V, axis=1, keepdims=True)
+            u = u / np.linalg.norm(u)
+        scores = u.dot(V.T)
+        return scores
+
+    def user_recommendations(self, user, problems, measure=DOT, limit=None):
+        uid = user.id
+        if uid >= len(self.user_embeddings):
+            uid = 0
+        scores = self.compute_scores(
+            self.user_embeddings[uid], self.problem_embeddings, measure)
+        
+        res = [] # [(score, problem)]
+        for problem in problems:
+            pid = problem.id
+            if pid < len(scores):
+                res.append((scores[pid], problem))
+
+        res.sort(reverse=True)
+        return res[:limit]
+
+
+    # return a list of pid
+    def problems_neighbors(self, problem, problemset, measure=DOT, limit=None):
+        pid = problem.id
+        if pid >= len(self.problem_embeddings):
+            return None
+        scores = self.compute_scores(
+            self.problem_embeddings[pid], self.problem_embeddings, measure)
+        res = []
+        for p in problemset:
+            if p.id < len(scores):
+                res.append((scores[p.id], p))
+        res.sort(reverse=True)
+        return res[:limit]
--- a/judge/utils/problems.py
+++ b/judge/utils/problems.py
@ -115,7 +115,7 @@ def hot_problems(duration, limit):
    qs = cache.get(cache_key)
    if qs is None:
        qs = Problem.get_public_problems() \
-                    .filter(submission__date__gt=timezone.now() - duration, points__gt=3, points__lt=25)
+                    .filter(submission__date__gt=timezone.now() - duration)
        qs0 = qs.annotate(k=Count('submission__user', distinct=True)).order_by('-k').values_list('k', flat=True)

        if not qs0:
@ -141,7 +141,7 @@ def hot_problems(duration, limit):
        qs = qs.filter(unique_user_count__gt=max(mx / 3.0, 1))

        qs = qs.annotate(ordering=ExpressionWrapper(
-            0.5 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) +
+            0.02 * F('points') * (0.4 * F('ac_volume') / F('submission_volume') + 0.6 * F('ac_rate')) +
            100 * e ** (F('unique_user_count') / mx), output_field=FloatField(),
        )).order_by('-ordering').defer('description')[:limit]

--- a/judge/views/problem.py
+++ b/judge/views/problem.py
@ -39,6 +39,7 @@ from judge.utils.strings import safe_float_or_none, safe_int_or_none
 from judge.utils.tickets import own_ticket_filter
 from judge.utils.views import QueryStringSortMixin, SingleObjectFormView, TitleMixin, generic_message
 from judge.views.blog import FeedView
+from judge.ml.collab_filter import CollabFilter


 def get_contest_problem(problem, profile):
@ -611,9 +612,44 @@ class ProblemFeed(FeedView):
                                        .values_list('problem__id', flat=True))
        return queryset.distinct()

+    # arr = [[], [], ..]
+    def merge_recommendation(self, arr): 
+        idx = [0] * len(arr)
+        stop = False
+        res = []
+        used_pid = set()
+        cnt = 0
+        while not stop:
+            cnt += 1
+            stop = True
+            for i in range(len(arr)):
+                if idx[i] < len(arr[i]):
+                    obj = arr[i][idx[i]]
+                    if type(obj) == tuple:
+                        obj = obj[1]
+                    elif cnt % 3 != 0: # hot problems appear less
+                        continue
+                    if obj not in used_pid:
+                        res.append(obj)
+                        used_pid.add(obj)
+                    idx[i] += 1
+                    stop = False
+        return res
+
+
    def get_queryset(self):
        queryset = self.get_unsolved_queryset()
-        return queryset.order_by('?')    
+        user = self.request.profile
+        if not settings.ML_OUTPUT_PATH or not user:
+            return queryset.order_by('?')
+        
+        cl_model = CollabFilter()
+        dot_rec = cl_model.user_recommendations(user, queryset, cl_model.DOT, 100)
+        cosine_rec = cl_model.user_recommendations(user, queryset, cl_model.COSINE, 100)
+        hot_problems_rec = hot_problems(timedelta(days=7), 10)
+        
+        q = self.merge_recommendation([dot_rec, cosine_rec, hot_problems_rec])
+        return q

    def get_context_data(self, **kwargs):
        context = super(ProblemFeed, self).get_context_data(**kwargs)
--- a/requirements.txt
+++ b/requirements.txt
@ -35,4 +35,5 @@ netaddr
 redis
 lupa
 websocket-client
-python-memcached
+python-memcached
+numpy
--- a/templates/blog/list.html
+++ b/templates/blog/list.html
@ -77,7 +77,7 @@
            })
            $('.blog-description').each(function() {
                if ($(this).prop('scrollHeight') > $(this).height() ) {
-                    $(this).parent().css('background-image', '-webkit-linear-gradient(bottom, lightgray, lightgray 3%, transparent 8%, transparent 100%)');
+                    $(this).parent().css('background-image', '-webkit-linear-gradient(bottom, gray, lightgray 3%, transparent 8%, transparent 100%)');
                    $(this).parent().css('padding-bottom', '0');
                    $(this).css('cursor', 'pointer');
                }
--- a/templates/problem/feed.html
+++ b/templates/problem/feed.html
@ -17,7 +17,7 @@
            <i class="fa fa-tag"></i>
            {% for type in problem.types_list %}
                <span class="type-tag">{{ type }}</span>{% if not loop.last %}, {% endif %}
-            {% endfor %}
+            {% endfor %}, {{problem.points | int}}
        </div>
    {% endif %}
    <div class='blog-description content-description'>