Test Formatter (#100)

2024-01-09 02:27:20 +08:00 · 2024-01-09 02:27:20 +08:00 · 04c6af1dff
commit 04c6af1dff
parent 14ecef649e
19 changed files with 1624 additions and 0 deletions
--- a/judge/views/test_formatter/tf_pattern.py
+++ b/judge/views/test_formatter/tf_pattern.py
@ -0,0 +1,268 @@
+import os
+import random
+from judge.views.test_formatter import tf_utils as utils
+
+SAMPLE_SIZE = 16
+NUMBERED_MM = ["0", "1", "00", "01", "000", "001", "0000", "0001"]
+VALID_MM = ["*"] + NUMBERED_MM
+
+MSG_TOO_MANY_OCCURRENCES = (
+    "400: Invalid pattern: Pattern cannot have more than one '{}'"
+)
+MSG_MM_NOT_FOUND = "400: Invalid pattern: Wildcard not found. Wildcard list: {}"
+
+
+class Pattern:
+    def __init__(self, ll, mm, rr):
+        assert mm in VALID_MM, "Invalid wildcard"
+        self.ll = ll
+        self.mm = mm
+        self.rr = rr
+
+    def __repr__(self):
+        return "Pattern('{}', '{}', '{}')".format(self.ll, self.mm, self.rr)
+
+    def __eq__(self, other):
+        return self.__repr__() == other.__repr__()
+
+    def __hash__(self):
+        return self.__repr__().__hash__()
+
+    @classmethod
+    def from_string(cls, text):
+        for mm in ["*"] + sorted(NUMBERED_MM, key=len, reverse=True):
+            if mm in text:
+                if text.count(mm) > 1:
+                    raise Exception(MSG_TOO_MANY_OCCURRENCES.format(mm))
+                i = text.index(mm)
+                return cls(text[:i], mm, text[i + len(mm) :])
+        raise Exception(MSG_MM_NOT_FOUND.format(",".join(VALID_MM)))
+
+    def to_string(self):
+        return self.ll + self.mm + self.rr
+
+    def is_valid_test_id(self, test_id):
+        if self.mm == "*":
+            return True
+        if self.mm in NUMBERED_MM:
+            return test_id.isdigit() and len(test_id) >= len(self.mm)
+        raise NotImplementedError
+
+    def matched(self, name):
+        return (
+            name.startswith(self.ll)
+            and name.endswith(self.rr)
+            and len(name) >= len(self.ll) + len(self.rr)
+            and self.is_valid_test_id(self.get_test_id(name))
+        )
+
+    def get_test_id(self, name):
+        return name[len(self.ll) : len(name) - len(self.rr)]
+
+    def get_test_id_from_index(self, index):
+        assert self.mm in NUMBERED_MM, "Wildcard is not a number"
+        return str(int(self.mm) + index).zfill(len(self.mm))
+
+    def get_name(self, test_id, index=None, use_index=False):
+        if use_index and self.mm in NUMBERED_MM:
+            return self.ll + self.get_test_id_from_index(index) + self.rr
+        return self.ll + test_id + self.rr
+
+    def matches(self, names, returns):
+        if returns == "test_id":
+            result = [n for n in names]
+            result = [n for n in result if self.matched(n)]
+            result = [self.get_test_id(n) for n in result]
+            return result
+        else:
+            raise NotImplementedError
+
+
+class PatternPair:
+    def __init__(self, x: Pattern, y: Pattern):
+        assert x.mm == y.mm, "Input wildcard and output wildcard must be equal"
+        self.x = x
+        self.y = y
+
+    def __repr__(self):
+        return "PatternPair({}, {})".format(self.x, self.y)
+
+    def __eq__(self, other):
+        return self.__repr__() == other.__repr__()
+
+    def __hash__(self):
+        return self.__repr__().__hash__()
+
+    @classmethod
+    def from_string_pair(cls, inp_format, out_format):
+        return cls(Pattern.from_string(inp_format), Pattern.from_string(out_format))
+
+    def matches(self, names, returns):
+        x_test_ids = self.x.matches(names, returns="test_id")
+        y_test_ids = self.y.matches(names, returns="test_id")
+
+        test_ids = set(x_test_ids) & set(y_test_ids)
+        test_ids = list(sorted(test_ids, key=utils.natural_sorting_key))
+
+        if returns == "fast_count":
+            if self.x.mm == "*":
+                return len(test_ids)
+            elif self.x.mm in NUMBERED_MM:
+                count_valid = 0
+                for t in test_ids:
+                    if t == self.x.get_test_id_from_index(count_valid):
+                        count_valid += 1
+
+                return count_valid
+
+        extra_files = list(names)
+        valid_test_ids = []
+        for t in test_ids:
+            if self.x.mm in NUMBERED_MM:
+                if t != self.x.get_test_id_from_index(len(valid_test_ids)):
+                    continue
+
+            inp_name = self.x.get_name(t)
+            out_name = self.y.get_name(t)
+
+            if inp_name == out_name:
+                continue
+            if inp_name not in extra_files:
+                continue
+            if out_name not in extra_files:
+                continue
+
+            valid_test_ids.append(t)
+            extra_files.remove(inp_name)
+            extra_files.remove(out_name)
+
+        if returns == "count":
+            return len(valid_test_ids)
+        elif returns == "test_id":
+            return valid_test_ids
+        elif returns == "test_id_with_extra_files":
+            return valid_test_ids, extra_files
+        else:
+            raise NotImplementedError
+
+    def score(self, names):
+        def ls(s):
+            return len(s) - s.count("0")
+
+        def zs(s):
+            return -s.count("0")
+
+        def vs(s):
+            return sum(
+                s.lower().count(c) * w
+                for c, w in [("a", -1), ("e", -1), ("i", +1), ("o", -1), ("u", -1)]
+            )
+
+        count_score = self.matches(names, returns="fast_count")
+
+        len_score = ls(self.x.ll + self.x.rr + self.y.ll + self.y.rr)
+        zero_score = zs(self.x.ll + self.x.rr + self.y.ll + self.y.rr)
+
+        assert self.x.mm in ["*"] + NUMBERED_MM
+        specific_score = 0 if self.x.mm == "*" else len(self.x.mm)
+
+        vowel_score = vs(self.x.ll + self.x.rr) - vs(self.y.ll + self.y.rr)
+
+        return count_score, specific_score, len_score, zero_score, vowel_score
+
+    def is_string_safe(self):
+        try:
+            x = Pattern.from_string(self.x.to_string())
+            y = Pattern.from_string(self.y.to_string())
+            return self == PatternPair(x, y)
+        except:
+            return False
+
+
+def maximal(a, key):
+    max_score = max(map(key, a))
+    result = [x for x in a if key(x) == max_score]
+    if len(result) == 1:
+        return result[0]
+    else:
+        print(result)
+        raise Exception("More than one maximum values")
+
+
+def get_all_star_pattern_pairs(names):
+    sample = random.sample(names, min(len(names), SAMPLE_SIZE))
+
+    star_pattern_pairs = []
+
+    all_prefixes = [n[:i] for n in sample for i in range(len(n) + 1)]
+    all_prefixes = list(sorted(set(all_prefixes)))
+    all_suffixes = [n[i:] for n in sample for i in range(len(n) + 1)]
+    all_suffixes = list(sorted(set(all_suffixes)))
+
+    for prefix in all_prefixes:
+        matched_names = [n for n in names if n.startswith(prefix)]
+        if len(matched_names) == 2:
+            mn0, mn1 = matched_names
+            for i in range(len(prefix) + 1):
+                x = Pattern(prefix[:i], "*", mn0[len(prefix) :])
+                y = Pattern(prefix[:i], "*", mn1[len(prefix) :])
+                star_pattern_pairs.append(PatternPair(x, y))
+
+    for suffix in all_suffixes:
+        matched_names = [n for n in names if n.endswith(suffix)]
+        if len(matched_names) == 2:
+            mn0, mn1 = matched_names
+            for i in range(len(suffix) + 1):
+                x = Pattern(mn0[: len(mn0) - len(suffix)], "*", suffix[i:])
+                y = Pattern(mn1[: len(mn1) - len(suffix)], "*", suffix[i:])
+                star_pattern_pairs.append(PatternPair(x, y))
+
+    star_pattern_pairs = list(set(star_pattern_pairs))
+    return star_pattern_pairs
+
+
+def get_variant_pattern_pairs(pp):
+    return [
+        PatternPair(Pattern(pp.x.ll, mm, pp.x.rr), Pattern(pp.y.ll, mm, pp.y.rr))
+        for mm in VALID_MM
+    ] + [
+        PatternPair(Pattern(pp.y.ll, mm, pp.y.rr), Pattern(pp.x.ll, mm, pp.x.rr))
+        for mm in VALID_MM
+    ]
+
+
+def find_best_pattern_pair(names):
+    star_pattern_pairs = get_all_star_pattern_pairs(names)
+    star_pattern_pairs = [
+        pp for pp in star_pattern_pairs if pp.matches(names, returns="fast_count") >= 2
+    ]
+    # for pp in star_pattern_pairs:
+    #     print(pp, pp.is_string_safe(), pp.score(names))
+
+    if len(star_pattern_pairs) == 0:
+        return PatternPair(Pattern("", "*", ""), Pattern("", "*", ""))
+    best_star_pattern_pair = maximal(star_pattern_pairs, key=lambda pp: pp.score(names))
+
+    pattern_pairs = get_variant_pattern_pairs(best_star_pattern_pair)
+    # for pp in pattern_pairs:
+    #     print(pp, pp.is_string_safe(), pp.score(names))
+    pattern_pairs = [pp for pp in pattern_pairs if pp.is_string_safe()]
+    best_pattern_pair = maximal(pattern_pairs, key=lambda pp: pp.score(names))
+
+    return best_pattern_pair
+
+
+def list_dir_recursively(folder):
+    old_cwd = os.getcwd()
+    os.chdir(folder)
+    result = []
+    for root, _, filenames in os.walk("."):
+        for filename in filenames:
+            result.append(os.path.join(root, filename))
+    os.chdir(old_cwd)
+    return result
+
+
+def test_with_dir(folder):
+    names = list_dir_recursively(folder)
+    print(folder, find_best_pattern_pair(names))