NDOJ/judge/views/test_formatter/tf_pattern.py

269 lines
8.7 KiB
Python
Raw Normal View History

2024-01-08 18:27:20 +00:00
import os
import random
from judge.views.test_formatter import tf_utils as utils
SAMPLE_SIZE = 16
NUMBERED_MM = ["0", "1", "00", "01", "000", "001", "0000", "0001"]
VALID_MM = ["*"] + NUMBERED_MM
MSG_TOO_MANY_OCCURRENCES = (
"400: Invalid pattern: Pattern cannot have more than one '{}'"
)
MSG_MM_NOT_FOUND = "400: Invalid pattern: Wildcard not found. Wildcard list: {}"
class Pattern:
def __init__(self, ll, mm, rr):
assert mm in VALID_MM, "Invalid wildcard"
self.ll = ll
self.mm = mm
self.rr = rr
def __repr__(self):
return "Pattern('{}', '{}', '{}')".format(self.ll, self.mm, self.rr)
def __eq__(self, other):
return self.__repr__() == other.__repr__()
def __hash__(self):
return self.__repr__().__hash__()
@classmethod
def from_string(cls, text):
for mm in ["*"] + sorted(NUMBERED_MM, key=len, reverse=True):
if mm in text:
if text.count(mm) > 1:
raise Exception(MSG_TOO_MANY_OCCURRENCES.format(mm))
i = text.index(mm)
return cls(text[:i], mm, text[i + len(mm) :])
raise Exception(MSG_MM_NOT_FOUND.format(",".join(VALID_MM)))
def to_string(self):
return self.ll + self.mm + self.rr
def is_valid_test_id(self, test_id):
if self.mm == "*":
return True
if self.mm in NUMBERED_MM:
return test_id.isdigit() and len(test_id) >= len(self.mm)
raise NotImplementedError
def matched(self, name):
return (
name.startswith(self.ll)
and name.endswith(self.rr)
and len(name) >= len(self.ll) + len(self.rr)
and self.is_valid_test_id(self.get_test_id(name))
)
def get_test_id(self, name):
return name[len(self.ll) : len(name) - len(self.rr)]
def get_test_id_from_index(self, index):
assert self.mm in NUMBERED_MM, "Wildcard is not a number"
return str(int(self.mm) + index).zfill(len(self.mm))
def get_name(self, test_id, index=None, use_index=False):
if use_index and self.mm in NUMBERED_MM:
return self.ll + self.get_test_id_from_index(index) + self.rr
return self.ll + test_id + self.rr
def matches(self, names, returns):
if returns == "test_id":
result = [n for n in names]
result = [n for n in result if self.matched(n)]
result = [self.get_test_id(n) for n in result]
return result
else:
raise NotImplementedError
class PatternPair:
def __init__(self, x: Pattern, y: Pattern):
assert x.mm == y.mm, "Input wildcard and output wildcard must be equal"
self.x = x
self.y = y
def __repr__(self):
return "PatternPair({}, {})".format(self.x, self.y)
def __eq__(self, other):
return self.__repr__() == other.__repr__()
def __hash__(self):
return self.__repr__().__hash__()
@classmethod
def from_string_pair(cls, inp_format, out_format):
return cls(Pattern.from_string(inp_format), Pattern.from_string(out_format))
def matches(self, names, returns):
x_test_ids = self.x.matches(names, returns="test_id")
y_test_ids = self.y.matches(names, returns="test_id")
test_ids = set(x_test_ids) & set(y_test_ids)
test_ids = list(sorted(test_ids, key=utils.natural_sorting_key))
if returns == "fast_count":
if self.x.mm == "*":
return len(test_ids)
elif self.x.mm in NUMBERED_MM:
count_valid = 0
for t in test_ids:
if t == self.x.get_test_id_from_index(count_valid):
count_valid += 1
return count_valid
extra_files = list(names)
valid_test_ids = []
for t in test_ids:
if self.x.mm in NUMBERED_MM:
if t != self.x.get_test_id_from_index(len(valid_test_ids)):
continue
inp_name = self.x.get_name(t)
out_name = self.y.get_name(t)
if inp_name == out_name:
continue
if inp_name not in extra_files:
continue
if out_name not in extra_files:
continue
valid_test_ids.append(t)
extra_files.remove(inp_name)
extra_files.remove(out_name)
if returns == "count":
return len(valid_test_ids)
elif returns == "test_id":
return valid_test_ids
elif returns == "test_id_with_extra_files":
return valid_test_ids, extra_files
else:
raise NotImplementedError
def score(self, names):
def ls(s):
return len(s) - s.count("0")
def zs(s):
return -s.count("0")
def vs(s):
return sum(
s.lower().count(c) * w
for c, w in [("a", -1), ("e", -1), ("i", +1), ("o", -1), ("u", -1)]
)
count_score = self.matches(names, returns="fast_count")
len_score = ls(self.x.ll + self.x.rr + self.y.ll + self.y.rr)
zero_score = zs(self.x.ll + self.x.rr + self.y.ll + self.y.rr)
assert self.x.mm in ["*"] + NUMBERED_MM
specific_score = 0 if self.x.mm == "*" else len(self.x.mm)
vowel_score = vs(self.x.ll + self.x.rr) - vs(self.y.ll + self.y.rr)
return count_score, specific_score, len_score, zero_score, vowel_score
def is_string_safe(self):
try:
x = Pattern.from_string(self.x.to_string())
y = Pattern.from_string(self.y.to_string())
return self == PatternPair(x, y)
except:
return False
def maximal(a, key):
max_score = max(map(key, a))
result = [x for x in a if key(x) == max_score]
if len(result) == 1:
return result[0]
else:
print(result)
raise Exception("More than one maximum values")
def get_all_star_pattern_pairs(names):
sample = random.sample(names, min(len(names), SAMPLE_SIZE))
star_pattern_pairs = []
all_prefixes = [n[:i] for n in sample for i in range(len(n) + 1)]
all_prefixes = list(sorted(set(all_prefixes)))
all_suffixes = [n[i:] for n in sample for i in range(len(n) + 1)]
all_suffixes = list(sorted(set(all_suffixes)))
for prefix in all_prefixes:
matched_names = [n for n in names if n.startswith(prefix)]
if len(matched_names) == 2:
mn0, mn1 = matched_names
for i in range(len(prefix) + 1):
x = Pattern(prefix[:i], "*", mn0[len(prefix) :])
y = Pattern(prefix[:i], "*", mn1[len(prefix) :])
star_pattern_pairs.append(PatternPair(x, y))
for suffix in all_suffixes:
matched_names = [n for n in names if n.endswith(suffix)]
if len(matched_names) == 2:
mn0, mn1 = matched_names
for i in range(len(suffix) + 1):
x = Pattern(mn0[: len(mn0) - len(suffix)], "*", suffix[i:])
y = Pattern(mn1[: len(mn1) - len(suffix)], "*", suffix[i:])
star_pattern_pairs.append(PatternPair(x, y))
star_pattern_pairs = list(set(star_pattern_pairs))
return star_pattern_pairs
def get_variant_pattern_pairs(pp):
return [
PatternPair(Pattern(pp.x.ll, mm, pp.x.rr), Pattern(pp.y.ll, mm, pp.y.rr))
for mm in VALID_MM
] + [
PatternPair(Pattern(pp.y.ll, mm, pp.y.rr), Pattern(pp.x.ll, mm, pp.x.rr))
for mm in VALID_MM
]
def find_best_pattern_pair(names):
star_pattern_pairs = get_all_star_pattern_pairs(names)
star_pattern_pairs = [
pp for pp in star_pattern_pairs if pp.matches(names, returns="fast_count") >= 2
]
# for pp in star_pattern_pairs:
# print(pp, pp.is_string_safe(), pp.score(names))
if len(star_pattern_pairs) == 0:
return PatternPair(Pattern("", "*", ""), Pattern("", "*", ""))
best_star_pattern_pair = maximal(star_pattern_pairs, key=lambda pp: pp.score(names))
pattern_pairs = get_variant_pattern_pairs(best_star_pattern_pair)
# for pp in pattern_pairs:
# print(pp, pp.is_string_safe(), pp.score(names))
pattern_pairs = [pp for pp in pattern_pairs if pp.is_string_safe()]
best_pattern_pair = maximal(pattern_pairs, key=lambda pp: pp.score(names))
return best_pattern_pair
def list_dir_recursively(folder):
old_cwd = os.getcwd()
os.chdir(folder)
result = []
for root, _, filenames in os.walk("."):
for filename in filenames:
result.append(os.path.join(root, filename))
os.chdir(old_cwd)
return result
def test_with_dir(folder):
names = list_dir_recursively(folder)
print(folder, find_best_pattern_pair(names))