.*", '', content, flags=re.DOTALL)
# odstranění HTML entit
content = re.sub(r'&.*?;', ' ', content, flags=re.DOTALL)
# odstranění tagů
content = re.sub(r'<.*?>', ' ', content, flags=re.DOTALL)
content = content.lower()
for word in re.findall(r'\w+', content):
hash_map[word] = f"/{path}"
def preprocess():
root_url = "http://ksp.mff.cuni.cz"
root_nodes = ['h/ulohy', 'z/ulohy']
pages = ['zadani', 'reseni', 'komentare']
for node in root_nodes:
for year in range(33):
for series in range(1, 6):
for page in pages:
process_content(root_url, f"{node}/{year}/{page}{series}.html")
PREPROCESSED_FILE = "preprocessed.txt"
if len(sys.argv) >= 2:
if sys.argv[1] == '-ini':
preprocess()
with open(PREPROCESSED_FILE, "w") as db:
for word in sorted(list(hash_map.keys())):
db.write(f"{word} {hash_map[word]}\n")
print("Success", file=sys.stderr)
sys.exit(0)
print_help()
try:
with open(PREPROCESSED_FILE, "r") as file:
for line in file.readlines():
key, value = line.split()
hash_map[key] = value
except FileNotFoundError:
print_help()
n = int(input())
for _ in range(n):
word_to_find = input()
assert word_to_find in hash_map, f"Word `{word_to_find}`not found in {PREPROCESSED_FILE}"
print(hash_map[word_to_find])