tagmerger/tagmerger

141 lines
5.6 KiB
Text
Raw Normal View History

2024-10-27 21:13:10 +00:00
#!/usr/bin/env python3
import sys
import argparse
import yaml
import rapidfuzz as rf
def find_similar(tag, tag_list):
#result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.fuzz.WRatio, score_cutoff=70)
#result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.DamerauLevenshtein.distance, score_cutoff=5, processor=rf.utils.default_process)
#result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.JaroWinkler.distance, score_cutoff=0.23, processor=rf.utils.default_process)
#result = rf.process.extract("feminisme", tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process, score_cutoff=None)
#result = rf.process.cdist(["feminisme"], tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process)
result = rf.process.extract(tag, tag_list,
limit=None,
scorer=rf.distance.JaroWinkler.distance,
score_cutoff=0.23,
processor=rf.utils.default_process)
return [r[0] for r in result]
def merge(tag, old_tags, new_tags):
merge_candidates = find_similar(tag, new_tags.keys())
print(f"\nTag : {tag}")
if len(merge_candidates) > 0:
print("Merge candidates:")
for i in range(len(merge_candidates)):
print(f"\t{i}. {merge_candidates[i]}")
choice = input(f"Tag name or [0-{len(merge_candidates) - 1}] or new merge tag > ")
else:
print("No reasonable merge candidate")
choice = input("New merge tag > ")
try:
# Existing tag chosen by index
choice = int(choice)
merge_tag = merge_candidates[choice]
print(f"Merging in tag {merge_tag}")
except ValueError:
merge_tag = choice
if merge_tag in merge_candidates: # Existing tag chosen by name
print(f"Merging in tag {merge_tag}")
else: # New tag
while len(merge_tag) == 0:
merge_tag = input("New tag cannot be empty. Enter new tag > ")
while merge_tag in new_tags.keys():
print(f"{merge_tag} already exists in the new tag set !",
"Appending '(dup)'")
merge_tag = merge_tag + "(dup)"
print(f"Merging in new tag {merge_tag}")
new_tags[merge_tag] = []
old_tags[tag] = merge_tag
new_tags[merge_tag].append(tag)
# Suggest merging more similar unmerged tags the merge tag
unmerged_tags = [t for t, m in old_tags.items() if m is None]
similar_unmerged_tags = find_similar(tag, unmerged_tags)
if len(similar_unmerged_tags) > 0:
print(f"Might also fit in {merge_tag}:")
for i in range(len(similar_unmerged_tags)):
print(f"\t{i}. {similar_unmerged_tags[i]}")
choice = input(f"Space-separated indices from [0-{len(similar_unmerged_tags) - 1}] > ")
# list(set()) removes duplicates
choice = list(set([int(c) for c in choice.split()]))
for c in choice:
chosen_tag = similar_unmerged_tags[c]
print(f"Merging old tag {chosen_tag} in new tag {merge_tag}")
old_tags[chosen_tag] = merge_tag
new_tags[merge_tag].append(chosen_tag)
def new_from_old_tags(old_tags):
new_tags = {}
for tag, merge_tag in old_tags.items():
if merge_tag in new_tags:
new_tags[merge_tag].append(tag)
else:
new_tags[merge_tag] = [tag]
return new_tags
def load_tags_yaml(path):
print(f"Loading tags from yaml file {path}")
with open(path, "r") as filein:
old_tags = yaml.safe_load(filein)
print(f"{len(old_tags)} tags loaded in dirty set")
print("Extracting new_tags from yaml")
new_tags = new_from_old_tags(old_tags)
print(f"{len(new_tags)} tags loaded in new set")
return old_tags, new_tags
def load_tags_plain(path):
print(f"Loading tags from plain file {path}")
with open(path, "r") as filein:
tags = filein.readlines()
tags = [t.strip() for t in tags]
old_tags = {t: None for t in tags}
print(f"{len(old_tags)} tags loaded in dirty set")
print("Initialising empty new set")
new_tags = {}
return old_tags, new_tags
def export_tags(old_tags, new_tags, path):
with open(path, "w") as fileout:
yaml.dump(old_tags, fileout)
print(f"Tags exported to {path}")
parser = argparse.ArgumentParser(description = "A simple tool to semi-automate merging a dirty set of tags into a new set of tags.")
parser.add_argument("tags", type=str, help="Path to a yaml file containing the dirty set of tags. The yaml directory must be flat, with one key per tag in the dirty set with value null or set to the merge tag in the new set.")
parser.add_argument("--plain-input", action="store_true", help="If set, the input file is assumed to contain one tag of the dirty set per line. No character is escaped, the newline character is used as separator.")
args = parser.parse_args()
if args.plain_input == True:
old_tags, new_tags = load_tags_plain(args.tags)
else:
old_tags, new_tags = load_tags_yaml(args.tags)
for tag in old_tags.keys():
if old_tags[tag] == None:
choice = input("\nContinue [c, default] or export merged tags [e] > ")
if choice == "e":
export_path = ""
while len(export_path) == 0:
export_path = input("Export path > ")
export_tags(old_tags, new_tags, export_path)
sys.exit(0)
else:
merge(tag, old_tags, new_tags)
print("\nAll tags merged !")
export_path = ""
while len(export_path) == 0:
export_path = input("Export path > ")
export_tags(old_tags, new_tags, export_path)