commit ba580b64ec2901af1200ee9934b6a1b9c7681939 Author: sim Date: Sun Oct 27 22:13:10 2024 +0100 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..40d5e21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.swp +venv + diff --git a/tagmerger b/tagmerger new file mode 100755 index 0000000..9d4bb42 --- /dev/null +++ b/tagmerger @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 + +import sys +import argparse +import yaml +import rapidfuzz as rf + + +def find_similar(tag, tag_list): + #result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.fuzz.WRatio, score_cutoff=70) + #result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.DamerauLevenshtein.distance, score_cutoff=5, processor=rf.utils.default_process) + #result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.JaroWinkler.distance, score_cutoff=0.23, processor=rf.utils.default_process) + #result = rf.process.extract("feminisme", tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process, score_cutoff=None) + #result = rf.process.cdist(["feminisme"], tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process) + result = rf.process.extract(tag, tag_list, + limit=None, + scorer=rf.distance.JaroWinkler.distance, + score_cutoff=0.23, + processor=rf.utils.default_process) + return [r[0] for r in result] + +def merge(tag, old_tags, new_tags): + merge_candidates = find_similar(tag, new_tags.keys()) + + print(f"\nTag : {tag}") + if len(merge_candidates) > 0: + print("Merge candidates:") + for i in range(len(merge_candidates)): + print(f"\t{i}. {merge_candidates[i]}") + + choice = input(f"Tag name or [0-{len(merge_candidates) - 1}] or new merge tag > ") + else: + print("No reasonable merge candidate") + choice = input("New merge tag > ") + + try: + # Existing tag chosen by index + choice = int(choice) + merge_tag = merge_candidates[choice] + print(f"Merging in tag {merge_tag}") + except ValueError: + merge_tag = choice + if merge_tag in merge_candidates: # Existing tag chosen by name + print(f"Merging in tag {merge_tag}") + else: # New tag + while len(merge_tag) == 0: + merge_tag = input("New tag cannot be empty. Enter new tag > ") + while merge_tag in new_tags.keys(): + print(f"{merge_tag} already exists in the new tag set !", + "Appending '(dup)'") + merge_tag = merge_tag + "(dup)" + print(f"Merging in new tag {merge_tag}") + new_tags[merge_tag] = [] + old_tags[tag] = merge_tag + new_tags[merge_tag].append(tag) + + # Suggest merging more similar unmerged tags the merge tag + unmerged_tags = [t for t, m in old_tags.items() if m is None] + similar_unmerged_tags = find_similar(tag, unmerged_tags) + if len(similar_unmerged_tags) > 0: + print(f"Might also fit in {merge_tag}:") + for i in range(len(similar_unmerged_tags)): + print(f"\t{i}. {similar_unmerged_tags[i]}") + + choice = input(f"Space-separated indices from [0-{len(similar_unmerged_tags) - 1}] > ") + + # list(set()) removes duplicates + choice = list(set([int(c) for c in choice.split()])) + + for c in choice: + chosen_tag = similar_unmerged_tags[c] + print(f"Merging old tag {chosen_tag} in new tag {merge_tag}") + old_tags[chosen_tag] = merge_tag + new_tags[merge_tag].append(chosen_tag) + +def new_from_old_tags(old_tags): + new_tags = {} + for tag, merge_tag in old_tags.items(): + if merge_tag in new_tags: + new_tags[merge_tag].append(tag) + else: + new_tags[merge_tag] = [tag] + return new_tags + +def load_tags_yaml(path): + print(f"Loading tags from yaml file {path}") + with open(path, "r") as filein: + old_tags = yaml.safe_load(filein) + print(f"{len(old_tags)} tags loaded in dirty set") + + print("Extracting new_tags from yaml") + new_tags = new_from_old_tags(old_tags) + print(f"{len(new_tags)} tags loaded in new set") + return old_tags, new_tags + +def load_tags_plain(path): + print(f"Loading tags from plain file {path}") + with open(path, "r") as filein: + tags = filein.readlines() + tags = [t.strip() for t in tags] + old_tags = {t: None for t in tags} + print(f"{len(old_tags)} tags loaded in dirty set") + print("Initialising empty new set") + new_tags = {} + return old_tags, new_tags + +def export_tags(old_tags, new_tags, path): + with open(path, "w") as fileout: + yaml.dump(old_tags, fileout) + print(f"Tags exported to {path}") + +parser = argparse.ArgumentParser(description = "A simple tool to semi-automate merging a dirty set of tags into a new set of tags.") + +parser.add_argument("tags", type=str, help="Path to a yaml file containing the dirty set of tags. The yaml directory must be flat, with one key per tag in the dirty set with value null or set to the merge tag in the new set.") +parser.add_argument("--plain-input", action="store_true", help="If set, the input file is assumed to contain one tag of the dirty set per line. No character is escaped, the newline character is used as separator.") + +args = parser.parse_args() + +if args.plain_input == True: + old_tags, new_tags = load_tags_plain(args.tags) +else: + old_tags, new_tags = load_tags_yaml(args.tags) + +for tag in old_tags.keys(): + if old_tags[tag] == None: + choice = input("\nContinue [c, default] or export merged tags [e] > ") + if choice == "e": + export_path = "" + while len(export_path) == 0: + export_path = input("Export path > ") + export_tags(old_tags, new_tags, export_path) + sys.exit(0) + else: + merge(tag, old_tags, new_tags) + +print("\nAll tags merged !") +export_path = "" +while len(export_path) == 0: + export_path = input("Export path > ") +export_tags(old_tags, new_tags, export_path)