tagmerger/tagmerger

#!/usr/bin/env python3

import sys
import argparse
import yaml
import rapidfuzz as rf


def find_similar(tag, tag_list):
    #result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.fuzz.WRatio, score_cutoff=70)
    #result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.DamerauLevenshtein.distance, score_cutoff=5, processor=rf.utils.default_process)
    #result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.JaroWinkler.distance, score_cutoff=0.23, processor=rf.utils.default_process)
    #result = rf.process.extract("feminisme", tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process, score_cutoff=None)
    #result = rf.process.cdist(["feminisme"], tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process)
    result = rf.process.extract(tag, tag_list, 
            limit=None,
            scorer=rf.distance.JaroWinkler.distance,
            score_cutoff=0.23,
            processor=rf.utils.default_process)
    return [r[0] for r in result]

def merge(tag, old_tags, new_tags):
    merge_candidates = find_similar(tag, new_tags.keys())

    print(f"\nTag : {tag}")
    if len(merge_candidates) > 0:
        print("Merge candidates:")
        for i in range(len(merge_candidates)):
            print(f"\t{i}. {merge_candidates[i]}")

        choice = input(f"Tag name or [0-{len(merge_candidates) - 1}] or new merge tag > ")
    else:
        print("No reasonable merge candidate")
        choice = input("New merge tag > ")

    try:
        # Existing tag chosen by index
        choice = int(choice)
        merge_tag = merge_candidates[choice]
        print(f"Merging in tag {merge_tag}")
    except ValueError:
        merge_tag = choice
        if merge_tag in merge_candidates: # Existing tag chosen by name
            print(f"Merging in tag {merge_tag}")
        else: # New tag
            while len(merge_tag) == 0:
                merge_tag = input("New tag cannot be empty. Enter new tag > ")
            while merge_tag in new_tags.keys():
                print(f"{merge_tag} already exists in the new tag set !",
                    "Appending '(dup)'")
                merge_tag = merge_tag + "(dup)"
            print(f"Merging in new tag {merge_tag}")
            new_tags[merge_tag] = []
    old_tags[tag] = merge_tag
    new_tags[merge_tag].append(tag)

    # Suggest merging more similar unmerged tags the merge tag
    unmerged_tags = [t for t, m in old_tags.items() if m is None]
    similar_unmerged_tags = find_similar(tag, unmerged_tags)
    if len(similar_unmerged_tags) > 0:
        print(f"Might also fit in {merge_tag}:")
        for i in range(len(similar_unmerged_tags)):
            print(f"\t{i}. {similar_unmerged_tags[i]}")

        choice = input(f"Space-separated indices from [0-{len(similar_unmerged_tags) - 1}] > ")

        # list(set()) removes duplicates
        choice = list(set([int(c) for c in choice.split()]))

        for c in choice:
            chosen_tag = similar_unmerged_tags[c]
            print(f"Merging old tag {chosen_tag} in new tag {merge_tag}")
            old_tags[chosen_tag] = merge_tag
            new_tags[merge_tag].append(chosen_tag)

def new_from_old_tags(old_tags):
    new_tags = {}
    for tag, merge_tag in old_tags.items():
        if merge_tag in new_tags:
            new_tags[merge_tag].append(tag)
        else:
            new_tags[merge_tag] = [tag]
    return new_tags

def load_tags_yaml(path):
    print(f"Loading tags from yaml file {path}")
    with open(path, "r") as filein:
        old_tags = yaml.safe_load(filein)
    print(f"{len(old_tags)} tags loaded in dirty set")

    print("Extracting new_tags from yaml")
    new_tags = new_from_old_tags(old_tags)
    print(f"{len(new_tags)} tags loaded in new set")
    return old_tags, new_tags

def load_tags_plain(path):
    print(f"Loading tags from plain file {path}")
    with open(path, "r") as filein:
        tags = filein.readlines()
    tags = [t.strip() for t in tags]
    old_tags = {t: None for t in tags}
    print(f"{len(old_tags)} tags loaded in dirty set")
    print("Initialising empty new set")
    new_tags = {}
    return old_tags, new_tags

def export_tags(old_tags, new_tags, path):
    with open(path, "w") as fileout:
        yaml.dump(old_tags, fileout)
    print(f"Tags exported to {path}")

parser = argparse.ArgumentParser(description = "A simple tool to semi-automate merging a dirty set of tags into a new set of tags.")

parser.add_argument("tags", type=str, help="Path to a yaml file containing the dirty set of tags. The yaml directory must be flat, with one key per tag in the dirty set with value null or set to the merge tag in the new set.")
parser.add_argument("--plain-input", action="store_true", help="If set, the input file is assumed to contain one tag of the dirty set per line. No character is escaped, the newline character is used as separator.")

args = parser.parse_args()

if args.plain_input == True:
    old_tags, new_tags = load_tags_plain(args.tags)
else:
    old_tags, new_tags = load_tags_yaml(args.tags)

for tag in old_tags.keys():
    if old_tags[tag] == None:
        choice = input("\nContinue [c, default] or export merged tags [e] > ")
        if choice == "e":
            export_path = ""
            while len(export_path) == 0:
                export_path = input("Export path > ")
            export_tags(old_tags, new_tags, export_path)
            sys.exit(0)
        else:
            merge(tag, old_tags, new_tags)

print("\nAll tags merged !")
export_path = ""
while len(export_path) == 0:
    export_path = input("Export path > ")
export_tags(old_tags, new_tags, export_path)
Initial commit 2024-10-27 21:13:10 +00:00			`#!/usr/bin/env python3`

			`import sys`
			`import argparse`
			`import yaml`
			`import rapidfuzz as rf`


			`def find_similar(tag, tag_list):`
			`#result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.fuzz.WRatio, score_cutoff=70)`
			`#result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.DamerauLevenshtein.distance, score_cutoff=5, processor=rf.utils.default_process)`
			`#result = rf.process.extract("feminisme", tags, limit=None, scorer=rf.distance.JaroWinkler.distance, score_cutoff=0.23, processor=rf.utils.default_process)`
			`#result = rf.process.extract("feminisme", tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process, score_cutoff=None)`
			`#result = rf.process.cdist(["feminisme"], tags, scorer=rf.distance.DamerauLevenshtein.distance, processor=rf.utils.default_process)`
			`result = rf.process.extract(tag, tag_list,`
			`limit=None,`
			`scorer=rf.distance.JaroWinkler.distance,`
			`score_cutoff=0.23,`
			`processor=rf.utils.default_process)`
			`return [r[0] for r in result]`

			`def merge(tag, old_tags, new_tags):`
			`merge_candidates = find_similar(tag, new_tags.keys())`

			`print(f"\nTag : {tag}")`
			`if len(merge_candidates) > 0:`
			`print("Merge candidates:")`
			`for i in range(len(merge_candidates)):`
			`print(f"\t{i}. {merge_candidates[i]}")`

			`choice = input(f"Tag name or [0-{len(merge_candidates) - 1}] or new merge tag > ")`
			`else:`
			`print("No reasonable merge candidate")`
			`choice = input("New merge tag > ")`

			`try:`
			`# Existing tag chosen by index`
			`choice = int(choice)`
			`merge_tag = merge_candidates[choice]`
			`print(f"Merging in tag {merge_tag}")`
			`except ValueError:`
			`merge_tag = choice`
			`if merge_tag in merge_candidates: # Existing tag chosen by name`
			`print(f"Merging in tag {merge_tag}")`
			`else: # New tag`
			`while len(merge_tag) == 0:`
			`merge_tag = input("New tag cannot be empty. Enter new tag > ")`
			`while merge_tag in new_tags.keys():`
			`print(f"{merge_tag} already exists in the new tag set !",`
			`"Appending '(dup)'")`
			`merge_tag = merge_tag + "(dup)"`
			`print(f"Merging in new tag {merge_tag}")`
			`new_tags[merge_tag] = []`
			`old_tags[tag] = merge_tag`
			`new_tags[merge_tag].append(tag)`

			`# Suggest merging more similar unmerged tags the merge tag`
			`unmerged_tags = [t for t, m in old_tags.items() if m is None]`
			`similar_unmerged_tags = find_similar(tag, unmerged_tags)`
			`if len(similar_unmerged_tags) > 0:`
			`print(f"Might also fit in {merge_tag}:")`
			`for i in range(len(similar_unmerged_tags)):`
			`print(f"\t{i}. {similar_unmerged_tags[i]}")`

			`choice = input(f"Space-separated indices from [0-{len(similar_unmerged_tags) - 1}] > ")`

			`# list(set()) removes duplicates`
			`choice = list(set([int(c) for c in choice.split()]))`

			`for c in choice:`
			`chosen_tag = similar_unmerged_tags[c]`
			`print(f"Merging old tag {chosen_tag} in new tag {merge_tag}")`
			`old_tags[chosen_tag] = merge_tag`
			`new_tags[merge_tag].append(chosen_tag)`

			`def new_from_old_tags(old_tags):`
			`new_tags = {}`
			`for tag, merge_tag in old_tags.items():`
			`if merge_tag in new_tags:`
			`new_tags[merge_tag].append(tag)`
			`else:`
			`new_tags[merge_tag] = [tag]`
			`return new_tags`

			`def load_tags_yaml(path):`
			`print(f"Loading tags from yaml file {path}")`
			`with open(path, "r") as filein:`
			`old_tags = yaml.safe_load(filein)`
			`print(f"{len(old_tags)} tags loaded in dirty set")`

			`print("Extracting new_tags from yaml")`
			`new_tags = new_from_old_tags(old_tags)`
			`print(f"{len(new_tags)} tags loaded in new set")`
			`return old_tags, new_tags`

			`def load_tags_plain(path):`
			`print(f"Loading tags from plain file {path}")`
			`with open(path, "r") as filein:`
			`tags = filein.readlines()`
			`tags = [t.strip() for t in tags]`
			`old_tags = {t: None for t in tags}`
			`print(f"{len(old_tags)} tags loaded in dirty set")`
			`print("Initialising empty new set")`
			`new_tags = {}`
			`return old_tags, new_tags`

			`def export_tags(old_tags, new_tags, path):`
			`with open(path, "w") as fileout:`
			`yaml.dump(old_tags, fileout)`
			`print(f"Tags exported to {path}")`

			`parser = argparse.ArgumentParser(description = "A simple tool to semi-automate merging a dirty set of tags into a new set of tags.")`

			`parser.add_argument("tags", type=str, help="Path to a yaml file containing the dirty set of tags. The yaml directory must be flat, with one key per tag in the dirty set with value null or set to the merge tag in the new set.")`
			`parser.add_argument("--plain-input", action="store_true", help="If set, the input file is assumed to contain one tag of the dirty set per line. No character is escaped, the newline character is used as separator.")`

			`args = parser.parse_args()`

			`if args.plain_input == True:`
			`old_tags, new_tags = load_tags_plain(args.tags)`
			`else:`
			`old_tags, new_tags = load_tags_yaml(args.tags)`

			`for tag in old_tags.keys():`
			`if old_tags[tag] == None:`
			`choice = input("\nContinue [c, default] or export merged tags [e] > ")`
			`if choice == "e":`
			`export_path = ""`
			`while len(export_path) == 0:`
			`export_path = input("Export path > ")`
			`export_tags(old_tags, new_tags, export_path)`
			`sys.exit(0)`
			`else:`
			`merge(tag, old_tags, new_tags)`

			`print("\nAll tags merged !")`
			`export_path = ""`
			`while len(export_path) == 0:`
			`export_path = input("Export path > ")`
			`export_tags(old_tags, new_tags, export_path)`