#!/usr/bin/env python3

import argparse
import gzip
import os
import pickle


def get_tokens(raw, inds):
    # Extract text given indices.
    return "".join([raw[ii] for ii in inds])


def extract_output(pickle_file, language_file, english_file, output_file, remove_discont):
    # Open and load the pickle file.
    with open(pickle_file, "rb") as f:
        index_dct = pickle.load(f)

    with gzip.open(language_file, 'rt') as lngf, gzip.open(english_file, 'rt') as engf, open(output_file, 'wt') as outf:
        # Iterate over all lines.
        for ii, (ln, en) in enumerate(zip(lngf, engf)):
            # Check if the line is annotated in this pickle.
            if ii in index_dct:
                # Loop over all annotations for the line.
                for lng_inds, eng_inds, notes, lng_discont, eng_discont in index_dct[ii]:
                    # lng_ids: list of ints
                    # eng_inds: list of ints
                    # notes: list of tuples
                    # lng_discont: boolean
                    # eng_discont: boolean

                    # Extract tokens from character indices
                    lng = get_tokens(ln, lng_inds)
                    eng = get_tokens(en, eng_inds)

                    # Extract and format rare word information
                    notes2 = ":".join([pair[0] + "(" + get_tokens(ln, pair[1]) + ")" for pair in notes])

                    # Write output to file, removing discontiguous entries if requested. 
                    if remove_discont:
                        if not lng_discont and not eng_discont:
                            outf.write("\t".join(
                                [str(ii), lng, eng, notes2, str(lng_inds), str(eng_inds), str(lng_discont),
                                 str(eng_discont)]) + "\n")
                    else:
                        outf.write("\t".join([str(ii), lng, eng, notes2, str(lng_inds), str(eng_inds), str(lng_discont),
                                              str(eng_discont)]) + "\n")


if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        'Create the datasets for "HABLex: Human Annotated Bilingual Lexicons for Experiments in Machine Translation"')
    parser.add_argument('coppa_v2_dir',
                        help='WIPO Coppa V2 dataset directory, which can be obtained free of charge for research purposes from the world intellectual property organization: https://www.wipo.int/patentscope/en/data/forms/products.jsp')

    parser.add_argument("--remove_discont",
                        help="Remove discontiguous entries (to match paper exactly, run without this flag)",
                        action="store_true")

    args = parser.parse_args()

    os.makedirs('./data', exist_ok=True)

    for lang in ('ko', 'ru', 'zh'):
        for test_dev in ('test', 'dev'):
            en_file = '%s/moses/%s_en/CoppaV2.en.gz' % (args.coppa_v2_dir, lang)
            fl_file = '%s/moses/%s_en/CoppaV2.%s.gz' % (args.coppa_v2_dir, lang, lang)
            pickle_file = './annotations/%s_en.%s.pickle' % (lang, test_dev)
            out_file = './data/%s_en.%s.tsv' % (lang, test_dev)

            extract_output(pickle_file, fl_file, en_file, out_file, args.remove_discont)
