#!/usr/bin/env python3

"""
Extract glossary entries from chapters 1-19 and appendix B and create
a consolidated glossary text file delimited by '|' (pipe characters).
"""

import glob
import re
import os
import collections

GLOSSARY = r'Gloss[^-]{3,7}-{8,15}\n(.*)'

GLOSSARY_RE = re.compile(GLOSSARY, re.DOTALL)

GLOSSARY_SECTION_RE = re.compile(GLOSSARY + r'-{8,15}', re.DOTALL)

expected_entries = [  # glossary entries per chapter
    ('01', 21),
    ('02', 19),
    ('03', 22),
    ('04', 11),
    ('05', 15),
    ('06',  5),
    ('07',  8),
    ('08', 12),
    ('09',  3),
    ('10', 14),
    ('11', 20),
    ('12',  8),
    ('13',  6),
    ('14', 14),
    ('15',  9),
    ('16',  7),
    ('17',  9),
    ('18', 13),
    ('19',  5),
    ( 'B', 11),
]

expected_entries_dic = dict(expected_entries)

# \n(.*?)\n\n
ENTRY_RE = re.compile(r'([^\n]+):\n[ ]+(.*?)\n\n', re.DOTALL)

GlossaryEntry = collections.namedtuple('GlossaryEntry', 'term definition')
Definition = collections.namedtuple('Definition', 'chapter_id position text')


def parse_entries(text, chapter_id):
    matches = ENTRY_RE.findall(text)
    entries = []
    for position, match in enumerate(matches, 1):
        term = match[0]
        definition_text = ' '.join(match[1].split())
        #print(term, '::', definition_text)
        entries.append(GlossaryEntry(term,
                                     Definition(chapter_id, position, definition_text)))
    return entries


def scan_files(*paths):
    entries = collections.defaultdict(list)
    for path in paths:
        for name in glob.glob(os.path.join(path, '*.rst')):
            chapter_id = os.path.basename(name).split('-')[0]

            with open(name, encoding='utf-8') as infile:
                rst = infile.read()
                gloss_match = (GLOSSARY_SECTION_RE.search(rst) or
                               GLOSSARY_RE.search(rst))
                if gloss_match:
                    #print('*' * 40, name)
                    new_entries = parse_entries(gloss_match.group(1), chapter_id)
                    for term, definition in new_entries:
                        #if term in entries:
                        #    print('duplicate term:', term)
                        entries[term].append(definition)
                    #print(len(new_entries))
                    assert expected_entries_dic[chapter_id] == len(new_entries), (
                        chapter_id, expected_entries_dic[chapter_id], len(new_entries))
    for term in sorted(entries, key=str.upper):
        definitions = entries[term]
        for i, (chapter_id, position, definition) in enumerate(sorted(definitions)):
            if i:
                term = '\t'
            print(term, chapter_id, position, definition, sep='|')


if __name__ == '__main__':
    import sys
    scan_files(*sys.argv[1:])