Source code for markovipy.utils

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import codecs


[docs]def fix_caps(word):
    """Used to compare words, irrespective of their capitalisation

    :param word: the word to be fixed
    :type word: <str>
    :return: <str>
    """
    if word.isupper() and word != "I":
        """eg: word -> 'AAA', result -> 'aaa'
        """
        word = word.lower()
    elif word[0].isupper():
        """eg: word -> 'AvA', result -> 'Ava'
        """
        word = word.lower().capitalize()
    else:
        """eg: word -> 'aVA', result -> 'ava'
        """
        word = word.lower()
    return word


[docs]def get_word_list(file):
    """Used to get the words inside the corpus file and generate a list of
    words by parsing it

    Something like = ["once", "upon", "a", ...]

    Check the regex on https://regex101.com/ with any file inside the
    corpus/ dir

    \w matches any word character (equal to [a-zA-Z0-9_])
    ' matches the character ' literally (case sensitive)
    .,!?; matches a single character in the list .,!?; (case sensitive)

    :param file: the file being passed to create the list of words
    :type file: <str>
    :return: <list>
    """
    try:
        # fixes UnicodeDecodeError while reading files instead of using the
        # normal open()
        with codecs.open(file, 'r', encoding='utf-8') as f:
            words_list = \
                [fix_caps(w) for w in re.findall(r"[\w']+|[.,!?;]", f.read())]
        return words_list
    except OSError:
        return "Did you pass a valid file name/path?"