apply formatting with black (#1038)

* applying black (with the default 88-column limit) * add flake8 * add isort * fix isort
2023-03-06 18:50:37 -05:00
parent 500d0fe966
commit b80bcf610d
21 changed files with 533 additions and 227 deletions
@@ -1,2 +1,2 @@
-from .basic import BasicTextNormalizer
-from .english import EnglishTextNormalizer
+from .basic import BasicTextNormalizer as BasicTextNormalizer
+from .english import EnglishTextNormalizer as EnglishTextNormalizer
@@ -48,13 +48,16 @@ def remove_symbols(s: str):
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    """
    return "".join(
-        " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
+        " " if unicodedata.category(c)[0] in "MSP" else c
+        for c in unicodedata.normalize("NFKC", s)
    )


 class BasicTextNormalizer:
    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
-        self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        self.clean = (
+            remove_symbols_and_diacritics if remove_diacritics else remove_symbols
+        )
        self.split_letters = split_letters

    def __call__(self, s: str):
@@ -66,6 +69,8 @@ class BasicTextNormalizer:
        if self.split_letters:
            s = " ".join(regex.findall(r"\X", s, regex.U))

-        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+        s = re.sub(
+            r"\s+", " ", s
+        )  # replace any successive whitespace characters with a space

        return s
@@ -84,7 +84,8 @@ class EnglishNumberNormalizer:
            name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
        }
        self.tens_ordinal = {
-            name.replace("y", "ieth"): (value, "th") for name, value in self.tens.items()
+            name.replace("y", "ieth"): (value, "th")
+            for name, value in self.tens.items()
        }
        self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}

@@ -108,7 +109,10 @@ class EnglishNumberNormalizer:
        self.multipliers_ordinal = {
            name + "th": (value, "th") for name, value in self.multipliers.items()
        }
-        self.multipliers_suffixed = {**self.multipliers_plural, **self.multipliers_ordinal}
+        self.multipliers_suffixed = {
+            **self.multipliers_plural,
+            **self.multipliers_ordinal,
+        }
        self.decimals = {*self.ones, *self.tens, *self.zeros}

        self.preceding_prefixers = {
@@ -128,7 +132,8 @@ class EnglishNumberNormalizer:
            "cents": "¢",
        }
        self.prefixes = set(
-            list(self.preceding_prefixers.values()) + list(self.following_prefixers.values())
+            list(self.preceding_prefixers.values())
+            + list(self.following_prefixers.values())
        )
        self.suffixers = {
            "per": {"cent": "%"},
@@ -218,7 +223,9 @@ class EnglishNumberNormalizer:
                if value is None:
                    value = ones
                elif isinstance(value, str) or prev in self.ones:
-                    if prev in self.tens and ones < 10:  # replace the last zero with the digit
+                    if (
+                        prev in self.tens and ones < 10
+                    ):  # replace the last zero with the digit
                        assert value[-1] == "0"
                        value = value[:-1] + str(ones)
                    else:
@@ -522,14 +529,14 @@ class EnglishTextNormalizer:
        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = re.sub(self.ignore_patterns, "", s)
-        s = re.sub(r"\s+'", "'", s)  # standardize when there's a space before an apostrophe
+        s = re.sub(r"\s+'", "'", s)  # when there's a space before an apostrophe

        for pattern, replacement in self.replacers.items():
            s = re.sub(pattern, replacement, s)

        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
-        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep some symbols for numerics
+        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep numeric symbols

        s = self.standardize_numbers(s)
        s = self.standardize_spellings(s)
@@ -538,6 +545,6 @@ class EnglishTextNormalizer:
        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
        s = re.sub(r"([^0-9])%", r"\1 ", s)

-        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space
+        s = re.sub(r"\s+", " ", s)  # replace any successive whitespaces with a space

        return s