Added example handling for abbreviations and ccompound words.
This commit is contained in:
@@ -40,13 +40,32 @@ def main():
|
|||||||
|
|
||||||
# Convert to sentences
|
# Convert to sentences
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
# abbreviations (in alphabetical order)
|
||||||
|
line = line.replace("%", "per cent")
|
||||||
|
line = line.replace("5G", "5 G")
|
||||||
|
line = line.replace("CO2", "C O 2")
|
||||||
|
line = line.replace("EUR", "Euro")
|
||||||
|
line = line.replace("II", "2")
|
||||||
|
line = line.replace("IBM", "I B M")
|
||||||
|
line = line.replace("IMF", "I M F")
|
||||||
|
line = line.replace("OECD", "O E C D")
|
||||||
|
line = line.replace("UN", "U N")
|
||||||
|
line = line.replace("USB", "U S B")
|
||||||
|
line = line.replace("WHO", "W H O")
|
||||||
|
line = line.replace("WTO", "W T O")
|
||||||
|
# compound words
|
||||||
|
line = line.replace("biotechnology", "bio technology")
|
||||||
|
line = line.replace("Coronavirus", "Corona virus")
|
||||||
|
line = line.replace("immunocompetence", "immuno competence")
|
||||||
|
# punctuation marks
|
||||||
line = line.replace("-", " - ")
|
line = line.replace("-", " - ")
|
||||||
line = line.replace("/", ", ")
|
line = line.replace("/", ", ")
|
||||||
line = line.replace("—", ". ")
|
line = line.replace("—", ". ")
|
||||||
line = line.replace(":", ". ")
|
line = line.replace(":", ". ")
|
||||||
line = line.replace(";", ". ")
|
line = line.replace(";", ". ")
|
||||||
|
line = line.replace("?", "?. ")
|
||||||
line = line.replace("(", ". ")
|
line = line.replace("(", ". ")
|
||||||
line = line.replace(")", ". ")
|
# line = line.replace(")", ". ") # TODO: Check immune system article
|
||||||
for x in line.split(". "):
|
for x in line.split(". "):
|
||||||
sentences.append(x.strip())
|
sentences.append(x.strip())
|
||||||
sentences.append("<PAUSE>")
|
sentences.append("<PAUSE>")
|
||||||
|
|||||||
Reference in New Issue
Block a user