Added example handling for abbreviations and ccompound words.

This commit is contained in:
Heiko J Schick
2022-08-31 11:20:08 +02:00
parent 27837b270b
commit 0c09e3a91e
+20 -1
View File
@@ -40,13 +40,32 @@ def main():
# Convert to sentences
for line in lines:
# abbreviations (in alphabetical order)
line = line.replace("%", "per cent")
line = line.replace("5G", "5 G")
line = line.replace("CO2", "C O 2")
line = line.replace("EUR", "Euro")
line = line.replace("II", "2")
line = line.replace("IBM", "I B M")
line = line.replace("IMF", "I M F")
line = line.replace("OECD", "O E C D")
line = line.replace("UN", "U N")
line = line.replace("USB", "U S B")
line = line.replace("WHO", "W H O")
line = line.replace("WTO", "W T O")
# compound words
line = line.replace("biotechnology", "bio technology")
line = line.replace("Coronavirus", "Corona virus")
line = line.replace("immunocompetence", "immuno competence")
# punctuation marks
line = line.replace("-", " - ")
line = line.replace("/", ", ")
line = line.replace("", ". ")
line = line.replace(":", ". ")
line = line.replace(";", ". ")
line = line.replace("?", "?. ")
line = line.replace("(", ". ")
line = line.replace(")", ". ")
# line = line.replace(")", ". ") # TODO: Check immune system article
for x in line.split(". "):
sentences.append(x.strip())
sentences.append("<PAUSE>")