Как вывести результат работы этой программы в текстовый файл?
Как вывести результат работы этой программы в текстовый файл?
import os
from UzMorphAnalyser import UzMorphAnalyser
import re
import pandas as pd
from nltk.tokenize import word_tokenize
class Tagger:
def __word_tokenize(self, text: str):
tokens = re.findall(r'[A-Za-z\‘’]+(?:\`[A-Za-z]+)?|["!;:.,?-]|\w+', text)
return tokens
def pos_tag(self, text: str):
dirname = os.path.dirname(__file__) + "/"
tagged_text = []
# 0. text cleaning
text = self.__text_cleaning(text)
# 1. tokenize to words
tokens = self.__word_tokenize(text)
# 2. read data
with open(os.path.join(dirname + "non_affixed_word.csv"), "r", encoding='utf-8') as f:
df_non_aff = pd.read_csv(f)
with open(os.path.join(dirname + "lexicon.csv"), "r", encoding='utf-8') as f:
df_lexicon = pd.read_csv(f)
df_lexicon = df_lexicon.sort_values(by="stem", key=lambda x: x.str.len()) # sorting to stands a small one at top, after that get first item as a result
uzmorph = UzMorphAnalyser()
punctuation = {'.', ',', '!', '?', ':', ';', '–', '–', '—', '(', ')', '[', ']', '{', '}', '<', '>', '“', '”'} # punctuation list, set pos tag as "PUNC"
symbol = {'$', '*', '#', '@', '%', '&', '⟨', '⟩', '/', '~', '\'', '^', '|'} # symbol list, set pos tag as "SYM" (Symbol)
for token in tokens:
# 3. check from ready list
# 3.1 check from punctuation list
if token in punctuation:
tagged_text.append((token, self.POS.PUNC)) # punctuation
continue
# 3.2 check from symbol list
if token in symbol:
tagged_text.append((token, self.POS.SYM)) # symbol
continue
token_l = token.lower()
# 3.3 check from non_affixed_words
df1 = df_non_aff.loc[df_non_aff['stem'] == token_l]
if not df1.empty:
tagged_text.append((token, df1["pos"].iloc[0])) # "non_aff"
continue
# 3.4. lookup from lexicon.csv, in reverse order by remove character
df1 = df_lexicon[df_lexicon['stem'].str.startswith(token_l)]
if not df1.empty:
tagged_text.append((token, df1["pos"].iloc[0])) # found in "lexicon" file
continue
# 3.5. UzMorphAnalyser checking process
res = uzmorph.analyze(token)
if res[0]["pos"] != None:
tagged_text.append((token, res[0]["pos"])) # tagged from "uzmorph" UzMorphAnalyser
continue
# 7. still can not find, make any decision
tagged_text.append((token, self.POS.NOUN)) # other
return tagged_text
class POS:
NOUN = "NOUN" # Noun
VERB = "VERB" # Verb
ADJ = "ADJ" # Adjective
NUM = "NUM" # Numeric
ADV = "ADV" # Adverb
PRN = "PRN" # Pronoun
CNJ = "CNJ" # Conjunction
ADP = "ADP" # Adposition
PRT = "PRT" # Particle
INTJ = "INTJ" # Interjection
MOD = "MOD" # Modal
IMIT = "IMIT" # Imitation
AUX = "AUX" # Auxiliary verb
PPN = "PPN" # Proper noun
PUNC = "PUNC" # Punctuation
SYM = "SYM" # Symbol
def help(self):
return "<list of tuple> pos_tag(<text>) \n pos_tag_list()"
def pos_tag_list(self):
return [
('NOUN', 'Noun', 'Ot'),
('ADJ', 'Adjective', 'Sifat'),
('NUM', 'Number', 'Son'),
('PRON', 'Pronoun', 'Olmosh'),
('ADV', 'Adverb', 'Ravish'),
('VERB', 'Verb', 'Fel'),
('CNJ', 'Conjuction', 'Bog`lovchi'),
('ADP', 'Adposition', 'Ko`makchi'),
('PRT', 'Particle', 'Yuklama'),
('INTJ', 'Interjection', 'Undov'),
('MOD', 'Modal', 'Modal'),
('IMIT', 'Imitation', 'Taqlid'),
('AUX', 'Auxiliary verb', 'Yodamchi fel'),
('PPN', 'Proper Noun', 'Atoqli ot'),
('PUNC', 'Punctuation', 'Tinish belgi'),
('SYM', 'Symbol', 'Belgi')
]
def __text_cleaning(self, text: str):
text = text.replace("g'", "g‘")
text = text.replace("o'", "o‘")
text = text.replace("g`", "g‘")
text = text.replace("o`", "o‘")
text = text.replace("g’", "g‘")
text = text.replace("o’", "o‘")
text = text.replace("gʻ", "g‘")
text = text.replace("oʻ", "o‘")
text = text.replace("G'", "G‘")
text = text.replace("O'", "O‘")
text = text.replace("G`", "G‘")
text = text.replace("O`", "O‘")
text = text.replace("G’", "G‘")
text = text.replace("O’", "O‘")
text = text.replace("Gʻ", "G‘")
text = text.replace("Oʻ", "O‘")
text = text.replace("'", "’") # boshqa belgilarni ъ ni kodiga utirish
text = text.replace("ʼ", "’") # boshqa belgilarni ъ ni kodiga utirish
text = text.replace("’", "’") # boshqa belgilarni ъ ni kodiga utirish
return text