NILC-ICMC-USP commited on
Commit
e139db3
·
verified ·
1 Parent(s): ec63fa6

Upload postprocess.py

Browse files
Files changed (1) hide show
  1. src/postproc/postprocess.py +359 -308
src/postproc/postprocess.py CHANGED
@@ -1,8 +1,8 @@
1
  #################################################
2
- ### Post Processing Program to Portparser.v2
3
  #################################################
4
  #
5
- # (c) Lucelene Lopes 2024
6
  #
7
  ##################
8
  # main function: fixLemmaFeatures()
@@ -83,23 +83,26 @@ def parseOptions(arguments):
83
  #################################################
84
  def getUsualAbbr():
85
  infile = open("./src/postproc/usAbbr.tsv", "r")
86
- dayW, month, abbr, ordinal = [], [], [], []
87
  for line in infile:
88
  if (line[0] == "#"):
89
  continue
90
  buf = line[:-1].split("\t")
91
- if (buf[1] == "week"):
92
- dayW.append([buf[0], buf[2], buf[3], buf[4]])
93
- elif (buf[1] == "month"):
94
- month.append([buf[0], buf[2], buf[3], buf[4]])
95
- elif (buf[1] == "abbr"):
96
  abbr.append([buf[0], buf[2], buf[3], buf[4]])
97
- elif (buf[1] == "ordinal"):
98
- ordinal.append([buf[0], buf[2], buf[3], buf[4]])
99
- return dayW, month, abbr, ordinal
100
 
101
  #################################################
102
- ### Function - Check if word is in an abbreviation list
 
 
 
 
 
 
 
 
 
103
  #################################################
104
  def isWithin(listAbbr, form):
105
  for a in listAbbr:
@@ -108,7 +111,7 @@ def isWithin(listAbbr, form):
108
  return None, None, None
109
 
110
  #################################################
111
- ### Function - Check if word is in an abbreviation list
112
  #################################################
113
  def print_reps(repfile, accName, acc):
114
  print("\n==========================================================\n", file=repfile)
@@ -117,16 +120,145 @@ def print_reps(repfile, accName, acc):
117
  print("{:8} - fixed: {:6>}".format(accName[i], acc[i]))
118
 
119
  #################################################
120
- ### Main Function - Fix Lemma and Features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  #################################################
122
- def fixLemmaFeatures():
123
- lexCloseTags = ["ADP", "ADV", "CCONJ", "DET", "PRON", "SCONJ"]
124
- lexOpenTags = ["ADJ", "INTJ", "NOUN", "NUM"]
125
- lexVerbTags = ["AUX", "VERB"]
126
- lexOutOfTags = ["PROPN", "PUNCT", "SYM"]
127
- # the POS tag "X" is dealt differently
 
 
 
 
 
 
 
128
  if (len(sys.argv) == 1):
129
- arguments = ["xxx.conllu", "yyy.conllu", True, True, False]
130
  print("Assumindo default: 'yyy.conllu' como arquivo de entrada, 'xxx.conllu' como arquivo de saída, e executando correção de lemas e features.")
131
  else:
132
  arguments = parseOptions(sys.argv)
@@ -135,306 +267,225 @@ def fixLemmaFeatures():
135
  print("Assumindo 'xxx.conllu' como arquivo de saída")
136
  arguments[0] = 'xxx.conllu'
137
  if not os.path.isfile(arguments[1]):
138
- print("Arquivo de entrada inválido - por favor corrija e tente novamente")
139
  else:
140
  outfile = open(arguments[0], "w")
141
- repfile = open(arguments[0]+".rep.tsv", "w")
142
  base = conlluFile(arguments[1])
143
- # contadores
144
- accName = ["Lchanged", "LnoLEX", "L1LEX", "LmLEX", "LunkTAG", \
145
- "LdaysW", "Lmonth", "LuAbbr", "Lord", "LrepTAG", \
146
- "Fchanged", "FnoLEX", "F1LEX", "FmLEX", "FunkTAG", \
147
- "FdaysW", "Fmonth", "FuAbbr", "Ford", "FrepTAG", \
148
- ]
149
  acc = [0]*len(accName)
150
- # usual Abbr (lidas de um .tsv no formato "forma", "tipo", "POS", "lemma", "feats")
151
- dayW, month, abbr, ordinal = getUsualAbbr()
152
  # main loop
153
  for i in range(base.getS()):
154
  b = base.getSentByIndex(i)
 
155
  for tk in b[4]:
156
- # skip contracted words
157
  if ("-" in tk[0]):
 
158
  continue
159
- # check if abbreviation
160
- pos, lem, feat = isWithin(abbr, tk[1].lower())
161
- if (pos == tk[3]):
162
- if (arguments[2]): # fix Lemma
163
- if (tk[2] != lem):
164
- acc[accName.index("Lchanged")] += 1
165
- acc[accName.index("LuAbbr")] += 1
166
- if (not arguments[4]):
167
- print(b[0], tk[0], tk[1], "LuAbbr", tk[2], lem, sep="\t", file=repfile)
168
- tk[2] = lem
169
- if (arguments[3]): # fix Features
170
- if (tk[5] != feat):
171
- acc[accName.index("Fchanged")] += 1
172
- acc[accName.index("FuAbbr")] += 1
173
- if (not arguments[4]):
174
- print(b[0], tk[0], tk[1], "FuAbbr", tk[5], feat, sep="\t", file=repfile)
175
- tk[5] = feat
176
- # check if day of the week
177
- elif ("NOUN" == tk[3]):
178
- pos, lem, feat = isWithin(dayW, tk[1].lower())
179
- if (arguments[2]) and (pos is not None): # fix Lemma
180
- if (tk[2] != lem):
181
- acc[accName.index("Lchanged")] += 1
182
- acc[accName.index("LdaysW")] += 1
183
- if (not arguments[4]):
184
- print(b[0], tk[0], tk[1], "LdaysW", tk[2], lem, sep="\t", file=repfile)
185
- tk[2] = lem
186
- if (arguments[3]) and (pos is not None): # fix Features
187
- if (tk[5] != feat):
188
- acc[accName.index("Fchanged")] += 1
189
- acc[accName.index("FdaysW")] += 1
190
- if (not arguments[4]):
191
- print(b[0], tk[0], tk[1], "FdaysW", tk[5], feat, sep="\t", file=repfile)
192
- tk[5] = feat
193
- # check if month
194
- elif ("NOUN" == tk[3]):
195
- pos, lem, feat = isWithin(month, tk[1].lower())
196
- if (arguments[2]) and (pos is not None): # fix Lemma
197
- if (tk[2] != lem):
198
- acc[accName.index("Lchanged")] += 1
199
- acc[accName.index("Lmonth")] += 1
200
- if (not arguments[4]):
201
- print(b[0], tk[0], tk[1], "Lmonth", tk[2], lem, sep="\t", file=repfile)
202
- tk[2] = lem
203
- if (arguments[3]) and (pos is not None): # fix Features
204
- if (tk[5] != feat):
205
- acc[accName.index("Fchanged")] += 1
206
- acc[accName.index("Fmonth")] += 1
207
- if (not arguments[4]):
208
- print(b[0], tk[0], tk[1], "Fmonth", tk[5], feat, sep="\t", file=repfile)
209
- tk[5] = feat
210
- # check if ordinal
211
- elif ("ADJ" == tk[3]):
212
- pos, lem, feat = isWithin(ordinal, tk[1].lower())
213
- if (arguments[2]) and (pos is not None): # fix Lemma
214
- if (tk[2] != lem):
215
- acc[accName.index("Lchanged")] += 1
216
- acc[accName.index("Lord")] += 1
217
- if (not arguments[4]):
218
- print(b[0], tk[0], tk[1], "Lord", tk[2], lem, sep="\t", file=repfile)
219
- tk[2] = lem
220
- if (arguments[3]) and (pos is not None): # fix Features
221
- if (tk[5] != feat):
222
- acc[accName.index("Fchanged")] += 1
223
- acc[accName.index("Ford")] += 1
224
- if (not arguments[4]):
225
- print(b[0], tk[0], tk[1], "Ford", tk[5], feat, sep="\t", file=repfile)
226
- tk[5] = feat
227
- # check if POS tag X
228
- elif (tk[3] == "X"):
229
- if (arguments[2]): # fix Lemma
230
- if (tk[1] != tk[2]):
231
- acc[accName.index("Lchanged")] += 1
232
- acc[accName.index("LrepTAG")] += 1
233
- if (not arguments[4]):
234
- print(b[0], tk[0], tk[1], "LrepTAG", tk[2], tk[1], sep="\t", file=repfile)
235
- tk[2] = tk[1]
236
- if (arguments[3]): # fix Features
237
- if (tk[5] not in ["Foreign=Yes", "_"]):
238
- acc[accName.index("Fchanged")] += 1
239
- acc[accName.index("FrepTAG")] += 1
240
- if (not arguments[4]):
241
- print(b[0], tk[0], tk[1], "FrepTAG", tk[5], "Foreign=Yes", sep="\t", file=repfile)
242
- tk[5] = "Foreign=Yes"
243
- # check if POS tag out of the Lexicon
244
- elif (tk[3] in lexOutOfTags):
245
- if (arguments[2]): # fix Lemma
246
- if (tk[1] != tk[2]):
247
- acc[accName.index("Lchanged")] += 1
248
- acc[accName.index("LrepTAG")] += 1
249
- if (not arguments[4]):
250
- print(b[0], tk[0], tk[1], "LrepTAG", tk[2], tk[1], sep="\t", file=repfile)
251
- tk[2] = tk[1]
252
- if (arguments[3]): # fix Features
253
- if (tk[5] != "_"):
254
- acc[accName.index("Fchanged")] += 1
255
- acc[accName.index("FrepTAG")] += 1
256
- if (not arguments[4]):
257
- print(b[0], tk[0], tk[1], "FrepTAG", tk[5], "_", sep="\t", file=repfile)
258
- tk[5] = "_"
259
- # check NUM in numeric form
260
- elif (tk[3] == "NUM") and ((tk[1][0].isdigit()) or \
261
- ((tk[1][0] in ["-", "+"]) and (tk[1][1].isdigit()))):
262
- if (arguments[2]): # fix Lemma
263
- if (tk[1] != tk[2]):
264
- acc[accName.index("Lchanged")] += 1
265
- acc[accName.index("LrepTAG")] += 1
266
- if (not arguments[4]):
267
- print(b[0], tk[0], tk[1], "LrepTAG", tk[2], tk[1], sep="\t", file=repfile)
268
- tk[2] = tk[1]
269
- if (arguments[3]): # fix Features
270
- if (tk[5] not in ["NumType=Card", "_"]):
271
- acc[accName.index("Fchanged")] += 1
272
- acc[accName.index("FrepTAG")] += 1
273
- if (not arguments[4]):
274
- print(b[0], tk[0], tk[1], "FrepTAG", tk[5], "NumType=Card", sep="\t", file=repfile)
275
- tk[5] = "NumType=Card"
276
- # check Close POS tags (ADP, ADV, CCONJ, DET, PRON, SCONJ)
277
  elif (tk[3] in lexCloseTags):
278
- lowForm = tk[1].lower()
279
- options = lex.pget(lowForm, tk[3])
280
- if (len(options) == 0): ### not found
281
- if (arguments[2]): # fix Lemma
282
- if (tk[1] != tk[2]):
283
- acc[accName.index("Lchanged")] += 1
284
- acc[accName.index("LnoLEX")] += 1
285
- if (not arguments[4]):
286
- print(b[0], tk[0], tk[1], "LnoLEX", tk[2], tk[1], sep="\t", file=repfile)
287
- tk[2] = tk[1]
288
- if (arguments[3]): # fix Features
289
- if (tk[5] != "_"):
290
- acc[accName.index("Fchanged")] += 1
291
- acc[accName.index("FnoLEX")] += 1
292
- if (not arguments[4]):
293
- print(b[0], tk[0], tk[1], "FnoLEX", tk[5], "_", sep="\t", file=repfile)
294
- tk[5] = "_"
295
- elif (len(options) == 1): ### single option
296
- if (arguments[2]): # fix Lemma
297
- if (options[0][0] != tk[2]):
298
- acc[accName.index("Lchanged")] += 1
299
- acc[accName.index("L1LEX")] += 1
300
- if (not arguments[4]):
301
- print(b[0], tk[0], tk[1], "L1LEX", tk[2], options[0][0], sep="\t", file=repfile)
302
- tk[2] = options[0][0]
303
- if (arguments[3]): # fix Features
304
- if (options[0][2] != tk[5]):
305
- acc[accName.index("Fchanged")] += 1
306
- acc[accName.index("F1LEX")] += 1
307
- if (not arguments[4]):
308
- print(b[0], tk[0], tk[1], "F1LEX", tk[5], options[0][2], sep="\t", file=repfile)
309
- tk[5] = options[0][2]
310
- elif (len(options) > 1): ### multiple option
311
- lemmas, guess = [], "x"*100
312
- for o in options:
313
- lemmas.append(o[0])
314
- if (len(o[0]) < len(guess)):
315
- guess = o[0]
316
- if (arguments[2]): # fix Lemma
317
- if (tk[2] not in lemmas):
318
- acc[accName.index("Lchanged")] += 1
319
- acc[accName.index("LmLEX")] += 1
320
- if (not arguments[4]):
321
- print(b[0], tk[0], tk[1], "LmLEX", tk[2], guess, sep="\t", file=repfile)
322
- tk[2] = guess
323
- feats, guess = [], "x"*100
324
- for o in options:
325
- feats.append(o[2])
326
- if (tk[3] in ["DET", "PRON"]) and ("Person" in o[2]) and ("Person" not in guess):
327
- guess = o[2]
328
- elif (tk[3] in ["DET", "PRON"]) and ("Person" in o[2]) and ("Person" in guess) and (len(o[2]) < len(guess)):
329
- guess = o[2]
330
- if (arguments[3]): # fix Features
331
- if (tk[5] not in feats):
332
- acc[accName.index("Fchanged")] += 1
333
- acc[accName.index("FmLEX")] += 1
334
- if (not arguments[4]):
335
- print(b[0], tk[0], tk[1], "FmLEX", tk[5], guess, sep="\t", file=repfile)
336
- tk[5] = guess
337
- # check Open POS tags (ADJ, INTJ, NOUN, NUM)
 
 
338
  elif (tk[3] in lexOpenTags):
339
- lowForm = tk[1].lower()
340
- options = lex.pget(lowForm, tk[3])
341
- if (len(options) == 0): ### not found
342
- continue
343
- elif (len(options) == 1): ### single option
344
- if (arguments[2]): # fix Lemma
345
- if (options[0][0] != tk[2]):
346
- acc[accName.index("Lchanged")] += 1
347
- acc[accName.index("L1LEX")] += 1
348
- if (not arguments[4]):
349
- print(b[0], tk[0], tk[1], "L1LEX", tk[2], options[0][0], sep="\t", file=repfile)
350
- tk[2] = options[0][0]
351
- if (arguments[3]): # fix Features
352
- if (options[0][2] != tk[5]):
353
- acc[accName.index("Fchanged")] += 1
354
- acc[accName.index("F1LEX")] += 1
355
- if (not arguments[4]):
356
- print(b[0], tk[0], tk[1], "F1LEX", tk[5], options[0][2], sep="\t", file=repfile)
357
- tk[5] = options[0][2]
358
- elif (len(options) > 1): ### multiple option
359
- lemmas, guess = [], "x"*100
360
- for o in options:
361
- lemmas.append(o[0])
362
- if (len(o[0]) < len(guess)):
363
- guess = o[0]
364
- if (arguments[2]): # fix Lemma
365
- if (tk[2] not in lemmas) and (guess[:5] != "xxxxx"):
366
- acc[accName.index("Lchanged")] += 1
367
- acc[accName.index("LmLEX")] += 1
368
- if (not arguments[4]):
369
- print(b[0], tk[0], tk[1], "LmLEX", tk[2], guess, sep="\t", file=repfile)
370
- tk[2] = guess
371
- feats, guess = [], "x"*100
372
- for o in options:
373
- feats.append(o[2])
374
- if (len(o[2]) < len(guess)):
375
- guess = o[2]
376
- if (arguments[3]): # fix Features
377
- if (tk[5] not in feats) and (guess[:5] != "xxxxx"):
378
- acc[accName.index("Fchanged")] += 1
379
- acc[accName.index("FmLEX")] += 1
380
- if (not arguments[4]):
381
- print(b[0], tk[0], tk[1], "FmLEX", tk[5], guess, sep="\t", file=repfile)
382
- tk[5] = guess
383
- # check VERB and AUX POS tags
384
  elif (tk[3] in lexVerbTags):
385
- lowForm = tk[1].lower()
386
- options = lex.pget(lowForm, tk[3])
387
- if (len(options) == 0): ### not found
388
- continue
389
- elif (len(options) == 1): ### single option
390
- if (arguments[2]): # fix Lemma
391
- if (options[0][0] != tk[2]):
392
- acc[accName.index("Lchanged")] += 1
393
- acc[accName.index("L1LEX")] += 1
394
- if (not arguments[4]):
395
- print(b[0], tk[0], tk[1], "L1LEX", tk[2], options[0][0], sep="\t", file=repfile)
396
- tk[2] = options[0][0]
397
- if (arguments[3]): # fix Features
398
- if (options[0][2] != tk[5].replace("|Voice=Pass", "")):
399
- acc[accName.index("Fchanged")] += 1
400
- acc[accName.index("F1LEX")] += 1
401
- if (not arguments[4]):
402
- print(b[0], tk[0], tk[1], "F1LEX", tk[5], options[0][2], sep="\t", file=repfile)
403
- tk[5] = options[0][2]
404
- elif (len(options) > 1): ### multiple option
405
- lemmas, guess = [], "x"*100
406
- for o in options:
407
- lemmas.append(o[0])
408
- if (len(o[0]) < len(guess)):
409
- guess = o[0]
410
- if (arguments[2]): # fix Lemma
411
- if (tk[2] not in lemmas) and (guess[:5] != "xxxxx"):
412
- acc[accName.index("Lchanged")] += 1
413
- acc[accName.index("LmLEX")] += 1
414
- if (not arguments[4]):
415
- print(b[0], tk[0], tk[1], "LmLEX", tk[2], guess, sep="\t", file=repfile)
416
- tk[2] = guess
417
- feats, guess = [], "x"*100
418
- for o in options:
419
- feats.append(o[2])
420
- if ("Person=3" in o[2]) and ("Person=3" not in guess):
421
- guess = o[2]
422
- elif ("Person=3" in o[2]) and ("Person=3" in guess):
423
- if (len(o[2]) < len(guess)):
424
- guess = o[2]
425
- if (arguments[3]): # fix Features
426
- if (tk[5].replace("|Voice=Pass", "") not in feats) and (guess[:5] != "xxxxx"):
427
- acc[accName.index("Fchanged")] += 1
428
- acc[accName.index("FmLEX")] += 1
429
- if (not arguments[4]):
430
- print(b[0], tk[0], tk[1], "FmLEX", tk[5], guess, sep="\t", file=repfile)
431
- tk[5] = guess
432
-
433
- print_reps(repfile, accName, acc)
 
 
 
 
 
 
 
 
 
 
434
  base.printNoHeader(outfile)
435
- repfile.close()
436
  outfile.close()
437
 
438
- fixLemmaFeatures()
439
-
440
-
 
1
  #################################################
2
+ ### Post Processing Program to Portparser.v3
3
  #################################################
4
  #
5
+ # (c) Lucelene Lopes 2025
6
  #
7
  ##################
8
  # main function: fixLemmaFeatures()
 
83
  #################################################
84
  def getUsualAbbr():
85
  infile = open("./src/postproc/usAbbr.tsv", "r")
86
+ abbr = []
87
  for line in infile:
88
  if (line[0] == "#"):
89
  continue
90
  buf = line[:-1].split("\t")
91
+ if (buf[1] == "abbr"):
 
 
 
 
92
  abbr.append([buf[0], buf[2], buf[3], buf[4]])
93
+ return abbr
 
 
94
 
95
  #################################################
96
+ ### Function - Check if word is in the abbreviation
97
+ #################################################
98
+ def isAbbr(listAbbr, form):
99
+ for a in listAbbr:
100
+ if (form == a[0]):
101
+ return True
102
+ return False
103
+
104
+ #################################################
105
+ ### Function - get info word is in an abbreviation list
106
  #################################################
107
  def isWithin(listAbbr, form):
108
  for a in listAbbr:
 
111
  return None, None, None
112
 
113
  #################################################
114
+ ### Function - Print a frequency list
115
  #################################################
116
  def print_reps(repfile, accName, acc):
117
  print("\n==========================================================\n", file=repfile)
 
120
  print("{:8} - fixed: {:6>}".format(accName[i], acc[i]))
121
 
122
  #################################################
123
+ ### Function - fix upper letters in coumpound words
124
+ #################################################
125
+ def fixCompoundUpper(form, lemma, upos, feats):
126
+ if (upos in ["PROPN", "SYM", "X", "PUNCT"]):
127
+ return upos, form, "_"
128
+ else:
129
+ lemma = lemma.lower()
130
+ # # deal with the lemma
131
+ # dashesF = form.count("-")
132
+ # dashesL = lemma.count("-")
133
+ # if (dashesF == dashesL):
134
+ # buf = lemma
135
+ # bits = []
136
+ # for i in range(dashesL):
137
+ # dash = buf.index("-")
138
+ # bits.append(buf[:dash])
139
+ # buf = buf[dash+1:]
140
+ # for j in range(1,len(bits[-1])):
141
+ # if (bits[-1][j].isupper()):
142
+ # bits[-1] = bits[-1][:j]+bits[-1][j].lower()+bits[-1][j+1:]
143
+ # lemma = bits[0]
144
+ # for i in range(1,len(bits)):
145
+ # lemma += "-"+bits[i]
146
+ # lemma += "-"+buf
147
+ # deal with the features
148
+ #### not yet
149
+ return upos, lemma, feats
150
+
151
+ #################################################
152
+ ### Function - assemble feats
153
+ #################################################
154
+ def featsFull(feat, abbr=False, extpos="", voicepass=False, prontype="", verbform="", numtype=""):
155
+ def ignoreCase(f):
156
+ return f.lower()
157
+ # disassemble the string
158
+ if (feat == "_"):
159
+ feats = []
160
+ else:
161
+ feats = feat.split("|")
162
+ # deal with Abbr=Yes
163
+ if (abbr) and ("Abbr=Yes" not in feats):
164
+ feats.append("Abbr=Yes")
165
+ if (not abbr) and ("Abbr=Yes" in feats):
166
+ feats.remove("Abbr=Yes")
167
+ # deal with ExtPos=
168
+ if (extpos != "") and ("ExtPos="+extpos not in feats):
169
+ feats.append("ExtPos="+extpos)
170
+ to_rem = []
171
+ for f in feats:
172
+ if (f[:7] == "ExtPos=") and (f != "ExtPos="+extpos):
173
+ to_rem.append(f)
174
+ for trf in to_rem:
175
+ feats.remove(trf)
176
+ # deal with Voice=Pass
177
+ if (voicepass) and ("Voice=Pass" not in feats):
178
+ feats.append("Voice=Pass")
179
+ if (not voicepass) and ("Voice=Pass" in feats):
180
+ feats.remove("Voice=Pass")
181
+ # deal with PronType=
182
+ if (prontype != None):
183
+ if (prontype != "") and ("PronType="+prontype not in feats):
184
+ feats.append("PronType="+prontype)
185
+ to_rem = []
186
+ for f in feats:
187
+ if (f[:9] == "PronType=") and (f != "PronType="+prontype):
188
+ to_rem.append(f)
189
+ for trf in to_rem:
190
+ feats.remove(trf)
191
+ # deal with VerbForm=
192
+ if (verbform != None):
193
+ if (verbform != "") and ("VerbForm="+verbform not in feats):
194
+ feats.append("VerbForm="+verbform)
195
+ to_rem = []
196
+ for f in feats:
197
+ if (f[:9] == "VerbForm=") and (f != "VerbForm="+verbform):
198
+ to_rem.append(f)
199
+ for trf in to_rem:
200
+ feats.remove(trf)
201
+ # deal with NumType=
202
+ if (numtype != None):
203
+ if (numtype != "") and ("NumType="+numtype not in feats):
204
+ feats.append("NumType="+numtype)
205
+ to_rem = []
206
+ for f in feats:
207
+ if (f[:8] == "NumType=") and (f != "NumType="+numtype):
208
+ to_rem.append(f)
209
+ for trf in to_rem:
210
+ feats.remove(trf)
211
+ # assemble the string
212
+ if (feats == []):
213
+ return "_"
214
+ else:
215
+ feats.sort(key=ignoreCase)
216
+ ans = ""
217
+ for f in feats:
218
+ ans += f+"|"
219
+ return ans[:-1]
220
+
221
+ #################################################
222
+ ### Function - locate the fixed heads in the sentence
223
+ #################################################
224
+ def locateExtPos(tks):
225
+ fixeds = []
226
+ for tk in tks:
227
+ if (tk[7] == "fixed") and (tk[6] not in fixeds):
228
+ fixeds.append(tk[6])
229
+ return fixeds
230
+
231
+ #################################################
232
+ ### Function - check options separating lemma and features
233
+ #################################################
234
+ def sepLEMMA_FEATS(options):
235
+ opLEMMA = []
236
+ opFEATS = []
237
+ for o in options:
238
+ if (o[0] not in opLEMMA):
239
+ opLEMMA.append(o[0])
240
+ if (o[2] not in opFEATS):
241
+ opFEATS.append(o[2])
242
+ return opLEMMA, opFEATS
243
+
244
+ #################################################
245
+ ### Main Function - Postprocess fix of UPOS, LEMMA and FEATS
246
  #################################################
247
+ def posprocFix():
248
+ # if compound word # fix - replace upper case in Lemma only
249
+ # if the word is within known unambiguous abbr # correct arbitrarily
250
+ lexOutOfTags = ["PROPN", "PUNCT", "SYM", "X"] # correct arbitrarily
251
+ lexCloseTags = ["ADP", "ADV", "CCONJ", "SCONJ"] # correct if unique in lex, erase feats (features are impossible)
252
+ lexPronDetTags = ["DET", "PRON"] # correct if unique in lex, require 'PronType', erase impossible features
253
+ lexOpenTags = ["ADJ", "INTJ", "NOUN", "NUM"] # correct if unique in lex, erase impossible features
254
+ lexVerbTags = ["AUX", "VERB"] # correct if unique in lex, require 'VerbForm', erase impossible features
255
+ digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
256
+ ordinalsignsFem = ['ª', 'a']
257
+ ordinalsignsMasc = ['º', '°', 'o']
258
+ ordinalsignsNeut = ['.']
259
+
260
  if (len(sys.argv) == 1):
261
+ arguments = ["xxx.conllu", "yyy.conllu", True, True, False] # output file, input file, do lemmas, do features, run quiet(false)
262
  print("Assumindo default: 'yyy.conllu' como arquivo de entrada, 'xxx.conllu' como arquivo de saída, e executando correção de lemas e features.")
263
  else:
264
  arguments = parseOptions(sys.argv)
 
267
  print("Assumindo 'xxx.conllu' como arquivo de saída")
268
  arguments[0] = 'xxx.conllu'
269
  if not os.path.isfile(arguments[1]):
270
+ print(arguments[1], "Arquivo de entrada inválido - por favor corrija e tente novamente")
271
  else:
272
  outfile = open(arguments[0], "w")
273
+ if (not arguments[4]): repfile = open(arguments[0]+".rep.tsv", "w")
274
  base = conlluFile(arguments[1])
275
+ # counters
276
+ accName = ["Pchanged", "Lchanged", "Fchanged"]
 
 
 
 
277
  acc = [0]*len(accName)
278
+ # usual Abbr (read from .tsv with "form", "kind", "UPOS", "LEMMA", "FEATS")
279
+ usualAbbr = getUsualAbbr()
280
  # main loop
281
  for i in range(base.getS()):
282
  b = base.getSentByIndex(i)
283
+ fixeds = locateExtPos(b[4])
284
  for tk in b[4]:
285
+ # level down contracted tokens info, but ID and FORM
286
  if ("-" in tk[0]):
287
+ tk[2], tk[3], tk[4], tk[5], tk[6], tk[7], tk[8], tk[9] = "-", "-", "-", "-", "-", "-", "-", "-"
288
  continue
289
+ # fix out of lexikon tokens
290
+ if (tk[3] in lexOutOfTags):
291
+ if (tk[3] in ["PROPN", "PUNCT", "SYM"]):
292
+ pos, lem, feat = tk[3], tk[1], "_"
293
+ elif (tk[3] == "X"):
294
+ if ("Foreign=Yes" in tk[5]):
295
+ pos, lem, feat = tk[3], tk[1], "Foreign=Yes"
296
+ else:
297
+ pos, lem, feat = tk[3], tk[1], "_"
298
+ # fix only lemma in compound words
299
+ elif ("-" in tk[1]):
300
+ pos, lem, feat = fixCompoundUpper(tk[1], tk[2], tk[3], tk[5])
301
+ # fix known abbreviations
302
+ elif (isAbbr(usualAbbr, tk[1].lower())) and (tk[3] in ["ADP", "NOUN"]):
303
+ pos, lem, feat = isWithin(usualAbbr, tk[1].lower())
304
+ # fix numerical NUM, ADJ, NOUN
305
+ elif (tk[3] in ["ADJ", "NOUN", "NUM"]) and (not tk[1].isalpha()):
306
+ if (tk[3] == "NOUN"):
307
+ pos, lem, feat = tk[3], tk[1], "_"
308
+ elif (tk[3] == "ADJ"):
309
+ if (tk[1][-1] in ordinalsignsMasc):
310
+ pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord"
311
+ elif (tk[1][-1] in ordinalsignsFem):
312
+ pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord"
313
+ elif (tk[1][-1] in ordinalsignsNeut):
314
+ pos, lem, feat = tk[3], tk[1], "NumType=Ord"
315
+ else:
316
+ pos, lem, feat = tk[3], tk[1], "_"
317
+ elif (tk[3] == "NUM"):
318
+ if (tk[1][-1] in ordinalsignsMasc):
319
+ pos, lem, feat = tk[3], tk[1], "Gender=Masc|NumType=Ord"
320
+ elif (tk[1][-1] in ordinalsignsFem):
321
+ pos, lem, feat = tk[3], tk[1], "Gender=Fem|NumType=Ord"
322
+ elif (tk[1][-1] in ordinalsignsNeut):
323
+ pos, lem, feat = tk[3], tk[1], "NumType=Ord"
324
+ else:
325
+ pos, lem, feat = tk[3], tk[1], "NumType=Card"
326
+ # fix closed tags - ADP, ADV, CCONJ, SCONJ
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  elif (tk[3] in lexCloseTags):
328
+ options = lex.pget(tk[1].lower(), tk[3])
329
+ opLEMMA, opFEATS = sepLEMMA_FEATS(options)
330
+ abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
331
+ if (tk[0] in fixeds):
332
+ if (tk[7] == "cc"):
333
+ extpos = "CCONJ"
334
+ elif (tk[7] == "advmod"):
335
+ extpos = "ADV"
336
+ elif (tk[7] == "case"):
337
+ extpos = "ADP"
338
+ elif (tk[7] == "mark"):
339
+ extpos = "SCONJ"
340
+ elif (tk[3] == "PRON"):
341
+ extpos = "PRON"
342
+ else:
343
+ extpos = tk[3]
344
+ else:
345
+ extpos = ""
346
+ if (len(options) == 0): # out of the lex
347
+ pos, lem, feat = tk[3], tk[2].lower(), featsFull("_", abbr, extpos=extpos)
348
+ elif (len(options) == 1): # unambiguous in the lex
349
+ pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos)
350
+ else: # ambiguous in the lex - do nothing
351
+ pos = tk[3]
352
+ lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
353
+ feat = opFEATS[0] if (len(opFEATS) == 1) else tk[5]
354
+ # fix Pron and Det tags - PRON, DET
355
+ elif (tk[3] in lexPronDetTags):
356
+ options = lex.pget(tk[1].lower(), tk[3])
357
+ opLEMMA, opFEATS = sepLEMMA_FEATS(options)
358
+ abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
359
+ if (tk[0] in fixeds):
360
+ if (tk[7] == "cc"):
361
+ extpos = "CCONJ"
362
+ elif (tk[7] == "advmod"):
363
+ extpos = "ADV"
364
+ elif (tk[7] == "case"):
365
+ extpos = "ADP"
366
+ elif (tk[7] == "mark"):
367
+ extpos = "SCONJ"
368
+ elif (tk[3] == "PRON"):
369
+ extpos = "PRON"
370
+ else:
371
+ extpos = tk[3]
372
+ else:
373
+ extpos = ""
374
+ if ("PronType" in tk[5]):
375
+ idx = tk[5].index("PronType=")+9
376
+ prontype = tk[5][idx:idx+3]
377
+ elif (tk[3] == "PRON"):
378
+ prontype = "Dem"
379
+ elif (tk[3] == "DET"):
380
+ prontype = "Art"
381
+ if (len(options) == 0): # out of the lex
382
+ pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, prontype=prontype)
383
+ elif (len(options) == 1): # unambiguous in the lex
384
+ pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, prontype=None)
385
+ else: # ambiguous in the lex - do nothing
386
+ pos = tk[3]
387
+ lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
388
+ feat = opFEATS[0] if (len(opFEATS) == 1) else tk[5]
389
+ # fix Open tags - ADJ, INTJ, NOUN, NUM
390
  elif (tk[3] in lexOpenTags):
391
+ options = lex.pget(tk[1].lower(), tk[3])
392
+ opLEMMA, opFEATS = sepLEMMA_FEATS(options)
393
+ abbr = ("Abbr=Yes" in tk[5]) and ((tk[1].lower() != tk[2]) or ("/" in tk[1]) or ("." in tk[1]))
394
+ if (tk[0] in fixeds):
395
+ if (tk[7] == "cc"):
396
+ extpos = "CCONJ"
397
+ elif (tk[7] == "advmod"):
398
+ extpos = "ADV"
399
+ elif (tk[7] == "case"):
400
+ extpos = "ADP"
401
+ elif (tk[7] == "mark"):
402
+ extpos = "SCONJ"
403
+ elif (tk[3] == "PRON"):
404
+ extpos = "PRON"
405
+ else:
406
+ extpos = tk[3]
407
+ else:
408
+ extpos = ""
409
+ if ("VerbForm=Part" in tk[5]) and (tk[3] == "ADJ"):
410
+ verbform = "Part"
411
+ else:
412
+ verbform = ""
413
+ if ("NumType=Ord" in tk[5]) and (tk[3] in ["ADJ", "NUM"]):
414
+ numtype = "Ord"
415
+ elif ("NumType=Card" in tk[5]) and (tk[3] == "NUM"):
416
+ numtype = "Card"
417
+ else:
418
+ numtype = ""
419
+ if (len(options) == 0): # out of the lex
420
+ pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, numtype=numtype)
421
+ elif (len(options) == 1): # unambiguous in the lex
422
+ pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, numtype=None)
423
+ else: # ambiguous in the lex - do nothing
424
+ pos = tk[3]
425
+ lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
426
+ feat = opFEATS[0] if (len(opFEATS) == 1) else tk[5]
427
+ # fix Verb tags - AUX, VERB
 
 
 
 
 
 
 
 
428
  elif (tk[3] in lexVerbTags):
429
+ options = lex.pget(tk[1].lower(), tk[3])
430
+ opLEMMA, opFEATS = sepLEMMA_FEATS(options)
431
+ abbr = ("Abbr=Yes" in tk[5]) and (tk[1].lower() != tk[2])
432
+ if (tk[0] in fixeds):
433
+ if (tk[7] == "cc"):
434
+ extpos = "CCONJ"
435
+ elif (tk[7] == "advmod"):
436
+ extpos = "ADV"
437
+ elif (tk[7] == "case"):
438
+ extpos = "ADP"
439
+ elif (tk[7] == "mark"):
440
+ extpos = "SCONJ"
441
+ elif (tk[3] == "PRON"):
442
+ extpos = "PRON"
443
+ else:
444
+ extpos = tk[3]
445
+ else:
446
+ extpos = ""
447
+ if ("VerbForm=Inf" in tk[5]):
448
+ verbform = "Inf"
449
+ elif ("VerbForm=Ger" in tk[5]):
450
+ verbform = "Ger"
451
+ elif ("VerbForm=Part" in tk[5]):
452
+ verbform = "Part"
453
+ elif ("VerbForm=Fin" in tk[5]):
454
+ verbform = "Fin"
455
+ else:
456
+ if (tk[1][-1].lower() == "r"):
457
+ verbform = "Inf"
458
+ else:
459
+ verbform = "Fin"
460
+ if ("Voice=Pass" in tk[5]):
461
+ voicepass = True
462
+ else:
463
+ voicepass = False
464
+ if (len(options) == 0): # out of the lex
465
+ pos, lem, feat = tk[3], tk[2].lower(), featsFull(tk[5], abbr, extpos=extpos, verbform=verbform, voicepass=voicepass)
466
+ elif (len(options) == 1): # unambiguous in the lex
467
+ pos, lem, feat = tk[3], options[0][0], featsFull(options[0][2], abbr, extpos=extpos, verbform=None, voicepass=voicepass)
468
+ else: # ambiguous in the lex - do nothing
469
+ pos = tk[3]
470
+ lem = opLEMMA[0] if (len(opLEMMA) == 1) else tk[2].lower()
471
+ feat = opFEATS[0] if (len(opFEATS) == 1) else tk[5]
472
+ # do reports and change
473
+ if (pos != tk[3]):
474
+ print(b[0], tk[0], tk[1], tk[3], "UPOS", tk[3], pos, sep="\t", file=repfile)
475
+ acc[accName.index("Pchanged")] += 1
476
+ tk[3] = pos
477
+ if (lem != tk[2]):
478
+ print(b[0], tk[0], tk[1], tk[3], "LEMMA", tk[2], lem, sep="\t", file=repfile)
479
+ acc[accName.index("Lchanged")] += 1
480
+ tk[2] = lem
481
+ if (feat != tk[5]):
482
+ if ("ExtPos=" not in feat):
483
+ print(b[0], tk[0], tk[1], tk[3], "FEATS", tk[5], feat, sep="\t", file=repfile)
484
+ acc[accName.index("Fchanged")] += 1
485
+ tk[5] = feat
486
+ if (not arguments[4]): print_reps(repfile, accName, acc)
487
+ if (not arguments[4]): repfile.close()
488
  base.printNoHeader(outfile)
 
489
  outfile.close()
490
 
491
+ posprocFix()