1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot
21
22 See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and
23 usage instructions
24 """
25 import os
26 import re
27 import sys
28 import logging
29
30 from translate.lang import factory as lang_factory
31 from translate.misc import optrecurse
32 from translate.storage import po
33 from translate.storage import factory
34 from translate.misc import file_discovery
35
36 -def create_termunit(term, unit, targets, locations, sourcenotes, transnotes, filecounts):
37 termunit = po.pounit(term)
38 if unit is not None:
39 termunit.merge(unit, overwrite=False, comments=False)
40 if len(targets.keys()) > 1:
41 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
42 for target, files in targets.iteritems()])
43 if termunit.target.find('};') < 0:
44 termunit.target = txt
45 termunit.markfuzzy()
46 else:
47
48 termunit.addnote(txt, "translator")
49 for location in locations:
50 termunit.addlocation(location)
51 for sourcenote in sourcenotes:
52 termunit.addnote(sourcenote, "developer")
53 for transnote in transnotes:
54 termunit.addnote(transnote, "translator")
55 for filename, count in filecounts.iteritems():
56 termunit.addnote("(poterminology) %s (%d)\n" % (filename, count), 'translator')
57 return termunit
58
62 self.foldtitle = foldtitle
63 self.ignorecase = ignorecase
64 self.accelchars = accelchars
65 self.termlength = termlength
66
67 self.sourcelanguage = sourcelanguage
68 self.invert = invert
69
70 self.stopwords = {}
71 self.stoprelist = []
72 self.stopfoldtitle = True
73 self.stopignorecase = False
74
75 if stopfile is None:
76 try:
77 stopfile = file_discovery.get_abs_data_filename('stoplist-%s' % self.sourcelanguage)
78 except:
79 pass
80 self.stopfile = stopfile
81 self.parse_stopword_file()
82
83
84 self.formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
85
86 self.xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
87
88 self.xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
89 flags=re.UNICODE|re.IGNORECASE)
90
91 self.units = 0
92 self.glossary = {}
93
95
96 actions = { '+': frozenset(), ':': frozenset(['skip']),
97 '<': frozenset(['phrase']), '=': frozenset(['word']),
98 '>': frozenset(['word','skip']),
99 '@': frozenset(['word','phrase']) }
100
101 stopfile = open(self.stopfile, "r")
102 line = 0
103 try:
104 for stopline in stopfile:
105 line += 1
106 stoptype = stopline[0]
107 if stoptype == '#' or stoptype == "\n":
108 continue
109 elif stoptype == '!':
110 if stopline[1] == 'C':
111 self.stopfoldtitle = False
112 self.stopignorecase = False
113 elif stopline[1] == 'F':
114 self.stopfoldtitle = True
115 self.stopignorecase = False
116 elif stopline[1] == 'I':
117 self.stopignorecase = True
118 else:
119 logging.warning("%s line %d - bad case mapping directive", (self.stopfile, line))
120 elif stoptype == '/':
121 self.stoprelist.append(re.compile(stopline[1:-1]+'$'))
122 else:
123 self.stopwords[stopline[1:-1]] = actions[stoptype]
124 except KeyError, character:
125 logging.warning("%s line %d - bad stopword entry starts with", (self.stopfile, line))
126 logging.warning("%s line %d all lines after error ignored", (self.stopfile, line + 1))
127 stopfile.close()
128
130 """returns the cleaned string that contains the text to be matched"""
131 for accelerator in self.accelchars:
132 string = string.replace(accelerator, "")
133 string = self.formatpat.sub(" ", string)
134 string = self.xmlelpat.sub(" ", string)
135 string = self.xmlentpat.sub(" ", string)
136 string = string.strip()
137 return string
138
140 """return case-mapped stopword for input word"""
141 if self.stopignorecase or (self.stopfoldtitle and word.istitle()):
142 word = word.lower()
143 return word
144
146 """return stoplist frozenset for input word"""
147 return self.stopwords.get(self.stopmap(word), defaultset)
148
150 """adds (sub)phrases with non-skipwords and more than one word"""
151 if (len(words) > skips + 1 and
152 'skip' not in self.stopword(words[0]) and
153 'skip' not in self.stopword(words[-1])):
154 self.glossary.setdefault(' '.join(words), []).append(translation)
155 if partials:
156 part = list(words)
157 while len(part) > 2:
158 if 'skip' in self.stopword(part.pop()):
159 skips -= 1
160 if (len(part) > skips + 1 and
161 'skip' not in self.stopword(part[0]) and
162 'skip' not in self.stopword(part[-1])):
163 self.glossary.setdefault(' '.join(part), []).append(translation)
164
166 sourcelang = lang_factory.getlanguage(self.sourcelanguage)
167 rematchignore = frozenset(('word','phrase'))
168 defaultignore = frozenset()
169 for unit in units:
170 self.units += 1
171 if unit.isheader():
172 continue
173 if not self.invert:
174 source = self.clean(unit.source)
175 target = self.clean(unit.target)
176 else:
177 target = self.clean(unit.source)
178 source = self.clean(unit.target)
179 if len(source) <= 1:
180 continue
181 for sentence in sourcelang.sentences(source):
182 words = []
183 skips = 0
184 for word in sourcelang.words(sentence):
185 stword = self.stopmap(word)
186 if self.ignorecase or (self.foldtitle and word.istitle()):
187 word = word.lower()
188 ignore = defaultignore
189 if stword in self.stopwords:
190 ignore = self.stopwords[stword]
191 else:
192 for stopre in self.stoprelist:
193 if stopre.match(stword) != None:
194 ignore = rematchignore
195 break
196 translation = (source, target, unit, fullinputpath)
197 if 'word' not in ignore:
198
199 root = word
200 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
201 root = word[0:-1]
202 elif len(root) > 2 and root + 's' in self.glossary:
203 self.glossary[root] = self.glossary.pop(root + 's')
204 self.glossary.setdefault(root, []).append(translation)
205 if self.termlength > 1:
206 if 'phrase' in ignore:
207
208 while len(words) > 2:
209 if 'skip' in self.stopword(words.pop(0)):
210 skips -= 1
211 self.addphrases(words, skips, translation)
212 words = []
213 skips = 0
214 else:
215 words.append(word)
216 if 'skip' in ignore:
217 skips += 1
218 if len(words) > self.termlength + skips:
219 while len(words) > self.termlength + skips:
220 if 'skip' in self.stopword(words.pop(0)):
221 skips -= 1
222 self.addphrases(words, skips, translation)
223 else:
224 self.addphrases(words, skips, translation, partials=False)
225 if self.termlength > 1:
226
227 while self.termlength > 1 and len(words) > 2:
228
229 if 'skip' in self.stopword(words.pop(0)):
230 skips -= 1
231 self.addphrases(words, skips, translation)
232
234 terms = {}
235 locre = re.compile(r":[0-9]+$")
236 print >> sys.stderr, ("%d terms from %d units" %
237 (len(self.glossary), self.units))
238 for term, translations in self.glossary.iteritems():
239 if len(translations) <= 1:
240 continue
241 filecounts = {}
242 sources = set()
243 locations = set()
244 sourcenotes = set()
245 transnotes = set()
246 targets = {}
247 fullmsg = False
248 bestunit = None
249 for source, target, unit, filename in translations:
250 sources.add(source)
251 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
252
253 if term.lower() == self.clean(unit.source).lower():
254 fullmsg = True
255 target = self.clean(unit.target)
256 if self.ignorecase or (self.foldtitle and target.istitle()):
257 target = target.lower()
258 unit.target = target
259 if target != "":
260 targets.setdefault(target, []).append(filename)
261 if term.lower() == unit.source.strip().lower():
262 sourcenotes.add(unit.getnotes("source code"))
263 transnotes.add(unit.getnotes("translator"))
264 unit.source = term
265 bestunit = unit
266
267
268 for loc in unit.getlocations():
269 locations.add(locre.sub("", loc))
270
271 numsources = len(sources)
272 numfiles = len(filecounts)
273 numlocs = len(locations)
274 if numfiles < inputmin or 0 < numlocs < locmin:
275 continue
276 if fullmsg:
277 if numsources < fullmsgmin:
278 continue
279 elif numsources < substrmin:
280 continue
281
282 locmax = 2 * locmin
283 if numlocs > locmax:
284 locations = list(locations)[0:locmax]
285 locations.append("(poterminology) %d more locations"
286 % (numlocs - locmax))
287
288 termunit = create_termunit(term, bestunit, targets, locations, sourcenotes, transnotes, filecounts)
289 terms[term] = ((10 * numfiles) + numsources, termunit)
290 return terms
291
293 """reduce subphrases from extracted terms"""
294
295 termlist = terms.keys()
296 print >> sys.stderr, "%d terms after thresholding" % len(termlist)
297 termlist.sort(lambda x, y: cmp(len(x), len(y)))
298 for term in termlist:
299 words = term.split()
300 nonstop = [word for word in words if not self.stopword(word)]
301 if len(nonstop) < nonstopmin and len(nonstop) != len(words):
302 del terms[term]
303 continue
304 if len(words) <= 2:
305 continue
306 while len(words) > 2:
307 words.pop()
308 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
309 del terms[' '.join(words)]
310 words = term.split()
311 while len(words) > 2:
312 words.pop(0)
313 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
314 del terms[' '.join(words)]
315 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys())
316 termitems = terms.values()
317 while len(sortorders) > 0:
318 order = sortorders.pop()
319 if order == "frequency":
320 termitems.sort(lambda x, y: cmp(y[0], x[0]))
321 elif order == "dictionary":
322 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
323 elif order == "length":
324 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
325 else:
326 logging.warning("unknown sort order %s", order)
327 return termitems
328
329
331 """a specialized Option Parser for the terminology tool..."""
332
334 """parses the command line options, handling implicit input/output args"""
335 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
336
337 if args and not options.input:
338 if not options.output and not options.update and len(args) > 1:
339 options.input = args[:-1]
340 args = args[-1:]
341 else:
342 options.input = args
343 args = []
344
345
346 if args and not options.output and not options.update:
347 if os.path.lexists(args[-1]) and not os.path.isdir(args[-1]):
348 self.error("To overwrite %s, specify it with -o/--output or -u/--update" % (args[-1]))
349 options.output = args[-1]
350 args = args[:-1]
351 if options.output and options.update:
352 self.error("You cannot use both -u/--update and -o/--output")
353 if args:
354 self.error("You have used an invalid combination of -i/--input, -o/--output, -u/--update and freestanding args")
355 if not options.input:
356 self.error("No input file or directory was specified")
357 if isinstance(options.input, list) and len(options.input) == 1:
358 options.input = options.input[0]
359 if options.inputmin == None:
360 options.inputmin = 1
361 elif not isinstance(options.input, list) and not os.path.isdir(options.input):
362 if options.inputmin == None:
363 options.inputmin = 1
364 elif options.inputmin == None:
365 options.inputmin = 2
366 if options.update:
367 options.output = options.update
368 if isinstance(options.input, list):
369 options.input.append(options.update)
370 elif options.input:
371 options.input = [options.input, options.update]
372 else:
373 options.input = options.update
374 if not options.output:
375 options.output = "pootle-terminology.pot"
376 return (options, args)
377
379 """sets the usage string - if usage not given, uses getusagestring for each option"""
380 if usage is None:
381 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
382 "\n input directory is searched for PO files, terminology PO file is output file"
383 else:
384 super(TerminologyOptionParser, self).set_usage(usage)
385
398
400 """recurse through directories and process files"""
401 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
402 if isinstance(options.input, list):
403 inputfiles = self.recurseinputfilelist(options)
404 else:
405 inputfiles = self.recurseinputfiles(options)
406 else:
407 if options.input:
408 inputfiles = [os.path.basename(options.input)]
409 options.input = os.path.dirname(options.input)
410 else:
411 inputfiles = [options.input]
412 if os.path.isdir(options.output):
413 options.output = os.path.join(options.output,"pootle-terminology.pot")
414
415 self.initprogressbar(inputfiles, options)
416 for inputpath in inputfiles:
417 self.files += 1
418 fullinputpath = self.getfullinputpath(options, inputpath)
419 success = True
420 try:
421 self.processfile(None, options, fullinputpath)
422 except Exception, error:
423 if isinstance(error, KeyboardInterrupt):
424 raise
425 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
426 success = False
427 self.reportprogress(inputpath, success)
428 del self.progressbar
429 self.outputterminology(options)
430
431 - def processfile(self, fileprocessor, options, fullinputpath):
436
438 """saves the generated terminology glossary"""
439 termfile = po.pofile()
440 print >> sys.stderr, ("scanned %d files" % self.files)
441 terms = self.extractor.extract_terms(inputmin=options.inputmin, fullmsgmin=options.fullmsgmin,
442 substrmin=options.substrmin, locmin=options.locmin)
443 termitems = self.extractor.filter_terms(terms, nonstopmin=options.nonstopmin, sortorders=options.sortorders)
444 for count, unit in termitems:
445 termfile.units.append(unit)
446 open(options.output, "w").write(str(termfile))
447
449 parser.values.ignorecase = False
450 parser.values.foldtitle = True
451
453 parser.values.ignorecase = parser.values.foldtitle = False
454
456 formats = {"po":("po", None), "pot": ("pot", None), None:("po", None)}
457 parser = TerminologyOptionParser(formats)
458
459 parser.add_option("-u", "--update", type="string", dest="update",
460 metavar="UPDATEFILE", help="update terminology in UPDATEFILE")
461
462 parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", dest="stopfile",
463 help="read stopword (term exclusion) list from STOPFILE (default %s)" %
464 file_discovery.get_abs_data_filename('stoplist-en'))
465
466 parser.set_defaults(foldtitle = True, ignorecase = False)
467 parser.add_option("-F", "--fold-titlecase", callback=fold_case_option,
468 action="callback", help="fold \"Title Case\" to lowercase (default)")
469 parser.add_option("-C", "--preserve-case", callback=preserve_case_option,
470 action="callback", help="preserve all uppercase/lowercase")
471 parser.add_option("-I", "--ignore-case", dest="ignorecase",
472 action="store_true", help="make all terms lowercase")
473
474 parser.add_option("", "--accelerator", dest="accelchars", default="",
475 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
476
477 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
478 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
479 parser.add_option("", "--nonstop-needed", type="int", dest="nonstopmin", default="1",
480 help="omit terms with less than MIN nonstop words (default 1)", metavar="MIN")
481 parser.add_option("", "--inputs-needed", type="int", dest="inputmin",
482 help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN")
483 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
484 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
485 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
486 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
487 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
488 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
489
490 sortorders_default = [ "frequency", "dictionary", "length" ]
491 parser.add_option("", "--sort", dest="sortorders", action="append",
492 type="choice", choices=sortorders_default, metavar="ORDER", default=sortorders_default,
493 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(sortorders_default))
494
495 parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
496 help="the source language code (default 'en')", metavar="LANG")
497 parser.add_option("-v", "--invert", dest="invert",
498 action="store_true", default=False, help="invert the source and target languages for terminology")
499 parser.set_usage()
500 parser.description = __doc__
501 parser.run()
502
503
504 if __name__ == '__main__':
505 main()
506