diff --git a/lang/ca/sieve/fill_doc_date_kde.py b/lang/ca/sieve/fill_doc_date_kde.py index b34a7580..b639abdf 100644 --- a/lang/ca/sieve/fill_doc_date_kde.py +++ b/lang/ca/sieve/fill_doc_date_kde.py @@ -1,83 +1,83 @@ # -*- coding: UTF-8 -*- """ Reformat documentation update date for Catalan KDE Team. Sieve has no options. @author: Alexander Potashev @license: GPLv3 """ from pology import _, n_ from pology.report import report from pology.msgreport import report_msg_content import os import re def setup_sieve (p): p.set_desc(_("@info sieve discription", "Reformat documentation update date for Catalan KDE Team." )) class Sieve (object): def __init__ (self, params): # Some dates have non-standard format, here is the workaround for them: self.pretranslated = { u'April 8, 2003': u'8 d\'abril de 2003', u'Jun 7, 2005': u'7 de juny de 2005', u'2007-31-03': u'31 de març de 2007', u'June 12, 2005': u'12 de juny de 2005', u'2009-11-8': u'08 de novembre de 2009', u'May 25, 2005': u'25 de maig de 2005', u'28/12/2007': u'28 de desembre de 2007', u'28/08/2009': u'28 d\'agost de 2009', u'February 1st, 2005': u'1 de febrer de 2005', u'June 07, 2005': u'7 de juny de 2005', u'May 22, 2011': u'22 de maig de 2011', u'August 3 2012': u'22 d\'agost de 2012', u'April 7, 2003': u'7 d\'abril de 2003', } # Other dates should have the following format: (yyyy-mm-dd) self.date_re = re.compile("^[0-9][0-9][0-9][0-9]\-[0-9][0-9]\-[0-9][0-9]$") def format_date (self, date_en): if self.pretranslated.has_key(date_en): return self.pretranslated[date_en] elif self.date_re.match(date_en): date_result = os.popen("date '+%-d m%mm %Y' -d " + date_en).readlines()[0].decode('utf-8').rstrip() + u'' # Translate name of months into Catalan return date_result.\ replace('m01m', u'de gener de').\ replace('m02m', u'de febrer de').\ replace('m03m', u'de març de').\ replace('m04m', u'd\'abril de').\ replace('m05m', u'de maig de').\ replace('m06m', u'de juny de').\ replace('m07m', u'de juliol de').\ replace('m08m', u'd\'agost de').\ replace('m09m', u'de setembre de').\ replace('m10m', u'd\'octubre de').\ replace('m11m', u'de novembre de').\ replace('m12m', u'de desembre de') else: - print "This is not a valid date: " + date_en + print("This is not a valid date: " + date_en) def process (self, msg, cat): # Detect documentation update date message if ("\n".join(msg.auto_comment) == "Tag: date"): new_msgstr = self.format_date(msg.msgid) if (msg.fuzzy or msg.msgstr[0] != new_msgstr): msg.msgstr[0] = new_msgstr msg.unfuzzy() report_msg_content(msg, cat) def finalize (self): "" diff --git a/lang/es/scripts/createProperWordsDict.py b/lang/es/scripts/createProperWordsDict.py index 096e7d17..6efb0721 100755 --- a/lang/es/scripts/createProperWordsDict.py +++ b/lang/es/scripts/createProperWordsDict.py @@ -1,85 +1,85 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- # Obtains a list of proper words (that that begins with a capital letter or # contains an intermediate capital letter) # that are not included yet in the local dictionary. # It is a tool that helps to complete the local dictionary. # The code is adapted from the Servian team pology scripts by Chusslove Illich. import fallback_import_paths import sys import os import re import locale import enchant from pology import version, _, n_ from pology.catalog import Catalog from pology.colors import ColorOptionParser from pology.fsops import str_to_unicode, collect_catalogs from pology.fsops import collect_paths_cmdline from pology.split import proper_words # from pology.msgreport import warning_on_msg, report_msg_content # from pology.report import report, warning, error, format_item_list from pology.stdcmdopt import add_cmdopt_filesfrom def _main (): locale.setlocale(locale.LC_ALL, "") usage= _("@info command usage", "%(cmd)s [OPTIONS] VCS [POPATHS...]", cmd="%prog") desc = _("@info command description", "Obtains a list of proper words from the message text ") ver = _("@info command version", u"%(cmd)s (Pology) %(version)s\n" u"Copyright © 2011 " u"Javier Viñal <%(email)s>", cmd="%prog", version=version(), email="fjvinal@gmail.com") opars = ColorOptionParser(usage=usage, description=desc, version=ver) add_cmdopt_filesfrom(opars) (options, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) # Collect PO files in given paths. popaths = collect_paths_cmdline(rawpaths=free_args, filesfrom=options.files_from, elsecwd=True, respathf=collect_catalogs, abort=True) dict_en = enchant.Dict("en") dict_local = enchant.Dict("es") for path in popaths: extract_proper_words(path, dict_en, dict_local) dict_en.close() for word in sorted(dict_local.session_dict()): - print word + print(word) dict_local.session_dict(clear=True) dict_local.close() _ent_proper_word = re.compile("^\w*?[A-Z]\w*$") def extract_proper_words (path, dict_en, dict_local): cat = Catalog(path) for msg in cat: words = proper_words(msg.msgstr[0], True, cat.accelerator(), msg.format) for word in words: if _ent_proper_word.match(word): if not dict_en.check(str(word)) and not dict_local.check(str(word)): #report("%s" %(word)) dict_local.session_dict(str(word)) if __name__ == '__main__': _main() diff --git a/lang/es/sieve/setUbsp.py b/lang/es/sieve/setUbsp.py index 2a6ecdad..1f775be9 100644 --- a/lang/es/sieve/setUbsp.py +++ b/lang/es/sieve/setUbsp.py @@ -1,51 +1,51 @@ # -*- coding: utf-8 -*- """ @author: Javier Viñal @license: GPLv3""" import re def setup_sieve (p): p.set_desc("Replace normal space by non-breaking space where needed.") class Sieve (object): """Replace normal space by unbreakable space when needed""" def __init__ (self, params): self.nmatch = 0 self.percent=re.compile("( %)(?=$| |\.|,)") def process (self, msg, cat): oldcount=msg.modcount for i in range(len(msg.msgstr)): msg.msgstr[i]=self.setUbsp(msg.msgstr[i]) if oldcount 0: - print "Total messages changed: %d" % (self.nmatch,) + print("Total messages changed: %d" % (self.nmatch,)) def setUbsp(self, text): """Set correctly unbreakable spaces""" text=text.replace(u"\xa0", u" ") text=text.replace(u" :", u"\xc2\xa0:") text=text.replace(u" :", u"\xa0:") text=text.replace(u" ;", u"\xa0;") text=text.replace(u" ?", u"\xa0?") text=text.replace(u" !", u"\xa0!") text=text.replace(u"« ", u"«\xa0") text=text.replace(u" »", u"\xa0»") text=text.replace(u" / ", u"\xa0/ ") text=self.percent.sub(u"\xa0%", text) return text diff --git a/lang/fr/sieve/setUbsp.py b/lang/fr/sieve/setUbsp.py index 5d32e166..a52dc56b 100644 --- a/lang/fr/sieve/setUbsp.py +++ b/lang/fr/sieve/setUbsp.py @@ -1,63 +1,63 @@ # -*- coding: UTF-8 -*- """ Replace normal space by non-breaking space where needed. Documented in C{doc/user/sieving.docbook}. @author: Sébastien Renard @license: GPLv3""" import re from pology import _, n_ from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve description", "Replace normal space by non-breaking space where needed.")) class Sieve (object): """Replace normal space by unbreakable space when needed""" def __init__ (self, params): self.nmatch = 0 self.percent=re.compile("( %)(?=$| |\.|,)") def process (self, msg, cat): oldcount=msg.modcount for i in range(len(msg.msgstr)): msg.msgstr[i]=self.setUbsp(msg.msgstr[i]) if oldcount 0: report(n_("@info", "Non-breaking spaces added in %(num)d message.", "Non-breaking spaces added in %(num)d messages.", num=self.nmatch)) def setUbsp(self, text): """Set correctly unbreakable spaces""" - text=text.replace(u"\xa0", u" ") - text=text.replace(u" :", u"\xc2\xa0:") - text=text.replace(u" :", u"\xa0:") - text=text.replace(u" ;", u"\xa0;") - text=text.replace(u" ?", u"\xa0?") - text=text.replace(u" !", u"\xa0!") - text=text.replace(u"« ", u"«\xa0") - text=text.replace(u" »", u"\xa0»") - text=text.replace(u" / ", u"\xa0/ ") - text=self.percent.sub(u"\xa0%", text) + text=text.replace("\xa0", " ") + text=text.replace(" :", "\xc2\xa0:") + text=text.replace(" :", "\xa0:") + text=text.replace(" ;", "\xa0;") + text=text.replace(" ?", "\xa0?") + text=text.replace(" !", "\xa0!") + text=text.replace("« ", "«\xa0") + text=text.replace(" »", "\xa0»") + text=text.replace(" / ", "\xa0/ ") + text=self.percent.sub("\xa0%", text) return text \ No newline at end of file diff --git a/lang/ru/sieve/fill_doc_date_kde.py b/lang/ru/sieve/fill_doc_date_kde.py index 627378cd..1890f438 100644 --- a/lang/ru/sieve/fill_doc_date_kde.py +++ b/lang/ru/sieve/fill_doc_date_kde.py @@ -1,82 +1,82 @@ # -*- coding: UTF-8 -*- """ Reformat documentation update date for Russian KDE Team. Sieve has no options. @author: Alexander Potashev @license: GPLv3 """ from pology import _, n_ from pology.report import report from pology.msgreport import report_msg_content import os import re def setup_sieve (p): p.set_desc(_("@info sieve discription", "Reformat documentation update date for Russian KDE Team." )) class Sieve (object): def __init__ (self, params): # Some dates have non-standard format, here is the workaround for them: self.pretranslated = { - u'April 8, 2003': u'8 апреля 2003 г.', - u'April 7, 2003': u'7 апреля 2003 г.', - u'28/08/2009': u'28 августа 2009 г.', - u'22/05/2009': u'22 мая 2009 г.', - u'07 January 2005': u'7 января 2005 г.', - u'March 7, 2003': u'7 марта 2003 г.', - u'March 8, 2003': u'8 марта 2003 г.', - u'April 06, 2003': u'6 апреля 2003 г.', - u'April 07, 2003': u'7 апреля 2003 г.', - u'Month Daynumber, 4-Digit-Year': u'2 февраля 2005 г.', - u'April 2018': u'апрель 2018 г.', - u'04/02/2007': u'4 февраля 2007 г.', + 'April 8, 2003': '8 апреля 2003 г.', + 'April 7, 2003': '7 апреля 2003 г.', + '28/08/2009': '28 августа 2009 г.', + '22/05/2009': '22 мая 2009 г.', + '07 January 2005': '7 января 2005 г.', + 'March 7, 2003': '7 марта 2003 г.', + 'March 8, 2003': '8 марта 2003 г.', + 'April 06, 2003': '6 апреля 2003 г.', + 'April 07, 2003': '7 апреля 2003 г.', + 'Month Daynumber, 4-Digit-Year': '2 февраля 2005 г.', + 'April 2018': 'апрель 2018 г.', + '04/02/2007': '4 февраля 2007 г.', } # Other dates should have the following format: (yyyy-mm-dd) self.date_re = re.compile("^[0-9][0-9][0-9][0-9]\-[0-9][0-9]\-[0-9][0-9]$") def format_date (self, date_en): - if self.pretranslated.has_key(date_en): + if date_en in self.pretranslated: return self.pretranslated[date_en] elif self.date_re.match(date_en): - date_result = os.popen("date '+%-d m%mm %Y' -d " + date_en).readlines()[0].decode('utf-8').rstrip() + u' г.' + date_result = os.popen("date '+%-d m%mm %Y' -d " + date_en).readlines()[0].decode('utf-8').rstrip() + ' г.' # Translate name of months into Russian return date_result.\ - replace('m01m', u'января').\ - replace('m02m', u'февраля').\ - replace('m03m', u'марта').\ - replace('m04m', u'апреля').\ - replace('m05m', u'мая').\ - replace('m06m', u'июня').\ - replace('m07m', u'июля').\ - replace('m08m', u'августа').\ - replace('m09m', u'сентября').\ - replace('m10m', u'октября').\ - replace('m11m', u'ноября').\ - replace('m12m', u'декабря') + replace('m01m', 'января').\ + replace('m02m', 'февраля').\ + replace('m03m', 'марта').\ + replace('m04m', 'апреля').\ + replace('m05m', 'мая').\ + replace('m06m', 'июня').\ + replace('m07m', 'июля').\ + replace('m08m', 'августа').\ + replace('m09m', 'сентября').\ + replace('m10m', 'октября').\ + replace('m11m', 'ноября').\ + replace('m12m', 'декабря') else: print("\nThis is not a valid date: %s\n" % date_en) def process (self, msg, cat): # Detect documentation update date message if ("\n".join(msg.auto_comment) == "Tag: date"): new_msgstr = self.format_date(msg.msgid) if (msg.fuzzy or msg.msgstr[0] != new_msgstr): msg.msgstr[0] = new_msgstr msg.unfuzzy() report_msg_content(msg, cat) def finalize (self): "" diff --git a/lang/ru/sieve/fill_kstars_object_names.py b/lang/ru/sieve/fill_kstars_object_names.py index f4657817..12c508ba 100644 --- a/lang/ru/sieve/fill_kstars_object_names.py +++ b/lang/ru/sieve/fill_kstars_object_names.py @@ -1,164 +1,164 @@ #!/usr/bin/python2 # coding: utf8 """ Fill in some comets' names that follow specific patterns. Run against kstars.po. Sieve has no options. @author: Alexander Potashev @license: GPLv3 """ from pology import _, n_ from pology.report import report from pology.msgreport import report_msg_content import os import re def setup_sieve (p): p.set_desc(_("@info sieve discription", "Fill in some comets' names that follow specific patterns." )) class Sieve (object): def __init__ (self, params): pass # Other dates should have the following format: (yyyy-mm-dd) self.date_re = re.compile("^[0-9][0-9][0-9][0-9]\-[0-9][0-9]\-[0-9][0-9]$") def translate (self, msg): if msg.msgctxt == "Asteroid name (optional)" and re.match(r'\([0-9]{4} [A-Z]{2}[0-9]{1,3}\)', msg.msgid): return msg.msgid if msg.msgctxt == 'Comet name (optional)': m = re.match(r'([CP]/[0-9]* [A-Z0-9\-]{2,5}) \(([a-zA-Z ]*)\)', msg.msgid) if m: code = m.group(1) name = m.group(2) tr_name = { - u'Great comet': u'Большая комета', - u'PANSTARRS': u'Pan-STARRS', - u'LINEAR': u'LINEAR', - u'Lemmon': u'обзор Маунт-Леммон', - u'NEOWISE': u'NEOWISE', - u'Catalina': u'обзор Каталина', - u'Borisov': u'Борисов', - u'Messier': u'Мессье', - u'Pons': u'Понс', - u'Гершель': u'Гершель', - u'Olbers': u'Ольберс', - u'Winnecke': u'Виннеке', - u'WISE': u'WISE', - u'Tempel': u'Темпель', - u'STEREO': u'STEREO', - u'SOHO': u'SOHO', - u'Spacewatch': u'Spacewatch', - u'SOLWIND': u'Solwind', - u'Shoemaker': u'Шумейкер', - u'NEAT': u'NEAT', - u'McNaught': u'Макнот', - u'Christensen': u'Кристенсен', - u'Barnard': u'Барнард', - u'LONEOS': u'LONEOS', - u'Lovejoy': u'Лавджой', - u'Machholz': u'Макхольц', - u'Mechain': u'Мешен', - u'SMM': u'SolarMax', - u'Boattini': u'Боаттини', - u'Bradfield': u'Брэдфилд', - u'Mueller': u'Мюллер', - u'Alcock': u'Алкок', - u'Blathwayt': u'Блатуэйт', - u'Borrelly': u'Борелли', - u'Brorsen': u'Брорзен', - u'Burnham': u'Бёрнхем', - u'du Toit': u'Дю Туа', - u'Gambart': u'Гамбар', - u'Giacobini': u'Джакобини', - u'Gibbs': u'Гиббс', - u'Hill': u'Хилл', - u'Honda': u'Хонда', - u'IRAS': u'IRAS', - u'Klinkerfues': u'Клинкерфус', - u'Kohoutek': u'Когоутек', - u'Lovas': u'Ловас', - u'Mauvais': u'Мовэ', - u'Mellish': u'Меллиш', - u'Petersen': u'Петерсон', - u'Siding Spring': u'Сайдинг-Спринг', - u'Skjellerup': u'Скьеллеруп', - u'Swift': u'Свифт', - u'Yanaka': u'Янака', - u'Brooks': u'Брукс', - u'Galle': u'Галле', - u'Skiff': u'Скифф', - u'Wilson': u'Уилсон', - u'Herschel': u'Гершель', - u'Perrine': u'Перрайн', - u'Kowalski': u'Ковальский', - u'Garradd': u'Гаррэдд', - u'Beshore': u'Бешор', - u'Cardinal': u'Кардинал', - u'Denning': u'Деннинг', - u'Finsler': u'Финслер', - u'Harrington': u'Харрингтон', - u'Hartwig': u'Хартвиг', - u'Holvorcem': u'Ольворсем', - u'Humason': u'Хьюмасон', - u'Hyakutake': u'Хякутакэ', - u'Lagerkvist': u'Лагерквист', - u'Larsen': u'Ларсен', - u'Pajdusakova': u'Пайдушакова', - u'Palomar': u'Паломарская обсерватория', - u'Schaumasse': u'Шомасс', - u'Schaeberle': u'Шеберле', - u'Schwartz': u'Шварц', - u'Ikeya': u'Икэя', - u'Austin': u'Остин', - u'de Vico': u'де Вико', - u'Donati': u'Донати', - u'Elenin': u'Еленин', - u'Ferris': u'Феррис', - u'Johnson': u'Джонсон', - u'Metcalf': u'Меткалф', - u'Montani': u'Монтани', - u'Peltier': u'Пельтье', - u'Respighi': u'Респиги', - u'Tabur': u'Табур', - u'Torres': u'Торрес', - u'Wilk': u'Уилк', - u'Bester': u'Бестер', - u'Larson': u'Ларсон', - u'Meier': u'Майер', - u'Schweizer': u'Швейцер', - u'Wirtanen': u'Виртанен', - u'Bruhns': u'Брунс', - u'Coggia': u'Коджа', - u'Levy': u'Леви', - u'La Sagra': u'Ла Сагра', - u'Mrkos': u'Мркос', - u'Skotti': u'Скотти', - u'SWAN': u'SWAN', + 'Great comet': 'Большая комета', + 'PANSTARRS': 'Pan-STARRS', + 'LINEAR': 'LINEAR', + 'Lemmon': 'обзор Маунт-Леммон', + 'NEOWISE': 'NEOWISE', + 'Catalina': 'обзор Каталина', + 'Borisov': 'Борисов', + 'Messier': 'Мессье', + 'Pons': 'Понс', + 'Гершель': 'Гершель', + 'Olbers': 'Ольберс', + 'Winnecke': 'Виннеке', + 'WISE': 'WISE', + 'Tempel': 'Темпель', + 'STEREO': 'STEREO', + 'SOHO': 'SOHO', + 'Spacewatch': 'Spacewatch', + 'SOLWIND': 'Solwind', + 'Shoemaker': 'Шумейкер', + 'NEAT': 'NEAT', + 'McNaught': 'Макнот', + 'Christensen': 'Кристенсен', + 'Barnard': 'Барнард', + 'LONEOS': 'LONEOS', + 'Lovejoy': 'Лавджой', + 'Machholz': 'Макхольц', + 'Mechain': 'Мешен', + 'SMM': 'SolarMax', + 'Boattini': 'Боаттини', + 'Bradfield': 'Брэдфилд', + 'Mueller': 'Мюллер', + 'Alcock': 'Алкок', + 'Blathwayt': 'Блатуэйт', + 'Borrelly': 'Борелли', + 'Brorsen': 'Брорзен', + 'Burnham': 'Бёрнхем', + 'du Toit': 'Дю Туа', + 'Gambart': 'Гамбар', + 'Giacobini': 'Джакобини', + 'Gibbs': 'Гиббс', + 'Hill': 'Хилл', + 'Honda': 'Хонда', + 'IRAS': 'IRAS', + 'Klinkerfues': 'Клинкерфус', + 'Kohoutek': 'Когоутек', + 'Lovas': 'Ловас', + 'Mauvais': 'Мовэ', + 'Mellish': 'Меллиш', + 'Petersen': 'Петерсон', + 'Siding Spring': 'Сайдинг-Спринг', + 'Skjellerup': 'Скьеллеруп', + 'Swift': 'Свифт', + 'Yanaka': 'Янака', + 'Brooks': 'Брукс', + 'Galle': 'Галле', + 'Skiff': 'Скифф', + 'Wilson': 'Уилсон', + 'Herschel': 'Гершель', + 'Perrine': 'Перрайн', + 'Kowalski': 'Ковальский', + 'Garradd': 'Гаррэдд', + 'Beshore': 'Бешор', + 'Cardinal': 'Кардинал', + 'Denning': 'Деннинг', + 'Finsler': 'Финслер', + 'Harrington': 'Харрингтон', + 'Hartwig': 'Хартвиг', + 'Holvorcem': 'Ольворсем', + 'Humason': 'Хьюмасон', + 'Hyakutake': 'Хякутакэ', + 'Lagerkvist': 'Лагерквист', + 'Larsen': 'Ларсен', + 'Pajdusakova': 'Пайдушакова', + 'Palomar': 'Паломарская обсерватория', + 'Schaumasse': 'Шомасс', + 'Schaeberle': 'Шеберле', + 'Schwartz': 'Шварц', + 'Ikeya': 'Икэя', + 'Austin': 'Остин', + 'de Vico': 'де Вико', + 'Donati': 'Донати', + 'Elenin': 'Еленин', + 'Ferris': 'Феррис', + 'Johnson': 'Джонсон', + 'Metcalf': 'Меткалф', + 'Montani': 'Монтани', + 'Peltier': 'Пельтье', + 'Respighi': 'Респиги', + 'Tabur': 'Табур', + 'Torres': 'Торрес', + 'Wilk': 'Уилк', + 'Bester': 'Бестер', + 'Larson': 'Ларсон', + 'Meier': 'Майер', + 'Schweizer': 'Швейцер', + 'Wirtanen': 'Виртанен', + 'Bruhns': 'Брунс', + 'Coggia': 'Коджа', + 'Levy': 'Леви', + 'La Sagra': 'Ла Сагра', + 'Mrkos': 'Мркос', + 'Skotti': 'Скотти', + 'SWAN': 'SWAN', } if name in tr_name: - return u'{} ({})'.format(code, tr_name[name]) + return '{} ({})'.format(code, tr_name[name]) else: print('unknown: %s' % name) return None def process (self, msg, cat): new_msgstr = self.translate(msg) if new_msgstr is not None and (msg.fuzzy or msg.msgstr[0] != new_msgstr): msg.msgstr[0] = new_msgstr msg.unfuzzy() report_msg_content(msg, cat) def finalize (self): "" diff --git a/lang/ru/sieve/fill_units.py b/lang/ru/sieve/fill_units.py index 188c846a..2c27f2d0 100644 --- a/lang/ru/sieve/fill_units.py +++ b/lang/ru/sieve/fill_units.py @@ -1,182 +1,182 @@ # -*- coding: UTF-8 -*- """ Fill in Russian translations for units with metric prefixes. Sieve has no options. @author: Alexander Potashev @license: GPLv3 """ from pology import _ from pology.msgreport import report_msg_content def setup_sieve (p): p.set_desc(_("@info sieve discription", "Fill in Russian translations for units with metric prefixes." )) class Sieve (object): def __init__ (self, params): self.prefixes = [ - ('yotta', u'йотта', 'Y', u'Й'), - ('zetta', u'зетта', 'Z', u'З'), - ('exa', u'экса', 'E', u'Э'), - ('peta', u'пета', 'P', u'П'), - ('tera', u'тера', 'T', u'Т'), - ('giga', u'гига', 'G', u'Г'), - ('mega', u'мега', 'M', u'М'), - ('kilo', u'кило', 'k', u'к'), - ('hecto', u'гекто', 'h', u'г'), - ('deca', u'дека', 'da', u'да'), - ('', u'', '', u''), - ('deci', u'деци', 'd', u'д'), - ('centi', u'санти', 'c', u'с'), - ('milli', u'милли', 'm', u'м'), - ('micro', u'микро', u'µ', u'мк'), - ('nano', u'нано', 'n', u'н'), - ('pico', u'пико', 'p', u'п'), - ('femto', u'фемто', 'f', u'ф'), - ('atto', u'атто', 'a', u'а'), - ('zepto', u'зепто', 'z', u'з'), - ('yocto', u'йокто', 'y', u'и'), - - ('yobi', u'йоби', 'Yi', u'Йи'), - ('zebi', u'зеби', 'Zi', u'Зи'), - ('exbi', u'эксби', 'Ei', u'Эи'), - ('pebi', u'пеби', 'Pi', u'Пи'), - ('tebi', u'теби', 'Ti', u'Ти'), - ('gibi', u'гиби', 'Gi', u'Ги'), - ('mebi', u'меби', 'Mi', u'Ми'), - ('kibi', u'киби', 'Ki', u'Ки'), + ('yotta', 'йотта', 'Y', 'Й'), + ('zetta', 'зетта', 'Z', 'З'), + ('exa', 'экса', 'E', 'Э'), + ('peta', 'пета', 'P', 'П'), + ('tera', 'тера', 'T', 'Т'), + ('giga', 'гига', 'G', 'Г'), + ('mega', 'мега', 'M', 'М'), + ('kilo', 'кило', 'k', 'к'), + ('hecto', 'гекто', 'h', 'г'), + ('deca', 'дека', 'da', 'да'), + ('', '', '', ''), + ('deci', 'деци', 'd', 'д'), + ('centi', 'санти', 'c', 'с'), + ('milli', 'милли', 'm', 'м'), + ('micro', 'микро', 'µ', 'мк'), + ('nano', 'нано', 'n', 'н'), + ('pico', 'пико', 'p', 'п'), + ('femto', 'фемто', 'f', 'ф'), + ('atto', 'атто', 'a', 'а'), + ('zepto', 'зепто', 'z', 'з'), + ('yocto', 'йокто', 'y', 'и'), + + ('yobi', 'йоби', 'Yi', 'Йи'), + ('zebi', 'зеби', 'Zi', 'Зи'), + ('exbi', 'эксби', 'Ei', 'Эи'), + ('pebi', 'пеби', 'Pi', 'Пи'), + ('tebi', 'теби', 'Ti', 'Ти'), + ('gibi', 'гиби', 'Gi', 'Ги'), + ('mebi', 'меби', 'Mi', 'Ми'), + ('kibi', 'киби', 'Ki', 'Ки'), ] # {0} -> "giga", "гига" # {1} -> "G", "Г" def translate_with_unit_prefix(self, text, msgid_fmt, msgstr_fmt, bytes_exception): for prefix in self.prefixes: if text == msgid_fmt.format(prefix[0], prefix[2]): if bytes_exception and prefix[0] == 'kilo': - return msgstr_fmt.format(prefix[1], u'К') # килобайт/КБ + return msgstr_fmt.format(prefix[1], 'К') # килобайт/КБ else: return msgstr_fmt.format(prefix[1], prefix[3]) return None def translate_with_unit_prefix_plural(self, texts, msgid_fmts, msgstr_fmts): for prefix in self.prefixes: if all(texts[i] == msgid_fmts[i].format(prefix[0], prefix[2]) for i in range(len(texts))): return list(msgstr_fmt.format(prefix[1], prefix[3]) for msgstr_fmt in msgstr_fmts) return None def translate_multiple_with_unit_prefix(self, text, unit_pairs, bytes_exception=False): for unit in unit_pairs: tr = self.translate_with_unit_prefix(text, unit[0], unit[1], len(unit) > 2 and unit[2]) if tr is not None: return tr return None def process_single(self, msg, cat): tr = None # Example: "gigaamperes" -> "гигаамперы" - if msg.msgctxt == u'unit description in lists': + if msg.msgctxt == 'unit description in lists': units = [ - ('{0}amperes', u'{0}амперы'), - ('{0}ohms', u'{0}омы'), - ('{0}volts', u'{0}вольты'), - ('{0}bytes', u'{0}байты'), - ('{0}bits', u'{0}биты'), - ('{0}watts', u'{0}ватты'), + ('{0}amperes', '{0}амперы'), + ('{0}ohms', '{0}омы'), + ('{0}volts', '{0}вольты'), + ('{0}bytes', '{0}байты'), + ('{0}bits', '{0}биты'), + ('{0}watts', '{0}ватты'), ] tr = self.translate_multiple_with_unit_prefix(msg.msgid, units) if msg.msgctxt is not None and msg.msgctxt.endswith(' unit symbol'): units = [ - (u'{1}A', u'{1}А'), - (u'{1}V', u'{1}В'), - (u'{1}Ω', u'{1}Ом'), + ('{1}A', '{1}А'), + ('{1}V', '{1}В'), + ('{1}Ω', '{1}Ом'), # ('{0}ohms', u'{0}омы'), # ('{0}volts', u'{0}вольты'), - (u'{1}b', u'{1}бит'), - (u'{1}W', u'{1}Вт'), + ('{1}b', '{1}бит'), + ('{1}W', '{1}Вт'), ] tr = self.translate_multiple_with_unit_prefix(msg.msgid, units) if msg.msgctxt == 'unit synonyms for matching user input': # TODO replace these tuples with a structure units = [ - (u'{0}ampere;{0}amperes;{1}A', u'{0}ампер;{0}ампера;{0}амперов;{0}амперы;{0}амперах;{1}А', False), - (u'{0}amp;{0}amps;{0}ampere;{0}amperes;{1}A', u'{0}ампер;{0}ампера;{0}амперов;{0}амперы;{0}амперах;{1}А', False), - (u'{0}volt;{0}volts;{1}V', u'{0}вольт;{0}вольта;{0}вольтов;{0}вольты;{0}вольтах;{1}В', False), - (u'{0}ohm;{0}ohms;{1}Ω', u'{0}ом;{0}ома;{0}омов;{0}омы;{0}омах;{1}Ом', False), - (u'{1}B;{0}byte;{0}bytes', u'{1}Б;{0}байт;{0}байта;{0}байтов;{0}байты;{0}байтах', True), - (u'{1}b;{0}bit;{0}bits', u'{1}бит;{0}бит;{0}бита;{0}битов;{0}биты;{0}битах', False), - (u'{0}watt;{0}watts;{1}W', u'{0}ватт;{0}ватта;{0}ваттов;{0}ватты;{0}ваттах;{1}Вт', False), + ('{0}ampere;{0}amperes;{1}A', '{0}ампер;{0}ампера;{0}амперов;{0}амперы;{0}амперах;{1}А', False), + ('{0}amp;{0}amps;{0}ampere;{0}amperes;{1}A', '{0}ампер;{0}ампера;{0}амперов;{0}амперы;{0}амперах;{1}А', False), + ('{0}volt;{0}volts;{1}V', '{0}вольт;{0}вольта;{0}вольтов;{0}вольты;{0}вольтах;{1}В', False), + ('{0}ohm;{0}ohms;{1}Ω', '{0}ом;{0}ома;{0}омов;{0}омы;{0}омах;{1}Ом', False), + ('{1}B;{0}byte;{0}bytes', '{1}Б;{0}байт;{0}байта;{0}байтов;{0}байты;{0}байтах', True), + ('{1}b;{0}bit;{0}bits', '{1}бит;{0}бит;{0}бита;{0}битов;{0}биты;{0}битах', False), + ('{0}watt;{0}watts;{1}W', '{0}ватт;{0}ватта;{0}ваттов;{0}ватты;{0}ваттах;{1}Вт', False), ] tr = self.translate_multiple_with_unit_prefix(msg.msgid, units) - if msg.msgctxt == u'amount in units (real)': + if msg.msgctxt == 'amount in units (real)': units = [ - ('%1 {0}amperes', u'%1 {0}ампера'), - ('%1 {0}volts', u'%1 {0}вольта'), - ('%1 {0}ohms', u'%1 {0}ома'), - ('%1 {0}bytes', u'%1 {0}байт'), - ('%1 {0}bits', u'%1 {0}бит'), - ('%1 {0}watts', u'%1 {0}ватт'), + ('%1 {0}amperes', '%1 {0}ампера'), + ('%1 {0}volts', '%1 {0}вольта'), + ('%1 {0}ohms', '%1 {0}ома'), + ('%1 {0}bytes', '%1 {0}байт'), + ('%1 {0}bits', '%1 {0}бит'), + ('%1 {0}watts', '%1 {0}ватт'), ] tr = self.translate_multiple_with_unit_prefix(msg.msgid, units) return tr def process_plural(self, msg, cat): - if msg.msgctxt == u'amount in units (integer)': + if msg.msgctxt == 'amount in units (integer)': unit_pairs = [ - ((u'%1 {0}ampere', u'%1 {0}amperes'), - (u'%1 {0}ампер', u'%1 {0}ампера', u'%1 {0}ампер', u'%1 {0}ампер')), - ((u'%1 {0}ohm', u'%1 {0}ohms'), - (u'%1 {0}ом', u'%1 {0}ома', u'%1 {0}ом', u'%1 {0}ом')), - ((u'%1 {0}volt', u'%1 {0}volts'), - (u'%1 {0}вольт', u'%1 {0}вольта', u'%1 {0}вольт', u'%1 {0}вольт')), - ((u'%1 {0}byte', u'%1 {0}bytes'), - (u'%1 {0}байт', u'%1 {0}байта', u'%1 {0}байт', u'%1 {0}байт')), - ((u'%1 {0}bit', u'%1 {0}bits'), - (u'%1 {0}бит', u'%1 {0}бита', u'%1 {0}бит', u'%1 {0}бит')), - ((u'%1 {0}watt', u'%1 {0}watts'), - (u'%1 {0}ватт', u'%1 {0}ватта', u'%1 {0}ватт', u'%1 {0}ватт')), + (('%1 {0}ampere', '%1 {0}amperes'), + ('%1 {0}ампер', '%1 {0}ампера', '%1 {0}ампер', '%1 {0}ампер')), + (('%1 {0}ohm', '%1 {0}ohms'), + ('%1 {0}ом', '%1 {0}ома', '%1 {0}ом', '%1 {0}ом')), + (('%1 {0}volt', '%1 {0}volts'), + ('%1 {0}вольт', '%1 {0}вольта', '%1 {0}вольт', '%1 {0}вольт')), + (('%1 {0}byte', '%1 {0}bytes'), + ('%1 {0}байт', '%1 {0}байта', '%1 {0}байт', '%1 {0}байт')), + (('%1 {0}bit', '%1 {0}bits'), + ('%1 {0}бит', '%1 {0}бита', '%1 {0}бит', '%1 {0}бит')), + (('%1 {0}watt', '%1 {0}watts'), + ('%1 {0}ватт', '%1 {0}ватта', '%1 {0}ватт', '%1 {0}ватт')), ] for unit in unit_pairs: tr = self.translate_with_unit_prefix_plural((msg.msgid, msg.msgid_plural), unit[0], unit[1]) if tr is not None: return tr return None def process(self, msg, cat): # if msg.translated: # return if msg.msgid_plural is None: tr = self.process_single(msg, cat) if tr is not None: msg.msgstr[0] = tr msg.unfuzzy() report_msg_content(msg, cat) else: tr = self.process_plural(msg, cat) if tr is not None: for i in range(len(tr)): msg.msgstr[i] = tr[i] msg.unfuzzy() report_msg_content(msg, cat) def finalize(self): pass diff --git a/lang/sr/scripts/pohybdl.py b/lang/sr/scripts/pohybdl.py index 5bf019eb..b9f73cbc 100755 --- a/lang/sr/scripts/pohybdl.py +++ b/lang/sr/scripts/pohybdl.py @@ -1,190 +1,190 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- try: import fallback_import_paths except: pass import sys import os import locale from tempfile import NamedTemporaryFile from pology import version, _, n_ from pology.catalog import Catalog from pology.message import MessageUnsafe from pology.lang.sr.wconv import tohi from pology.colors import ColorOptionParser from pology.comments import manc_parse_flag_list from pology.diff import msg_ediff, msg_ediff_to_new from pology.fsops import str_to_unicode, collect_catalogs from pology.fsops import collect_paths_cmdline from pology.msgreport import warning_on_msg, report_msg_content from pology.report import report, warning, error, format_item_list from pology.stdcmdopt import add_cmdopt_filesfrom from pology.vcs import available_vcs, make_vcs def _main (): locale.setlocale(locale.LC_ALL, "") usage= _("@info command usage", "%(cmd)s [OPTIONS] VCS [POPATHS...]", cmd="%prog") desc = _("@info command description", "Compose hybridized Ijekavian-Ekavian translation out of " "translation modified from Ekavian to Ijekavian or vice-versa.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-a", "--accept-changes", action="store_true", dest="accept_changes", default=False, help=_("@info command line option description", "Accept messages which have some changes between base " "and reconstructed base text.")) opars.add_option( "-r", "--base-revision", metavar=_("@info command line value placeholder", "REVISION"), action="store", dest="base_revision", default=None, help=_("@info command line option description", "Use the given revision as base for hybridization, " "instead of local latest revision.")) add_cmdopt_filesfrom(opars) (options, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) try: import psyco psyco.full() except ImportError: pass # Create VCS. if len(free_args) < 1: showvcs = list(set(available_vcs()).difference(["none"])) showvcs.sort() error(_("@info", "Version control system not given " "(can be one of: %(vcslist)s).", vcslist=format_item_list(showvcs))) vcskey = free_args.pop(0) if vcskey not in available_vcs(flat=True): error(_("@info", "Unknown version control system '%(vcs)s'.", vcs=vcskey)) vcs = make_vcs(vcskey) # Collect PO files in given paths. popaths = collect_paths_cmdline(rawpaths=free_args, filesfrom=options.files_from, elsecwd=True, respathf=collect_catalogs, abort=True) # Catalogs must be under version control. for path in popaths: if not vcs.is_versioned(path): error(_("@info", "Catalog '%(file)s' is not under version control.", file=path)) # Go by modified PO file and hybridize it. for path in popaths: # Extract local head counterpart. tmpf = NamedTemporaryFile(prefix="pohybdl-export-", suffix=".po") if not vcs.export(path, options.base_revision, tmpf.name): error(_("@info", "Version control system cannot export file '%(file)s'.", file=path)) # Hybridize by comparing local head and modified file. hybdl(path, tmpf.name, options.accept_changes) def hybdl (path, path0, accnohyb=False): cat = Catalog(path) cat0 = Catalog(path0, monitored=False) nhybridized = 0 nstopped = 0 for msg in cat: if "no-hybdl" in manc_parse_flag_list(msg, "|"): continue # Unembed diff if message was diffed for review. # Replace ediff with manual review flag. diffed = False for flag in msg.flag: if flag.startswith("ediff"): msg.flag.remove(flag) diffed = True if diffed: msg_ediff_to_new(msg, msg) - msg.flag.add(u"reviewed") + msg.flag.add("reviewed") # Fetch original message. msg0 = cat0.get(msg) if msg0 is None: warning_on_msg(_("@info", "Message does not exist in the original catalog."), msg, cat) nstopped += 1 continue if len(msg.msgstr) != len(msg0.msgstr): warning_on_msg(_("@info", "Number of translations not same as in " "the original message."), msg, cat) nstopped += 1 continue if msg.msgstr == msg0.msgstr: # No changes, nothing new to hybridize. continue # Hybridize translation. textsh = [] textshinv = [] for text0, text in zip(msg0.msgstr, msg.msgstr): texth = tohi(text0, text, parthyb=True) textsh.append(texth) if not accnohyb: texthinv = tohi(text, text0, parthyb=True) textshinv.append(texthinv) if accnohyb or textsh == textshinv: - for i, texth in zip(range(len(msg.msgstr)), textsh): + for i, texth in zip(list(range(len(msg.msgstr))), textsh): msg.msgstr[i] = texth nhybridized += 1 else: nstopped += 1 msgh = MessageUnsafe(msg) msgh.msgstr = textsh msghinv = MessageUnsafe(msg) msghinv.msgstr = textshinv msg_ediff(msghinv, msgh, emsg=msgh, colorize=True) report_msg_content(msgh, cat, delim=("-" * 20)) if nstopped == 0: if cat.sync(): report("! %s (%d)" % (path, nhybridized)) else: warning(n_("@info", "%(num)d message in '%(file)s' cannot be " "cleanly hybridized.", "%(num)d messages in '%(file)s' cannot be " "cleanly hybridized.", num=nstopped, file=path)) nhybridized = 0 return nhybridized if __name__ == '__main__': _main() diff --git a/lang/sr/scripts/traplint.py b/lang/sr/scripts/traplint.py index bf1b9a7f..70bd2176 100755 --- a/lang/sr/scripts/traplint.py +++ b/lang/sr/scripts/traplint.py @@ -1,529 +1,529 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- try: import fallback_import_paths except: pass import sys import os import re import locale from pology import PologyError, version, _, n_ from pology.lang.sr.wconv import ctol, hictoall from pology.lang.sr.trapnakron import rootdir from pology.lang.sr.trapnakron import trapnakron_ui from pology.lang.sr.trapnakron import norm_pkey, norm_rtkey from pology.lang.sr.trapnakron import _disamb_marker from pology.colors import ColorOptionParser from pology.fsops import str_to_unicode from pology.normalize import identify from pology.report import report, warning, format_item_list from pology.vcs import VcsSubversion def validate (tp, onlysrcs=None, onlykeys=None, demoexp=False, expwkeys=False): needed_pkeys = set() nom_pkeys = ( - [u"н"], - [u"нм", u"нж", u"нс", u"ну"], + ["н"], + ["нм", "нж", "нс", "ну"], ) needed_pkeys.update(sum(nom_pkeys, [])) - gender_pkey = u"_род" + gender_pkey = "_род" needed_pkeys.add(gender_pkey) - known_genders = set((u"м", u"ж", u"с", u"у")) - known_genders.update(map(ctol, known_genders)) + known_genders = set(("м", "ж", "с", "у")) + known_genders.update(list(map(ctol, known_genders))) known_alts = [ - ("_s", u"сист"), - ("_a", u"алт"), - ("_a2", u"алт2"), - ("_a3", u"алт3"), + ("_s", "сист"), + ("_a", "алт"), + ("_a2", "алт2"), + ("_a3", "алт3"), ] - base_envs = [u"", u"л", u"иј", u"ијл"] + base_envs = ["", "л", "иј", "ијл"] all_envs = set(base_envs) for aenv in [x[1] for x in known_alts]: all_envs.update(x + aenv for x in base_envs) if demoexp: - demoexp_pkeys = [u"н", u"г", u"д", u"а", u"в", u"и", - u"нк", u"гк", u"дк", u"ак", u"вк", - u"нм", u"нмп"] + demoexp_pkeys = ["н", "г", "д", "а", "в", "и", + "нк", "гк", "дк", "ак", "вк", + "нм", "нмп"] needed_pkeys.update(demoexp_pkeys) dkeys_by_rtkey = {} # Sort keys such that derivations are checked by file and position. dkeys = tp.dkeys(single=onlykeys is None) def sortkey (x): path, lno, cno = tp.source_pos(x) return path.count(os.path.sep), path, lno, cno dkeys = sorted(dkeys, key=sortkey) nproblems = 0 unmatched_srcs = set(onlysrcs) if onlysrcs is not None else None unmatched_keys = set(onlykeys) if onlykeys is not None else None reported_fmtexps = set() for dkey in dkeys: srcname = tp.source_name(dkey) path, lno, cno = tp.source_pos(dkey) cnproblems = 0 if ( ( onlysrcs is not None and not _match_text(srcname, onlysrcs, unmatched_srcs)) or ( onlykeys is not None and not _match_text(dkey, onlykeys, unmatched_keys)) ): continue try: aprops = [] seenesuffs = set() cenvs = tp.envs(dkey) for cenv in cenvs: if cenv != "": envmatched = False for ksuff, esuff in known_alts: if cenv in all_envs and cenv.endswith(esuff): envmatched = True break else: envmatched = True ksuff, esuff = "", "" if envmatched and esuff not in seenesuffs: dkeym = dkey + ksuff props = dict([(x, tp.get2(dkeym, norm_pkey(x))) for x in needed_pkeys]) aprops.append((esuff, props)) seenesuffs.add(esuff) elif cenv not in all_envs: warning(_("@info", "Derivation at %(file)s:%(line)d:%(col)d " "defines unknown environment '%(env)s'.", file=path, line=lno, col=cno, env=cenv)) cnproblems += 1 - except Exception, e: + except Exception as e: warning(str_to_unicode(str(e))) cnproblems += 1 continue for esuff, props in aprops: # Assure all nominative forms are unique. for pkeys in nom_pkeys: # select first nominative set by priority pvals = [props.get(x) for x in pkeys] - noms = filter(lambda x: x is not None, pvals) + noms = [x for x in pvals if x is not None] if noms: break if noms: - rtkeys = map(norm_rtkey, noms) + rtkeys = list(map(norm_rtkey, noms)) for rtkey in rtkeys: odkey = dkeys_by_rtkey.get(rtkey) if odkey is not None and tp.props(dkey) != tp.props(odkey): opath, olno, ocno = tp.source_pos(odkey) warning(_("@info", "Derivation at %(file1)s:%(line1)d:%(col1)d " "has normalized nominative equal to " "derivation at %(file2)s:%(line2)d:%(col2)d; " "consider adding a disambiguation marker " "(%(dchar)s).", file1=path, line1=lno, col1=cno, file2=opath, line2=olno, col2=ocno, dchar=_disamb_marker)) cnproblems += 1 for rtkey in rtkeys: # must be in new loop dkeys_by_rtkey[rtkey] = dkey # Assure presence of gender on noun derivations. if props.get(nom_pkeys[0][0]) is not None: gender = props.get(gender_pkey) if gender is None: warning(_("@info", "Derivation at %(file)s:%(line)d:%(col)d " "does not define gender.", file=path, line=lno, col=cno)) cnproblems += 1 else: for gender in hictoall(gender): if gender not in known_genders: warning(_("@info", "Derivation at %(file)s:%(line)d:%(col)d " "defines unknown gender '%(gen)s'.", file=path, line=lno, col=cno, gen=gender)) cnproblems += 1 # Show selection of expanded properties if requested. if demoexp and not cnproblems: demoprops = [(x, props.get(x)) for x in demoexp_pkeys] - demoprops = filter(lambda x: x[1] is not None, demoprops) + demoprops = [x for x in demoprops if x[1] is not None] fmtprops = ["%s=%s" % (x[0], _escape_pval(x[1])) for x in demoprops] fmtsyns = ["%s" % _escape_syn(x) for x in tp.syns(dkey)] fmtexp = ", ".join(fmtsyns) + ": " + ", ".join(fmtprops) if expwkeys: fmtdkeys = ", ".join(sorted(tp.altdkeys(dkey))) fmtexp = "# " + fmtdkeys + "\n" + fmtexp if fmtexp not in reported_fmtexps: if not esuff: report(fmtexp) reported_fmtexps.add(fmtexp) else: afmtexp = " @" + esuff + ": " + ", ".join(fmtprops) report(afmtexp) nproblems += cnproblems tp.empty_pcache() if unmatched_srcs: fmtsrcs = format_item_list(sorted(getattr(x, "pattern", x) for x in unmatched_srcs)) warning(_("@info", "Sources requested by name not found: %(srclist)s.", srclist=fmtsrcs)) if unmatched_keys: fmtkeys = format_item_list(sorted(getattr(x, "pattern", x) for x in unmatched_keys)) warning(_("@info", "Derivations requested by key not found: %(keylist)s.", keylist=fmtkeys)) return nproblems class _Wre (object): def __init__ (self, pattern): self.regex = re.compile(pattern, re.U) self.pattern = pattern def _match_text (text, tests, unmatched_tests=None): match = False for test in tests: - if isinstance(test, basestring): + if isinstance(test, str): if test == text: match = True break elif isinstance(test, _Wre): if test.regex.search(text): match = True break elif callable(test): if test(text): match = True break else: raise PologyError( _("@info", "Unknown matcher type '%(type)s'.", type=type(test))) if unmatched_tests is not None: if match and test in unmatched_tests: unmatched_tests.remove(test) return match def _escape_pval (pval): pval = pval.replace(",", "\,") return pval def _escape_syn (pval): pval = pval.replace(",", "\,") pval = pval.replace(":", "\:") return pval def _collect_mod_dkeys (tp, onlysrcs=None, onlykeys=None): # Collect the unified diff of trapnakron root. vcs = VcsSubversion() udiff = vcs.diff(rootdir()) udiff = _elim_moved_blocks(udiff) # Collect key syntagmas related to added lines. asyns = set() skip_file = True prev_syns = None for tag, data in udiff: if tag == "@": continue fpath = data if tag == ":": if not fpath.endswith(".sd"): skip_file = True else: srcname = os.path.splitext(os.path.basename(fpath))[0] if onlysrcs is None: skip_file = False else: skip_file = not _match_text(srcname, onlysrcs) if skip_file: continue line = data.strip() if line.startswith(("#", ">")) or not line: continue if tag == " ": if not line.startswith("@"): prev_syns = _parse_syns(line) elif tag == "+": if not line.startswith("@"): syns = _parse_syns(line) elif prev_syns: syns = prev_syns asyns.update(syns) prev_syns = [] # Collect derivation keys from syntagmas. onlykeys_mod = set() dkeys_in_tp = set(tp.dkeys(single=True)) for syn in asyns: dkey = identify(syn) if ( dkey and dkey in dkeys_in_tp and (onlykeys is None or _match_text(dkey, onlykeys)) ): onlykeys_mod.add(dkey) return None, onlykeys_mod # Eliminate difference blocks due to pure moving between and within files. def _elim_moved_blocks (udiff): segcnt_ad = {} segcnt_rm = {} ctag = "" cseg = [] for tag, data in udiff + [("@", None)]: # sentry if tag == "@": if ctag in ("+", "-"): cskey = "".join(cseg) segcnt = segcnt_ad if ctag == "+" else segcnt_rm if cskey not in segcnt: segcnt[cskey] = 0 segcnt[cskey] += 1 ctag = "" cseg = [] elif tag in ("+", "-"): if ctag and ctag != tag: ctag = "xxx" else: ctag = tag cseg.append(data) udiff_mod = [] subdiff = [] ctag = "" cseg = [] for tag, data in udiff + [("@", None)]: if tag in (":", "@"): if subdiff: cskey = "".join(cseg) if ( ctag not in ("+", "-") or segcnt_ad.get(cskey, 0) != 1 or segcnt_rm.get(cskey, 0) != 1 ): udiff_mod.extend(subdiff) subdiff = [] cseg = [] ctag = "" if tag == ":": udiff_mod.append((tag, data)) else: subdiff = [(tag, data)] else: subdiff.append((tag, data)) if tag in ("+", "-"): if ctag and ctag != tag: ctag = "xxx" else: ctag = tag cseg.append(data) return udiff_mod def _parse_syns (line): if line.strip().startswith(("#", ">")): return [] llen = len(line) pos = 0 syns = [] csyn = "" intag = False while pos < llen: c = line[pos] if c == "\\": pos += 1 if pos < llen: csyn += line[pos] elif intag: if cltag: if c == cltag: intag = False else: cn = line[pos + 1:pos + 2] if cn in (",", ":") or cn.isspace(): intag = False elif c == "~": intag = True cltag = "}" if line[pos + 1:pos + 2] == "{" else "" elif c in (",", ":"): csyn = csyn.strip() if csyn.startswith("|"): csyn = csyn[1:] syns.append(csyn) if c == ":": break else: csyn = "" spos = pos + 1 else: csyn += line[pos] pos += 1 return syns def _statistics (tp, onlysrcs, onlykeys): dkeys = set() fpaths = {} for dkey in tp.dkeys(single=True): srcname = tp.source_name(dkey) fpath, lno, cno = tp.source_pos(dkey) if ( (onlysrcs is not None and not _match_text(srcname, onlysrcs)) or (onlykeys is not None and not _match_text(dkey, onlykeys)) ): continue dkeys.add(dkey) if fpath not in fpaths: fpaths[fpath] = [srcname, 0] fpaths[fpath][1] += 1 report("-" * 40) if onlysrcs is not None or onlykeys is not None: report(_("@info statistics; side note stating that not all entries " "have been taken into account, but only some selected", "(Selection active.)")) report(_("@info statistics", "Total derivations: %(num)d", num=len(dkeys))) if len(fpaths) > 0: report(_("@info statistics", "Total files: %(num)d", num=len(fpaths))) report(_("@info statistics", "Average derivations per file: %(num).1f", num=(float(len(dkeys)) / len(fpaths)))) - bydif = sorted([(v[1], v[0]) for k, v in fpaths.items()]) + bydif = sorted([(v[1], v[0]) for k, v in list(fpaths.items())]) report(_("@info statistics", "Most derivations in a file: %(num)d (%(file)s)", num=bydif[-1][0], file=bydif[-1][1])) def _main (): locale.setlocale(locale.LC_ALL, "") usage= _("@info command usage", "%(cmd)s [OPTIONS] [DKEY|SRCPATH|:SRCNAME]...", cmd="%prog") desc = _("@info command description", "Check validity and expand derivations from internal trapnakron.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-e", "--expansion-sample", action="store_true", dest="demoexp", default=False, help=_("@info command line option description", "Show a sample of expanded properties for " "each valid derivation.")) opars.add_option( "-k", "--show-keys", action="store_true", dest="expwkeys", default=False, help=_("@info command line option description", "When expanding, also show all derivation keys by derivation.")) opars.add_option( "-m", "--modified", action="store_true", dest="modified", default=False, help=_("@info command line option description", "Validate or expand only modified derivations.")) opars.add_option( "-r", "--regex", action="store_true", dest="regex", default=False, help=_("@info command line option description", "Source names and derivation keys given in command line " "are regular expressions.")) opars.add_option( "-s", "--statistics", action="store_true", dest="statistics", default=False, help=_("@info command line option description", "Show statistics.")) (options, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) try: import psyco psyco.full() except ImportError: pass onlysrcs = set() onlykeys = set() sksep = ":" for arg in free_args: if os.path.isfile(arg): test = os.path.splitext(arg.split(os.path.sep)[-1])[0] onlysrcs.add(test) elif arg.startswith(sksep): test = arg[len(sksep):] if options.regex: test = _Wre(test) onlysrcs.add(test) else: if options.regex: arg = _Wre(arg) else: arg = identify(arg) onlykeys.add(arg) onlysrcs = onlysrcs or None onlykeys = onlykeys or None # Create and validate the trapnakron. tp = trapnakron_ui() if options.modified: onlysrcs, onlykeys = _collect_mod_dkeys(tp, onlysrcs, onlykeys) validate(tp, onlysrcs, onlykeys, options.demoexp, options.expwkeys) if options.statistics: _statistics(tp, onlysrcs, onlykeys) if __name__ == '__main__': _main() diff --git a/pology/__init__.py b/pology/__init__.py index d7076563..ed8f0cbe 100644 --- a/pology/__init__.py +++ b/pology/__init__.py @@ -1,320 +1,320 @@ # -*- coding: UTF-8 -*- # pology.__init__ """ The Pology Python library is a package for custom processing of PO files in field environments. It provides the foundation for Pology end-user tools. Core Pology objects -- abstractions of PO catalog and its entries -- are designed to allow quick writing of robust scripts. By default, the correctness of processed objects is strictly enforced, but such that the user may easily switch it off for better performance. Modifications to PO files on disk are always explicit, and Pology tries to change as few lines as possible to be friendly to version control systems. Pology provides utility various modules for typical processing needs of different kinds of data in PO files. These include word-splitting, markup handling, wrapping, comment parsing, summary reporting, validation, etc. Pology also contains language-specific and project-specific modules, for functionality that is tightly linked to particular languages and translation projects. @author: Chusslove Illich (Часлав Илић) @author: Sébastien Renard @author: Nicolas Ternisien @author: Goran Rakic (Горан Ракић) @author: Nick Shaforostoff (Николай Шафоростов) @license: GPLv3 """ import gettext import locale import os import re from pology.colors import ColorString def datadir (): """ Get data directory of Pology installation. @return: absolute directory path @rtype: string """ lenc = locale.getpreferredencoding() datadir = "@CONFIG_DATADIR@".decode(lenc) # configured if installed if not os.path.isdir(datadir): # if running from source dir srcdir = os.path.dirname(os.path.dirname(__file__)) datadir = srcdir.decode(lenc) return datadir def localedir (): """ Get locale directory of Pology installation. @return: absolute directory path @rtype: string """ lenc = locale.getpreferredencoding() localedir = "@CONFIG_LOCALEDIR@".decode(lenc) # configured if installed if not os.path.isdir(localedir): # if running from source dir srcdir = os.path.dirname(os.path.dirname(__file__)) localedir = os.path.join(srcdir, "mo").decode(lenc) return localedir def version (): """ Get Pology version string. @return: version string @rtype: string """ lenc = locale.getpreferredencoding() verstr = "@CONFIG_VERSION@".decode(lenc) # configured if installed if verstr.startswith("@"): # if running from source dir try: verfile = os.path.join(datadir(), "VERSION") for line in open(verfile): line = line.decode("UTF-8").strip() if line: verstr = line break except: pass return verstr def version_info (): """ Get Pology version information. Pology version information consists of three version numbers (major, minor, bugfix) and an arbitrary suffix (may be empty). @return: version tuple (major, minor, bugfix, suffix) @rtype: (int, int, int, string) """ verstr = version() verrx = re.compile(r"^(\d+)\.(\d+)\.?(\d+)?(.*)$") m = verrx.match(verstr) - major, minor, bugfix = map(int, [x or "0" for x in m.groups()[:3]]) + major, minor, bugfix = list(map(int, [x or "0" for x in m.groups()[:3]])) suffix = m.groups()[-1] verinfo = (major, minor, bugfix, suffix) return verinfo # Collect data paths. # Setup translations. try: _tr = gettext.translation("pology", localedir()) except IOError: _tr = gettext.NullTranslations() def _ (_ctxt_, _text_, **kwargs): """ Get translation of the text into user's language. If there are any formatting directives in the text, they should be named; the arguments which substitute them are given as keyword values following the text. @param _ctxt_: the context in which the text is used @type _ctxt_: string @param _text_: the text to translate @type _text_: string @return: translated text if available, otherwise original @rtype: L{ColorString} """ ts = TextTrans() ts._init(_ctxt_, _text_, None, kwargs) return ts.to_string() def n_ (_ctxt_, _stext_, _ptext_, **kwargs): """ Get translation of the singular/plural text into user's language. If there are any formatting directives in the text, they should be named; the arguments which substitute them are given as keyword values following the text. The plural deciding number is given by the C{num} keyword argument. If no such key exists, or its value is not an integer, an error is raised. @param _ctxt_: the context in which the text is used @type _ctxt_: string @param _stext_: the text to translate for the singular case @type _stext_: string @param _ptext_: the text to translate for the plural case @type _ptext_: string @return: translated text if available, otherwise original @rtype: L{ColorString} """ ts = TextTrans() ts._init(_ctxt_, _stext_, _ptext_, kwargs) return ts.to_string() def t_ (_ctxt_, _text_, **kwargs): """ Get deferred translation of the text into user's language. Like L{_()<_>}, but returns deferred translation object instead of translated text as string. In this way some or all arguments for named formatting directives can be supplied at a later point, using L{with_args} method, and then the translated string obtained by L{to_string} method. @returns: deferred translation @rtype: L{TextTrans} """ ts = TextTrans() ts._init(_ctxt_, _text_, None, kwargs) return ts def tn_ (_ctxt_, _stext_, _ptext_, **kwargs): """ Get deferred translation of the singular/plural text into user's language. Like L{n_()<_>}, but returns deferred translation object instead of translated text as string. In this way some or all arguments for named formatting directives can be supplied at a later point, using L{with_args} method, and then the translated string obtained by L{to_string} method. @returns: deferred translation @rtype: L{TextTrans} """ ts = TextTrans() ts._init(_ctxt_, _stext_, _ptext_, kwargs) return ts class TextTrans: """ Class for intermediate handling of translated user-visible text. Objects of this type are not functional if created manually, but only through C{t*_()} translation calls. """ def _init (self, msgctxt, msgid, msgid_plural, kwargs): self._msgctxt = msgctxt self._msgid = msgid self._msgid_plural = msgid_plural self._kwargs = kwargs def _copy (self): # Shallow copy all attributes. t = TextTrans() t._msgctxt = self._msgctxt t._msgid = self._msgid t._msgid_plural = self._msgid_plural t._kwargs = dict(self._kwargs) return t def with_args (self, **kwargs): """ Add arguments for substitution in the text, creating new object. @returns: new deferred translation @rtype: L{TextTrans} """ t = self._copy() t._kwargs.update(kwargs) return t def to_string (self): """ Translate the text to get ordinary string. @returns: translated text @rtype: L{ColorString} """ if self._msgid_plural is None: trf = _tr.ugettext # camouflaged against xgettext if self._msgctxt is None: msgstr = trf(self._msgid) else: msgstr = trf("%s\x04%s" % (self._msgctxt, self._msgid)) if "\x04" in msgstr: msgstr = self._msgid else: n = self._kwargs.get("num") if n is None or not isinstance(n, int): raise PologyError( _("@info", "No '%(arg)s' keyword argument to " "plural translation request.", arg="num")) trf = _tr.ungettext # camouflaged against xgettext if self._msgctxt is None: msgstr = trf(self._msgid, self._msgid_plural, n) else: msgstr = trf("%s\x04%s" % (self._msgctxt, self._msgid), self._msgid_plural, n) if "\x04" in msgstr: msgstr = self._msgid msgstr = ColorString(msgstr) # before substituting arguments msgstr = msgstr % self._kwargs return msgstr class PologyError (Exception): """ Base exception class for errors in Pology. """ def __init__ (self, msg): """ Constructor. @param msg: a description of what went wrong @type msg: string """ self._msg = msg def __unicode__ (self): - return unicode(self._msg) + return str(self._msg) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) diff --git a/pology/ascript.py b/pology/ascript.py index 59ef9e3f..f68796f3 100644 --- a/pology/ascript.py +++ b/pology/ascript.py @@ -1,2003 +1,2003 @@ # -*- coding: UTF-8 -* """ Process ascription configurations, catalogs, and histories. @note: For the moment, this module is only for internal use within Pology. Interfaces may change arbitrarily between any two Pology releases. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import codecs -from ConfigParser import SafeConfigParser +from configparser import SafeConfigParser import datetime import imp import os import re import time from pology import PologyError, _, n_, t_ from pology.header import format_datetime, parse_datetime from pology.message import Message, MessageUnsafe from pology.comments import parse_summit_branches from pology.diff import msg_ediff from pology.fsops import join_ncwd, str_to_unicode, unicode_to_str from pology.match import make_msg_fmatcher from pology.monitored import Monlist from pology.msgreport import warning_on_msg from pology.report import warning from pology.vcs import make_vcs # ----------------------------------------------------------------------------- # Ascription data representations. class AscConfig (object): """ Representation of an ascription configuration file. The settings are reached through class attributes. Some attributes are raw data read from configuration fields, while other may be derived based on configuration fields. Parameters which have "for header updates" in their description are used for creating and updating ascription catalog headers, as well as original catalog headers when header update on commit is requested. They may contain a number of interpolations, see L{Catalog.update_header}. @ivar path: the path to the ascription configuration file @type path: string @ivar catroot: the path to root directory of original catalogs @type catroot: string @ivar ascroot: the path to root directory of ascription catalogs @type ascroot: string @ivar title: the header title comment (for header updates; only for original catalogs) @type title: string or None @ivar langteam: the language team name (for header updates) @type langteam: string or None @ivar teamemail: the language team email address (for header updates) @type teamemail: string or None @ivar langcode: the language code (for header updates) @type langcode: string or None @ivar plforms: the PO plural forms specification (for header updates) @type plforms: string or None @ivar vcs: the version control system for catalogs @type vcs: L{VcsBase} @ivar commitmsg: the automatic commit message @type commitmsg: string or None @ivar revtags: known review tags (empty string always included) @type revtags: set(string*) @ivar users: data for ascription users by username @type users: {string: L{AscUser}*} """ def __init__ (self, cfgpath): """ Constructor. Reads the ascription configuration file to set raw and derived ascription settings. @param cfgpath: the path to ascription configuration file @type cfgpath: string """ config = SafeConfigParser() ifl = codecs.open(cfgpath, "r", "UTF-8") config.readfp(ifl) ifl.close() self.path = cfgpath gsect = dict(config.items("global")) cpathdir = os.path.dirname(cfgpath) self.catroot = join_ncwd(cpathdir, gsect.get("catalog-root", "")) self.ascroot = join_ncwd(cpathdir, gsect.get("ascript-root", "")) if self.catroot == self.ascroot: raise PologyError( _("@info", "Catalog root and ascription root for '%(file)s' " "resolve to same path '%(dir)s'.", file=cfgpath, dir=self.catroot)) self.title = gsect.get("title", None) self.langteam = gsect.get("language-team", None) self.teamemail = gsect.get("team-email", None) self.langcode = gsect.get("language", None) self.plforms = gsect.get("plural-header", None) self.vcs = make_vcs(gsect.get("version-control", "noop")) self.commitmsg = gsect.get("commit-message", None) cval = gsect.get("review-tags", None) if cval is not None: self.revtags = set(cval.split()) else: self.revtags = set() self.revtags.add("") self.users = {} userst = "user-" for section in config.sections(): if section.startswith(userst): user = section[len(userst):] usect = dict(config.items(section)) if user in self.users: raise PologyError( _("@info", "Repeated user '%(user)s' in '%(file)s'.", user=user, file=cpath)) if "name" not in usect: raise PologyError( _("@info", "The name is missing for " "user '%(user)s' in '%(file)s'.", user=user, file=cpath)) udat = AscUser() udat.name = usect.get("name") udat.oname = usect.get("original-name") udat.email = usect.get("email") self.users[user] = udat class AscUser (object): """ Representation of an ascription user. @ivar name: user's name readable in English @type name: string or None @ivar oname: user's name in user's native language @type oname: string or None @ivar email: user's email address @type email: string or None """ def __init__ (self, name=None, oname=None, email=None): """ Constructor. See attribute documentation for details on parameters. """ self.name = name self.oname = oname self.email = email class AscPoint (object): """ Representation of an ascription point. @ivar msg: a stripped version of the original PO message as it appeared when the ascription was made, containing only L{extraction-invariant parts} @type msg: L{MessageUnsafe} @ivar rmsg: the message in the ascription catalog from which C{msg} was parsed @type rmsg: L{MessageUnsafe} @ivar user: the user to whom the ascription was made @type user: string @ivar type: the ascription type (one of C{ATYPE_} constants) @type type: string @ivar tag: the review tag (from the set defined in ascription config) @type tag: string @ivar date: the date when the ascription was made @type date: datetime.datetime @ivar slen: the length of the separator in ascription message (C{rmsg}) @type slen: int @ivar fuzz: whether the original message was fuzzy @type fuzz: bool @ivar obs: whether the original message was obsolete @type obs: bool @ivar pos: the position of this ascription point within the ascription history (increasing from 1, 1 is the latest by date) @type pos: int """ _known_attrs = ( "rmsg", "msg", "user", "type", ("tag", ""), "date", "slen", "fuzz", "obs", "pos" ) def __init__ (self, apoint=None): """ Create an empty ascription point or a shallow copy of another. @param apoint: an ascription point @type apoint: L{AscPoint} """ for attr in AscPoint._known_attrs: if isinstance(attr, tuple): attr, dval = attr else: attr, dval = attr, None if apoint is not None: self.__dict__[attr] = apoint.__dict__[attr] else: self.__dict__[attr] = dval # Ascription types. # NOTE: These string are written into and read from ascription files. ATYPE_MOD = "modified" ATYPE_REV = "reviewed" # ----------------------------------------------------------------------------- # Collecting ascription configurations and catalog paths. def collect_ascription_associations (catpaths): """ Build up ascription associations for catalog paths. For each catalog path, the ascription configuration to which it belongs is found and parsed, and the corresponding ascription catalog path assembled. The association is organized as list of two-tuples; the first element is the parsed ascription configuration, and the second element the list of two-tuples of original catalog paths and associated ascription catalog paths (whether the ascription catalog already exists or not). For example, if the input is:: ["foo/alpha.po", "foo/bravo.po", "bar/november.po"] and the files are covered by ascription configurations at C{foo/ascription-config} and C{bar/ascription-config}, the return value is:: [(AscConfig("foo/ascription-config"), [("foo/alpha.po", "foo-ascript/alpha.po"), ("foo/bravo.po", "foo-ascript/bravo.po")]), (AscConfig("bar/ascription-config"), [("bar/november.po", "bar-ascript/november.po")])] (assuming that both ascription configurations set C{*-ascript/} directories as corresponding ascription catalog roots). @param catpaths: a list of catalog paths @type catpaths: [string*] @returns: the ascription association list @rtype: [(AscConfig, [(string, string)*])*] """ aconfs_by_cfgpath = {} catpaths_by_cfgpath = {} for catpath in catpaths: # Look for the first config file up the directory tree. parent = os.path.dirname(os.path.abspath(catpath)) cfgpath = None while True: for cfgname in ("ascription-config", "ascribe"): test_cfgpath = os.path.join(parent, cfgname) if os.path.isfile(test_cfgpath): cfgpath = test_cfgpath break if cfgpath: break pparent = parent parent = os.path.dirname(parent) if parent == pparent: break if not cfgpath: raise PologyError( _("@info", "Cannot find ascription configuration for '%(file)s'.", file=catpath)) cfgpath = join_ncwd(cfgpath) # for nicer message output aconf = aconfs_by_cfgpath.get(cfgpath) if not aconf: # New config, load. aconf = AscConfig(cfgpath) aconfs_by_cfgpath[cfgpath] = aconf catpaths_by_cfgpath[cfgpath] = [] catpaths = catpaths_by_cfgpath.get(cfgpath) # If this catalog is under ascription, # determine path to ascription catalog. # Ignore it otherwise. relcatpath = _relpath(catpath, aconf.catroot) if relcatpath is not None: acatpath = join_ncwd(aconf.ascroot, relcatpath) catpath = join_ncwd(catpath) catpaths.append((catpath, acatpath)) # Link config objects and catalog paths. aconfs_catpaths = [] for cfgpath in sorted(aconfs_by_cfgpath): aconfs_catpaths.append((aconfs_by_cfgpath[cfgpath], catpaths_by_cfgpath[cfgpath])) return aconfs_catpaths def _relpath (path, dirpath): absdirpath = os.path.abspath(dirpath) lenadpath = len(absdirpath) lenadpathws = lenadpath + len(os.path.sep) abspath = os.path.abspath(path) p = abspath.find(absdirpath) if p == 0 and abspath[lenadpath:lenadpathws] == os.path.sep: return abspath[lenadpathws:] else: return None # ----------------------------------------------------------------------------- # Reading ascriptions. # FIXME: Factor out into message module. _id_fields = ( "msgctxt", "msgid", ) _nonid_fields = ( "msgid_plural", "msgstr", ) _fields_previous = ( "msgctxt_previous", "msgid_previous", "msgid_plural_previous", ) _fields_current = ( "msgctxt", "msgid", "msgid_plural", ) _fields_comment = ( "manual_comment", "auto_comment", ) _multiple_fields = (() + ("msgstr",) + _fields_comment ) _nonid_fields_eq_nonfuzzy = (() + _nonid_fields + ("manual_comment",) ) _nonid_fields_eq_fuzzy = (() + _nonid_fields_eq_nonfuzzy + _fields_previous ) _translator_parts = ( "manual_comment", "fuzzy", "msgstr", ) # FIXME: ...but this stays here. _nonid_fields_tracked = (() + _nonid_fields + _fields_previous + ("manual_comment",) ) def collect_ascription_history (msg, acat, aconf, nomrg=False, hfilter=None, shallow=False, addrem=None): """ Collect ascription history of a message. The ascription history of C{msg} is collected from the ascription catalog C{acat}, falling under the ascription configuration C{aconf}. The ascription history is a list of L{AscPoint} objects, ordered from the newest to the oldest by date of ascription. Some ascription points may be due to merging with template, when the ascriptions on a catalog were made just after merging. In many cases of examining the history these ascriptions are not useful, so they can be removed by setting C{nomrg} to C{True}. Sometimes it may be convenient to operate on history in which the translations of historical messages have been filtered, and this filter can be specified with C{hfilter}. If under filter two consecutive historical messages become equal, one of them will be eliminated from the history. History normally extends in the past through merging with templates (think of a paragraph-length message in which only one word was changed), so it may contain messages with keys different from the current message from some point and onwards. If only the history up to the earliest message with equal key is desired, C{shallow} can be set to C{True}. Sometimes it may be convenient to operate on I{incremental} history, in which every historical message is actually a partial difference (added, removed or equal segments) from the previous historical message. This can be requested by setting C{addrem} to one of the values as described in L{msg_diff} function. @param msg: the message from the original catalog @type msg: L{Message_base} @param acat: the ascription catalog corresponding to the original catalog @type acat: L{Catalog} @param aconf: the ascription configuration which covers the catalogs @type aconf: L{AscConfig} @param nomrg: whether to eliminate from history pure merge ascriptions @type nomrg: bool @param hfilter: the filter to apply to C{msgstr} fields of historical messages @type hfilter: (string)->string @param shallow: whether to collect history only up to last historical message with same key @type shallow: bool @param addrem: make each historical message an incremental difference from the first earlier historical message; see same-name parameter of L{msg_diff} for possible values @type addrem: string @returns: the ascription history @rtype: [AscPoint*] """ ahist = _collect_ascription_history_w(msg, acat, aconf, None, set(), shallow) # If the message is not ascribed, # add it in front as modified by unknown user. if not ahist or not ascription_equal(msg, ahist[0].msg): a = AscPoint() a.type = AscPoint.ATYPE_MOD a.user = None a.msg = msg ahist.insert(0, a) # Equip ascriptions with position markers, # to be able to see gaps possibly introduced by removals. pos = 1 for a in ahist: a.pos = pos pos += 1 # Eliminate clean merges from history. if nomrg: ahist_r = [] for i in range(len(ahist) - 1): a, ao = ahist[i], ahist[i + 1] if ( a.type != AscPoint.ATYPE_MOD or not merge_modified(ao.msg, a.msg) ): ahist_r.append(a) ahist_r.append(ahist[-1]) ahist = ahist_r # Eliminate contiguous chain of modifications equal under the filter, # except for the earliest in the chain. # (After elimination of clean merges.) if hfilter: def flt (msg): msg = MessageUnsafe(msg) - msg.msgstr = map(hfilter, msg.msgstr) + msg.msgstr = list(map(hfilter, msg.msgstr)) return msg ahist_r = [] a_prevmod = None ahist.reverse() for a in ahist: if ( a.type != AscPoint.ATYPE_MOD or not a_prevmod or flt(a.msg).inv != a_prevmod.msg.inv ): ahist_r.append(a) if a.type == AscPoint.ATYPE_MOD: a_prevmod = AscPoint(a) a_prevmod.msg = flt(a.msg) ahist = ahist_r ahist.reverse() # Reduce history to particular segments of diffs between modifications. # (After filtering). if addrem: a_nextmod = None for a in ahist: if a.type == AscPoint.ATYPE_MOD: if a_nextmod is not None: msg_ediff(a.msg, a_nextmod.msg, emsg=a_nextmod.msg, addrem=addrem) a_nextmod = a return ahist def _collect_ascription_history_w (msg, acat, aconf, before, seenmsg, shallow=False): ahist = [] # Avoid circular paths. if msg.key in seenmsg: return ahist seenmsg.add(msg.key) # Collect history from current ascription message. if msg in acat: amsg = acat[msg] for a in collect_ascription_history_segment(amsg, acat, aconf): if not before or a.date <= before.date: ahist.append(a) if shallow: return ahist # Continue into the past by pivoting around earliest message if fuzzy. amsg = ahist[-1].msg if ahist else msg if amsg.fuzzy and amsg.msgid_previous: pmsg = MessageUnsafe() for field in _id_fields: setattr(pmsg, field, amsg.get(field + "_previous")) # All ascriptions beyond the pivot must be older than the oldest so far. after = ahist and ahist[-1] or before ct_ahist = _collect_ascription_history_w(pmsg, acat, aconf, after, seenmsg) ahist.extend(ct_ahist) return ahist def collect_ascription_history_segment (amsg, acat, aconf): """ Collect a segment of an ascription history. C{amsg} is an ascription message from the ascription catalog C{acat}, falling under the ascription configuration C{aconf}, and it contains a part of the ascription history of some message. This function is used to get only that part of the ascription history. The ascription history segment is a list of L{AscPoint} objects, ordered from the newest to the oldest by date of ascription. @param amsg: the ascription message from the ascription catalog @type amsg: L{Message_base} @param acat: the ascription catalog @type acat: L{Catalog} @param aconf: the ascription configuration which covers the catalogs @type aconf: L{AscConfig} @returns: the ascription history segment @rtype: [AscPoint*] """ ahist = [] spos = dict([(field, [0]) for field in _nonid_fields_tracked]) pvals = dict([(field, [[]]) for field in _nonid_fields_tracked]) for aflds in _parse_ascription_fields(amsg, acat, aconf): a = AscPoint() a.user, a.type, a.tag, a.date, a.slen, a.fuzz, a.obs = aflds if a.slen: # separator existing, reconstruct the fields shead = _field_separator_head(a.slen) pmsg = MessageUnsafe() for field in _id_fields: setattr(pmsg, field, amsg.get(field)) for field in _nonid_fields_tracked: amsg_seq = _get_as_sequence(amsg, field) pmsg_seq = [] for i in range(len(amsg_seq)): aval = amsg_seq[i] - pval = _amsg_step_value(aval, shead, u"\n", + pval = _amsg_step_value(aval, shead, "\n", spos[field], pvals[field], i) # ...do not break if None, has to roll all spos items if pval is not None: while i >= len(pmsg_seq): - pmsg_seq.append(u"") + pmsg_seq.append("") pmsg_seq[i] = pval _set_from_sequence(pmsg_seq, pmsg, field) else: pmsg = MessageUnsafe(ahist[-1].msg) # must exist if a.fuzz: - pmsg.flag.add(u"fuzzy") - elif u"fuzzy" in pmsg.flag: - pmsg.flag.remove(u"fuzzy") + pmsg.flag.add("fuzzy") + elif "fuzzy" in pmsg.flag: + pmsg.flag.remove("fuzzy") pmsg.obsolete = a.obs a.rmsg, a.msg = amsg, pmsg ahist.append(a) # Sort history by date and put it in reverse. # If several ascriptions have same time stamps, preserve their order. - ahist_ord = zip(ahist, range(len(ahist))) + ahist_ord = list(zip(ahist, list(range(len(ahist))))) ahist_ord.sort(key=lambda x: (x[0].date, x[1])) ahist_ord.reverse() ahist = [x[0] for x in ahist_ord] return ahist def _parse_ascription_fields (amsg, acat, aconf): """ Get ascriptions from given ascription message as list of tuples C{(user, type, tag, date, seplen, isfuzzy, isobsolete)}, with date being a real C{datetime} object. """ ascripts = [] for cmnt in amsg.auto_comment: p = cmnt.find(":") if p < 0: warning_on_msg(_("@info", "No type " "in ascription comment '%(cmnt)s'.", cmnt=cmnt), amsg, acat) continue atype = cmnt[:p].strip() atag = "" lst = atype.split(_atag_sep, 1) if len(lst) == 2: atype = lst[0].strip() atag = lst[1].strip() lst = cmnt[p+1:].split("|") if len(lst) < 2 or len(lst) > 3: warning_on_msg(_("@info", "Wrong number of descriptors " "in ascription comment '%(cmnt)s'.", cmnt=cmnt), amsg, acat) continue auser = lst.pop(0).strip() if not auser: warning_on_msg(_("@info", "Malformed user string " "in ascription comment '%(cmnt)s'.", cmnt=cmnt), amsg, acat) continue if auser not in aconf.users: warning_on_msg(_("@info", "Unknown user " "in ascription comment '%(cmnt)s'.", cmnt=cmnt), amsg, acat) continue datestr = lst.pop(0).strip() try: date = parse_datetime(datestr) except: warning_on_msg(_("@info", "Malformed date string " "in ascription comment '%(cmnt)s'.", cmnt=cmnt), amsg, acat) continue # States are reset only on modification ascriptions, # in order to keep them for the following review ascriptions. if atype == AscPoint.ATYPE_MOD: isfuzz = False isobs = False seplen = 0 if lst: tmp = lst.pop(0).strip() if _mark_fuzz in tmp: isfuzz = True tmp = tmp.replace(_mark_fuzz, "", 1) if _mark_obs in tmp: isobs = True tmp = tmp.replace(_mark_obs, "", 1) if tmp: try: seplen = int(tmp) except: warning_on_msg(_("@info", "Malformed separator length " "in ascription comment '%(cmnt)s'.", cmnt=cmnt), amsg, acat) continue ascripts.append((auser, atype, atag, date, seplen, isfuzz, isobs)) return ascripts def _amsg_step_value (aval, shead, stail, spos, pvals, i): if i >= len(spos): spos.extend([0] * (i - len(spos) + 1)) if i >= len(pvals): pvals.extend([[] for x in range(i - len(pvals) + 1)]) p0 = spos[i] p1 = aval.find(shead, p0) p2 = aval.find(stail, p1 + 1) if p2 < 0: p2 = len(aval) spos[i] = p2 + len(stail) mods = aval[p1 + len(shead):p2] if _trsep_mod_eq in mods: q1 = mods.find(_trsep_mod_eq) + len(_trsep_mod_eq) q2 = q1 while q2 < len(mods) and mods[q2].isdigit(): q2 += 1 nrev = int(mods[q1:q2]) pval = pvals[i][nrev] else: if _trsep_mod_none in mods: pval = None else: pval = aval[p0:p1] pvals[i].append(pval) return pval -_trsep_head = u"|" -_trsep_head_ext = u"~" -_trsep_mod_none = u"x" -_trsep_mod_eq = u"e" +_trsep_head = "|" +_trsep_head_ext = "~" +_trsep_mod_none = "x" +_trsep_mod_eq = "e" def _field_separator_head (length): return _trsep_head + _trsep_head_ext * length def _needed_separator_length (msg): goodsep = False seplen = 0 while not goodsep: seplen += 1 sephead = _field_separator_head(seplen) goodsep = True for field in _nonid_fields_tracked: values = msg.get(field) if values is None: continue - if isinstance(values, basestring): + if isinstance(values, str): values = [values] for value in values: if sephead in value: goodsep = False break if not goodsep: break return seplen def _get_as_sequence (msg, field, asc=True): if not asc and not msg.fuzzy and field in _fields_previous: # Ignore previous fields on non-ascription messages without fuzzy flag. return [] msg_seq = msg.get(field) if msg_seq is None: msg_seq = [] elif field not in _multiple_fields: msg_seq = [msg_seq] elif field in _fields_comment: # Report comments as a single newline-delimited entry. if msg_seq: - msg_seq = [u"\n".join(msg_seq)] + msg_seq = ["\n".join(msg_seq)] return msg_seq def _set_from_sequence (msg_seq, msg, field): if field not in _multiple_fields: # Single entry; set to given, or to None if no elements. msg_val = None if msg_seq: msg_val = msg_seq[0] multiple = False elif field in _fields_comment: # Comments treated as single newline-delimited entries; split. msg_val = [] if msg_seq: msg_val = msg_seq[0].split("\n") multiple = True else: # Straight sequence. msg_val = msg_seq multiple = True if multiple and isinstance(msg, Message): msg_val = Monlist(msg_val) setattr(msg, field, msg_val) # ----------------------------------------------------------------------------- # Writing ascriptions. def ascribe_modification (msg, user, dt, acat, aconf): """ Ascribe message modification. @param msg: modified message which is being ascribed @type msg: L{Message_base} @param user: user to whom the ascription is made @type user: string @param dt: the time stamp when the ascription is made @type dt: datetime.datetime @param acat: the ascription catalogs @type acat: L{Catalog} @param aconf: the ascription configuration @type aconf: L{AscConfig} """ _ascribe_any(msg, user, acat, AscPoint.ATYPE_MOD, [], aconf, dt) def ascribe_review (msg, user, dt, tags, acat, aconf): """ Ascribe message review. @param msg: reviewed message which is being ascribed @type msg: L{Message_base} @param user: user to whom the ascription is made @type user: string @param dt: the time stamp when the ascription is made @type dt: datetime.datetime @param tags: review tags @type tags: [string*] @param acat: the ascription catalogs @type acat: L{Catalog} @param aconf: the ascription configuration @type aconf: L{AscConfig} """ _ascribe_any(msg, user, acat, AscPoint.ATYPE_REV, tags, aconf, dt) -_atag_sep = u"/" -_mark_fuzz = u"f" -_mark_obs = u"o" +_atag_sep = "/" +_mark_fuzz = "f" +_mark_obs = "o" def _ascribe_any (msg, user, acat, atype, atags, aconf, dt=None): # Create or retrieve ascription message. if msg not in acat: # Copy ID elements of the original message. amsg = Message() for field in _id_fields: setattr(amsg, field, getattr(msg, field)) # Append to the end of catalog. acat.add_last(amsg) else: # Retrieve existing ascription message. amsg = acat[msg] # Reconstruct historical messages, from first to last. rahist = collect_ascription_history_segment(amsg, acat, aconf) rahist.reverse() # Do any of non-ID elements differ to last historical message? if rahist: hasdiff_state = rahist[-1].msg.state() != msg.state() hasdiff_nonid = _has_nonid_diff(rahist[-1].msg, msg) else: hasdiff_nonid = True hasdiff_state = True hasdiff = hasdiff_nonid or hasdiff_state # Add ascription comment. modstr = user + " | " + format_datetime(dt, wsec=True) modstr_wsep = modstr if hasdiff: wsep = "" if hasdiff_nonid: seplen = _needed_separator_length(msg) wsep += str(seplen) if msg.obsolete: wsep += _mark_obs if msg.fuzzy: wsep += _mark_fuzz if wsep: modstr_wsep += " | " + wsep first = True for atag in atags or [""]: field = atype if atag != "": field += _atag_sep + atag if first: _asc_append_field(amsg, field, modstr_wsep) first = False else: _asc_append_field(amsg, field, modstr) # Add non-ID fields. if hasdiff_nonid: _add_nonid(amsg, msg, seplen, rahist) # Update state. if msg.fuzzy: - amsg.flag.add(u"fuzzy") + amsg.flag.add("fuzzy") else: - amsg.flag.remove(u"fuzzy") + amsg.flag.remove("fuzzy") if msg.obsolete: amsg.obsolete = True else: amsg.obsolete = False def _has_nonid_diff (pmsg, msg): for field in _nonid_fields_tracked: msg_value = msg.get(field) if not msg.fuzzy and field in _fields_previous: # Ignore previous values in messages with no fuzzy flag. msg_value = None pmsg_value = pmsg.get(field) if msg_value != pmsg_value: return True return False def _add_nonid (amsg, msg, slen, rahist): shead = _field_separator_head(slen) nones = [_field_separator_head(x.slen) + _trsep_mod_none for x in rahist if x.slen] - padnone = u"\n".join(nones) + padnone = "\n".join(nones) for field in _nonid_fields_tracked: msg_seq = _get_as_sequence(msg, field, asc=False) amsg_seq = _get_as_sequence(amsg, field) # Expand items to length in new message. for i in range(len(amsg_seq), len(msg_seq)): amsg_seq.append(padnone) # Add to items. for i in range(len(amsg_seq)): if i < len(msg_seq): nmod = 0 i_eq = None for a in rahist: if not a.slen: # no modification in this ascription continue if i_eq is None: msg_seq_p = _get_as_sequence(a.msg, field) if i < len(msg_seq_p) and msg_seq[i] == msg_seq_p[i]: i_eq = nmod # ...no break, need number of modifications. nmod += 1 if i_eq is None: add = msg_seq[i] + shead else: add = shead + _trsep_mod_eq + str(i_eq) else: add = shead + _trsep_mod_none if amsg_seq[i]: - amsg_seq[i] += u"\n" + amsg_seq[i] += "\n" amsg_seq[i] += add _set_from_sequence(amsg_seq, amsg, field) fld_sep = ":" def _asc_append_field (msg, field, value): - stext = u"".join([field, fld_sep, " ", str(value)]) + stext = "".join([field, fld_sep, " ", str(value)]) msg.auto_comment.append(stext) # ----------------------------------------------------------------------------- # Utilities for comparing and selecting ascriptions. def ascription_equal (msg1, msg2): """ Whether two messages are equal from the ascription viewpoint. @param msg1: first message @type msg1: L{Message_base} @param msg2: second message @type msg2: L{Message_base} @returns: C{True} if messages are equal, C{False} otherwise @rtype: bool """ if msg1.state() != msg2.state(): return False if msg1.fuzzy: check_fields = _nonid_fields_eq_fuzzy else: check_fields = _nonid_fields_eq_nonfuzzy for field in check_fields: if msg1.get(field) != msg2.get(field): return False return True def merge_modified (msg1, msg2): """ Whether second message may have been derived from first by merging with templates. @param msg1: first message @type msg1: L{Message_base} @param msg2: second message @type msg2: L{Message_base} @returns: C{True} if C{msg2} is derived by merging from C{msg1}, C{False} otherwise @rtype: bool """ # Anything can happen on merge when going from obsolete to current. if msg1.obsolete and not msg2.obsolete: return True # Manual comments do not change on merge. if msg1.manual_comment != msg2.manual_comment: return False # Current and previous original fields may have changed on merge, # depending on whether both messages are fuzzy, or only one, and which. if msg1.fuzzy == msg2.fuzzy: fields = msg1.fuzzy and _fields_previous or _fields_current for field in fields: if msg1.get(field) != msg2.get(field): return False else: - fields = (msg1.fuzzy and zip(_fields_previous, _fields_current) - or zip(_fields_current, _fields_previous)) + fields = (msg1.fuzzy and list(zip(_fields_previous, _fields_current)) + or list(zip(_fields_current, _fields_previous))) for field1, field2 in fields: if msg1.get(field1) != msg2.get(field2): return False # Translation does not change on merge, except # on multiplication/reduction when plurality differs. if (msg1.msgid_plural is None) != (msg2.msgid_plural is None): if not msg1.fuzzy and not msg2.fuzzy: # Plurality cannot change between two non-fuzzy messages. return False if msg1.msgid_plural is not None: # Reduction to non-plural. if msg1.msgstr[0] != msg2.msgstr[0]: return False else: # Multiplication to plural. for msgstr in msg2.msgstr: if msgstr != msg1.msgstr[0]: return False else: if msg1.msgstr != msg2.msgstr: return False return True def first_non_fuzzy (ahist, start=0): """ Find first non fuzzy message in the ascription history. @param ahist: the ascription history @type ahist: [AscPoint*] @param start: position in history to start searching from @type start: int @returns: index of first non-fuzzy message, or None if there is none such @rtype: int """ for i in range(start, len(ahist)): hmsg = ahist[i].msg if hmsg and not hmsg.fuzzy: return i return None def has_tracked_parts (msg): """ Check whether the message has any parts which are tracked for ascription. For example, a pristine untranslated message is considered to have no tracked parts. @returns: C{True} if there are any tracked parts, C{False} otherwise @rtype: bool """ for part in _nonid_fields_tracked: pval = msg.get(part) if part not in _multiple_fields: if pval is not None and part != "msgid_plural": return True else: if part == "msgstr": for pval1 in pval: if pval1: return True elif pval: return True return False # ----------------------------------------------------------------------------- # Argument parsing for selectors. def parse_users (userspec, aconf): """ Parse ascription user specification. The user specification is a comma-separated list of user names. If the list starts with tilde (~), all users defined in the ascription configuration but for those listed will be selected (inverted selection). If an undefined user (according to ascription configuration) is mentioned, an exception is raised. @param userspec: the user specification @type userspec: string @param aconf: the ascription configuration @type aconf: L{AscConfig} @returns: selected user names @rtype: set(string*) """ return _parse_fixed_set(userspec, aconf, aconf.users, t_("@info", "User '%(name)s' not defined in '%(file)s'.")) def parse_review_tags (tagspec, aconf): """ Parse review tag specification. The tag specification is a comma-separated list of tags. If the list starts with tilde (~), all review tags defined in the ascription configuration but for those listed will be selected (inverted selection). If an undefined tag (according to ascription configuration) is mentioned, an exception is raised. @param tagspec: the review tag specification @type tagspec: string @param aconf: the ascription configuration @type aconf: L{AscConfig} @returns: selected review tags @rtype: set(string*) """ tags = _parse_fixed_set(tagspec, aconf, aconf.revtags, t_("@info", "Review tag '%(name)s' " "not defined in '%(file)s'.")) if not tags: tags = set([""]) return tags def _parse_fixed_set (elstr, aconf, knownels, errfmt): if not elstr: return set() elstr = elstr.replace(" ", "") inverted = False if elstr.startswith("~"): inverted = True elstr = elstr[1:] els = set(elstr.split(",")) for el in els: if el not in knownels: raise PologyError( errfmt.with_args(name=el, file=aconf.path).to_string()) if inverted: els = set(knownels).difference(els) return els # ----------------------------------------------------------------------------- # Caching for selectors. _cache = {} def cached_matcher (expr): """ Fetch a cached message matcher for the given expression, for use in ascription selectors. When this function is called for the first time on a new expression, the matcher function is created and cached. On subsequent invocations with the same expression, the matcher is fetched from the cache rather than created anew. @param expr: the matching expression; see L{make_msg_matcher} for details @type expr: string @returns: the matcher function @rtype: (L{Message_base}, L{Catalog})->bool """ key = ("matcher", expr) if key not in _cache: _cache[key] = make_msg_fmatcher(expr, abort=True) return _cache[key] def cached_users (userspec, aconf, utype=None): """ Fetch a cached set of users for the given user specification, for use in ascription selectors. When this function is called for the first time on a new combination of user specification C{userspec}, ascription configuration C{aconf}, and "user type" C{utype}, the specification is parsed and users collected. On subsequent invocations with the same combination, the user set is fetched from the cache rather than created anew. C{utype} is actually just an arbitrary string, for when you need to cache users by different categories. @param userspec: the user specification; see L{parse_users} for details @type userspec: string @param aconf: the ascription configuration @type aconf: L{AscConfig} @param utype: user type @type utype: string @returns: the set of users @rtype: set(string*) """ key = ("users", userspec, aconf, utype) if key not in _cache: _cache[key] = parse_users(userspec, aconf) return _cache[key] def cached_review_tags (tagspec, aconf): """ Fetch a cached set of review tags for the given tag specification, for use in ascription selectors. When this function is called for the first time on a new combination of tag specification C{tagspec} and ascription configuration C{aconf}, the specification is parsed and tags collected. On subsequent invocations with the same combination, the tag set is fetched from the cache rather than created anew. @param tagspec: the tag specification; see L{parse_review_tags} for details @type tagspec: string @param aconf: the ascription configuration @type aconf: L{AscConfig} @returns: the set of tags @rtype: set(string*) """ key = ("tags", tagspec, aconf) if key not in _cache: _cache[key] = parse_review_tags(tagspec, aconf) return _cache[key] # ----------------------------------------------------------------------------- # Making selectors. # Build compound selector out of list of specifications. # Selector specification is a string in format NAME:ARG1:ARG2:... # (instead of colon, separator can be any non-alphanumeric excluding # underscore and hyphen) def make_ascription_selector (selspecs, hist=False): """ Build compound ascription selector out of string specifications of basic selectors. Selector specification string has the format NAME:ARG1:ARG2:... Instead of colon, separator can be any non-alphanumeric character used consistently, except for underscore and hyphen. The compound selector is obtained by constructing each basic selector according to the specification in turn, and linking them with AND-boolean semantics. Parameter C{hist} determines whether the compound selector should be a shallow selector (C{True}) or a history selector (C{False}). If a history selector is required but cannot be made from the given composition of basic selectors, an exception is raised. @param selspecs: specifications of basic selectors @type selspecs: [string*] @param hist: C{True} if the compound selector should be history selector, C{False} if it should be shallow selector @type hist: bool @returns: the compound selector @rtype: (L{Message_base}, L{Catalog}, [AscPoint*], L{AscConfig})->bool (shallow), (...)->int/None (history) """ # Component selectors. selectors = [] for selspec in selspecs: argsep = ":" for c in selspec: if not (c.isalpha() or c.isdigit() or c in ("_", "-")): argsep = c break lst = selspec.split(argsep) sname, sargs = lst[0], lst[1:] negated = False if sname.startswith("n"): sname = sname[1:] negated = True sfactory, can_hist = _selector_factories.get(sname, (None, False)) if not sfactory: raise PologyError( _("@info", "Unknown selector '%(sel)s'.", sel=sname)) if hist: if not can_hist: raise PologyError( _("@info", "Selector '%(sel)s' cannot be used " "as history selector.", sel=sname)) if negated: raise PologyError( _("@info", "Negated selectors (here '%(sel)s') cannot be used " "as history selectors.", sel=sname)) try: selector = sfactory(sargs) - except PologyError, e: + except PologyError as e: raise PologyError( _("@info", "Selector '%(sel)s' not created due to " "the following error:\n" "%(msg)s", sel=selspec, msg=str_to_unicode(str(e)))) if negated: selector = _negate_selector(selector) selectors.append((selector, selspec)) # Compound selector. if hist: res0 = None else: res0 = False def cselector (msg, cat, ahist, aconf): res = res0 for selector, selspec in selectors: try: res = selector(msg, cat, ahist, aconf) - except PologyError, e: + except PologyError as e: raise PologyError( _("@info", "Selector '%(sel)s' failed on message " "%(file)s:%(line)d:(#%(entry)d) " "with the following error:\n" "%(msg)s", sel=selspec, file=cat.filename, line=msg.refline, entry=msg.refentry, msg=str_to_unicode(str(e)))) if not res: return res return res return cselector def _negate_selector (selector): def negative_selector (*args): return not selector(*args) return negative_selector _external_mods = {} def import_ascription_extensions (modpath): """ Import extensions to ascription functionality from a Python module. Additional selector factories can be introduced by defining the C{asc_selector_factories} dictionary, in which the key is the selector name, and the value a tuple of the selector factory function and the indicator of whether the selector can be used as a history selector or not. For example:: asc_selector_factories = { # key: (function, can_be_used_as_history_selector), "specsel1": (selector_specsel1, True), "specsel2": (selector_specsel2, False), ... } @param modpath: path to Python file @type modpath: string """ # Load external module. try: modfile = open(unicode_to_str(modpath)) # ...unicode_to_str because of exec below. except IOError: raise PologyError( _("@info", "Cannot load external module '%(file)s'.", file=modpath)) # Load file into new module. modname = "mod" + str(len(_external_mods)) xmod = imp.new_module(modname) - exec modfile in xmod.__dict__ + exec(modfile, xmod.__dict__) modfile.close() _external_mods[modname] = xmod # to avoid garbage collection # Collect everything collectable from the module. xms = [] xms.append("asc_selector_factories") selector_factories = getattr(xmod, xms[-1], None) if selector_factories is not None: _selector_factories.update(selector_factories) # Warn of unknown externals. known_xms = set(xms) - for xm in filter(lambda x: x.startswith("asc_"), dir(xmod)): + for xm in [x for x in dir(xmod) if x.startswith("asc_")]: if xm not in known_xms: warning(_("@info", "Unknown external resource '%(res)s' " "in module '%(file)s'.", res=xm, file=modpath)) # Registry of basic selector factories. _selector_factories = { # key: (function, can_be_used_as_history_selector), } # ----------------------------------------------------------------------------- # Internal selector factories. # Use make_ascription_selector() to create selectors. # NOTE: # Plain selectors should return True or False. # History selectors should return 1-based index into ascription history # when the appropriate historical message is found, and 0 otherwise. # In this way, when it is only necessary to test if a message is selected, # returns from both types of selectors can be tested for simple falsity/truth, # and non-zero integer return always indicates history selection. def _selector_any (args): if len(args) != 0: raise PologyError(_("@info", "Wrong number of arguments.")) def selector (msg, cat, ahist, aconf): return True return selector _selector_factories["any"] = (_selector_any, False) def _selector_active (args): if len(args) != 0: raise PologyError(_("@info", "Wrong number of arguments.")) def selector (msg, cat, ahist, aconf): return msg.translated and not msg.obsolete return selector _selector_factories["active"] = (_selector_active, False) def _selector_current (args): if len(args) != 0: raise PologyError(_("@info", "Wrong number of arguments.")) def selector (msg, cat, ahist, aconf): return not msg.obsolete return selector _selector_factories["current"] = (_selector_current, False) def _selector_branch (args): if len(args) != 1: raise PologyError(_("@info", "Wrong number of arguments.")) branch = args[0] if not branch: raise PologyError(_("@info", "Branch ID must not be empty.")) branches = set(branch.split(",")) def selector (msg, cat, ahist, aconf): return bool(branches.intersection(parse_summit_branches(msg))) return selector _selector_factories["branch"] = (_selector_branch, False) def _selector_unasc (args): if len(args) != 0: raise PologyError(_("@info", "Wrong number of arguments.")) def selector (msg, cat, ahist, aconf): # Do not consider pristine messages as unascribed. return ahist[0].user is None and has_tracked_parts(msg) return selector _selector_factories["unasc"] = (_selector_unasc, False) def _selector_fexpr (args): if len(args) != 1: raise PologyError(_("@info", "Wrong number of arguments.")) expr = args[0] if not expr: raise PologyError(_("@info", "Match expression must not be empty.")) def selector (msg, cat, ahist, aconf): matcher = cached_matcher(expr) return bool(matcher(msg, cat)) return selector _selector_factories["fexpr"] = (_selector_fexpr, False) def _selector_e (args): if len(args) != 1: raise PologyError(_("@info", "Wrong number of arguments.")) entry = args[0] if not entry or not entry.isdigit(): raise PologyError( _("@info", "Message entry number must be a positive integer.")) refentry = int(entry) def selector (msg, cat, ahist, aconf): return msg.refentry == refentry return selector _selector_factories["e"] = (_selector_e, False) def _selector_l (args): if len(args) != 1: raise PologyError(_("@info", "Wrong number of arguments.")) line = args[0] if not line or not line.isdigit(): raise PologyError( _("@info", "Message line number must be a positive integer.")) refline = int(line) def selector (msg, cat, ahist, aconf): return abs(msg.refline - refline) <= 1 return selector _selector_factories["l"] = (_selector_l, False) # Select messages between and including first and last reference by entry. # If first entry is not given, all messages to the last entry are selected. # If last entry is not given, all messages from the first entry are selected. def _selector_espan (args): if not 1 <= len(args) <= 2: raise PologyError(_("@info", "Wrong number of arguments.")) first = args[0] last = args[1] if len(args) > 1 else "" if not first and not last: raise PologyError( _("@info", "At least one of the first and last message entry numbers " "must be given.")) if first and not first.isdigit(): raise PologyError( _("@info", "First message entry number must be a positive integer.")) if last and not last.isdigit(): raise PologyError( _("@info", "Last message entry number must be a positive integer.")) first_entry = (first and [int(first)] or [None])[0] last_entry = (last and [int(last)] or [None])[0] def selector (msg, cat, ahist, aconf): if first_entry is not None and msg.refentry < first_entry: return False if last_entry is not None and msg.refentry > last_entry: return False return True return selector _selector_factories["espan"] = (_selector_espan, False) # Select messages between and including first and last reference by line. # If first line is not given, all messages to the last line are selected. # If last line is not given, all messages from the first line are selected. def _selector_lspan (args): if not 1 <= len(args) <= 2: raise PologyError(_("@info", "Wrong number of arguments.")) first = args[0] last = args[1] if len(args) > 1 else "" if not first and not last: raise PologyError( _("@info", "At least one of the first and last message line numbers " "must be given.")) if first and not first.isdigit(): raise PologyError( _("@info", "First message line number must be a positive integer.")) if last and not last.isdigit(): raise PologyError( _("@info", "Last message line number must be a positive integer.")) first_line = (first and [int(first)] or [None])[0] last_line = (last and [int(last)] or [None])[0] def selector (msg, cat, ahist, aconf): if first_line is not None and msg.refline < first_line: return False if last_line is not None and msg.refline > last_line: return False return True return selector _selector_factories["lspan"] = (_selector_lspan, False) def _selector_hexpr (args): if not 1 <= len(args) <= 3: raise PologyError(_("@info", "Wrong number of arguments.")) expr = args[0] user_spec = args[1] if len(args) > 1 else "" addrem = args[2] if len(args) > 2 else "" if not expr: raise PologyError( _("@info", "Match expression cannot be empty.")) def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 matcher = cached_matcher(expr) users = cached_users(user_spec, aconf) if not addrem: i = 0 else: i = first_non_fuzzy(ahist, 0) if i is None: return 0 while i < len(ahist): a = ahist[i] if users and a.user not in users: i += 1 continue if not addrem: amsg = a.msg i_next = i + 1 else: i_next = first_non_fuzzy(ahist, i + 1) if i_next is not None: amsg2 = ahist[i_next].msg else: amsg2 = MessageUnsafe(a.msg) for field in _nonid_fields_tracked: amsg2_value = amsg2.get(field) if amsg2_value is None: pass - elif isinstance(amsg2_value, basestring): + elif isinstance(amsg2_value, str): setattr(amsg2, field, None) else: - amsg2_value = [u""] * len(amsg2_value) + amsg2_value = [""] * len(amsg2_value) i_next = len(ahist) amsg = MessageUnsafe(a.msg) msg_ediff(amsg2, amsg, emsg=amsg, addrem=addrem) if matcher(amsg, cat): return i + 1 i = i_next return 0 return selector _selector_factories["hexpr"] = (_selector_hexpr, True) # Select last ascription (any, or by users). def _selector_asc (args): if not 0 <= len(args) <= 1: raise PologyError(_("@info", "Wrong number of arguments.")) user_spec = args[0] if len(args) > 0 else "" def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 users = cached_users(user_spec, aconf) hi_sel = 0 for i in range(len(ahist)): a = ahist[i] if not users or a.user in users: hi_sel = i + 1 break return hi_sel return selector _selector_factories["asc"] = (_selector_asc, True) # Select last modification (any or by users). def _selector_mod (args): if not 0 <= len(args) <= 1: raise PologyError(_("@info", "Wrong number of arguments.")) user_spec = args[0] if len(args) > 0 else "" def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 users = cached_users(user_spec, aconf) hi_sel = 0 for i in range(len(ahist)): a = ahist[i] if not a.user: continue if a.type == AscPoint.ATYPE_MOD and (not users or a.user in users): hi_sel = i + 1 break return hi_sel return selector _selector_factories["mod"] = (_selector_mod, True) # Select first modification (any or by m-users, and not by r-users) # after last review (any or by r-users, and not by m-users). def _selector_modar (args): return _w_selector_modax(False, True, args, 3) _selector_factories["modar"] = (_selector_modar, True) # Select first modification (any or by m-users, and not by mm-users) # after last modification (any or by mm-users, and not by m-users). def _selector_modam (args): return _w_selector_modax(True, False, args, 2) _selector_factories["modam"] = (_selector_modam, True) # Select first modification (any or by m-users, and not by rm-users) # after last review or modification (any or by m-users, and not by rm-users). def _selector_modarm (args): return _w_selector_modax(True, True, args, 3) _selector_factories["modarm"] = (_selector_modarm, True) # Select first modification of translation # (any or by m-users, and not by r-users) # after last review (any or by r-users, and not by m-users). def _selector_tmodar (args): return _w_selector_modax(False, True, args, 3, True) _selector_factories["tmodar"] = (_selector_tmodar, True) # Worker for builders of *moda* selectors. def _w_selector_modax (amod, arev, args, maxnarg, tronly=False): if not 0 <= len(args) <= maxnarg: raise PologyError(_("@info", "Wrong number of arguments.")) muser_spec = args[0] if len(args) > 0 else "" rmuser_spec = args[1] if len(args) > 1 else "" atag_spec = args[2] if len(args) > 2 else "" def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 musers = cached_users(muser_spec, aconf, utype="m") rmusers = cached_users(rmuser_spec, aconf, utype="rm") atags = cached_review_tags(atag_spec, aconf) hi_sel = 0 for i in range(len(ahist)): a = ahist[i] # Check if this message cancels further modifications. if ( ( (amod and a.type == AscPoint.ATYPE_MOD) or (arev and a.type == AscPoint.ATYPE_REV and a.tag in atags)) and (not rmusers or a.user in rmusers) and (not musers or a.user not in musers) ): break # Check if this message is admissible modification. if ( a.type == AscPoint.ATYPE_MOD and (not musers or a.user in musers) and (not rmusers or a.user not in rmusers) ): # Cannot be a candidate if in translation-only mode and # there is no difference in translation to earlier message. ae = ahist[i + 1] if i + 1 < len(ahist) else None if not (tronly and ae and ae.msg.msgstr == a.msg.msgstr): hi_sel = i + 1 return hi_sel return selector # Select last review (any or by users). def _selector_rev (args): if not 0 <= len(args) <= 2: raise PologyError(_("@info", "Wrong number of arguments.")) user_spec = args[0] if len(args) > 0 else "" atag_spec = args[1] if len(args) > 1 else "" def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 users = cached_users(user_spec, aconf) atags = cached_review_tags(atag_spec, aconf) hi_sel = 0 for i in range(len(ahist)): a = ahist[i] if ( a.type == AscPoint.ATYPE_REV and a.tag in atags and (not users or a.user in users) ): hi_sel = i + 1 break return hi_sel return selector _selector_factories["rev"] = (_selector_rev, True) # Select first review (any or by r-users, and not by m-users) # before last modification (any or by m-users, and not by r-users). def _selector_revbm (args): if not 0 <= len(args) <= 3: raise PologyError(_("@info", "Wrong number of arguments.")) ruser_spec = args[0] if len(args) > 0 else "" muser_spec = args[1] if len(args) > 1 else "" atag_spec = args[2] if len(args) > 2 else "" def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 rusers = cached_users(ruser_spec, aconf, utype="r") musers = cached_users(muser_spec, aconf, utype="m") atags = cached_review_tags(atag_spec, aconf) hi_sel = 0 can_select = False for i in range(len(ahist)): a = ahist[i] if ( a.type == AscPoint.ATYPE_MOD and (not musers or a.user in musers) and (not rusers or a.user not in rusers) ): # Modification found, enable selection of review. can_select = True if ( a.type == AscPoint.ATYPE_REV and a.tag in atags and (not rusers or a.user in rusers) and (not musers or a.user not in musers) ): # Review found, select it if enabled, and stop anyway. if can_select: hi_sel = i + 1 break return hi_sel return selector _selector_factories["revbm"] = (_selector_revbm, True) # Select first modification (any or by users) at or after given time. def _selector_modafter (args): if not 0 <= len(args) <= 2: raise PologyError(_("@info", "Wrong number of arguments.")) time_spec = args[0] if len(args) > 0 else "" user_spec = args[1] if len(args) > 1 else "" if not time_spec: raise PologyError( _("@info", "Time specification cannot be empty.")) date = parse_datetime(time_spec) def selector (msg, cat, ahist, aconf): if ahist[0].user is None: return 0 users = cached_users(user_spec, aconf) hi_sel = 0 for i in range(len(ahist) - 1, -1, -1): a = ahist[i] if ( a.type == AscPoint.ATYPE_MOD and (not users or a.user in users) and a.date >= date ): hi_sel = i + 1 break return hi_sel return selector _selector_factories["modafter"] = (_selector_modafter, True) diff --git a/pology/bpatterns.py b/pology/bpatterns.py index c9fa31a1..58f05331 100644 --- a/pology/bpatterns.py +++ b/pology/bpatterns.py @@ -1,207 +1,207 @@ # -*- coding: UTF-8 -*- """ Detect unwanted patterns in translation. @note: This module is deprecated. Use L{rules} through C{check-rules} sieve instead. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re import codecs from pology import _, n_ from pology.comments import manc_parse_flag_list from pology.msgreport import report_on_msg, report_msg_content def bad_patterns (rxmatch=False, casesens=True, patterns=None, fromfiles=None): """ Detect unwanted patterns in text [hook factory]. Patterns can be given both as list of strings, and as a list of file paths containing patterns (in each file: one pattern per line, strip leading and trailing whitespace, skip empty lines, #-comments). Detected patterns are reported to stdout. If C{rxmatch} is C{False}, patterns are matched by plain substring search, otherwise as regular expressions. If C{casesens} is True, matching is case sensitive. If the message has pipe flag C{no-bad-patterns}, check is skipped. @param rxmatch: whether to take pattern as regular expression @type rxmatch: bool @param casesens: whether the match should be case-sensitive @type casesens: bool @param patterns: patterns to match the text @type patterns: list of strings @param fromfiles: file paths from which to read patterns @type fromfiles: list of strings @return: type S3A hook @rtype: C{(text, msg, cat)->numerr} """ patterns_str = list(patterns or []) for file in fromfiles: patterns_str.extend(_load_patterns(file)) patterns_cmp = _process_patterns(rxmatch=rxmatch, casesens=casesens, patterns=patterns_str) def hook (text, msg, cat): if _flag_no_bad_patterns in manc_parse_flag_list(msg, "|"): return 0 indspans = _match_patterns(text, patterns_cmp) for pind, span in indspans: pstr = patterns_str[pind] report_on_msg(_("@info", "Bad pattern '%(pattern)s' detected.", pattern=pstr), msg, cat) return len(indspans) return hook def bad_patterns_msg (rxmatch=False, casesens=True, patterns=None, fromfiles=None): """ Detect unwanted patterns in translation [hook factory]. Like L{bad_patterns}, but checks and reports on all C{msgstr} fields in the message. @return: type S4A hook @rtype: C{(msg, cat)->numerr} """ return _bad_patterns_msg_w(rxmatch, casesens, patterns, fromfiles, False) def bad_patterns_msg_sp (rxmatch=False, casesens=True, patterns=None, fromfiles=None): """ Detect unwanted patterns in translation, report spans [hook factory]. Like L{bad_patterns_msg}, but reports parts instead of writing to stdout. @return: type V4A hook @rtype: C{(msg, cat)->parts} """ return _bad_patterns_msg_w(rxmatch, casesens, patterns, fromfiles, True) # Worker for bad_patterns_msg* hooks. def _bad_patterns_msg_w (rxmatch, casesens, patterns, fromfiles, partrep): patterns_str = list(patterns or []) for file in fromfiles or []: patterns_str.extend(_load_patterns(file)) patterns_cmp = _process_patterns(rxmatch=rxmatch, casesens=casesens, patterns=patterns_str) def hook (msg, cat): if _flag_no_bad_patterns in manc_parse_flag_list(msg, "|"): return 0 parts = [] nbad = 0 for i in range(len(msg.msgstr)): indspans = _match_patterns(msg.msgstr[i], patterns_cmp) spans = [] for pind, span in indspans: emsg = _("@info", "Bad pattern '%(pattern)s' detected.", pattern=patterns_str[pind]) spans.append(span + (emsg,)) nbad += 1 if spans: parts.append(("msgstr", i, spans)) if partrep: return parts else: if parts: report_msg_content(msg, cat, highlight=parts, delim=("-" * 20)) return nbad return hook # Pipe flag used to manually prevent matching for a particular message. _flag_no_bad_patterns = "no-bad-patterns" # Load pattern string from the file: # one pattern per non-empty line in the file, # leading and trailing whitespace stripped, # #-comments possible. def _load_patterns (filepath): ifl = codecs.open(filepath, "r", "UTF-8") rem_cmnt_rx = re.compile(r"#.*") patterns = [] for line in ifl.readlines(): line = rem_cmnt_rx.sub("", line).strip() if line: patterns.append(line) return patterns # Process given list of pattern strings. # If rxmatch is True, patterns are compiled into regexes. # If casesens is False, re.I flag is used in regex compilation, or # if regex patterns are not requested, patterns are lower-cased. def _process_patterns (patterns, rxmatch=False, casesens=True): patterns_cmp = [] if rxmatch: rx_flags = re.U if not casesens: rx_flags |= re.I for pattern in patterns: patterns_cmp.append(re.compile(pattern, rx_flags)) else: for pattern in patterns: if not casesens: patterns_cmp.append(pattern.lower()) else: patterns_cmp.append(pattern) return patterns_cmp # Try to match the text by all patterns in the list. # A pattern can be either a plain string for substring search, # or a compiled regular expression. # Returns a list of (pattern_index, span) tuples for patterns that matched. def _match_patterns (text, patterns): matched_patterns = [] for i in range(len(patterns)): pattern = patterns[i] span = None - if isinstance(pattern, basestring): + if isinstance(pattern, str): p = text.find(pattern) if p >= 0: span = (p, p + len(pattern)) else: m = pattern.search(text) if m: span = m.span() if span: matched_patterns.append((i, span)) return matched_patterns diff --git a/pology/catalog.py b/pology/catalog.py index 57acf354..b15792e6 100644 --- a/pology/catalog.py +++ b/pology/catalog.py @@ -1,2531 +1,2531 @@ # -*- coding: UTF-8 -*- """ Collection of PO entries. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import copy import difflib import os import re import tempfile import time import types from pology import PologyError, _, n_ from pology.header import Header, format_datetime from pology.message import Message as MessageMonitored from pology.message import MessageUnsafe as MessageUnsafe from pology.escape import escape_c as escape from pology.escape import unescape_c as unescape from pology.fsops import mkdirpath from pology.monitored import Monitored from pology.resolve import expand_vars from pology.wrap import select_field_wrapper class CatalogSyntaxError (PologyError): """ Exception for errors in catalog syntax. This exception is normally raised when parsing a catalog, e.g. on invalid syntax or non-decodable characters. """ pass def _parse_quoted (s): sp = s[s.index("\"") + 1:s.rindex("\"")] sp = unescape(sp); return sp class _MessageDict: def __init__ (self, lcache=True): self.manual_comment = [] self.auto_comment = [] self.source = [] self.flag = [] self.obsolete = False self.msgctxt_previous = [] self.msgid_previous = [] self.msgid_plural_previous = [] self.msgctxt = [] self.msgid = [] self.msgid_plural = [] self.msgstr = [] self.refline = -1 self.refentry = -1 if lcache: self._lines_all = [] self._lines_manual_comment = [] self._lines_auto_comment = [] self._lines_source = [] self._lines_flag = [] self._lines_msgctxt_previous = [] self._lines_msgid_previous = [] self._lines_msgid_plural_previous = [] self._lines_msgctxt = [] self._lines_msgid = [] self._lines_msgid_plural = [] self._lines_msgstr = [] def _read_lines_and_encoding (file, filename): fstr = file.read() # Determine line ending. maxlno = 0 for clend in ("\r\n", "\n", "\r"): # "\r\n" should be checked first lno = len(fstr.split(clend)) if maxlno < lno: maxlno = lno lend = clend lines = [x + "\n" for x in fstr.split(lend)] if lines[-1] == "\n": lines.pop() enc = None enc_rx = re.compile(r"Content-Type:.*charset=(.+?)\\n", re.I) for line in lines: if line.strip().startswith("#:"): break m = enc_rx.search(line) if m: enc = m.group(1).strip() if not enc or enc == "CHARSET": # no encoding given enc = None break if enc is None: enc = "UTF-8" # fall back to UTF-8 if encoding not found enclines = [] lno = 0 for line in lines: lno += 1 try: encline = line.decode(enc) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: raise CatalogSyntaxError( _("@info", "Text decoding failure at %(file)s:%(line)d:%(col)d " "under assumed encoding '%(enc)s'.", file=filename, line=lno, col=e.start, enc=enc)) enclines.append(encline) return enclines, enc def _parse_po_file (file, MessageType=MessageMonitored, headonly=False, lcache=True): - if isinstance(file, basestring): + if isinstance(file, str): filename = file file = open(filename, "rb") close_later = True else: if hasattr(file, "name"): filename = file.name else: filename = _("@item generic name for the source or destination " "of data being read or written", "<stream>").resolve("none") close_later = False lines, fenc = _read_lines_and_encoding(file, filename) if close_later: file.close() ctx_modern, ctx_obsolete, \ ctx_previous, ctx_current, \ - ctx_none, ctx_msgctxt, ctx_msgid, ctx_msgid_plural, ctx_msgstr = range(9) + ctx_none, ctx_msgctxt, ctx_msgid, ctx_msgid_plural, ctx_msgstr = list(range(9)) messages1 = list() lno = 0 eno = 0 class Namespace: pass loc = Namespace() loc.lno = 0 loc.tail = None loc.msg = _MessageDict(lcache) loc.life_context = ctx_modern loc.field_context = ctx_none loc.age_context = ctx_current # The message has been completed by the previous line if the context just # switched away from ctx_msgstr; # call whenever context switch happens, *before* assigning new context. nlines = len(lines) def try_finish (): if loc.field_context == ctx_msgstr: messages1.append(loc.msg) loc.msg = _MessageDict(lcache) loc.field_context = ctx_none # In header-only mode, the first message read is the header. # Compose the tail of this and rest of the lines, and # set lno to nlines for exit. if headonly: # If not at end of file, current line is part of # first message and should be retained in the tail. offset = loc.lno < nlines and 1 or 0 loc.tail = "".join(lines[loc.lno - offset:]) loc.lno = nlines while loc.lno < nlines: # sentry for last entry line_raw = lines[lno] loc.lno += 1 lno = loc.lno # shortcut line = line_raw.strip() if not line: continue string_follows = True loc.life_context = ctx_modern loc.age_context = ctx_current if line.startswith("#"): if 0: pass elif line.startswith("#~|"): line = line[3:].lstrip() loc.age_context = ctx_previous elif line.startswith("#~"): line = line[2:].lstrip() loc.life_context = ctx_obsolete elif line.startswith("#|"): line = line[2:].lstrip() loc.age_context = ctx_previous elif line.startswith("#:"): try_finish() string_follows = False for srcref in line[2:].split(" "): srcref = srcref.strip() if srcref: lst = srcref.split(":", 1) if len(lst) == 2: file = lst[0] try: line = int(lst[1]) assert line > 0 except: file = srcref line = -1 loc.msg.source.append((file, line)) else: loc.msg.source.append((srcref, -1)) elif line.startswith("#,"): try_finish() string_follows = False for flag in line[2:].split(","): flag = flag.strip() if flag: loc.msg.flag.append(flag) elif line.startswith("#."): try_finish() string_follows = False loc.msg.auto_comment.append(line[2:].lstrip()) elif line.startswith("#"): try_finish() string_follows = False loc.msg.manual_comment.append(line[2:].lstrip()) else: # Cannot reach, all unknown comments treated as manual above. raise CatalogSyntaxError( _("@info", "Unknown comment type at %(file)s:%(line)d.", file=filename, line=lno)) if line and string_follows: # for starting fields if 0: pass elif line.startswith("msgctxt"): # TODO: Assert context. try_finish() loc.field_context = ctx_msgctxt line = line[7:].lstrip() elif line.startswith("msgid_plural"): # TODO: Assert context. # No need for try_finish(), msgid_plural cannot start message. loc.field_context = ctx_msgid_plural line = line[12:].lstrip() elif line.startswith("msgid"): # TODO: Assert context. try_finish() if loc.life_context == ctx_obsolete: loc.msg.obsolete = True loc.field_context = ctx_msgid if loc.age_context == ctx_current: loc.msg.refline = lno loc.msg.refentry = eno eno += 1 line = line[5:].lstrip() elif line.startswith("msgstr"): # TODO: Assert context. loc.field_context = ctx_msgstr line = line[6:].lstrip() msgstr_i = 0 if line.startswith("["): line = line[1:].lstrip() llen = len(line) p = 0 while p < llen and line[p].isdigit(): p += 1 if p == 0: raise CatalogSyntaxError( _("@info", "Malformed '%(field)s' ordinal " "at %(file)s:%(line)d.", file=filename, line=lno, field="msgstr")) msgstr_i = int(line[:p]) line = line[p:].lstrip() if line.startswith("]"): line = line[1:].lstrip() else: raise CatalogSyntaxError( _("@info", "Malformed '%(field)s' ordinal " "at %(file)s:%(line)d.", file=filename, line=lno, field="msgstr")) # Add missing msgstr entries. for i in range(len(loc.msg.msgstr), msgstr_i + 1): loc.msg.msgstr.append([]) elif not line.startswith("\""): raise CatalogSyntaxError( _("@info", "Unknown field name at %(file)s:%(line)d.", file=filename, line=lno)) if line and string_follows: # for continuing fields if line.startswith("\""): s = _parse_quoted(line) if loc.age_context == ctx_previous: if loc.field_context == ctx_msgctxt: loc.msg.msgctxt_previous.append(s) elif loc.field_context == ctx_msgid: loc.msg.msgid_previous.append(s) elif loc.field_context == ctx_msgid_plural: loc.msg.msgid_plural_previous.append(s) else: if loc.field_context == ctx_msgctxt: loc.msg.msgctxt.append(s) elif loc.field_context == ctx_msgid: loc.msg.msgid.append(s) elif loc.field_context == ctx_msgid_plural: loc.msg.msgid_plural.append(s) elif loc.field_context == ctx_msgstr: loc.msg.msgstr[msgstr_i].append(s) else: raise CatalogSyntaxError( _("@info", "Expected string continuation at %(file)s:%(line)d.", file=filename, line=lno)) # Update line caches. if lcache: loc.msg._lines_all.append(line_raw) if 0: pass elif line_raw.startswith("#:"): loc.msg._lines_source.append(line_raw) elif line_raw.startswith("#,"): loc.msg._lines_flag.append(line_raw) elif line_raw.startswith("#."): loc.msg._lines_auto_comment.append(line_raw) elif line_raw.startswith("#") and line_raw[1:2] not in ("~", "|"): loc.msg._lines_manual_comment.append(line_raw) elif loc.age_context == ctx_previous: if loc.field_context == ctx_msgctxt: loc.msg._lines_msgctxt_previous.append(line_raw) elif loc.field_context == ctx_msgid: loc.msg._lines_msgid_previous.append(line_raw) elif loc.field_context == ctx_msgid_plural: loc.msg._lines_msgid_plural_previous.append(line_raw) else: raise PologyError( _("@info", "Internal problem (%(id)d) at %(file)s:%(line)d.", id=11, file=filename, line=lno)) elif loc.age_context == ctx_current: if loc.field_context == ctx_msgctxt: loc.msg._lines_msgctxt.append(line_raw) elif loc.field_context == ctx_msgid: loc.msg._lines_msgid.append(line_raw) elif loc.field_context == ctx_msgid_plural: loc.msg._lines_msgid_plural.append(line_raw) elif loc.field_context == ctx_msgstr: loc.msg._lines_msgstr.append(line_raw) else: raise PologyError( _("@info", "Internal problem (%(id)d) at %(file)s:%(line)d.", id=12, file=filename, line=lno)) else: raise PologyError( _("@info", "Internal problem (%(id)d) at %(file)s:%(line)d.", id=10, file=filename, line=lno)) try_finish() # the last message if len(messages1) == 0: raise CatalogSyntaxError( _("@info", "No header at %(file)s:%(line)d.", file=filename, line=lno)) # Join fields. join_or_none = lambda x: "".join(x) if x else None for i, msg in enumerate(messages1): msg.msgctxt_previous = join_or_none(msg.msgctxt_previous) msg.msgid_previous = join_or_none(msg.msgid_previous) msg.msgid_plural_previous = join_or_none(msg.msgid_plural_previous) msg.msgctxt = join_or_none(msg.msgctxt) msg.msgid = join_or_none(msg.msgid) msg.msgid_plural = join_or_none(msg.msgid_plural) msg.msgstr = [join_or_none(x) for x in msg.msgstr] if i > 0 and msg.msgid == "" and msg.msgctxt is None: raise CatalogSyntaxError( _("@info", "Empty message at %(file)s:%(line)d.", file=filename, line=msg.refline)) # Repack raw dictionaries as message objects. messages2 = [] for msg1 in messages1: messages2.append(MessageType(msg1.__dict__)) return (messages2, fenc, loc.tail) def _srcref_repack (srcrefs): srcdict = {} for file, line in srcrefs: if not file in srcdict: srcdict[file] = [line] else: srcdict[file].append(line) srcdict[file].sort() return srcdict _Catalog_spec = { # Data. "header" : {"type" : Header}, - "filename" : {"type" : types.StringTypes}, - "name" : {"type" : types.StringTypes, "derived" : True}, + "filename" : {"type" : (str,)}, + "name" : {"type" : (str,), "derived" : True}, "*" : {}, # messages sequence: the type is assigned at construction } class Catalog (Monitored): """ Class for access and operations on PO catalogs. Catalog behaves as an ordered sequence of messages. The typical way of iterating over the messages from a PO file on disk would be:: cat = Catalog("relative/path/foo.po") for msg in cat: ... (do something with msg) ... cat.sync() where L{sync()} method is used to write any modifications back to the disk. The header entry of the catalog is not part of the message sequence, but is provided by the L{header} attribute, an object of type different from an ordinary message entry. The catalog is a I{monitored} class. Catalog message entries themeselves may also be monitored (default), but need not, depending on the mode of creation. @ivar header: the header entry @type header: L{Header} @ivar filename: the file name which the catalog was created with @type filename: string @ivar name: (read-only) the name of the catalog Determined as base of the filename, without extension. @type name: string @see: L{Monitored} @see: L{Message}, L{MessageUnsafe} @see: L{Header} """ def __init__ (self, filename, create=False, truncate=False, wrapping=None, monitored=True, headonly=False, readfh=None, single_entry=0): """ Build a message catalog by reading from a PO file or creating anew. The message entries in the catalog may be monitored themselves or not. That is, when monitoring is requested, entries are represented by the L{Message} class, otherwise with L{MessageUnsafe}. Monitored messages are usually appropriate when the application is expected to modify them. Non-monitored messages should provide better performance, so use them whenever the catalog is opened for read-only purposes (such as checks). Catalog can also be opened in header-only mode, for better performance when only the header data is needed. This mode provides L{header} attribute as usual, but the rest of entries are unavailable. If any of the operations dealing with message entries are invoked, an error is signaled. Instead of opening and reading from catalog's filename, catalog can be read from a file-like object provided by C{readfh} parameter. Same as when reading from file on disk, text will be decoded using catalog's encoding after reading it from C{readfh}. If a problem which prevents construction of a valid catalog is detected while parsing a PO file, L{CatalogSyntaxError} is raised. @param filename: name of the PO catalog on disk, or new catalog @type filename: string @param create: whether a blank catalog can be created when the PO file does not already exist, or signal an error @type create: bool @param truncate: whether catalog should be empty (and with uninitialized header) regardless of whether it is opened or created @type truncate: bool @param wrapping: sequence of keywords specifying wrapping policy for message text fields (C{msgid}, C{msgstr}, etc.). See L{select_field_wrapper} function for possible keywords and their effects on wrapping. If given as C{None}, it will be deduced from the catalog (see L{wrapping} method). @type wrapping: sequence of strings @param monitored: whether the message entries are monitored @type monitored: bool @param headonly: whether to open in header-only mode @type headonly: bool @param readfh: file to read the catalog from @type readfh: file-like object """ self._monitored = monitored # Select type of message object to use. if monitored: message_type = MessageMonitored else: message_type = MessageUnsafe # Signal if catalog should exist on disk but does not. if not create and not (os.path.exists(filename) or readfh): raise PologyError( _("@info", "File '%(file)s' does not exist.", file=filename)) # Read messages or create empty catalog. if not truncate and (os.path.exists(filename) or readfh): file = readfh or filename m, e, t = _parse_po_file(file, message_type, headonly, monitored) self._encoding = e self._created_from_scratch = False if not m[0].msgctxt and not m[0].msgid: # Proper PO, containing the header. self._header = Header(m[0]) self._header._committed = True # status for sync if (single_entry > 0): self.__dict__["*"] = [m[single_entry]] else: self.__dict__["*"] = m[1:] else: # Improper PO, missing the header. self._header = Header() self._header._committed = False # status for sync if (single_entry > 0): self.__dict__["*"] = [m[single_entry-1]] else: self.__dict__["*"] = m self._tail = t else: self._encoding = "UTF-8" self._created_from_scratch = True self._header = Header() self._header._committed = False # status for sync self.__dict__["*"] = [] self._tail = None self._filename = filename self._messages = self.__dict__["*"] # nicer name for the sequence # Fill in the message key-position links. # Set committed and remove-on-sync status. self._msgpos = {} for i in range(len(self._messages)): self._msgpos[self._messages[i].key] = i self._messages[i]._committed = True self._messages[i]._remove_on_sync = False # Initialize monitoring. final_spec = copy.deepcopy(_Catalog_spec) final_spec["*"]["type"] = message_type self.assert_spec_init(final_spec) # Inverse map (by msgstr) will be computed on first use. self._invmap = None # Cached plural definition from the header. self._plustr = "" # Cached language of the translation. # None means the language has not been determined. self._lang = None self._lang_determined = False # Cached environments. self._envs = None self._envs_determined = False # Cached accelerator markers. self._accels = None self._accels_determined = False # Cached markup types. self._mtypes = None self._mtypes_determined = False # Cached wrapping policy. if wrapping is None: self._wrap_determined = False self._wrapf = None self._wrapkw = None else: self._wrap_determined = True self._wrapf = select_field_wrapper(wrapping) self._wrapkw = tuple(wrapping) def _assert_headonly (self): if self._tail: raise PologyError( _("@info", "Trying to access catalog messages in header-only mode.")) def __getattr__ (self, att): """ Attribute getter. Processes read-only attributes, and sends others to the base class. @param att: name of the attribute to get @returns: attribute value """ if 0: pass elif att == "name": basename = os.path.basename(self._filename) p = basename.rfind(".") if p >= 0: return basename[:p] else: return basename else: return Monitored.__getattr__(self, att) def __len__ (self): """ The number of messages in the catalog. The number includes obsolete entries, and excludes header entry. @returns: the number of messages @rtype: int """ self._assert_headonly() return len(self._messages) def __getitem__ (self, ident): """ Get message by position or another message. If the position is out of range, or the lookup message does not have a counterpart in this catalog with the same key, an error is signaled. Runtime complexity O(1), regardless of the C{ident} type. @param ident: position index or another message @type ident: int or L{Message_base} @returns: reference to the message in catalog @rtype: L{Message_base} """ self._assert_headonly() self.assert_spec_getitem() if not isinstance(ident, int): ident = self._msgpos[ident.key] return self._messages[ident] def __setitem__ (self, ident, msg): """ Set message by position or another message. If the position is out of range, or the lookup message does not have a counterpart in this catalog with the same key, an error is signaled. Runtime complexity O(1), regardless of the C{ident} type. @param ident: position index or another message @type ident: int or L{Message_base} @returns: reference to the message in catalog @rtype: L{Message_base} """ self._assert_headonly() self.assert_spec_setitem(msg) if not isinstance(ident, int): ident = self._msgpos[ident.key] self._messages[ident] = msg if self._messages[ident] is not msg: self.__dict__["#"]["*"] += 1 return self._messages[ident] def __contains__ (self, msg): """ Whether the message with the same key exists in the catalog. Runtime complexity O(1). @param msg: message to look for @type msg: L{Message_base} @returns: C{True} if the message exists @rtype: bool """ self._assert_headonly() return msg.key in self._msgpos def __eq__ (self, ocat): """ Whether two catalogs are equal in all apparent parts. Catalogs are considered equal if they are of the same length, their headers are equal, and each two messages with the same position are equal. Runtime complexity O(n). @returns: C{True} if catalogs are equal @rtype: bool """ if len(self) != len(ocat): return False if self.header != ocat.header: return False for i in range(len(ocat)): if self[i] != ocat[i]: return False return True def __ne__ (self, ocat): """ Whether two catalogs are equal in all apparent parts. Equivalent to C{not (self == ocat)}. @returns: C{False} if catalogs are equal @rtype: bool """ return not self.__eq__(ocat) def find (self, msg, wobs=True): """ Position of the message in the catalog. Runtime complexity O(1). @param msg: message to look for @type msg: L{Message_base} @param wobs: obsolete messages considered non-existant if C{False} @type wobs: bool @returns: position index if the message exists, -1 otherwise @rtype: int """ self._assert_headonly() if msg.key in self._msgpos: if wobs or not msg.obsolete: return self._msgpos[msg.key] return -1 def get (self, msg, defmsg=None): """ Get message by key of another message, with default fallback. If the lookup message C{msg} does not have a counterpart in this catalog with the same key, C{defmsg} is returned. C{msg} can also be C{None}, when C{defmsg} is returned. Runtime complexity O(1). @param msg: message for the lookup by key @type msg: L{Message_base} or None @param defmsg: fallback in case lookup failed @type defmsg: any @returns: reference to the message in catalog, or default @rtype: L{Message_base} or type(defmsg) """ if msg is None: return defmsg pos = self.find(msg) if pos >= 0: return self._messages[pos] else: return defmsg def add (self, msg, pos=None, srefsyn={}): """ Add a message to the catalog. If the message with the same key already exists in the catalog, it will be replaced with the new message, ignoring position. The return value will be C{None}. If the message does not exist in the catalog, when the position is C{None}, the insertion will be attempted such as that the messages be near according to the source references; if the position is not C{None}, the message is inserted at the given position. The return value will be the true insertion position. Negative position can be given as well. It counts backward from the first non-obsolete message if the message to be added is not obsolete, or from last message otherwise. When the message is inserted according to source references, a dictionary of file paths to consider synonymous can be given by the C{srefsyn}. The key is the file path for which the synonyms are being given, and the value the list of synonymous file paths. The mapping is not symmetric; if B is in the list of synonyms to A, A will not be automatically considered to be among synonyms of B, unless explicitly given in the list of synonyms to B. Runtime complexity O(1) if the message is present in the catalog; O(n - pos) if the position is given and the message is not present; O(n) if the position is not given and the message is not present. @param msg: message to insert @type msg: L{Message_base} @param pos: position index to insert at @type pos: int or None @param srefsyn: synonymous names to some of the source files @type srefsyn: {string: [string*]*} @returns: if inserted, the position where inserted @rtype: int or None """ return self.add_more([(msg, pos)], srefsyn=srefsyn)[0] def add_more (self, msgpos, cumulative=False, srefsyn={}): """ Add more than one message to the catalog. Like L{add}, except that several messages are added in one call. This significantly speeds up insertion when insertion positions of all messages are known beforehand. Insertion positions can be given relative to state before the call, or cumulative to earlier insertions in the list. For example, if insertions are given as C{[(msg1, 2), (msg2, 5)]} and not cumulative, then the resulting position for C{msg1} will be 2, and for C{msg2} 6 (assuming that both messages actually got inserted). This behavior can be toggled by the C{cumulative} parameter. @param msgpos: messages with target insertion positions @type msgpos: [(L{Message_base}, int), ...] @param cumulative: whether input positions are cumulative @type cumulative: bool @param srefsyn: synonymous names to some of the source files @type srefsyn: {string: [string*]*} @returns: positions where inserted, or None where replaced @rtype: [int or None, ...] """ self._assert_headonly() for msg, pos in msgpos: self.assert_spec_setitem(msg) if not msg.msgid and msg.msgctxt is None: raise PologyError( _("@info", "Trying to insert message with empty key into catalog.")) # Resolve backward positions, set aside automatic positions, # set aside replacements. msgpos_ins = [] msgs_auto = [] msgs_repl = [] for msg, pos in msgpos: if msg.key not in self._msgpos: if pos is not None: if pos < 0: pos = len(self._messages) + pos if pos < 0 or pos > len(self._messages): raise PologyError( _("@info", "Trying to insert message into catalog by " "position out of range.")) msgpos_ins.append((msg, pos)) else: msgs_auto.append(msg) else: msgs_repl.append(msg) # Sort messages to be inserted by resolved positions. msgpos_ins = sorted(msgpos_ins, key=lambda x: x[1]) # Resolve messages to be inserted by automatic positions. for msg in msgs_auto: pos, d1 = self._pick_insertion_point(msg, srefsyn) i = 0 while i < len(msgpos_ins): omsg, opos = msgpos_ins[i] if pos < opos: break elif cumulative: pos += 1 msgpos_ins.insert(i, (msg, pos)) # Accumulate insertion positions if not cumulative. if not cumulative and len(msgpos_ins) > 1: off = 0 msgpos_tmp = [] for msg, pos in msgpos_ins: msgpos_tmp.append((msg, pos + off)) off += 1 msgpos_ins = msgpos_tmp # Update key-position links for the index to be added. off = 0 for i in range(len(msgpos_ins)): pos1 = msgpos_ins[i][1] - off if i + 1 < len(msgpos_ins): pos2 = msgpos_ins[i + 1][1] - (off + 1) else: pos2 = len(self._messages) for j in range(pos1, pos2): ckey = self._messages[j].key self._msgpos[ckey] = j + (off + 1) off += 1 # Insert messages at computed positions. for msg, pos in msgpos_ins: self._messages.insert(pos, msg) self._messages[pos]._remove_on_sync = False # no pending removal self._messages[pos]._committed = False # write it on sync self._msgpos[msg.key] = pos # store new key-position link self.__dict__["#"]["*"] += 1 # indicate sequence change # Replace existing messages. for msg in msgs_repl: pos = self._msgpos[msg.key] self._messages[pos] = msg # Recover insertion/replacement positions. pos_res = [] msgpos_ins_d = dict(msgpos_ins) for msg, pos in msgpos: ipos = msgpos_ins_d.get(msg) if ipos is not None: pos_res.append(ipos) else: pos_res.append(None) return pos_res def obspos (self): """ Get canonical position of the first obsolete message. I{Canonical} position of the first obsolete message is the position of first of the contiguous obsolete messages at the end of the catalog. Normally this should be the same as the position of the very first obsolete message, as all obsolete messages should be contiguously grouped at the end. But there is no enforcement of such grouping, therefore the more stricter definition. If there are no messages in the catalog, or the last message is not obsolete, the position is reported as number of messages (i.e. one position after the last message). Runtime complexity O(number of contiguous trailing obsolete messages). @return: canonical position of first obsolete message @rtype: int """ op = len(self._messages) while op > 0 and self._messages[op - 1].obsolete: op -= 1 return op def add_last (self, msg): """ Add a message to the selected end of catalog, if not already in it. Synonym to C{cat.add(msg, cat.obspos())} if the message is not obsolete (i.e. tries to add the message after all non-obsolete), or to C{cat.add(msg, len(cat))} (tries to add at the very end). If the message already exits in the catalog (by key), same behavior as for L{add} applies. @see: L{add} """ if not msg.obsolete: return self.add(msg, self.obspos()) else: return self.add(msg, len(self._messages)) def remove (self, ident): """ Remove a message from the catalog, by position or another message. If the position is out of range, or the lookup message does not have a counterpart in this catalog with the same key, an error is signaled. Runtime complexity O(n), regardless of C{ident} type. Use L{remove_on_sync()} method for O(1) complexity, when the logic allows the removal to be delayed to syncing time. @param ident: position index or another message @type ident: int or L{Message_base} @returns: C{None} """ self._assert_headonly() # Determine position and key by given ident. if isinstance(ident, int): ip = ident key = self._messages[ip].key else: key = ident.key ip = self._msgpos[key] # Update key-position links for the removed index. for i in range(ip + 1, len(self._messages)): ckey = self._messages[i].key self._msgpos[ckey] = i - 1 # Remove from messages and key-position links. self._messages.pop(ip) self._msgpos.pop(key) self.__dict__["#"]["*"] += 1 # indicate sequence change def remove_on_sync (self, ident): """ Remove a message from the catalog, by position or another message, on the next sync. If the position is out of range, or the lookup message does not have a counterpart in this catalog with the same key, an error is signaled. Suited for for-in iterations over a catalog with a sync afterwards, so that the indices are not confused by removal, and good performance. Runtime complexity O(1). @param ident: position index or another message @type ident: int or L{Message_base} @returns: C{None} """ self._assert_headonly() # Determine position and key by given ident. if isinstance(ident, int): ip = ident else: ip = self._msgpos[ident.key] # Indicate removal on sync for this message. self._messages[ip]._remove_on_sync = True self.__dict__["#"]["*"] += 1 # indicate sequence change (pending) def sync (self, force=False, noobsend=False, writefh=None, fitplural=False): """ Write catalog file to disk if any message has been modified. All activities scheduled for sync-time are performed, such as delayed message removal. If catalog is monitored, unmodified messages (and message parts) are not reformatted unless forced. Instead of opening and writing into catalog's filename, catalog can be written to a file-like object provided by C{writefh} parameter. Same as when writing to file on disk, text will be encoded using catalog's encoding before writing it to C{writefh}. If in a plural message the number of C{msgstr} fields is not equal to the number specified in the plural header, the C{fitplural} parameter can be set to C{True} to correct this on syncing. However, this fitting will be performed only on clean plural messages, i.e. those in which all existing C{msgstr} fields are empty, as otherwise it is unclear how to adapt them to plural header. @param force: whether to reformat unmodified messages @type force: bool @param noobsend: do not reorder messages to group all obsolete at end @type noobsend: bool @param writefh: file to write the catalog to @type writefh: file-like object @param fitplural: whether to fit the number of msgstr fields in clean plural messages to plural header specification @type fitplural: bool @returns: C{True} if the file was modified, C{False} otherwise @rtype: bool """ # Cannot sync catalogs which have been given no path # (usually temporary catalogs). if not self._filename.strip(): raise PologyError( _("@info", "Trying to sync unnamed catalog.")) # Fit the number of msgstr entries in plural messages if requested. # Must be done before the modification test below. if fitplural: n = self.nplurals() for msg in self._messages: if ( msg.msgid_plural is not None and len(msg.msgstr) != n and all(len(s) == 0 for s in msg.msgstr) ): - msg.msgstr[:] = [u""] * n + msg.msgstr[:] = [""] * n # If catalog is not monitored, force syncing. if not self._monitored: force = True # If no modifications throughout and sync not forced, return. if not force and not self.modcount: return False # No need to indicate sequence changes here, as after sync the # catalog is set to unmodified throughout. # Temporarily insert header, for homogeneous iteration. self._messages.insert(0, self._header) self._messages[0]._remove_on_sync = False # never remove header nmsgs = len(self._messages) # Starting position for reinserting obsolete messages. obstop = len(self._messages) while obstop > 0 and self._messages[obstop - 1].obsolete: obstop -= 1 obsins = obstop # NOTE: Key-position links may be invalidated from this point onwards, # by reorderings/removals. To make sure it is not used before the # rebuild at the end, delete now. del self._msgpos if not self._wrap_determined: self.wrapping() flines = [] i = 0 while i < nmsgs: msg = self._messages[i] if msg.get("_remove_on_sync", False): # Removal on sync requested, just skip. i += 1 elif not noobsend and msg.obsolete and i < obstop: # Obsolete message out of order, reinsert and repeat the index. # Reinsertion is such that the relative ordering of obsolete # messages is preserved. msg = self._messages.pop(i) self._messages.insert(obsins - 1, msg) # -1 due to popping obstop -= 1 else: # Normal message, append formatted lines to rest. committed = msg.get("_committed", False) flines.extend(msg.to_lines(self._wrapf, force or not committed)) # Message should finish with one empty line. if flines[-1] != "\n": flines.append("\n") i += 1 if not self._tail: # Remove trailing empty lines. while flines and flines[-1] == "\n": flines.pop(-1) else: # Tail has to be converted to separate lines, # so that possibly new encoding is applied to it too # while being able to report line/column on error. flines.extend(x + "\n" for x in self._tail.split("\n")) if self._tail.endswith("\n"): flines.pop(-1) # Remove temporarily inserted header. self._messages.pop(0) # Update message map. self.sync_map() # Reset modification state throughout. self.modcount = 0 # Encode lines and write file. enclines = [] for i, line in enumerate(flines): try: encline = line.encode(self._encoding) - except UnicodeEncodeError, e: + except UnicodeEncodeError as e: raise CatalogSyntaxError( _("@info", "Text encoding failure at %(file)s:%(line)d:%(col)d " "under assumed encoding '%(enc)s'.", file=self._filename, line=(i + 1), col=e[2], enc=self._encoding)) enclines.append(encline) if not writefh: # Create the parent directory if it does not exist. pdirpath = os.path.dirname(self._filename) mkdirpath(pdirpath) # Write to file atomically: directly write to temporary file, # then rename it to destination file. #ofl = tempfile.NamedTemporaryFile(delete=False, dir=pdirpath) #tmpfname = ofl.name # ...needs Python 2.6 tmpfname = os.path.join(pdirpath, os.path.basename(self._filename) + "~tmpw") ofl = open(tmpfname, "w") else: ofl = writefh ofl.writelines(enclines) if not writefh: ofl.close() if os.name == "nt" and os.path.exists(self._filename): # NT does not allow to overwrite on rename. tmpfname2 = self._filename + "~tmpo" os.rename(self._filename, tmpfname2) os.rename(tmpfname, self._filename) os.remove(tmpfname2) else: os.rename(tmpfname, self._filename) # Indicate the catalog is no longer created from scratch, if it was. self._created_from_scratch = False # Indicate header has been committed. self._header._committed = True # Indicate for each message that it has been committed. for msg in self._messages: msg._committed = True return True def sync_map (self): """ Update message map. In case there were any modifications to message keys, or any pending removals issued, this function will update the sequence of messages such that membership operations work properly again. Obsolete messages will be moved to end of catalog. Referent line and entry numbers will remain invalid, as catalog will not be written out. This is a less expensive alternative to syncing the catalog, when it is only necessary to continue using it in synced state, rather than actually writing it out. """ # Execute pending removals. # Separate messages into current and obsolete. newlst = [] newlst_obs = [] for msg in self._messages: if not msg.get("_remove_on_sync", False): if not msg.obsolete: newlst.append(msg) else: newlst_obs.append(msg) newlst.extend(newlst_obs) self.__dict__["*"] = newlst self._messages = self.__dict__["*"] # Rebuild key-position links. self._msgpos = {} for i in range(len(self._messages)): self._msgpos[self._messages[i].key] = i # Set inverse map to non-computed. self._invmap = None def _make_invmap (self): # Map for inverse lookup (by translation) has as key the msgstr[0], # and the value the list of messages having the same msgstr[0]. self._invmap = {} for msg in self._messages: ikey = msg.msgstr[0] msgs = self._invmap.get(ikey) if msgs is None: msgs = [] self._invmap[ikey] = msgs msgs.append(msg) def insertion_inquiry (self, msg, srefsyn={}): """ Compute the tentative insertion of the message into the catalog. The tentative insertion is a tuple of position of a message when it would be inserted into the catalog, and the I{weight} indicating the quality of positioning. The weight is computed by analyzing the source references. Runtime complexity O(n). @param msg: message to compute the tentative insertion for @type msg: L{Message_base} @param srefsyn: synonymous names to some of the source files @type srefsyn: {string: [string*]*} @returns: the insertion position and its weight @rtype: int, float """ self._assert_headonly() return self._pick_insertion_point(msg, srefsyn) def created (self): """ Whether the catalog has been newly created (no existing PO file). A catalog is no longer considered newly created after the first sync. @returns: C{True} if newly created, C{False} otherwise @rtype: bool """ return self._created_from_scratch def _pick_insertion_point (self, msg, srefsyn={}): # Return the best insertion position with associated weight. # Assume the existing messages in the catalog are properly ordered. if not msg.obsolete: last = self.obspos() else: last = len(self._messages) # Insert at the last position if the candidate message has # no source references. if not msg.source: return last, 0.0 ins_pos = -1 # Try to find insertion position by comparing the source references # of the candidate the source references of the existing messages. # The order of matching must be very specific for logical insertion. # If the matching source files are found, insert according to # the line number. for src, lno in msg.source: src_pos = 0 src_match = False curr_prim_esrc = "" for i in range(last): emsg = self._messages[i] if not emsg.source: continue same_prim_esrc = False for esrc, elno in emsg.source: if curr_prim_esrc in [esrc] + srefsyn.get(esrc, []): same_prim_esrc = True break if not same_prim_esrc: curr_prim_esrc, elno = emsg.source[0] if src in [curr_prim_esrc] + srefsyn.get(curr_prim_esrc, []): # The source file names match. # Insert at this position if the candidate's line # number preceeds that of the current message. src_match = True if lno < elno: ins_pos = i break elif src_match: # The sources no longer match, but were matched # before. This means the candidate line number is # after all existing, so insert at this position. ins_pos = i break if ins_pos >= 0: break if ins_pos >= 0: break if ins_pos >= 0: return ins_pos, 1.0 else: return last, 0.0 def nplurals (self): """ Number of msgstr fields expected for plural messages. Determined by the Plural-Forms header field; if this field is absent from the header, defaults to 1. @returns: number of plurals @rtype: int """ # Get nplurals string from the header. plforms = self._header.get_field_value("Plural-Forms") if not plforms: # no plural definition return 1 nplustr = plforms.split(";")[0] # Get the number of forms from the string. m = re.search(r"\d+", nplustr) if not m: # malformed nplurals return 1 return int(m.group(0)) def plural_index (self, number): """ Msgstr field index in plural messages for given number. Determined by the Plural-Forms header field; if this field is absent from the header, defaults to 0. @param number: the number to determine the plural form for @type number: int @returns: index of msgstr field @rtype: int """ # Get plural definition from the header. plforms = self._header.get_field_value("Plural-Forms") if not plforms: # no plural definition, assume 0 return 0 plustr = plforms.split(";")[1] # Rebuild evaluation string only if changed to last invocation. if plustr != self._plustr: # Record raw plural definition for check on next call. self._plustr = plustr # Prepare Python-evaluable string out of the raw definition. plustr = plustr[plustr.find("=") + 1:] # remove plural= part p = -1 evalstr = "" while 1: p = plustr.find("?") if p < 0: evalstr += " " + plustr break cond = plustr[:p] plustr = plustr[p + 1:] cond = cond.replace("&&", " and ") cond = cond.replace("||", " or ") evalstr += "(" + cond + ") and " p = plustr.find(":") body = plustr[:p] plustr = plustr[p + 1:] evalstr += "\"" + body + "\" or " if not evalstr.strip(): evalstr = "0" # Record the current evaluable definition. self._plustr_eval = evalstr # Evaluate the definition. n = number # set eval context (plural definition uses n as variable) form = int(eval(self._plustr_eval)) return form def plural_indices_single (self): """ Indices of the msgstr fields which are used for single number only. @returns: msgstr indices used for single numbers @rtype: [int*] """ # Get plural definition from the header. plforms = self._header.get_field_value("Plural-Forms") if not plforms: # no plural definition, assume 0 return [0] plustr = plforms.split(";")[1] lst = re.findall(r"\bn\s*==\s*\d+\s*\)?\s*\?\s*(\d+)", plustr) if not lst and re.search(r"\bn\s*(!=|>|<)\s*\d+\s*([^?]|$)", plustr): lst = ["0"] return [int(x) for x in lst] def select_by_key (self, msgctxt, msgid, wobs=False): """ Select message from the catalog by the fields that define its key. If matched, the message is returned as a single-element list, or an empty list when there is no match. This is so that the result of this method is in line with other C{select_*} methods. Runtime complexity as that of L{find}. @param msgctxt: the text of C{msgctxt} field @type msgctxt: string or C{None} @param msgid: the text of C{msgid} field @type msgid: string @param wobs: whether to include obsolete messages in selection @type wobs: bool @returns: selected messages @rtype: [L{Message_base}*] """ m = MessageUnsafe({"msgctxt" : msgctxt, "msgid" : msgid}) p = self.find(m, wobs) if p >= 0: return [self._messages[p]] else: return [] def select_by_key_match (self, msgctxt, msgid, exctxt=False, exid=True, case=True, wobs=False): """ Select messages from the catalog by matching key-defining fields. Parameters C{msgctxt} and C{msgid} are either exact values, to be matched by equality against message fields, or regular expression strings. Parameters C{exctxt} and C{exid} control which kind of match it is, respectively. Runtime complexity O(n), unless all matches are exact, when as that of L{find}. @param msgctxt: the text or regex string of C{msgctxt} field @type msgctxt: string or C{None} @param msgid: the text or regex string of C{msgid} field @type msgid: string @param exctxt: C{msgctxt} is exact value if C{True}, regex if C{False} @type exctxt: bool @param exid: C{msgid} is exact value if C{True}, regex if C{False} @type exid: bool @param case: whether regex matching is case-sensitive @type case: bool @param wobs: whether to include obsolete messages in selection @type wobs: bool @returns: selected messages @rtype: [L{Message_base}*] """ if exctxt and exid: return self.select_by_key(msgctxt, msgid, wobs=wobs) rxflags = re.U if not case: rxflags |= re.I if not exctxt: if msgctxt is not None: msgctxt_rx = re.compile(msgctxt, rxflags) else: # Force exact match if actually no context required. exctxt = True if not exid: msgid_rx = re.compile(msgid, rxflags) selected_msgs = [] for msg in self._messages: if ( (wobs or not msg.obsolete) and ( (exid and msg.msgid == msgid) or (not exid and msgid_rx.search(msg.msgid))) and ( (exctxt and msg.msgctxt == msgctxt) - or (not exctxt and msgctxt_rx.search(msg.msgctxt or u""))) + or (not exctxt and msgctxt_rx.search(msg.msgctxt or ""))) ): selected_msgs.append(msg) return selected_msgs def select_by_msgid (self, msgid, wobs=False): """ Select messages from the catalog by matching C{msgid} field. Several messages may have the same C{msgid} field, due to different C{msgctxt} fields. Empty list is returned when there is no match. Runtime complexity O(n). @param msgid: the text of C{msgid} field @type msgid: string @param wobs: whether to include obsolete messages in selection @type wobs: bool @returns: selected messages @rtype: [L{Message_base}*] """ selected_msgs = [] for msg in self._messages: if (wobs or not msg.obsolete) and msg.msgid == msgid: selected_msgs.append(msg) return selected_msgs def select_by_msgid_fuzzy (self, msgid, cutoff=0.6, wobs=False): """ Select messages from the catalog by near-matching C{msgid} field. The C{cutoff} parameter determines the minimal admissible similarity (1.0 fo exact match). The messages are returned ordered by decreasing similarity. Runtime complexity O(n) * O(length(msgid)*avg(length(msgids))) (probably). @param msgid: the text of C{msgid} field @type msgid: string @param cutoff: minimal similarity @type cutoff: float @param wobs: whether to include obsolete messages in selection @type wobs: bool @returns: selected messages @rtype: [L{Message_base}*] """ # Build dictionary of message keys by msgid; # there can be several keys per msgid, pack in a list. msgkeys = {} for msg in self._messages: if msg.obsolete and not wobs: # Skip obsolete messages if not explicitly included. continue if msg.msgid not in msgkeys: msgkeys[msg.msgid] = [] msgkeys[msg.msgid].append(msg.key) # Get near-match msgids. near_msgids = difflib.get_close_matches(msgid, msgkeys, cutoff=cutoff) # Collect messages per selected msgids. selected_msgs = [] for near_msgid in near_msgids: for msgkey in msgkeys[near_msgid]: selected_msgs.append(self._messages[self._msgpos[msgkey]]) return selected_msgs def select_by_msgstr (self, msgstr0, wobs=False, lazy=False): """ Select messages from the catalog inversely, by their msgstr[0]. Several messages may have the same C{msgstr[0]} field, so the return value is always a list of messages. Empty list is returned when there is no match. Runtime complexity is O(n) if C{lazy} is C{False}. If C{lazy} is C{True}, complexity is O(n) for the first search, and then O(1) until next syncing of the catalog; if msgstr fields of some messages change in between, or messages are added or removed from the catalog, this is not seen until next syncing. @param msgstr0: the text of C{msgstr[0]} field @type msgstr0: string @param wobs: whether to include obsolete messages in selection @type wobs: bool @param lazy: whether to assume msgstr are not modified between syncings @type lazy: bool @returns: selected messages @rtype: [L{Message_base}*] """ if not lazy: selected_msgs = {} for msg in self._messages: if (wobs or not msg.obsolete) and msg.msgstr[0] == msgstr0: selected_msgs.append(msg) else: if self._invmap is None: self._make_invmap() selected_msgs = self._invmap.get(msgstr0, []) if not wobs: selected_msgs = [x for x in selected_msgs if not x.obsolete] return selected_msgs def encoding (self): """ Report encoding used when syncing the catalog. Encoding is determined from C{Content-Type} header field. It is not defined when the header will be examined, or if it will be reexamined when it changes. If you want to set encoding after the catalog has been opened, use L{set_encoding}. @returns: the encoding name @rtype: string """ return self._encoding def set_encoding (self, encoding): """ Set encoding used when syncing the catalog. Encoding set by this method will later be readable by the L{encoding} method. This will also modify the catalog header C{Content-Type} field. @param encoding: the encoding name @type encoding: string """ self._encoding = encoding - ctval = u"text/plain; charset=%s" % encoding - self.header.set_field(u"Content-Type", ctval) + ctval = "text/plain; charset=%s" % encoding + self.header.set_field("Content-Type", ctval) def accelerator (self): """ Report characters used as accelerator markers in GUI messages. Accelerator characters are determined by looking for certain header fields, in this order: C{Accelerator-Marker}, C{X-Accelerator-Marker}. In each field, several accelerator markers can be stated as comma-separated list, or there may be several fields; the union of all parsed markers is reported. If empty set is returned, it was determined that there are no accelerator markers in the catalog; if C{None}, that there is no determination about markers. It is not defined when the header will be examined, or if it will be reexamined when it changes. If you want to set accelerator markers after the catalog has been opened, use L{set_accelerator}. @returns: accelerator markers @rtype: set(string*) or C{None} """ if self._accels_determined: return self._accels accels = None self._accels_determined = True for fname in ( "Accelerator-Marker", "X-Accelerator-Marker", ): fields = self._header.select_fields(fname) for fname, fval in fields: if accels is None: accels = set() accels.update([x.strip() for x in fval.split(",")]) if accels: accels.discard("") self._accels = accels return accels def set_accelerator (self, accels): """ Set accelerator markers that can be expected in messages. Accelerator markers set by this method will later be readable by the L{accelerator} method. This will not modify the catalog header in any way; if that is desired, it must be done manually by manipulating the header fields. If C{accels} is given as C{None}, it means the accelerator markers are undetermined; if empty, that there are no markers in messages. @param accels: accelerator markers @type accels: sequence of strings or C{None} """ if accels is not None: self._accels = set(accels) self._accels.discard("") else: self._accels = None self._accels_determined = True def markup (self): """ Report what types of markup can be expected in messages. Markup types are determined by looking for some header fields, which state markup types as short symbolic names, e.g. "html", "docbook", "mediawiki", etc. The header fields are tried in this order: C{Text-Markup}, C{X-Text-Markup}. In each field, several markup types can be stated as comma-separated list. If there are several fields, it is undefined from which one markup names are collected. Markup names are always reported in lower-case, regardless of the original casing used in the header. See L{set_markup} for list of markup types currently observed by various Pology modules to influence processing behavior. If empty set is returned, it was determined that there is no markup in the catalog; if C{None}, that there is no determination about markup. It is not defined when the header will be examined, or if it will be reexamined when it changes. If you want to set markup types after the catalog has been opened, use L{set_markup} method. @returns: markup names @rtype: set(string*) or C{None} """ if self._mtypes_determined: return self._mtypes mtypes = None self._mtypes_determined = True for fname in ( "Text-Markup", "X-Text-Markup", ): fval = self._header.get_field_value(fname) if fval is not None: mtypes = set([x.strip().lower() for x in fval.split(",")]) mtypes.discard("") self._mtypes = mtypes return mtypes def set_markup (self, mtypes): """ Set markup types that can be expected in messages. Markup types set by this method will later be readable by the L{markup} method. This will not modify the catalog header in any way; if that is desired, it must be done manually by manipulating the header fields. If C{mtypes} is given as C{None}, it means the markup types are undetermined; if empty, that there is no markup in messages. The following markup types are currently used by various parts of Pology to influence behavior on processing: - C{html}: HTML 4.01 - C{qtrich}: Qt rich-text, (almost) a subset of HTML - C{kuit}: UI semantic markup in KDE4 - C{kde4}: markup in KDE4 UI POs, a mix of Qt rich-text and KUIT - C{docbook4}: Docbook 4.x markup, in documentation POs - C{xmlents}: only XML-like entities, no other formal markup @param mtypes: markup types @type mtypes: sequence of strings or C{None} """ if mtypes is not None: self._mtypes = set([x.lower() for x in mtypes]) else: self._mtypes = None self._mtypes_determined = True def language (self): """ Report language of the translation. Language is determined by looking for the C{Language} header field. If this field is present, it should contain the language code in line with GNU C library locales, e.g. C{pt} for Portuguese, or C{pt_BR} for Brazilian Portuguese. If the field is not present, language is considered undetermined, and C{None} is returned. It is not defined when the header will be examined, or if it will be reexamined when it changes (most probably not). If you want to set language after the catalog has been opened, use L{set_language} method. @returns: language code @rtype: string or C{None} """ if self._lang_determined: return self._lang lang = None self._lang_determined = True fval = self._header.get_field_value("Language") if fval: lang = fval.strip() self._lang = lang return lang def set_language (self, lang): """ Set language of the translation. Language set by this method will later be readable by the L{language} method. This will not modify the catalog header in any way; if that is desired, it must be done manually by manipulating the header fields. If C{lang} is given as C{None}, it means the language is undetermined. If it is given as empty string, it means the language is deliberately considered unknown. @param lang: language code @type lang: string or C{None} """ if lang is not None: - self._lang = unicode(lang) + self._lang = str(lang) else: self._lang = None self._lang_determined = True def environment (self): """ Report environments which the catalog is part of. Sometimes the language alone is not enough to determine all the non-technical aspects of translation. For example, in a given language but different translation domains, one translator may decide to use one of the two synonyms naming a concept, and the other translator the other synonym. I{Environments} are a way to specify such sets of choices, so that automatic tools (e.g. terminology checker) can detect how to process a given catalog. An environment can represent anything. It may be a single translator, who applies own set of choices to all the catalogs under own maintenance; it may be a translation project, with many cooperating translators; and so on. Each environment is named by an alphanumeric keyword (such as normalized project name, translator's name, etc.), and should be unique within a given language. Environments are read from one of the following header fieldsE{:} C{Environment}, C{X-Environment}. The value the field should be comma-separated list of environment keywords. If there are several environment fields, it is undefined from which the environments are read. If more than one environment is stated, then wherever the conventions of two environments conflict, the environment mentioned later in the list should take precedence. For example, environment list such as C{"footp, jdoe"} would mean to apply conventions of FOO translation project, ammended by that of translator Johnas Doemann. It there is no environment header field, C{None} is reported. Empty list is reported if such field exists, but its value is empty. It is not defined when the header will be examined, or if it will be reexamined when it changes (most probably not). if you want to set environments after the catalog has been opened, use L{set_environment} method. @returns: environment keywords @rtype: [string*] or C{None} """ if self._envs_determined: return self._envs envs = None self._envs_determined = True for fname in ( "Environment", "X-Environment", ): fval = self._header.get_field_value(fname) if fval is not None: envs = [x.strip().lower() for x in fval.split(",")] while "" in envs: envs.remove("") break self._envs = envs return envs def set_environment (self, envs): """ Set environments which the catalog is part of. Environments set by this method will later be readable by the L{environment} method. This will not modify the catalog header in any way; if that is desired, it must be done manually by manipulating the header fields. If C{envs} is given as C{None}, it means that the environments are undetermined; if empty, the catalog belongs to no environment. @param envs: environment keywords @type envs: sequence of strings or C{None} """ if envs is not None: self._envs = set([x.lower() for x in envs]) else: self._envs = None self._envs_determined = True def wrapping (self): """ Report wrapping policy for message fields. Long text fields in messages (C{msgid}, C{msgstr}, etc.) may be wrapped in different ways, as wrapping does not influence their semantics. (This is unlike translator and extracted comments, which are never wrapped, because division into lines may be significant.) PO processing tools will typically offer wrapping options, but it may be more convenient to have wrapping policy bound to the catalog, which tools respect unless overridden. The following header fields are checked for wrapping policy, in given order: C{Wrapping}, C{X-Wrapping}. Wrapping policy (i.e. value of these header fields) is an unordered comma-separated list of wrapping keywords. See L{select_field_wrapper} for possible keywords. If no wrapping policy field is found in the header, C{None} is returned. If several wrapping policy fields are present, it is undefined which one is taken into account. It is not defined when the header will be examined, or if it will be reexamined when it changes (most probably not). If you want to set wrapping after the catalog has been opened, use L{set_wrapping} method. @returns: wrapping keywords @rtype: (string...) or C{None} """ if self._wrap_determined: return self._wrapkw wrapkw = None self._wrap_determined = True for fname in ( "Wrapping", "X-Wrapping", ): fval = self._header.get_field_value(fname) if fval is not None: wrapkw = [x.strip().lower() for x in fval.split(",")] wrapkw = tuple(sorted(wrapkw)) break self._wrapkw = wrapkw self._wrapf = select_field_wrapper(wrapkw) return self._wrapkw def set_wrapping (self, wrapkw): """ Set wrapping policy for message fields. Wrapping policy set by this method will later be readable by the L{wrapping} method. This will not modify the catalog header in any way; if that is desired, it must be done manually by manipulating the header fields. Wrapping policy is a sequence of keywords. See L{select_field_wrapper} for possible keywords. If C{None} is given instead, it is passed directly to L{select_field_wrapper}, which will construct default wrapper. @param wrapkw: wrapping policy @type wrapkw: [string...] or C{None} """ self._wrapkw = tuple(sorted(wrapkw)) if wrapkw is not None else None self._wrapf = select_field_wrapper(wrapkw) self._wrap_determined = True def wrapf (self): """ Get wrapping function used for message fields. Wrapping function is determined based on wrapping policy (see L{wrapping}, L{set_wrapping}). Wrapping function returned by this method is suitable as C{wrapf} parameter in methods of C{Message} objects. @returns: wrapping function @rtype: (string, string, string?)->[string] @see: L{wrap_field} """ self.wrapping() return self._wrapf def messages_by_source (self): """ Get messages grouped as lists by source. All messages sharing the same primary source file (their first source reference) are grouped and filed under that source file path. Grouping is represented by list of tuples of (source, list of messages), with both sources and messages within partial lists ordered by appearance. @return: messages grouped by sources @rtype: [(string, [L{Message_base}])] """ msgs_by_src = {} sources = [] for msg in self._messages: src = msg.source and msg.source[0][0] or "" if src not in msgs_by_src: msgs_by_src[src] = [] sources.append(src) msgs_by_src[src].append(msg) return [(x, msgs_by_src[x]) for x in sources] def sort_by_source (self): """ Sort messages in catalog by source references. Source references within each message are sorted too, before messages are sorted by source references. If any message changed its position due to sorting, L{sync_map} is called at the end. """ # Sort source references within messages. for msg in self._messages: sorted_source = sorted(msg.source, key=lambda s: (s[0].lower(), s[1])) if self._monitored: - msg.source = Monlist(map(Monpair, sorted_source)) + msg.source = Monlist(list(map(Monpair, sorted_source))) else: msg.source = sorted_source sorted_messages = sorted(self._messages, key=lambda m: [(s[0].lower(), s[1]) for s in m.source[:1]]) any_moved = False for i in range(len(self._messages)): if sorted_messages[i] is not self._messages[i]: any_moved = True break if any_moved: self._messages = sorted_messages self.sync_map() def update_header (self, project=None, title=None, copyright=None, license=None, name=None, email=None, teamemail=None, langname=None, langcode=None, encoding=None, ctenc=None, plforms=None, poeditor=None): """ Update catalog header. If a piece of information is not given (i.e. C{None}), the corresponding header field is left unmodified. If it is given as empty string, the corresponding header field is removed. PO revision date is updated always, to current date. Some fields (as noted in parameter descriptions) are expanded on variables by applying the L{expand_vars} function. For example:: title="Translation of %project into %langname." The following variables are available: - C{%basename}: PO file base name - C{%poname}: PO file base name without .po extension - C{%project}: value of C{project} parameter (if not C{None}/empty) - C{%langname}: value of C{langname} parameter (if not C{None}/empty) - C{%langcode}: value of C{langcode} parameter (if not C{None}/empty) @param project: project name @type project: string @param title: translation title (expanded on variables) @type title: string @param copyright: copyright notice (expanded on variables) @type copyright: string @param license: license notice (expanded on variables) @type license: string @param name: translator's name @type name: string @param email: translator's email address @type email: string @param teamemail: language team's email address @type teamemail: string @param langname: full language name @type langname: string @param langcode: language code @type langcode: string @param encoding: text encoding @type encoding: string @param ctenc: content transfer encoding @type ctenc: string @param plforms: plural forms expression @type plforms: string @param poeditor: translator's PO editor @type poeditor: string @returns: reference to header """ varmap = {} varmap["basename"] = os.path.basename(self.filename) varmap["poname"] = self.name if project: varmap["project"] = project if langname: varmap["langname"] = langname if langcode: varmap["langcode"] = langcode varhead="%" hdr = self.header if title: title = expand_vars(title, varmap, varhead) - hdr.title[:] = [unicode(title)] + hdr.title[:] = [str(title)] elif title == "": hdr.title[:] = [] if copyright: copyright = expand_vars(copyright, varmap, varhead) - hdr.copyright = unicode(copyright) + hdr.copyright = str(copyright) elif copyright == "": hdr.copyright = None if license: license = expand_vars(license, varmap, varhead) - hdr.license = unicode(license) + hdr.license = str(license) elif license == "": hdr.license = None if project: - hdr.set_field(u"Project-Id-Version", unicode(project)) + hdr.set_field("Project-Id-Version", str(project)) elif project == "": - hdr.remove_field(u"Project-Id-Version") + hdr.remove_field("Project-Id-Version") - hdr.set_field(u"PO-Revision-Date", format_datetime()) + hdr.set_field("PO-Revision-Date", format_datetime()) if name or email: if name and email: tr_ident = "%s <%s>" % (name, email) elif name: tr_ident = "%s" % name else: tr_ident = "<%s>" % email # Remove author placeholder. for i in range(len(hdr.author)): - if u"FIRST AUTHOR" in hdr.author[i]: + if "FIRST AUTHOR" in hdr.author[i]: hdr.author.pop(i) break # Look for current author in the comments, # to update only years if present. cyear = time.strftime("%Y") - acfmt = u"%s, %s." + acfmt = "%s, %s." new_author = True for i in range(len(hdr.author)): if tr_ident in hdr.author[i]: # Parse the current list of years. years = re.findall(r"\b(\d{2,4})\s*[,.]", hdr.author[i]) if cyear not in years: years.append(cyear) years.sort() hdr.author[i] = acfmt % (tr_ident, ", ".join(years)) new_author = False break if new_author: hdr.author.append(acfmt % (tr_ident, cyear)) - hdr.set_field(u"Last-Translator", unicode(tr_ident)) + hdr.set_field("Last-Translator", str(tr_ident)) elif name == "" or email == "": - hdr.remove_field(u"Last-Translator") + hdr.remove_field("Last-Translator") if langname: tm_ident = None if langname and teamemail: tm_ident = "%s <%s>" % (langname, teamemail) elif langname: tm_ident = langname - hdr.set_field(u"Language-Team", unicode(tm_ident)) + hdr.set_field("Language-Team", str(tm_ident)) elif langname == "": - hdr.remove_field(u"Language-Team") + hdr.remove_field("Language-Team") if langcode: - hdr.set_field(u"Language", unicode(langcode), after="Language-Team") + hdr.set_field("Language", str(langcode), after="Language-Team") elif langcode == "": - hdr.remove_field(u"Language") + hdr.remove_field("Language") if encoding: - ctval = u"text/plain; charset=%s" % encoding - hdr.set_field(u"Content-Type", ctval) + ctval = "text/plain; charset=%s" % encoding + hdr.set_field("Content-Type", ctval) elif encoding == "": - hdr.remove_field(u"Content-Type") + hdr.remove_field("Content-Type") if ctenc: - hdr.set_field(u"Content-Transfer-Encoding", unicode(ctenc)) + hdr.set_field("Content-Transfer-Encoding", str(ctenc)) elif ctenc == "": - hdr.remove_field(u"Content-Transfer-Encoding") + hdr.remove_field("Content-Transfer-Encoding") if plforms: - hdr.set_field(u"Plural-Forms", unicode(plforms)) + hdr.set_field("Plural-Forms", str(plforms)) elif plforms == "": - hdr.remove_field(u"Plural-Forms") + hdr.remove_field("Plural-Forms") if poeditor: - hdr.set_field(u"X-Generator", unicode(poeditor)) + hdr.set_field("X-Generator", str(poeditor)) elif poeditor == "": - hdr.remove_field(u"X-Generator") + hdr.remove_field("X-Generator") return hdr def detect_renamed_sources (self, cat, minshare=0.7): """ Heuristically determine possible renamings of source files from this catalog based on source files in the other catalog. To determine the possibility that the source file A from this catalog has been renamed into source file B in the other catalog C{cat}, primarily the share of common messages to A and B is considered. The minimum needed commonality can be given by C{minshare} parameter. When a source file from this catalog is directly mentioned in the other catalog, it is immediatelly considered to have no possible renamings. The return value is a dictionary in which the key is the source file and the value is the list of its possible renamed counterparts. The renaming list is never empty, i.e. if no renamings were detected for a given source file, that source file will not be present in the dictionary. The dictionary is fully symmetric: if source file B is in the renaming list of file A, then there will be an entry for file B with A in its renaming list (even when B is comming from the other catalog). Instead of a single other catalog to test against, a sequence of several other catalogs can be given. @param cat: catalog against which to test for renamings @type cat: Catalog or [Catalog*] @param minshare: the minimum commonality between two source files to consider them as possible renaming pair (0.0-1.0) @type minshare: float @returns: the renaming dictionary @rtype: {string: [string*]*} """ renamings = {} # Collect all own sources, to avoid matching for them. ownfs = set() for msg in self._messages: for src, lno in msg.source: ownfs.add(src) if isinstance(cat, Catalog): cats = [cat] else: cats = cat for ocat in cats: if self is ocat: continue fcnts = {} ccnts = {} for msg in self._messages: omsg = ocat.get(msg) if omsg is None: continue for src, lno in msg.source: if src not in fcnts: fcnts[src] = 0.0 ccnts[src] = {} # Weigh each message disproportionally to the number of # files it appears in (i.e. the sum of counts == 1). fcnts[src] += 1.0 / len(msg.source) counted = {} for osrc, olno in omsg.source: if osrc not in ownfs and osrc not in counted: if osrc not in ccnts[src]: ccnts[src][osrc] = 0.0 ccnts[src][osrc] += 1.0 / len(omsg.source) counted[osrc] = True # Select match groups. fuzzies = {} - for src, fcnt in fcnts.iteritems(): + for src, fcnt in fcnts.items(): shares = [] - for osrc, ccnt in ccnts[src].iteritems(): + for osrc, ccnt in ccnts[src].items(): share = ccnt / (fcnt + 1.0) # tip a bit to avoid fcnt of 0.x if share >= minshare: shares.append((osrc, share)) if shares: shares.sort(key=lambda x: x[1]) # not necessary atm fuzzies[src] = [f for f, s in shares] # Update the dictionary of renamings. - for src, fuzzsrcs in fuzzies.iteritems(): + for src, fuzzsrcs in fuzzies.items(): group = [src] + fuzzsrcs for src in group: if src not in renamings: renamings[src] = [] for osrc in group: if src != osrc and osrc not in renamings[src]: renamings[src].append(osrc) if not renamings[src]: renamings.pop(src) return renamings diff --git a/pology/colors.py b/pology/colors.py index de1d54fa..1c846a4a 100644 --- a/pology/colors.py +++ b/pology/colors.py @@ -1,492 +1,492 @@ # -*- coding: UTF-8 -*- """ Standard codes for terminal colors. @author: Chusslove Illich @author: Sébastien Renard @license: GPLv3 """ from optparse import OptionParser import re import sys # NOTE: Must not import anything from Pology, as top __init__ includes this. _xml_entities = { - u"lt": u"<", - u"gt": u">", - u"apos": u"'", - u"quot": u"\"", - u"amp": u"&", + "lt": "<", + "gt": ">", + "apos": "'", + "quot": "\"", + "amp": "&", } def _resolve_xml_ents (text): segs = [] p = 0 while True: p1 = p p = text.find("&", p1) if p < 0: segs.append(text[p1:]) break segs.append(text[p1:p]) p2 = p p = text.find(";", p2) if p < 0: segs.append(text[p2:]) break ent = text[p2 + 1:p] val = _xml_entities.get(ent) if val is None: segs.append(text[p2]) p = p2 + 1 else: segs.append(val) p += 1 rtext = "".join(segs) return rtext def _escape_xml_ents (text): rtext = text.replace("&", "&") - for ent, val in _xml_entities.items(): + for ent, val in list(_xml_entities.items()): if val != "&": rtext = rtext.replace(val, "&" + ent + ";") return rtext -class ColorString (unicode): +class ColorString (str): """ Class for strings with color markup. This class provides automatic resolution of color XML markup in strings for various output formats. It automatically escapes any raw strings combined with it (e.g. when using the C{%} or C{+} operators) and returns objects of its own type from methods (e.g. from C{split()} or C{strip()}). Otherwise it should behave like a normal string. Note that usage of this class is expensive, given that arguments are constantly checked and strings escaped. It should be used only for user-visible output, i.e. where human reaction time is the limiting factor. """ def _escape (self, v): - if isinstance(v, basestring) and not isinstance(v, ColorString): - v = unicode(v) + if isinstance(v, str) and not isinstance(v, ColorString): + v = str(v) v = _escape_xml_ents(v) return v def __add__ (self, other): - return ColorString(unicode.__add__(self, self._escape(other))) + return ColorString(str.__add__(self, self._escape(other))) def __radd__ (self, other): - return ColorString(unicode.__add__(self._escape(other), self)) + return ColorString(str.__add__(self._escape(other), self)) def __mod__ (self, args): if isinstance(args, dict): - rargs = dict((k, self._escape(v)) for k, v in args.items()) + rargs = dict((k, self._escape(v)) for k, v in list(args.items())) elif isinstance(args, tuple): rargs = tuple(self._escape(v) for v in args) else: rargs = self._escape(args) - return ColorString(unicode.__mod__(self, rargs)) + return ColorString(str.__mod__(self, rargs)) def __repr__ (self): - return "%s(%s)" % (self.__class__.__name__, unicode.__repr__(self)) + return "%s(%s)" % (self.__class__.__name__, str.__repr__(self)) def join (self, strings): rstrings = [self._escape(s) for s in strings] - return ColorString(unicode.join(self, rstrings)) + return ColorString(str.join(self, rstrings)) def resolve (self, ctype=None, dest=None): """ Resolve color markup according to given type and destination. Currently available coloring types (values of C{ctype} parameter): - C{"none"}: no coloring - C{"term"}: ANSI color escape sequences (for terminal output) - C{"html"}: HTML markup (for integration into HTML pages) If C{ctype} is C{None}, it is taken from global coloring options. Some coloring types may be applied conditionally, based on whether the intended output destination is a file or terminal. If this is desired, the file descriptor of the destination can be given by the C{dest} parameter. @param ctype: type of coloring @type ctype: string @param dest: destination file descriptor @type dest: file @returns: plain string with resolved markup @rtype: string """ # Resolve coloring type, considering all things. if ctype is None: ctype = _cglobals.ctype if ctype in (None, "term"): if not _cglobals.outdep or (dest and dest.isatty()): ctype = "term" else: ctype = "none" color_pack = _color_packs.get(ctype) if color_pack is None: color_pack = _color_packs.get("none") colorf, escapef, finalf = color_pack - text = unicode(self) + text = str(self) rtext, epos = self._resolve_markup_w(text, len(text), 0, None, None, colorf, escapef) rtext = finalf(rtext) return rtext def _resolve_markup_w (self, text, tlen, pos, tag, ptag, colorf, escapef): rsegs = [] p = pos valid = True closed = False while p < tlen: pp = p p = text.find("<", p) if p < 0: p = tlen seg = text[pp:p] rsegs.append(escapef(_resolve_xml_ents(seg))) if p == tlen: break pp = p stag, closed, p = self._parse_tag(text, tlen, p) if stag is not None: if not closed: rseg, p = self._resolve_markup_w(text, tlen, p, stag, tag, colorf, escapef) rsegs.append(rseg) else: if tag != stag: # Wrong closed tag, declare this span not valid # and reposition at the tag start. valid = False p = pp break else: # Not a proper tag start, just take literal < and go on. rsegs.append("<") p = pp + 1 if tag and not closed: valid = False rtext = "".join(rsegs) if tag: if valid: rtext = colorf(tag, rtext, ptag) else: # Not proper span, put back opening tag. rtext = "<%s>%s" % (tag, rtext) return rtext, p def _parse_tag (self, text, tlen, pos): # FIXME: No possibility of attributes at the moment. if tlen is None: tlen = len(text) p = pos tag = None closed = False if p < tlen and text[p] == "<": p += 1 while p < tlen and text[p].isspace(): p += 1 if p < tlen: if text[p] == "/": p += 1 closed = True pp = p p = text.find(">", p) if p < 0: p = tlen else: tag = text[pp:p].strip() p += 1 return tag, closed, p def visual_segment (self, pos): """ Get visual representation of raw segment starting from position. This function checks whether the segment of the string starting at given position has the raw or another visual value, accounting for markup. If the visual and raw segments differ, the visual representation and length of the raw segment are returned. Otherwise, empty string and zero length are returned. @param pos: position where to check for visual segment @type pos: int @returns: visual segment and length of underlying raw segment @rtype: string, int """ vis, rlen = "", 0 c = self[pos:pos + 1] if c == "<": pos2 = self.find(">", pos) if pos2 > 0: - vis, rlen = u"", pos2 + 1 - pos + vis, rlen = "", pos2 + 1 - pos elif c == "&": pos2 = self.find(";", pos) if pos2 > 0: ent = self[pos + 1:pos2] val = _xml_entities.get(ent) if val is not None: vis, rlen = val, pos2 + 1 - pos return vis, rlen def _fill_color_string_class (): def wrap_return_type (method): def wmethod (self, *args, **kwargs): res = method(self, *args, **kwargs) - if isinstance(res, basestring): + if isinstance(res, str): res = ColorString(res) elif isinstance(res, (tuple, list)): res2 = [] for el in res: - if isinstance(el, basestring): + if isinstance(el, str): el = ColorString(el) res2.append(el) res = type(res)(res2) return res return wmethod for attrname in ( "__getitem__", "__getslice__", "__mul__", "__rmul__", "capitalize", "center", "expandtabs", "ljust", "lower", "lstrip", "replace", "rjust", "rsplit", "rstrip", "split", "strip", "swapcase", "title", "translate", "upper", "zfill", ): - method = getattr(unicode, attrname) + method = getattr(str, attrname) setattr(ColorString, attrname, wrap_return_type(method)) _fill_color_string_class() -def cjoin (strings, joiner=u""): +def cjoin (strings, joiner=""): """ Join strings into a L{ColorString} if any of them are L{ColorString}, otherwise into type of joiner. @param strings: strings to join @type strings: sequence of strings @param joiner: string to be inserted between each two strings @type joiner: string @returns: concatenation by joiner of all strings @rtype: type(joiner)/L{ColorString} """ if not isinstance(joiner, ColorString): for s in strings: if isinstance(s, ColorString): joiner = ColorString(joiner) break return joiner.join(strings) def cinterp (format, *args, **kwargs): """ Interpolate arguments into the format string, producing L{ColorString} if any of the arguments is L{ColorString}, otherwise type of format string. The format string can use either positional format directives, in which case positional arguments are supplied after it, or it can use named format directives, in which case keyword arguments are supplied after it. If both positional and keyword arguments are following the format string, the behavior is undefined. @param format: string with formatting directives @type format: string @returns: interpolated strings @rtype: type(format)/L{ColorString} """ iargs = args or kwargs if not isinstance(format, ColorString): - for v in (iargs.values() if isinstance(iargs, dict) else iargs): + for v in (list(iargs.values()) if isinstance(iargs, dict) else iargs): if isinstance(v, ColorString): format = ColorString(format) break return format % iargs class ColorOptionParser (OptionParser): """ Lightweight wrapper for C{OptionParser} from standard library C{optparse}, to gracefully handle L{ColorString} arguments supplied to its methods. """ def _cv (self, val): if isinstance(val, ColorString): val = val.resolve("term", sys.stdout) elif isinstance(val, (list, tuple)): - val = map(self._cv, val) + val = list(map(self._cv, val)) elif isinstance(val, dict): - val = dict((k, self._cv(v)) for k, v in val.items()) + val = dict((k, self._cv(v)) for k, v in list(val.items())) return val def __init__ (self, *args, **kwargs): OptionParser.__init__(self, *self._cv(args), **self._cv(kwargs)) def add_option (self, *args, **kwargs): OptionParser.add_option(self, *self._cv(args), **self._cv(kwargs)) # FIXME: More overrides. def get_coloring_types (): """ List of keywords of all available coloring types. """ - return _color_packs.keys() + return list(_color_packs.keys()) def set_coloring_globals (ctype="term", outdep=True): """ Set global options for coloring. L{ColorString.resolve} will use the type of coloring given by C{ctype} here whenever its own C{ctype} is set to C{None}. If C{outdep} is set to C{False}, L{ColorString.resolve} will not check the file descriptor given to it, and always use coloring type according to C{ctype}. @param ctype: type of coloring @type ctype: string @param outdep: whether coloring depends on output file descriptor @type outdep: bool """ _cglobals.outdep = outdep _cglobals.ctype = ctype class _Data: pass _cglobals = _Data() set_coloring_globals() # ======================================================================== _color_packs = {} # ---------------------------------------- # No coloring, all markup elements are just removed. _color_packs["none"] = (lambda c, s, p: s, lambda s: s, lambda r: r) # ---------------------------------------- # ANSI terminal coloring. _term_head = "\033[" _term_reset = "0;0m" _term_colors = { "bold": "01m", "underline": "04m", "black": "30m", "red": "31m", "green": "32m", "orange": "33m", "blue": "34m", "purple": "35m", "cyan": "36m", "grey": "37m", } def _color_term (col, seg, pcol): eseq = _term_colors.get(col) if eseq is not None: # If this segment is within another colored section, # repeat the outer color sequence at end, otherwise reset. # If outer and current colors match, do nothing. eseq2 = _term_reset peseq = _term_colors.get(pcol) if peseq: eseq2 = _term_head + peseq if eseq != eseq2: seg = _term_head + eseq + seg + _term_head + eseq2 return seg _color_packs["term"] = (_color_term, lambda s: s, lambda r: r) # ---------------------------------------- # HTML coloring. _html_colors = { "black": "#000000", "red": "#ff0000", "green": "#228b22", "orange": "#ff8040", "blue": "#0000ff", "purple": "#ff0080", "cyan": "#52f3ff", "grey": "#808080", } def _color_term (col, seg, pcol): if col == "bold": seg = "%s" % seg elif col == "underline": seg = "%s" % seg else: rgb = _html_colors.get(col) if rgb is not None: seg = "%s" % (rgb, seg) return seg def _finalize_html (line): return line.replace("\n", "
\n") + "
" _color_packs["html"] = (_color_term, _escape_xml_ents, _finalize_html) diff --git a/pology/config.py b/pology/config.py index 6fb4dca8..7cfe67f0 100644 --- a/pology/config.py +++ b/pology/config.py @@ -1,277 +1,277 @@ # -*- coding: UTF-8 -*- """ Access the user configuration of Pology. The location and the syntax of the configuration file is described in the user manual, at C{doc/user/common.docbook#sec-cmconfig}. At every place where the user configuration is sourced, the API documentation should state which sections and fields (with their types) are accessed, how they are used, and what is the behavior when they are not set. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import codecs import os -from ConfigParser import SafeConfigParser +from configparser import SafeConfigParser from pology import _, n_ from pology.report import error _config = SafeConfigParser() def _parse_config (): # Try to correctly resolve the config file location across systems. cfgbase = "pologyrc" if os.name=="nt": cfgpath = os.path.join(os.environ.get("APPDATA", ""), cfgbase) else: cfgbase = "." + cfgbase cfgpath = os.path.join(os.environ.get("HOME", ""), cfgbase) # Parse the config if available. if os.path.isfile(cfgpath): ifl = codecs.open(cfgpath, "r", "UTF-8") _config.readfp(ifl) ifl.close() # Parse configuration on first import. _parse_config() def has_section (name): """ Check if the section of the configuration exists already. @param name: name of the section @type name: string @returns: C{True} if the section exists, C{False} otherwise @rtype: bool """ return _config.has_section(name) class section: """ Section of the configuration. All getter methods take the field name and the default value, which is returned when the field is not set. If the configuration field is set but cannot be converted into a value of requested type, execution aborts with an error message. @ivar name: name of the section @type name: string """ def __init__ (self, name): """ Retrieve a section of the configuration. Constructed section object is valid even when the configuration does not contain the requested section (in that case, all field queries on the section are going to return default values). @param name: name of the section @type name: string """ self.name = name def fields (self): """ Get all configuration field names in this section. @rtype: set(string) """ if not _config.has_section(self.name): return set() return set(_config.options(self.name)) def _value (self, typ, name, default=None, typename=None): if not _config.has_option(self.name, name): return default value = _config.get(self.name, name) if typ is not bool: try: cvalue = typ(value) except: cvalue = None else: cvalue = strbool(value) if cvalue is None: if typename: error(_("@info", "User configuration: value '%(val)s' " "of field '%(field)s' in section '%(sec)s' " "cannot be converted into '%(type)s' type.", val=value, field=name, sec=self.name, type=typename)) else: error(_("@info", "User configuration: value '%(val)s' " "of field '%(field)s' in section '%(sec)s' " "cannot be converted into requested type.", val=value, field=name, sec=self.name)) return cvalue def string (self, name, default=None): """ Get a configuration field as a string. @rtype: unicode or as C{default} """ - return self._value(unicode, name, default, "string") + return self._value(str, name, default, "string") def integer (self, name, default=None): """ Get a configuration field as an integer number. @rtype: int or as C{default} """ return self._value(int, name, default, "integer") def real (self, name, default=None): """ Get a configuration field as a real number. @rtype: float or as C{default} """ return self._value(float, name, default, "real") def boolean (self, name, default=None): """ Get a configuration field as a boolean. @rtype: bool or as C{default} """ return self._value(bool, name, default, "boolean") def strslist (self, name, default=None, sep=","): """ Get a configuration field as a list of separated strings. Separator character or string is used to split the field value into substrings:: afield = foo, bar, baz Leading and trailing whitespace in list elements is stripped. If list elements should be able to contain any characters or whitespace is significant, use delimited list instead (L{strdlist}). @param sep: the separator @type sep: string @rtype: unicode or as C{default} """ - value = self._value(unicode, name, None, "string") + value = self._value(str, name, None, "string") if value is None: return default lst = value.split(sep) lst = [x.strip() for x in lst] return lst def strdlist (self, name, default=None): """ Get a configuration field as a list of delimited strings. Delimiter is taken to be the non-alphanumeric character with which the field value starts. In this example:: afield = /foo/bar/baz/ the delimiter is C{/}. If the field value does not start with a non-alphanumeric, or it does not end with the delimiter, error is signalled. @rtype: unicode or as C{default} """ - value = self._value(unicode, name, None, "string") + value = self._value(str, name, None, "string") if value is None: return default value = value.strip() if len(value) < 2: error(_("@info", "User configuration: value '%(val)s' of field '%(field)s' " "in section '%(sec)s' is too short for a delimited list.", val=value, field=name, sec=self.name)) if value[0].isalnum(): error(_("@info", "User configuration: value '%(val)s' of field '%(field)s' " "in section '%(sec)s' does not start with " "a non-alphanumeric delimiter character.", val=value, field=name, sec=self.name)) delim = value[0] if value[-1] != delim: error(_("@info", "User configuration: value '%(val)s' of field '%(field)s' " "in section '%(sec)s' does not end with " "the delimiter character with which it starts.", val=value, field=name, sec=self.name)) lst = value[1:-1].split(delim) return lst def strbool (value): """ Parse the string specification of a boolean value. Values considered C{false} are: C{"0"}, C{"no"}, C{"false"}, C{"off"}; and C{True}: C{1}, C{"yes"}, C{"true"}, C{"on"}. String is stripped of leading and trailing whitespace and lowercased before matching. If the string matches none of the expected logical specifiers, C{None} is returned. @param value: string to parse @type value: string @return: parsed boolean @rtype: bool """ value = value.strip().lower() if value in ("0", "no", "false", "off"): return False elif value in ("1", "yes", "true", "on"): return True else: return None diff --git a/pology/diff.py b/pology/diff.py index 65664b5f..69899db1 100644 --- a/pology/diff.py +++ b/pology/diff.py @@ -1,1810 +1,1810 @@ # -*- coding: UTF-8 -* """ Produce special diffs between strings and other interesting objects. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from difflib import SequenceMatcher import random import re from pology import PologyError, _, n_ from pology.colors import ColorString, cjoin from pology.message import MessageUnsafe from pology.report import error from pology.split import split_text _new_tag = "+" _new_vtag = "+" _new_opnc = "{" _new_clsc = "}" _old_tag = "-" _old_vtag = "-" _old_opnc = "{" _old_clsc = "}" _equ_tag = " " _tagext_none = "~" _tagext_none_len = len(_tagext_none) _new_opn = _new_opnc + _new_vtag _new_cls = _new_vtag + _new_clsc _old_opn = _old_opnc + _old_vtag _old_cls = _old_vtag + _old_clsc _all_wrappers = set((_new_opn, _new_cls, _old_opn, _old_cls)) _tmp_wr = (_new_vtag, _new_opnc, _new_clsc, _old_vtag, _old_opnc, _old_clsc) -_tmp_wrlen = map(len, _tmp_wr) +_tmp_wrlen = list(map(len, _tmp_wr)) if max(_tmp_wrlen) != 1 or min(_tmp_wrlen) != 1: error(_("@info \"ediff\" is shorthand for \"embedded difference\"", "All ediff wrapper elements must be of unit length.")) class _Sequence_diff_wrapper: def __init__ (self, obj, reductf=None): self.obj = obj self._robj = (reductf or (lambda x: x))(obj) def __hash__ (self): return hash(self._robj) def __iter__ (self): return iter(self._robj) def __eq__ (self, other): return self._robj == other._robj def tdiff (seq_old, seq_new, reductf=None, diffr=False): """ Create tagged difference of two sequences. Difference is presented as a list of tuples, with each tuple composed of a difference tag and a sequence element. Difference tag is string C{"+"}, C{"-"}, or C{" "}, for elements which belong to the old, the new, or to both sequences, respectively. The list is ordered such that collecting all elements not tagged as old will reconstruct the new sequence, and collecting all not tagged as new will reconstruct the old sequence. If requested by the C{diffr} parameter, also reported is the I{difference ratio}, a heuristic measure of difference between two texts. 0.0 means no difference, and 1.0 that sequences are completely different. Examples:: >>> s1 = "A type of foo".split() >>> s2 = "A kind of foo".split() >>> tdiff(s1, s2) [(' ', 'A'), ('-', 'type'), ('+', 'kind'), (' ', 'of'), (' ', 'foo')] >>> tdiff(s1, s2, diffr=True) ([(' ', 'A'), ('-', 'type'), ('+', 'kind'), (' ', 'of'), (' ', 'foo')], 0.25) To be able to diff them, sequence elements only need to be hashable. However, for compound elements it may be better to diff them only by some subset of data, e.g. by one of their string attributes. Parameter C{reductf} can be used to specify a reduction function, which will be called on each element to produce its diffing representative. @param seq_old: sequence to diff from @type seq_old: sequence with hashable elements @param seq_new: sequence to diff to @type seq_new: sequence with hashable elements @param reductf: function to produce diffing representatives @type reductf: (sequence element) -> diffing representative @param diffr: whether to report difference ratio @type diffr: bool @returns: difference list and possibly difference ratio @rtype: [(string, element)...] or ([(string, element)...], float) """ if reductf is not None: seq_old = [_Sequence_diff_wrapper(x, reductf) for x in seq_old] seq_new = [_Sequence_diff_wrapper(x, reductf) for x in seq_new] dlist = [] seqmatch = SequenceMatcher(None, seq_old, seq_new) opcodes = seqmatch.get_opcodes() if diffr: dr = 1.0 - seqmatch.ratio() for opcode, i1, i2, j1, j2 in opcodes: if opcode == "equal": dlist.extend([(_equ_tag, el) for el in seq_old[i1:i2]]) elif opcode == "replace": dlist.extend([(_old_tag, el) for el in seq_old[i1:i2]]) dlist.extend([(_new_tag, el) for el in seq_new[j1:j2]]) elif opcode == "delete": dlist.extend([(_old_tag, el) for el in seq_old[i1:i2]]) elif opcode == "insert": dlist.extend([(_new_tag, el) for el in seq_new[j1:j2]]) else: raise PologyError( _("@info \"opcode\" is shorthand for \"operation code\"", "Unknown opcode '%(code)s' from sequence matcher.", code=opcode)) if reductf is not None: dlist = [(tag, el.obj) for tag, el in dlist] return diffr and (dlist, dr) or dlist def itdiff (seq_old, seq_new, reductf=None, cutoff=0.6, diffr=False): """ Create interleaved tagged difference of two sequences. Similar to L{tdiff}, except that blocks of added/removed elements are further heuristically interleaved by similarity, such that each removed element may be followed by a similar added element, if such has been determined. This is useful e.g. to be able to afterwards make inner difference of each two paired similar elements (e.g. word diff within line diff). Example:: >>> s1 = "Two blue airplanes".split() >>> s2 = "Two bluish ships".split() >>> tdiff(s1, s2) [(' ', 'Two'), ('-', 'blue'), ('-', 'airplanes'), ('+', 'bluish'), ('+', 'ships')] >>> itdiff(s1, s2) [(' ', 'Two'), ('-', 'blue'), ('+', 'bluish'), ('-', 'airplanes'), ('+', 'ships')] To be able to interleave blocks, each element in turn must be a sequence in its own. This means that function supplied by C{reductf}, otherwise of same semantics as in L{tdiff}, here must also produce a sequence as diffing representative (e.g. a string). Parameter C{cutoff} states the minimal similarity between two elements needed for them to be considered similar at all. @param seq_old: sequence to diff from @type seq_old: sequence with hashable elements @param seq_new: sequence to diff to @type seq_new: sequence with hashable elements @param reductf: function to produce diffing representatives @type reductf: (sequence element) -> representative sequence @param cutoff: minimal similarity to consider elements similar @type cutoff: float [0, 1] @param diffr: whether to report difference ratio @type diffr: bool @returns: interleaved difference list and possibly difference ratio @rtype: [(string, element)...] or ([(string, element)...], float) """ dres = tdiff(seq_old, seq_new, reductf=reductf, diffr=diffr) if diffr: dlist, dr = dres else: dlist = dres lendl = len(dlist) idlist = [] i = 0 while True: while i < lendl and dlist[i][0] == _equ_tag: idlist.append(dlist[i]) i += 1 if i >= lendl: break els_old = [] els_new = [] while i < lendl and dlist[i][0] != _equ_tag: if dlist[i][0] == _old_tag: els_old.append(dlist[i][1]) else: els_new.append(dlist[i][1]) i += 1 if els_old and els_new: idlist.extend(_dinterleave(els_old, els_new, reductf, cutoff)) else: idlist.extend([(_old_tag, x) for x in els_old]) idlist.extend([(_new_tag, x) for x in els_new]) return diffr and (idlist, dr) or idlist def _dinterleave (els_old, els_new, reductf, cutoff): reductf = reductf or (lambda x: x) #plf = _plinds_full # too expensive plf = _plinds_cont pls_old = plf(len(els_old), len(els_old) + len(els_new), 0) pls_new = plf(len(els_new), len(els_old) + len(els_new), 0) pls_old.reverse() # so that last old-new pair is top-bottom maxsim = 0.0 opt_pairs = (pls_old[-1], pls_new[-1]) i = 0 for pl_old in pls_old: for pl_new in pls_new: i += 1 sim = 0.0 - pairs = zip(pl_old, pl_new) + pairs = list(zip(pl_old, pl_new)) for i_old, i_new in pairs: if i_old is None or i_new is None: continue seq_old = reductf(els_old[i_old]) seq_new = reductf(els_new[i_new]) r = SequenceMatcher(None, seq_old, seq_new).ratio() if r < cutoff: r = 0.0 sim += r if sim >= maxsim: # >= so that last equal wins maxsim = sim opt_pairs = pairs dlist = [] for i_old, i_new in opt_pairs: if i_old is not None: dlist.append((_old_tag, els_old[i_old])) if i_new is not None: dlist.append((_new_tag, els_new[i_new])) return dlist def _plinds_full (ninds, nplaces, baseind): if nplaces < ninds: return [] if ninds <= 0: return [(None,) * nplaces] else: return ( [(baseind,) + x for x in _plinds_full(ninds - 1, nplaces - 1, baseind + 1)] + [(None,) + x for x in _plinds_full(ninds, nplaces - 1, baseind)]) def _plinds_cont (ninds, nplaces, baseind): pls = [] insinds = tuple(range(ninds)) for i in range(nplaces - ninds + 1): pls.append((None,) * i + insinds + (None,) * (nplaces - ninds - i)) return pls def word_diff (text_old, text_new, markup=False, format=None, diffr=False): """ Create word-level difference between old and new text. The difference is computed by looking at texts as collections of words and intersegments. Difference is presented as a list of tuples, with each tuple composed of a difference tag and a text segment. Difference tag is string C{"+"}, C{"-"}, or C{" "}, for text segments which are new, old, or present in both texts, respectively. If one of the texts is C{None}, as opposed to empty string, a tilde is appended to the base difference tag. The list is ordered such that joining all text segments not marked as old will reconstruct the new text, and joining all not marked as new will reconstruct the old text. If requested by the C{diffr} parameter, also reported is the I{difference ratio}, a heuristic measure of difference between two texts. 0.0 means no difference, and 1.0 that the texts are completely different. Differencing may take into account when the texts are expected to have XML-like markup, or when they are of certain format defined by Gettext. Examples:: >>> s1 = "A new type of foo." >>> s2 = "A new kind of foo." >>> word_diff(s1, s2) [(' ', 'A new '), ('+', 'kind'), ('-', 'type'), (' ', ' of foo.')] >>> word_diff(s1, s2, diffr=True) ([(' ', 'A new '), ('+', 'kind'), ('-', 'type'), (' ', ' of foo.')], 0.36363636363636365) >>> word_diff(s1, None, diffr=True) ([('-~', 'A new type of foo.')], 1.0) >>> word_diff(None, s2, diffr=True) ([('+~', 'A new kind of foo.')], 1.0) @param text_old: the old text @type text_old: string or None @param text_new: the new text @type text_new: string or None @param markup: whether C{<...>} markup can be expected in the texts @type markup: bool @param format: Gettext format flag (e.g. C{"c-format"}, etc.) @type format: string @param diffr: whether to report difference ratio @type diffr: bool @returns: difference list and possibly difference ratio @rtype: [(string, string)...] or ([(string, string)...], float) """ # Special cases, when one or both texts are None, or both are empty. specdlist = None if text_old is None and text_new is None: specdlist = [] specdr = 0.0 elif text_old is None: specdlist = [(_new_tag + _tagext_none, text_new)] specdr = 1.0 elif text_new is None: specdlist = [(_old_tag + _tagext_none, text_old)] specdr = 1.0 elif text_new == "" and text_old == "": specdlist = [(_equ_tag, "")] specdr = 0.0 if specdlist is not None: return diffr and (specdlist, specdr) or specdlist # Split text into segments: words and intersections, combined into # single lists for old and new text. Use words as is, but split # intersections further into single characters. segments = [] segment_isintr = [] def add_segment (intr, word): segments[-1].extend(list(intr) + [word]) segment_isintr[-1].extend([True] * len(intr) + [False]) for text in (text_old, text_new): lw, li = split_text(text, markup, format) segments.append([]) segment_isintr.append([]) map(add_segment, li, lw + ['']) # Create the tagged difference. dlist = tdiff(segments[0], segments[1]) # Recompute which elements of the difference are intersections. dlist_isintr = [] i_old = 0 i_new = 0 for tag, seg in dlist: if tag == _old_tag: dlist_isintr.append(segment_isintr[0][i_old]) else: dlist_isintr.append(segment_isintr[1][i_new]) if tag != _new_tag: i_old += 1 if tag != _old_tag: i_new += 1 # Reshuffle so that all old-new elements consecutive but for the # intersections are grouped into all old followed by all new, # with intersections included in both. ndlist = [] i = 0 while i < len(dlist): while i < len(dlist) and dlist[i][0] == _equ_tag: ndlist.append(dlist[i]) i += 1 seq_new = [] seq_old = [] i_first_diff = i i_last_diff = i while i < len(dlist) and (dlist[i][0] != _equ_tag or dlist_isintr[i]): if dlist[i][0] != _new_tag: seq_old.append(dlist[i][1]) if dlist[i][0] != _old_tag: seq_new.append(dlist[i][1]) if dlist[i][0] != _equ_tag: i_last_diff = i i += 1 for iex in range(i_last_diff, i - 1): seq_new.pop() seq_old.pop() i = i_last_diff + 1 if seq_old: ndlist.append((_old_tag, "".join(seq_old))) if seq_new: ndlist.append((_new_tag, "".join(seq_new))) dlist = ndlist # Join contiguous new/old/both segments, make tagged tuples. ndlist = [] - S_EQU, S_NEW, S_OLD = range(3) + S_EQU, S_NEW, S_OLD = list(range(3)) state = S_EQU cseg = [] len_equ, len_old, len_new = 0, 0, 0 _sen_tag = "." dlist.append((_sen_tag, "")) # sentry for tag, seg in dlist: if state == S_EQU and tag in (_new_tag, _old_tag, _sen_tag): if cseg: ndlist.append((_equ_tag, "".join(cseg))) cseg = [] if tag == _new_tag: state = S_NEW else: state = S_OLD elif state == S_OLD and tag in (_equ_tag, _new_tag, _sen_tag): if cseg: ndlist.append((_old_tag, "".join(cseg))) cseg = [] if tag == _equ_tag: state = S_EQU else: state = S_NEW elif state == S_NEW and tag in (_equ_tag, _old_tag, _sen_tag): if cseg: ndlist.append((_new_tag, "".join(cseg))) cseg = [] if tag == _equ_tag: state = S_EQU else: state = S_OLD if tag == _old_tag: len_old += len(seg) elif tag == _new_tag: len_new += len(seg) else: len_equ += len(seg) if seg: cseg.append(seg) dlist = ndlist len_all = len_new + len_old + len_equ if len_all > 0: diff_ratio = 1.0 - float(len_equ) / float(len_all) else: diff_ratio = 0.0 return diffr and (dlist, diff_ratio) or dlist def word_ediff (text_old, text_new, markup=False, format=None, colorize=False, diffr=False): """ Create word-level embedded difference between old and new texts. Same as L{word_diff}, but the difference is returned as text in which the new segments are wrapped as C{{+...+}}, and the old segments as C{{-...-}}. If a difference wrapper is already contained in the text, it will be escaped by inserting a tilde, e.g. C{"{+...+}"} -> C{"{~+...+~}"}. If even an escaped wrapper is contained in the text, another tilde is inserted, and so on. If one of the texts is C{None}, then the whole other text is wrapped as suitable difference, and a tilde added to its end to indicate that the other text was C{None}. If neither of the texts is C{None}, but after differencing the tilde appears in the end of embedded difference, it is escaped by another tilde. If both texts are C{None}, C{None} is returned as the difference. The C{colorize} parameter can be used to additionally highlight embedded difference by using color markup provided by L{ColorString}. If colorizing is enabled, the return value is a C{ColorString}. See L{word_diff} for description of other parameters. @param colorize: whether to colorize differences @type colorize: bool @returns: string with embedded differences and possibly difference ratio @rtype: string/ColorString/None or (string/ColorString/None, float) @see: L{word_diff} """ dlist, dr = word_diff(text_old, text_new, markup, format, diffr=True) if not dlist: return diffr and (None, 0.0) or None dtext = _assemble_ediff(dlist, colorize) return diffr and (dtext, dr) or dtext _capt_old_rx = re.compile( "\\" + _old_opnc + "\\" + _old_vtag + "(.*?)" \ + "\\" + _old_vtag + "\\" + _old_clsc, re.U|re.S) _capt_new_rx = re.compile( "\\" + _new_opnc + "\\" + _new_vtag + "(.*?)" \ + "\\" + _new_vtag + "\\" + _new_clsc, re.U|re.S) def word_ediff_to_old (dtext): """ Recover old version (-) from text with embedded differences. In case there was no old text, C{None} is returned. @param dtext: text with embedded differences @type dtext: string @returns: old version of the text @rtype: string or None @see: L{word_ediff} """ return _word_ediff_to_oldnew(dtext, _capt_old_rx, _capt_new_rx) def word_ediff_to_new (dtext): """ Recover new version (+) from text with embedded differences. In case there was no new text, C{None} is returned. @param dtext: text with embedded differences @type dtext: string @returns: new version of the text @rtype: string or None @see: L{word_ediff} """ return _word_ediff_to_oldnew(dtext, _capt_new_rx, _capt_old_rx) def _word_ediff_to_oldnew (dtext, capt_this_rx, capt_other_rx): if dtext is None: return None if isinstance(dtext, ColorString): dtext = dtext.resolve("none") text = dtext text = capt_this_rx.sub(r"\1", text) text = capt_other_rx.sub(r"", text) text = _unescape_ewraps(text) if text.endswith(_tagext_none): text = text[:-_tagext_none_len] if not text and capt_other_rx.search(dtext): text = None return text def word_ediff_to_rem (dtext, sep=" "): """ Recover removed segments (-) from text with embedded differences. If separator is not C{None}, the joined string of selected segments is returned. Otherwise, the list of selected segments is returned. In either case, if there was no old text, C{None} is returned. @param dtext: text with embedded differences @type dtext: string @param sep: separator with which to join selected segments @type sep: string or None @returns: text with only the removed segments @rtype: string or list or None @see: L{word_ediff} """ return _word_ediff_to_addrem(dtext, _capt_old_rx, sep) def word_ediff_to_add (dtext, sep=" "): """ Recover added segments (+) from text with embedded differences. If separator is not C{None}, the joined string of selected segments is returned. Otherwise, the list of selected segments is returned. In either case, if there was no new text, C{None} is returned. @param dtext: text with embedded differences @type dtext: string @param sep: separator with which to join selected segments @type sep: string or None @returns: text with only the added segments @rtype: string or list or None @see: L{word_ediff} """ return _word_ediff_to_addrem(dtext, _capt_new_rx, sep) def _word_ediff_to_addrem (dtext, capt_this_rx, sep): if dtext is None: return None if isinstance(dtext, ColorString): dtext = dtext.resolve("none") segs = capt_this_rx.findall(dtext) if sep is not None: segs = sep.join(segs) if ( not segs and dtext.endswith((_old_clsc + _tagext_none, _new_clsc + _tagext_none)) ): segs = None return segs def line_diff (lines_old, lines_new, markup=False, format=None, diffr=False): """ Create word-level difference between old and new lines of text. First makes a difference on a line-level, and then for each set of differing lines a difference on word-level, using L{word_diff}. Difference is presented as a list of tuples of word diffs and ratios as constructed by L{word_diff}. See L{word_diff} for description of keyword parameters. The difference ratio is computed as line-length weighted average of word difference ratios per line. @param lines_old: old lines of text @type lines_old: string @param lines_new: new lines of text @type lines_new: string @returns: difference list and possibly difference ratios @rtype: [[(string, string)...]...] or ([([(string, string)...], float)...], float) """ # Create the difference. dlist = tdiff(lines_old, lines_new) # Reshuffle so that all consecutive old-new lines are grouped into # all old followed by all new. # For each old-new set, compute word-diffs and weigh diff-ratios. wdiffs = [] sumwdrs = 0.0 sumws = 0.0 i = 0 while i < len(dlist): while i < len(dlist) and dlist[i][0] == _equ_tag: seg = dlist[i][1] wdiffs.append(([(_equ_tag, seg)], 0.0)) w = len(seg) sumwdrs += 0.0 * w sumws += w i += 1 seq_new = [] seq_old = [] while i < len(dlist) and dlist[i][0] != _equ_tag: seg = dlist[i][1] if dlist[i][0] != _new_tag: seq_old.append(seg) if dlist[i][0] != _old_tag: seq_new.append(seg) i += 1 if seq_old and seq_new: # Decide which line to pair with which by minimal local diff ratio. # FIXME: Now it tries to place best first line, then second, etc. # For higher precision, test all combinations. lold = len(seq_old) lnew = len(seq_new) lmax = max(lold, lnew) lmin = min(lold, lnew) if lold <= lnew: s1, s2, tag2, rev = seq_old, seq_new, _new_tag, False else: s1, s2, tag2, rev = seq_new, seq_old, _old_tag, True i1 = 0 i2 = 0 while i1 < lmin: mindr = 1.1 mwdiff = [] mj2 = -1 for j2 in range(i2, lmax - lmin + i1 + 1): if not rev: t1, t2 = s1[i1], s2[j2] else: t1, t2 = s2[j2], s1[i1] wdiff, dr = word_diff(t1, t2, markup, format, diffr=True) if mindr > dr: mindr = dr mwdiff = wdiff mj2 = j2 for j2 in range(i2, mj2): wdiffs.append(([(tag2 + _tagext_none, s2[j2])], 1.0)) w = len(s2[j2]) sumwdrs += 1.0 * w sumws += w i2 = mj2 wdiffs.append((mwdiff, mindr)) w = len(s2[i2]) sumwdrs += mindr * w sumws += w i1 += 1 i2 += 1 for j2 in range(i2, lmax): wdiffs.append(([(tag2 + _tagext_none, s2[j2])], 1.0)) w = len(s2[j2]) sumwdrs += 1.0 * w sumws += w elif seq_old: wdiffs.extend([([(_old_tag + _tagext_none, x)], 1.0) for x in seq_old]) w = sum(map(len, seq_old)) sumwdrs += 1.0 * w sumws += w elif seq_new: wdiffs.extend([([(_new_tag + _tagext_none, x)], 1.0) for x in seq_new]) w = sum(map(len, seq_new)) sumwdrs += 1.0 * w sumws += w # Weighted-averaged diff-ratio. dr = sumws > 0.0 and sumwdrs / sumws or 0.0 return diffr and (wdiffs, dr) or [x[0] for x in wdiffs] def line_ediff (lines_old, lines_new, markup=False, format=None, colorize=False, diffr=False): """ Create word-level embedded difference between old and new lines of text. Same as L{line_diff}, but the difference is returned as list of tuples of line of text (in which the new segments are wrapped as C{{+...+}}, and the old segments as C{{-...-}}) and difference ratio for the line. See L{word_diff} and L{word_ediff} for description of keyword parameters. @returns: lines with embedded differences and possibly difference ratios @rtype: [string...] or ([(string, float)...], float) @see: L{line_diff} """ dlists, dr = line_diff(lines_old, lines_new, markup, format, diffr=True) dlines = [(_assemble_ediff(x[0], colorize), x[1]) for x in dlists] return diffr and (dlines, dr) or [x[0] for x in dlines] def line_ediff_to_old (dlines): """ Recover old version (-) from lines of text with embedded differences. @param dlines: lines of text with embedded differences @type dlines: list of strings @returns: old version of the lines @rtype: list of strings @see: L{line_ediff} """ return _line_ediff_to_oldnew(dlines, word_ediff_to_old) def line_ediff_to_new (dlines): """ Recover new version (+) from lines of text with embedded differences. @param dlines: lines of text with embedded differences @type dlines: list of strings @returns: new version of the lines @rtype: list of strings @see: L{line_ediff} """ return _line_ediff_to_oldnew(dlines, word_ediff_to_new) def _line_ediff_to_oldnew (dlines, word_ediff_to_x): lines = [] for dline in dlines: line = word_ediff_to_x(dline) if line is not None: lines.append(line) return lines def _assemble_ediff (dlist, colorize): if not dlist: return None dtext = [] other_none = False for segtag, segtext in dlist: wext = "" if segtag.endswith(_tagext_none): # Can happen only if there is a single difference segment. segtag = segtag[:-_tagext_none_len] other_none = True segtext = _escape_ewraps(segtext) if segtag == _new_tag: d = _new_opn + segtext + _new_cls + wext if colorize: d = ColorString("%s") % d dtext.append(d) elif segtag == _old_tag: d = _old_opn + segtext + _old_cls + wext if colorize: d = ColorString("%s") % d dtext.append(d) else: dtext.append(segtext) haseqseg = True dtext = cjoin(dtext) if other_none: # Indicate the other string was none. dtext += _tagext_none elif dtext.endswith(_tagext_none): # Escape any trailing other-none markers. dtext += _tagext_none return dtext def _escape_ewraps (text): return _escunesc_ewraps(text, False) def _unescape_ewraps (text): return _escunesc_ewraps(text, True) _ediff_esc = _tagext_none _ediff_esc_len = len(_ediff_esc) def _escunesc_ewraps (text, unescape): for wstart, wend in ( (_old_opnc, _old_vtag), (_old_vtag, _old_clsc), (_new_opnc, _new_vtag), (_new_vtag, _new_clsc), ): segs = [] p = 0 tlen = len(text) lwstart = len(wstart) lwend = len(wend) while True: pp = p p = text.find(wstart, p) if p < 0: segs.append(text[pp:]) break segs.append(text[pp:p]) pp = p p += lwstart nesc = 0 while p < tlen and text[p:p + _ediff_esc_len] == _ediff_esc: p += _ediff_esc_len nesc += 1 if p == tlen or text[p:p + lwend] != wend or (unescape and nesc < 1): segs.append(text[pp:p]) else: if not unescape: segs.append(text[pp:p] + _ediff_esc + wend) else: segs.append(text[pp:p - _ediff_esc_len] + wend) p += lwend - text = u"".join(segs) + text = "".join(segs) return text def adapt_spans (otext, ftext, spans, merge=True): """ Adapt matched spans in filtered text to original text. Sometimes text gets filtered before being matched, and when a match is found in the filtered text, it needs to be reported relative to the original text. This function will heuristically adapt matched spans relative to the filtered text back to the original text. Spans are given as list of index tuples C{[(start1, end1), ...]} where start and end index have standard Python semantics (may be negative too). If C{merge} is C{True}, any spans that overlap or abut after adaptation will be merged into a single span, ordered by increasing start index, and empty spans removed; otherwise each adapted span will strictly correspond to the input span at that position. Span tuples may have more elements past the start and end indices. They will be ignored, but preserved; if merging is in effect, extra elements will be preserved for only the frontmost of the overlapping spans (undefined for which if there are several). If an input span is invalid in any way, it is carried over verbatim into result. @param otext: original text @type otext: string @param ftext: filtered text @type ftext: string @param spans: matched spans @type spans: list of index tuples @param merge: whether to merge overlapping spans @type merge: bool @returns: adapted spans @rtype: list of index tuples """ if not spans: return spans # Resolve negative spans. # Select out spans with invalid start or end. flen = len(ftext) fspans = [] invalid_spans = [] for span in spans: start, end = span[:2] valid = True if isinstance(start, int): if start < 0: start = flen + start else: valid = False if isinstance(end, int): if end < 0: end = flen + end else: valid = False if valid and start > end: valid = False if valid: fspans.append((start, end) + span[2:]) else: invalid_spans.append(span) # Create character-level difference from original to filtered text. dlist = tdiff(otext, ftext) # For each span, go through the difference and... do some magic. aspans = [] for fspan in fspans: aspan = [] for filtered_index, first in zip(fspan[:2], (True, False)): original_index = 0 original_index_atdiff = 0 track_index = 0 adapted_index = None stop_at_next_eq = False for dtag, dseg in dlist: slen = len(dseg) if dtag == _new_tag: track_index += slen elif dtag == _old_tag: original_index += slen else: original_index += slen track_index += slen original_index_atdiff = original_index if stop_at_next_eq: break if track_index >= filtered_index: exlen = track_index - filtered_index # 0 if char-level diff if dtag == _equ_tag: adapted_index = original_index - exlen break else: # dtag must be _new_tag if first: adapted_index = original_index_atdiff break else: stop_at_next_eq = True if stop_at_next_eq: adapted_index = original_index if adapted_index is None: break aspan.append(adapted_index) if adapted_index is not None: aspan.extend(fspan[2:]) aspans.append(tuple(aspan)) # Merge spans if requested. if merge: # Sort by start index immediately, for priority of extra elements. aspans.sort(key=lambda x: x[0]) maspans = [] while len(aspans) > 0: cstart, cend = aspans[0][:2] extras = aspans[0][2:] if cstart >= cend: aspans.pop(0) # remove empty spans continue i = 0 while i < len(aspans): start, end = aspans[i][:2] if cend >= start and cstart <= end: cstart = min(cstart, start) cend = max(cend, end) aspans.pop(i) else: i += 1 maspans.append((cstart, cend) + extras) # Sort by start index. maspans.sort(key=lambda x: x[0]) aspans = maspans # Put invalid spans back. aspans.extend(invalid_spans) return aspans -_dt_state, _dt_single, _dt_list = range(3) +_dt_state, _dt_single, _dt_list = list(range(3)) _msg_diff_parts = ( ("obsolete", _dt_state), ("fuzzy", _dt_state), ("manual_comment", _dt_list), ("msgctxt_previous", _dt_single), ("msgid_previous", _dt_single), ("msgid_plural_previous", _dt_single), ("msgctxt", _dt_single), ("msgid", _dt_single), ("msgid_plural", _dt_single), ("msgstr", _dt_list), ) _msg_dpart_types = dict(_msg_diff_parts) _msg_curr_fields = ( "msgctxt", "msgid", "msgid_plural", ) _msg_currprev_fields = [(x, x + "_previous") for x in _msg_curr_fields] def msg_diff (msg1, msg2, pfilter=None, addrem=None, diffr=False): """ Create word-level difference between extraction-invariant parts of messages. For which parts of a message are considered extraction-invariant, see description of L{inv} instance variable of message objects. There are two return modes, depending on the value of C{diffr} parameter. If C{diffr} is C{False}, the difference is returned as list of 3-tuples of differences by message part: (part name, part item, word difference). The part name can be used to fetch the part value from the message, using L{get()} method of message objects. The part item is C{None} for singular message parts (e.g. C{msgid}), and index for list parts (e.g. C{msgstr}). See L{word_diff} for the format of word-level difference. If C{diffr} is C{True}, then each part difference has a fourth element, the difference ratio; see L{word_diff} for its semantics. Additionally, the total difference ratio is computed, based on partial ones (also counting the zero difference of parts which were equal). The return value is now a 2-tuple of list of part differences (as 4-tuples) and the total difference ratio. Either of the messages can be given as C{None}. In case only one of the messages is C{None}, the difference of C{msgid} field will show that this field does not exist in the non-existant message (according to format of non-existant counterparts of L{word_diff}). If both messages are C{None}, the difference is empty list, as the messages are same, even if non-existant. Every C{msgstr} field can be passed through a filter before differencing, using the C{pfilter} parameter. Instead of constructing the full difference, using the C{addrem} parameter only equal, added, or removed segments can be reported. The value of this parameter is a string, such that the first character selects the type of partial difference: one of ('=', "e') for equal, ('+', 'a') for added, and ('-', 'r') for removed segments, and the rest of the string is used as separator to join the selected segments (if the separator is empty, space is used instead). @param msg1: the message from which to make the difference @type msg1: L{Message_base} or None @param msg2: the message to which to make the difference @type msg2: L{Message_base} or None @param pfilter: filter to be applied to translation prior to differencing @type pfilter: callable @param addrem: report equal, added or removed segments instead of full difference, joined by what follows the selection character @type addrem: string @param diffr: whether to report difference ratio @type diffr: bool @return: difference list @rtype: [(string, int/None, [(string, string)...])...] or ([(string, int/None, [(string, string)...], float)...], float) """ # Create thoroughly empty dummy messages in place of null messages. mod_msgs = [] for msg in (msg1, msg2): if msg is None: msg = MessageUnsafe() msg.msgid = None msg.msgstr = [] mod_msgs.append(msg) msg1, msg2 = mod_msgs # For partial differencing, decide upon which part of diffs to take. ar_dtyp = None if addrem: mode = addrem[0] - ar_sep = unicode(addrem[1:] or " ") + ar_sep = str(addrem[1:] or " ") if mode in ("=", "e"): ar_dtyp = _equ_tag elif mode in ("+", "a"): ar_dtyp = _new_tag elif mode in ("-", "r"): ar_dtyp = _old_tag else: raise PologyError( _("@info", "Unknown selection mode '%(mode)s' for partial differencing.", mode=mode)) # Diff two texts under the given diffing options. def _twdiff (text1, text2, islines=False, cpfilter=None): f_diff = islines and line_diff or word_diff if cpfilter: if not islines: text1 = cpfilter(text1) text2 = cpfilter(text2) else: text1 = [cpfilter(x) for x in text1] text2 = [cpfilter(x) for x in text2] format = (msg2 or msg1).format wdiff, dr = f_diff(text1, text2, markup=True, format=format, diffr=True) if addrem: if not islines: wdiff_part = None ar_segs = [x for t, x in wdiff if t == ar_dtyp] if text1 is not None or text2 is not None: wdiff_part = ar_sep.join(ar_segs) else: wdiff_part = [] for wdiff1, dr1 in wdiff: ar_segs = [x for t, x in wdiff1 if t == ar_dtyp] dr1 = 1.0 - dr1 if text1 or text2: wdiff_part += [(ar_sep.join(ar_segs), dr1)] wdiff = wdiff_part dr = 1.0 - dr return wdiff, dr # Create diffs of relevant parts. part_diffs = [] sumdr = 0.0 sumw = 0.0 # ...unless something cleverer comes up, weigh each part same. for part, typ in _msg_diff_parts: if typ == _dt_single: val1 = msg1.get(part) val2 = msg2.get(part) wdiff, dr = _twdiff(val1, val2) part_diffs.append((part, None, wdiff, dr)) sumdr += dr * 1.0 sumw += 1.0 elif typ == _dt_list: lst1 = msg1.get(part) lst2 = msg2.get(part) cpf = part == "msgstr" and pfilter or None wdiffs, totdr = _twdiff(lst1, lst2, islines=True, cpfilter=cpf) item = 0 for wdiff, dr in wdiffs: part_diffs.append((part, item, wdiff, dr)) item += 1 sumdr += dr * 1.0 sumw += 1.0 elif typ == _dt_state: st1 = msg1.get(part) and part or "" st2 = msg2.get(part) and part or "" wdiff, dr = word_diff(st1, st2, diffr=True) part_diffs.append((part, None, wdiff, dr)) sumdr += dr * 1.0 sumw += 1.0 else: raise PologyError( _("@info", "Unhandled message part '%(part)s' encountered " "while differencing.", part=part)) if diffr: dr = sumw and sumdr / sumw or 0.0 return part_diffs, dr else: return [x[:3] for x in part_diffs] _dcmnt_field = "auto_comment" # to use manual_comment would be bad idea -_dcmnt_head = u"ediff:" -_dcmnt_head_esc = u"~" # must be single character -_dcmnt_sep = u", " -_dcmnt_asep = u" " -_dcmnt_ind_state = u"state" -_dcmnt_ind_ctxtpad = u"ctxtpad" -_dcmnt_ind_infsep = u"infsep" +_dcmnt_head = "ediff:" +_dcmnt_head_esc = "~" # must be single character +_dcmnt_sep = ", " +_dcmnt_asep = " " +_dcmnt_ind_state = "state" +_dcmnt_ind_ctxtpad = "ctxtpad" +_dcmnt_ind_infsep = "infsep" _dcmnt_all_inds = ( # ordered _dcmnt_ind_state, _dcmnt_ind_ctxtpad, _dcmnt_ind_infsep, ) -_ctxtpad_sep = u"|" -_ctxtpad_noctxt = u"~" -_ctxtpad_alnums = u"abcdefghijklmnopqrstuvwxyz0123456789" -_infsep_blk = u"~=" +_ctxtpad_sep = "|" +_ctxtpad_noctxt = "~" +_ctxtpad_alnums = "abcdefghijklmnopqrstuvwxyz0123456789" +_infsep_blk = "~=" _infsep_minlen = 20 def msg_ediff (msg1, msg2, pfilter=None, addrem=None, emsg=None, ecat=None, eokpos=None, enoctxt=None, emptydc=False, colorize=False, diffr=False): """ Create word-level embedded difference between extraction-invariant parts of messages. Like L{msg_diff}, but instead of difference list the result is a message with embedded differences, of the kind produced by L{word_ediff}. See L{msg_diff} for description C{pfilter} and C{addrem} parameters, and L{word_ediff} for the format of embedded differences. Additionally, if C{pfilter} is given, C{msgstr} fields will be diffed both with and without the filter, and if the two diffs are not equal, both embeddings are going to be presented in the field, suitably visually separated. By default, a new message with embedded difference will be constructed, of the type of first non-None of C{msg2} and C{msg1}. Alternatively, the difference can be embedded into the message supplied by C{emsg} parameter. If resulting messages with embedded differences are to be inserted into a catalog, that catalog can be given by the C{ecat} parameter. Then, if the key of the resulting message would conflict one of those already in the catalog, its context will be appropriately padded to avoid the conflict. This is done by adding a pipe character and an unspecified number of alphanumerics (generally junk-looking) to the end of the C{msgctxt}. In case the conflict with a particular message in the catalog is acceptable (e.g. when resulting message is to be inserted in its place), the position of this message can be given by the C{eokpos} parameter. In case a certain value of C{msgctxt} should be padded regardless of whether there is a conflict or not, this value can be given by C{enoctxt} parameter. An additional automatic comment starting with C{ediff:} may be added to the message, possibly followed by some indicators necessary to complete the difference specification. These include: - C{state ...}: changes in message state, like C{obsolete} and C{fuzzy}; e.g. C{state {+obsolete+}} means that the message has been obsoleted from C{msg1} to C{msg2}, while C{state {-obsolete-}} means that it has been was revived. - C{ctxtpad }: padding alphanumerics added to the C{msgctxt} field to avoid key collision with one of the messages from C{ecat}. - C{infsep }: if C{pfilter} was used, this indicator states the building block and length in blocks of in-field separators. By default the difference comment is not added if there are no indicators, but it may be forced by setting C{emptydc} parameter to C{True}. Embedded differences can be additionally colorized (e.g. for terminal) by setting C{colorize} parameter to C{True}. If C{diffr} is C{True}, aside from the message with embedded differences, the total difference ratio is returned (see L{msg_diff}). If C{pfilter} is given, the ratio refers to difference under filter. @param msg1: the message from which to make the difference @type msg1: L{Message_base} or None @param msg2: the message to which to make the difference @type msg2: L{Message_base} or None @param pfilter: filter to be applied to translation prior to differencing @type pfilter: callable @param addrem: report equal, added or removed segments instead of full difference, joined by what follows the selection character @type addrem: string @param emsg: message to embedd the difference to @type emsg: L{Message_base} @param ecat: catalog of messages to avoid key conflict with @type ecat: L{Catalog} @param eokpos: position into C{ecat} where key conflict is ignored @type eokpos: int @param enoctxt: C{msgctxt} string that should be padded unconditionally @type enoctxt: string @param emptydc: whether to add difference comment even if empty @type emptydc: bool @param colorize: whether to colorize the difference @type colorize: bool @param diffr: whether to report difference ratio @type diffr: bool @return: message with embedded differences (or None) and possibly difference ratio @rtype: type(emsg or msg2 or msg1 or None) or (type(~), float) """ if msg1 is None and msg2 is None: return not diffr and (None, 0.0) or None # Compute the difference. wdiffs, totdr = msg_diff(msg1, msg2, addrem=addrem, diffr=True) wdiffs_pf = [] if pfilter: wdiffs_pf, totdr = msg_diff(msg1, msg2, pfilter=pfilter, addrem=addrem, diffr=True) # Construct list of embedded diffs out of original difference list. if not addrem: mtoe = lambda x: (x[0], x[1], _assemble_ediff(x[2], colorize), x[3]) - ediffs = map(mtoe, wdiffs) - ediffs_pf = map(mtoe, wdiffs_pf) + ediffs = list(map(mtoe, wdiffs)) + ediffs_pf = list(map(mtoe, wdiffs_pf)) else: ediffs = wdiffs ediffs_pf = wdiffs_pf # Construct the message to embed differences into. if emsg is None: tmsg = msg2 or msg1 emsg = type(tmsg)() for part, typ in _msg_diff_parts: tval = tmsg.get(part) if tval is not None: setattr(emsg, part, type(tval)(tval)) # Indicators for the difference comment. indargs = {} # Determine field separator for raw/filtered differences. if ediffs_pf: infseplen = _infsep_minlen infsepinc = 5 infseplen_p = infseplen - 1 while infseplen_p < infseplen: infsep = _infsep_blk * infseplen infseplen_p = infseplen for part, item, ediff, dr in ediffs + ediffs_pf: if ediff and infsep in ediff: infseplen += infsepinc break indargs[_dcmnt_ind_infsep] = [_infsep_blk, str(infseplen)] # Embed differences. for i in range(len(ediffs)): part, item, ediff, dr = ediffs[i] typ = _msg_dpart_types[part] if typ == _dt_single: setattr(emsg, part, ediff) elif typ == _dt_list: lst = emsg.get(part) - lst.extend([u""] * (item + 1 - len(lst))) + lst.extend([""] * (item + 1 - len(lst))) if ediffs_pf: ediff_pf = ediffs_pf[i][2] if ediff_pf and ediff_pf != ediff: ediff += "\n" + infsep + "\n" + ediff_pf lst[item] = ediff elif typ == _dt_state: stag, spart = wdiffs[i][2][0] if stag != _equ_tag: if _dcmnt_ind_state not in indargs: indargs[_dcmnt_ind_state] = [] indargs[_dcmnt_ind_state].append(ediff) sval = bool(stag in (_new_tag, _equ_tag) and spart) setattr(emsg, part, sval) else: raise PologyError( _("@info", "Unhandled message part '%(part)s' encountered " "while differencing.", part=part)) # Pad context to avoid conflicts. if ( (ecat is not None and emsg in ecat and ecat.find(emsg) != eokpos) or (enoctxt is not None and emsg.msgctxt == enoctxt) ): noctxtind = emsg.msgctxt is None and _ctxtpad_noctxt or "" - octxt = emsg.msgctxt or u"" + octxt = emsg.msgctxt or "" while True: padding = "".join([random.choice(_ctxtpad_alnums) for x in range(5)]) emsg.msgctxt = octxt + _ctxtpad_sep + padding + noctxtind if ( emsg not in ecat and (enoctxt is None or emsg.msgctxt != enoctxt) ): break indargs[_dcmnt_ind_ctxtpad] = [padding] # If any of the existing comments looks like diff comment, escape it. ecomments = emsg.get(_dcmnt_field) for i in range(len(ecomments)): scmnt = ecomments[i].strip() p = scmnt.find(_dcmnt_head) if p >= 0 and scmnt[:p] == _dcmnt_head_esc * p: nwp = 0 while scmnt[nwp].isspace(): nwp += 1 ecomments[i] = scmnt[:nwp] + _dcmnt_head_esc + scmnt[nwp:] # Add diff comment. if indargs or emptydc: inds = [] for ind in _dcmnt_all_inds: # to have deterministic ordering alst = indargs.get(ind) if alst is not None: inds.append(cjoin([ind] + alst, _dcmnt_asep)) dcmnt = _dcmnt_head if inds: dcmnt += " " + cjoin(inds, _dcmnt_sep) ecomments.insert(0, dcmnt) return diffr and (emsg, totdr) or emsg def msg_ediff_to_new (emsg, rmsg=None): """ Resolve message with embedded difference to the newer message. Message cannot be properly resolved if C{addrem} parameter to L{msg_ediff} was used on embedding. If this function is called on such a message, the result is undefined. By default a new message object is created, but using the C{rmsg} parameter, en existing message can be given to be filled with all the resolved parts (keeping its own, ignored parts). This message can be the C{emsg} itself. If the resolved message evaluates to no message, the function returns C{None}, and C{rmsg} is not touched if it was given. Any states indicated as added by the difference comment are ignored in favor of the actual states of embedded difference message. The two sets should normally be equal, but if they are not, the actual state in effect overrides the indicated added state. @param emsg: resolvable message with embedded differences @type emsg: L{Message_base} or None @param rmsg: message to fill in the resolved parts @type rmsg: L{Message_base} @return: resolved message (or None) @rtype: type of first non-None of rmsg, emsg, or None """ return _msg_ediff_to_x(emsg, rmsg, new=True) def msg_ediff_to_old (emsg, rmsg=None): """ Resolve message with embedded difference to the older message. Like L{msg_ediff_to_new}, only constructing the opposite message (except that states indicated as removed by difference comment are never ignored, i.e. they always override actual states). See L{msg_ediff_to_new} for parameters and return values. """ return _msg_ediff_to_x(emsg, rmsg, new=False) def _msg_ediff_to_x (emsg, rmsg, new): if new: word_ediff_to_x = word_ediff_to_new word_ediff_to_o = word_ediff_to_old line_ediff_to_x = line_ediff_to_new ignore_state_diff = True else: word_ediff_to_x = word_ediff_to_old word_ediff_to_o = word_ediff_to_new line_ediff_to_x = line_ediff_to_old ignore_state_diff = False # Work on copy if target message not given. if rmsg is None: rmsg = type(emsg)(emsg) # Since rmsg can be emsg itself, collect all attributes to set, # and set them in the end. atts_vals = [] # Parse everything out of diff comment, # unescape comments which looked like diff comment and were escaped. states = {} ctxtpad = None infsep = None cmnts = [] for cmnt in list(emsg.get(_dcmnt_field)): scmnt = cmnt.strip() p = scmnt.find(_dcmnt_head) if p == 0: dcmnt = scmnt[len(_dcmnt_head):] # FIXME: Checks for unknown indicators and bad arguments. for indargs in dcmnt.split(_dcmnt_sep.strip()): lst = indargs.strip().split(_dcmnt_asep) ind, args = lst[0], [word_ediff_to_x(x) for x in lst[1:]] if 0: pass elif ind == _dcmnt_ind_state: for arg in args: if _msg_dpart_types.get(arg) == _dt_state: states[arg] = True args_o = [word_ediff_to_o(x) for x in lst[1:]] for arg in args_o: if _msg_dpart_types.get(arg) == _dt_state: states[arg] = False elif ind == _dcmnt_ind_ctxtpad: ctxtpad = args[0] elif ind == _dcmnt_ind_infsep: infsep = args[0] * int(args[1]) else: if p > 0 and scmnt[:p] == _dcmnt_head_esc * p: nwp = 0 while cmnt[nwp].isspace(): nwp += 1 cmnt = cmnt[:nwp] + cmnt[nwp + 1:] cmnts.append(cmnt) listtype = type(rmsg.msgstr) # Put back cleaned comments. atts_vals.append((_dcmnt_field, listtype(cmnts))) # Remove context padding. if ctxtpad: val = emsg.get("msgctxt") - p = val.rfind(ctxtpad or u"") + p = val.rfind(ctxtpad or "") if ( p < 0 or val[p - len(_ctxtpad_sep):p] != _ctxtpad_sep or val[p + len(ctxtpad):] not in (_ctxtpad_noctxt, "") ): raise PologyError(_("@info", "Malformed padded context.")) if val[p + len(ctxtpad):] != _ctxtpad_noctxt: val = val[:p - len(_ctxtpad_sep)] else: val = None msgctxt_nopad = val # Resolve parts. for part, typ in _msg_diff_parts: if ctxtpad and part == "msgctxt": val = msgctxt_nopad else: val = emsg.get(part) if typ == _dt_single: nval = word_ediff_to_x(val) if nval == None and part == "msgid": return None atts_vals.append((part, nval)) elif typ == _dt_list: lst = [] for el in val: if infsep: p = el.find(infsep) if p >= 0: # strip filtered difference el = el[:p - 1] # -1 to remove newline lst.append(el) nlst = listtype(line_ediff_to_x(lst)) if nlst == [] and part == "msgstr": return None atts_vals.append((part, nlst)) elif typ == _dt_state: if not ignore_state_diff: val = states.get(part) if val is not None: atts_vals.append((part, val)) else: raise PologyError( _("@info", "Unhandled message part '%(part)s' encountered " "while resolving difference.", part=part)) # Set resolved parts for real. for att, val in atts_vals: setattr(rmsg, att, val) return rmsg def editprob (oldtext, newtext): """ Compute the probability that a human would rather edit the old text to obtain the new text, then write it from scratch. Classical algorithms to compute similarity ratio between two texts sometimes produce high ratios for texts which a human would unlikely consider similar enough to make one text by editing the other, and vice versa. This functions uses some heuristics to derive the probability that one text was really edited by a human into the other. Not commutative in general. If one of the texts is given as C{None}, the result is 0.0; if both are C{None}, the result is 1.0. @param oldtext: candidate for initial text @type oldtext: string @param newtext: current text @type newtext: string @returns: the probability of editing the old into the new text [0, 1] @rtype: float """ if oldtext == newtext: return 1.0 if not oldtext or not newtext: return 0.0 # Consider always the case of editing from longer to shorter text. if len(oldtext) < len(newtext): shorttext, longtext = oldtext, newtext else: shorttext, longtext = newtext, oldtext # Construct diff. sm = SequenceMatcher(None, longtext, shorttext) mblocks = sm.get_matching_blocks() mblocks = sorted(mblocks, key=lambda x: x[1]) mblocks.insert(0, (0, 0, 0)) # Acummulate probability. ep = 0.0 for i in range(1, len(mblocks) - 1): lm = mblocks[i][2] ld1 = mblocks[i][1] - (mblocks[i - 1][1] + mblocks[i - 1][2]) ld2 = mblocks[i + 1][1] - (mblocks[i][1] + mblocks[i][2]) cf = (float(lm) / (lm + ld1 + ld2))**2 # ...if cf would be set to 1, probability would be equal # to ordinary similarity ratio. ep += lm * cf ep /= len(shorttext) # Correct for different lengths of texts. rl = float(len(shorttext)) / len(longtext) ep *= 1 - (rl - 1)**4 return ep def descprob (descpath, ancpath, cutoff=None, getcsz=False): """ Compute the probability that one PO file is a descendant of another. Sometimes PO files are renamed, split into two, joined into one, also with possible small changes in messages between old and new set. This functions uses some heuristics to derive the probability that the PO file given by C{apath} is an ancestor of the PO file given by C{dpath}. If the probability cannot be determined (for whatever reason, e.g. if the file contains syntax errors), C{None} is returned. By default, only equality versus non-equality of messages is taken into consideration. If C{cutoff} is set to a number 0.0-1.0, then fuzzy matching is performed, and partial similarities greater than the cutoff are counted into the final probability. However, this reduces performance by orders of magnitude (the more the lower the cutoff; 0.7-0.8 may be a reasonable tradeoff). @param descpath: path to possible descendent PO file @type descpath: string @param ancpath: path to possible ancestor PO file @type ancpath: string @param cutoff: the cuttoff for fuzzy matching @type cutoff: float @param getcsz: also report the referent character sizes of the first and second file @type getcsz: bool @returns: the probability of ancestry [0, 1], the referent character sizes if requested @rtype: C{None} or float or (float, int, int) """ # Read representative texts of messages. # Ignore non-unique texts (contexts may have been stripped). dtexts = set(_read_msg_texts(descpath)) atexts = set(_read_msg_texts(ancpath)) # Make the computation commutative, by always taking # the file with less text as possible descendant. dtotchar = sum(len(t) for t in dtexts) atotchar = sum(len(t) for t in atexts) if getcsz: dtotchar_orig = dtotchar atotchar_orig = atotchar if dtotchar > atotchar: dtexts, atexts = atexts, dtexts dtotchar, atotchar = atotchar, dtotchar # Count how many texts from descendant are in ancestor too. # This gives basic probability. neq = len(dtexts.intersection(atexts)) prob = float(neq) / len(dtexts) # For each text in descendant not found in ancestor, # sum similarity ratios to nearest text in ancestor, # and add to the probability. if cutoff is not None: sumsim = 0.0 for dt in dtexts.difference(atexts): seqm = SequenceMatcher() seqm.set_seq2(dt) maxsim = 0.0 for at in atexts: seqm.set_seq1(at) sim = seqm.real_quick_ratio() if sim > cutoff: sim = seqm.quick_ratio() if sim > cutoff: sim = seqm.ratio() if sim > cutoff: maxsim = max(maxsim, sim) sumsim += maxsim prob += sumsim / len(dtexts) # Correct probability for small files. # This is necessary due to enforced commutativity above. limtotchar = 100 # e.g. 10 messages with 2 words (10 characters) each if dtotchar < limtotchar: prob *= (float(dtotchar) / atotchar)**0.5 if getcsz: return prob, dtotchar_orig, atotchar_orig else: return prob def _read_msg_texts (path): # NOTE: This function needs to be as fast as possible, # so instead of using file.Catalog, the file is manually parsed # to the necessary minimum. # It is more important to be fast than correct, # so parsing ignores some valid but highly unusual PO formatting. # NOTE: Intentionally ignoring: file encoding, escaping, msgctxt. try: lines = open(path).readlines() except: raise PologyError( _("@info", "Cannot read file '%(file)s'.", file=path)) msgids = [] inmsgid = False for line in lines: line = line.strip() if line.startswith("msgid "): segs = [] line = line[5:].strip() inmsgid = True elif not line.startswith("\""): if inmsgid: msgid = "".join(segs) msgids.append(msgid) inmsgid = False if inmsgid: segs.append(line[1:-1]) # strip quotes return msgids diff --git a/pology/entities.py b/pology/entities.py index 5c96838b..8c44c09b 100644 --- a/pology/entities.py +++ b/pology/entities.py @@ -1,202 +1,202 @@ # -*- coding: UTF-8 -*- """ Handle entity definitions. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import xml.parsers.expat from pology import PologyError, _, n_ from pology.fsops import collect_files_by_ext from pology.report import warning def parse_entities (defstr, src=None): """ Parse XML entity definitions from given string. The string should contain only entity definitions in DTD form, without any prolog or epilogue:: ... ... If the same entity is defined several times, the last read definition is taken as final. @param defstr: entity-defining string @type defstr: string @param src: name of the source, for problem reporting @param src: C{None} or string @returns: name-value pairs of parsed entities @rtype: dict """ # Equip with prolog and epilogue. defstr = "\n" \ "" # Parse entities. entities = {} def handler (name, is_parameter_entity, value, base, systemId, publicId, notationName): entities[name] = value p = xml.parsers.expat.ParserCreate() p.EntityDeclHandler = handler try: p.Parse(defstr, True) - except xml.parsers.expat.ExpatError, inst: + except xml.parsers.expat.ExpatError as inst: if src: raise PologyError( _("@info error report for a named source", "%(src)s: %(msg)s", src=src, msg=inst)) else: raise PologyError( _("@info error report for a string", "<string>: %(msg)s", msg=inst)) return entities def read_entities (filepath, fcap=False): """ Read XML entity definitions from given file path. Input argument can be a single file path, or a sequence of paths. Content of each file is parsed by L{parse_entities}. For each read entity, another entity may be added which has the first letter converted to upper-case, both in the entity name and value. See L{fcap_entities} for more details. @param filepath: path or paths of entity-defining file @type filepath: string or sequence of strings @param fcap: whether to add paired first-caps entities @type fcap: bool @returns: (name, value) dictionary of parsed entities @rtype: dict @see: L{parse_entities} """ - if isinstance(filepath, basestring): + if isinstance(filepath, str): fnames = [filepath] else: fnames = filepath entities = {} for fname in fnames: # Scoop up file contents, as raw bytes (UTF-8 expected). ifs = open(fname, "r") defstr = "".join(ifs.readlines()) ifs.close() # Parse entities. entities.update(parse_entities(defstr, src=fname)) if fcap: fcap_entities(entities, update=True) return entities def read_entities_by_env (entpathenv, recurse=True, fcap=False): """ Read XML entity definitions from directory paths given by an environment variable. Directory paths given by environment variable are searched for files with C{.entities} extension, and all found files are sent to L{read_entities}. Search through directories can be recursive or non-recursive. See L{fcap_entities} for use of C{fcap} parameter. If the environment variable is not set, a warning is output and empty collection of entities returned. @param entpathenv: environment variable that holds directory paths @type entpathenv: string @param recurse: whether to search directories recursively @type recurse: bool @param fcap: whether to add paired first-caps entities @type fcap: bool @returns: (name, value) dictionary of parsed entities @rtype: dict """ entities = {} entpath = os.getenv(entpathenv) if entpath is None: warning(_("@info", "Environment variable with paths to entity definitions " "'%(envar)s' is not set.", envar=entpathenv)) return entities entfilepaths = collect_files_by_ext(entpath.split(":"), "entities") entities.update(read_entities(entfilepaths, fcap)) return entities def fcap_entities (entities, update=False): """ Create paired set of entities with first letters in upper-case. For each given entity, another entity may be created which has the first letter converted to upper-case, both in the entity name and value. Such entity is created only if the original entity has at least one letter in the name, and the first letter in the name is lower-case. New entities are either returned in a new dictionary, or are inserted into the original dictionary, which is then returned. @param entities: (name, value) dictionary of entities @type entities: dict @param update: whether to insert new entities into C{entities} itself @type update: bool @returns: (name, value) dictionary of upper-case entities @rtype: dict """ if update: fcaps = entities - iterents = entities.items() + iterents = list(entities.items()) else: fcaps = {} - iterents = entities.iteritems() + iterents = iter(entities.items()) for name, value in iterents: # Upper-case entity name. p = 0 while p < len(name) and not name[p].isalpha(): p += 1 if p >= len(name): # nothing to upper-case, skip continue if not name[p].islower(): # first letter is not lower-case, skip continue name = name[:p] + name[p].upper() + name[p + 1:] # Upper-case entity value, if possible. p = 0 while p < len(value) and not value[p].isalpha(): p += 1 if p < len(value): value = value[:p] + value[p].upper() + value[p + 1:] fcaps[name] = value return fcaps diff --git a/pology/escape.py b/pology/escape.py index 65c2d668..14112808 100644 --- a/pology/escape.py +++ b/pology/escape.py @@ -1,145 +1,145 @@ # -*- coding: UTF-8 -*- """ Escaping texts in various contexts. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import PologyError, _, n_ from pology.report import warning _escapes_c = { "\a" : "a", "\b" : "b", "\f" : "f", "\n" : "n", "\r" : "r", "\t" : "t", "\v" : "v", "\"" : "\"", "\\" : "\\", } -_unescapes_c = dict([(y, x) for x, y in _escapes_c.items()]) +_unescapes_c = dict([(y, x) for x, y in list(_escapes_c.items())]) def unescape_c (s): """ Unescape text for C-style quoted strings. Octal and hex sequences (C{\\0OO}, C{\\xHH}) are converted into the corresponding ASCII characters if less than 128, or else thrown out (with a warning). Invalid escape sequences raise exception. @param s: text to unescape (without wrapping quotes) @type s: string @returns: unescaped text @rtype: string @see: L{escape_c} """ segs = [] p = 0 while True: pp = p p = s.find("\\", p) if p < 0: segs.append(s[pp:]) break segs.append(s[pp:p]) p += 1 c = s[p:p + 1] ec = None if c in ("x", "0"): dd = s[p + 1:p + 3] if len(dd) == 2: try: ec = chr(int(dd, c == "x" and 16 or 8)) p += 3 except: pass else: ec = _unescapes_c.get(c) if ec is not None: p += 1 if ec is None: raise PologyError( _("@info \"C\" is the C programming language", "Invalid C escape sequence after '%(snippet)s'.", snippet=s[:p])) segs.append(ec) return type(s)().join(segs) -_escapes_c_wpref = dict([(x, "\\" + y) for x, y in _escapes_c.items()]) +_escapes_c_wpref = dict([(x, "\\" + y) for x, y in list(_escapes_c.items())]) def escape_c (s): """ Escape text for C-style quoted strings. @param s: text to escape @type s: string @returns: escaped text (without wrapping quotes) @rtype: string @see: L{unescape_c} """ return type(s)().join([_escapes_c_wpref.get(c, c) for c in s]) _special_chars_sh = set(r" ~`#$&*()\|[]{};'\"<>?!") def escape_sh (s): """ Escape text for Unix sh-like shell. Escaped text may be used as a fixed argument in command line, i.e. the shell will not interpret any part of it in a special way. It is undefined which of the possible ways to escape are used (single quotes, double quotes, backslashes). @param s: text to escape @type s: string @returns: escaped text @rtype: string """ if bool(set(s).intersection(_special_chars_sh)): quote = "'" if "'" not in s else '"' s = s.replace(quote, "\\" + quote) s = quote + s + quote return s def split_escaped (text, sep): """ Like C{split()}, but double-separator is treated as an escape of itself. @param text: the text to split @type text: string @param sep: the separator @type sep: string @returns: parsed elements @rtype: list of strings """ - alakazoom = u"\u0004" + alakazoom = "\u0004" tmp = text.replace(sep + sep, alakazoom).split(sep) return [x.replace(alakazoom, sep) for x in tmp] diff --git a/pology/external/pyaspell.py b/pology/external/pyaspell.py index a33a46af..0d8bd900 100644 --- a/pology/external/pyaspell.py +++ b/pology/external/pyaspell.py @@ -1,367 +1,367 @@ # -*- coding: iso-8859-2 -*- # Aspell interface using ctypes. # $Date: 2007-04-07 14:27:33 $, $Revision: 1.3 $ # # This is straightforward translation of my # aspell-python, C extension. # # License: BSD # # author: Wojciech Muła # e-mail: wojciech_mula@poczta.onet.pl # www : http://wmula.republika.pl/proj/aspell-python/ # # TODO: add method to get/change **current** speller's config try: import ctypes import ctypes.util except ImportError: raise ImportError("ctypes library is needed") class AspellError(Exception): pass class AspellConfigError(AspellError): pass class AspellSpellerError(AspellError): pass class AspellLinux(object): """ Aspell speller object. Allows to check spelling, get suggested spelling list, manage user dictionarias, and other. Must be closed with 'close' method, or one may experience problems, like segfaults. """ def __init__(self, configkeys=None, libname=None): """ Parameters: * configkeys - list of configuration parameters; each element is a pair key & value (both strings) if None, then default configuration is used * libname - explicity set aspell library name; if None then default name is used """ if libname is None: libname = ctypes.util.find_library('aspell') self.__lib = ctypes.CDLL(libname) # Initialize speller # 1. create configuration config = self.__lib.new_aspell_config() if config == None: raise AspellError("Can't create aspell config object") # 2. parse configkeys arg. if configkeys is not None: assert type(configkeys) in [tuple, list], "Tuple or list expeced" if len(configkeys) == 2 and \ type(configkeys[0]) is str and \ type(configkeys[1]) is str: configkeys = [configkeys] for key, value in configkeys: assert type(key) is str, "Key must be string" assert type(value) is str, "Value must be string" if not self.__lib.aspell_config_replace(config, key, value): raise self._aspell_config_error(config) # 3. create speller possible_error = self.__lib.new_aspell_speller(config) self.__lib.delete_aspell_config(config) if self.__lib.aspell_error_number(possible_error) != 0: self.__lib.delete_aspell_can_have_error(possible_error) raise AspellError("Can't create speller object") self.__speller = self.__lib.to_aspell_speller(possible_error) def check(self, word): """ Check if word is present in main, personal or session dictionary. Boolean value is returned """ if type(word) is str: return bool( self.__lib.aspell_speller_check( self.__speller, word, len(word) )) else: raise TypeError("String expeced") def suggest(self, word): """ Return list of spelling suggestions of given word. Works even if word is correct. """ if type(word) is str: return self._aspellwordlist( self.__lib.aspell_speller_suggest( self.__speller, word, len(word) )) else: raise TypeError("String expeced") def personal_dict(self, word=None): """ Aspell's personal dictionary is a user defined, persistent list of word (saved in certain file). If 'word' is not given, then method returns list of words stored in dict. If 'word' is given, then is added to personal dict. New words are not saved automatically, method 'save_all' have to be call. """ if word is not None: # add new word assert type(word) is str, "String expeced" self.__lib.aspell_speller_add_to_personal( self.__speller, word, len(word) ) self._aspell_check_error() else: # return list of words from personal dictionary return self._aspellwordlist( self.__lib.aspell_speller_personal_word_list(self.__speller) ) def session_dict(self, word=None, clear=False): """ Aspell's session dictionary is a user defined, volatile list of word, that is destroyed with aspell object. If 'word' is None, then list of words from session dictionary is returned. If 'word' is present, then is added to dict. If 'clear' is True, then session dictionary is cleared. """ if clear: self.__lib.aspell_speller_clear_session(self.__speller) self._aspell_check_error() return if word is not None: # add new word assert type(word) is str, "String expeced" self.__lib.aspell_speller_add_to_session( self.__speller, word, len(word) ) self._aspell_check_error() else: # return list of words from personal dictionary return self._aspellwordlist( self.__lib.aspell_speller_session_word_list(self.__speller) ) def add_replacement_pair(self, misspelled, correct): """ Add replacement pair, i.e. pair of misspelled and correct word. It affects on order of words appear on list returned by 'suggest' method. """ assert type(misspelled) is str, "String is required" assert type(correct) is str, "String is required" self.__lib.aspell_speller_store_replacement( self.__speller, misspelled, len(misspelled), correct, len(correct) ) self._aspell_check_error() def save_all(self): """ Saves all words added to personal or session dictionary to the apell's defined file. """ self.__lib.spell_speller_save_all_word_lists(self.__speller) self._aspell_check_error() def configkeys(self): """ Returns list of all available config keys that can be passed to contructor. List contains a 3-tuples: 1. key name 2. default value of type: * bool * int * string * list of string 3. short description if None, then this key is undocumented is should not be used, unless one know what really do """ config = self.__lib.aspell_speller_config(self.__speller) if config is None: raise AspellConfigError("Can't get speller's config") keys_enum = self.__lib.aspell_config_possible_elements(config, 1) if keys_enum is None: raise AspellError("Can't get list of config keys") class KeyInfo(ctypes.Structure): _fields_ = [ ("name", ctypes.c_char_p), ("type", ctypes.c_int), ("default", ctypes.c_char_p), ("desc", ctypes.c_char_p), ("flags", ctypes.c_int), ("other_data", ctypes.c_int), ] key_next = self.__lib.aspell_key_info_enumeration_next key_next.restype = ctypes.POINTER(KeyInfo) list = [] while True: key_info = key_next(keys_enum) if not key_info: break else: key_info = key_info.contents if key_info.type == 0: # string list.append(( key_info.name, key_info.default, key_info.desc, )) elif key_info.type == 1: # integer list.append(( key_info.name, int(key_info.default), key_info.desc, )) elif key_info.type == 2: # boolean if key_info.default.lower() == 'true': list.append(( key_info.name, True, key_info.desc, )) else: list.append(( key_info.name, False, key_info.desc, )) elif key_info.type == 3: # list list.append(( key_info.name, key_info.default.split(), key_info.desc, )) self.__lib.delete_aspell_key_info_enumeration(keys_enum) return list def close(self): """ Close aspell speller object. """ self.__lib.delete_aspell_speller(self.__speller) # XXX: internal function, do not call directly def _aspellwordlist(self, wordlist_id): """ XXX: internal function Converts aspell list into python list. """ elements = self.__lib.aspell_word_list_elements(wordlist_id) list = [] while True: wordptr = self.__lib.aspell_string_enumeration_next(elements) if not wordptr: break else: word = ctypes.c_char_p(wordptr) list.append(word.value) self.__lib.delete_aspell_string_enumeration(elements) return list def _aspell_config_error(self, config): """ XXX: internal function Raise excpetion if operation of speller config caused an error. Additionaly destroy config object. """ # make exception object & copy error msg exc = AspellConfigError( ctypes.c_char_p( self.__lib.aspell_config_error_message(config) ).value ) # then destroy config objcet self.__lib.delete_aspell_config(config) # and then raise exception raise exc def _aspell_check_error(self): """ XXX: internal function Raise exception if previous speller operation caused an error. """ if self.__lib.aspell_speller_error(self.__speller) != 0: msg = self.__lib.aspell_speller_error_message(self.__speller) raise AspellSpellerError(msg) #class Aspell = AspellLinux if __name__ == '__main__': # TODO: more test cases a = Aspell(("lang", "en")) - print a.check("when") - print a.suggest("wehn") + print(a.check("when")) + print(a.suggest("wehn")) a.add_replacement_pair("wehn", "ween") - print a.suggest("wehn") + print(a.suggest("wehn")) - print a.session_dict() - print a.check("pyaspell") + print(a.session_dict()) + print(a.check("pyaspell")) a.session_dict("pyaspell") - print a.session_dict() - print a.check("pyaspell") + print(a.session_dict()) + print(a.check("pyaspell")) a.session_dict(clear=True) - print a.session_dict() + print(a.session_dict()) a.close() # vim: ts=4 sw=4 diff --git a/pology/fsops.py b/pology/fsops.py index 1f4ef285..d97d9dc2 100644 --- a/pology/fsops.py +++ b/pology/fsops.py @@ -1,1013 +1,1013 @@ # -*- coding: UTF-8 -*- """ Operations with environment, file system and external commands. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import codecs import locale import os import re import subprocess import sys from pology import PologyError, _, n_ import pology.config from pology.escape import escape_sh from pology.report import report, error, warning def collect_files (paths, recurse=True, sort=True, unique=True, relcwd=True, selectf=None): """ Collect list of files from given directory and file paths. C{paths} can be any sequence of strings, or a single string. Directories can be searched for files recursively or non-resursively, as requested by the C{recurse} parameter. Parameters C{sort} and C{unique} determine if the resulting paths are sorted alphabetically increasing and if duplicate paths are removed. If C{relcwd} is set to C{True}, absolute file paths which point to files within the current working directory are made relative to it. Only selected files may be collected by supplying a selection function through C{selectf} parameter. It takes a file path as argument and returns a boolean, C{True} to select the file or C{False} to discard it. @param paths: paths to search for files @type paths: string or iter(string*) @param recurse: whether to search for files recursively @type recurse: bool @param sort: whether to sort collected paths @type sort: bool @param unique: whether to eliminate duplicate collected paths @type unique: bool @param relcwd: whether to make collected absolute paths within current working directory relative to it @param relcwd: bool @param selectf: test to select or discard a file path @type selectf: (string)->bool @returns: collected file paths @rtype: [string...] """ - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] filepaths = [] for path in paths: if os.path.isdir(path): for root, dirs, files in os.walk(path): for file in files: filepath = os.path.normpath(os.path.join(root, file)) if not selectf or selectf(filepath): filepaths.append(filepath) if not recurse: dirs[:] = [] elif os.path.isfile(path): if not selectf or selectf(path): filepaths.append(path) elif not os.path.exists(path): raise PologyError( _("@info", "Path '%(path)s' does not exist.", path=path)) else: raise PologyError( _("@info", "Path '%(path)s' is neither a file nor a directory.", path=path)) if sort: if unique: filepaths = list(set(filepaths)) filepaths.sort() elif unique: # To preserve the order, reinsert paths avoiding duplicates. seen = {} ufilepaths = [] for filepath in filepaths: if filepath not in seen: seen[filepath] = True ufilepaths.append(filepath) filepaths = ufilepaths if relcwd: - filepaths = map(join_ncwd, filepaths) + filepaths = list(map(join_ncwd, filepaths)) return filepaths def collect_files_by_ext (paths, extension, recurse=True, sort=True, unique=True, relcwd=True, selectf=None): """ Collect list of files having given extension from given paths. The C{extension} parameter can be a single extension or a sequence of extensions, without the leading dot. Files with empty extension (i.e. dot at the end of path) are collected by supplying empty string for C{extension}, and files with no extension by supplying another empty sequence. Other parameters behave in the same way as in L{collect_files}. @param extension: extension of files to collect @type extension: string or sequence of strings @see: L{collect_files} """ - if isinstance(extension, basestring): + if isinstance(extension, str): extensions = [extension] else: extensions = extension def selectf_mod (fpath): ext = os.path.splitext(fpath)[1] if ext not in ("", "."): hasext = ext[1:] in extensions elif ext == ".": hasext = extensions == "" else: # ext == "" hasext = not extensions if selectf and hasext: return selectf(fpath) else: return hasext return collect_files(paths, recurse, sort, unique, relcwd, selectf_mod) def collect_catalogs (paths, recurse=True, sort=True, unique=True, relcwd=True, selectf=None): """ Collect list of catalog file paths from given paths. Applies C{collect_files_by_ext} with extensions set to C{("po", "pot")}. """ catexts = ("po", "pot") return collect_files_by_ext(paths, catexts, recurse, sort, unique, relcwd, selectf) def collect_catalogs_by_env (catpathenv, recurse=True, sort=True, unique=True, relcwd=True, selectf=None): """ Collect list of catalog file paths from directories given by an environment variable. Other parameters behave in the same way as in L{collect_catalogs}. @param catpathenv: environment variable name @type catpathenv: string """ catpath = os.getenv(catpathenv) if catpath is None: return [] catdirs = catpath.split(":") return collect_catalogs(catdirs, recurse, sort, unique, relcwd, selectf) def mkdirpath (dirpath): """ Make all the directories in the path which do not exist yet. Like shell's C{mkdir -p}. @param dirpath: the directory path to create @type dirpath: string @returns: the path of topmost created directory, if any @rtype: string or C{None} """ toppath = None incpath = "" for subdir in os.path.normpath(dirpath).split(os.path.sep): if not subdir: subdir = os.path.sep incpath = os.path.join(incpath, subdir) if not os.path.isdir(incpath): os.mkdir(incpath) if toppath is None: toppath = incpath return toppath def system_wd (cmdline, wdir): """ Execute command line in a specific working directory. Like C{os.system}, only switching CWD during execution. @param cmdline: command line to execute @type cmdline: string @param wdir: working directory for the command (CWD if none given) @type wdir: path @returns: exit code from the command @rtype: int """ cwd = getucwd() try: os.chdir(wdir) ret = os.system(cmdline) except: os.chdir(cwd) raise return ret def assert_system (cmdline, echo=False, wdir=None): """ Execute command line and assert success. If the command exits with non-zero zero state, the program aborts. C{cmdline} can be either a monolithic string, in which case it is executed through a shell, or a list of argument strings, when the process is started directly with these arguments. C{cmdline} is processed with L{unicode_to_str} to convert any unicode strings to raw byte strings in expected system encoding. @param cmdline: command line to execute @type cmdline: string @param echo: whether to echo the supplied command line @type echo: bool @param wdir: working directory for the command (CWD if none given) @type wdir: path """ if echo: - if isinstance(cmdline, basestring): + if isinstance(cmdline, str): cmdstr = cmdline else: cmdstr = " ".join(map(escape_sh, cmdline)) report(cmdstr) if wdir is not None: cwd = getucwd() os.chdir(wdir) - if isinstance(cmdline, basestring): + if isinstance(cmdline, str): cmdline = unicode_to_str(cmdline) shell = True else: - cmdline = map(unicode_to_str, cmdline) + cmdline = list(map(unicode_to_str, cmdline)) shell = False ret = subprocess.call(cmdline, shell=shell) if wdir is not None: os.chdir(cwd) if ret: if echo: error(_("@info", "Non-zero exit from the previous command.")) else: error(_("@info", "Non-zero exit from the command:\n%(cmdline)s", cmdline=cmdline)) def collect_system (cmdline, echo=False, wdir=None, env=None, instr=None): """ Execute command line and collect stdout, stderr, and exit code. C{cmdline} can be either a monolithic string, in which case it is executed through a shell, or a list of argument strings, when the process is started directly with these arguments. C{cmdline} is processed with L{unicode_to_str} to convert any unicode strings to raw byte strings in expected system encoding. @param cmdline: command line to execute @type cmdline: string or [string*] @param echo: whether to echo the command line, as well as stdout/stderr @type echo: bool @param wdir: working directory for the command (CWD if none given) @type wdir: path @param env: environment for the execution (variable name-value pairs) @type env: {string: string} @param instr: string to pass to the command stdin @type instr: string @returns: stdout, stderr, and exit code @rtype: (string, string, int) """ if echo: - if isinstance(cmdline, basestring): + if isinstance(cmdline, str): cmdstr = cmdline else: cmdstr = " ".join(map(escape_sh, cmdline)) report(cmdstr) if wdir is not None: cwd = getucwd() os.chdir(wdir) stdin = instr is not None and subprocess.PIPE or None - if isinstance(cmdline, basestring): + if isinstance(cmdline, str): cmdline = unicode_to_str(cmdline) shell = True else: - cmdline = map(unicode_to_str, cmdline) + cmdline = list(map(unicode_to_str, cmdline)) shell = False p = subprocess.Popen(cmdline, shell=shell, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=stdin) if instr is not None: p.stdin.write(instr.encode(locale.getpreferredencoding())) - strout, strerr = map(str_to_unicode, p.communicate()) + strout, strerr = list(map(str_to_unicode, p.communicate())) ret = p.returncode if wdir is not None: os.chdir(cwd) if echo: if strout: sys.stdout.write( _("@info ^^^ points to the earlier output in the terminal", "===== stdout from the command above =====") + "\n") sys.stdout.write(strout) if strerr: sys.stderr.write( _("@info ^^^ points to the earlier output in the terminal", "***** stderr from the command ^^^ *****") + "\n") sys.stderr.write(strerr) return (strout, strerr, ret) def lines_from_file (filepath, encoding=None): """ Read content of a text file into list of lines. Only CR, LF, and CR+LF are treated as line breaks. If the given file path is not readable, or text cannot be decoded using given encoding, exceptions are raised. If encoding is not given, the encoding specified by the environment is used. @param filepath: path of the file to read @type filepath: string @param encoding: text encoding for the file @param encoding: string @returns: lines @rtype: [string...] """ if encoding is None: encoding = locale.getpreferredencoding() try: ifl = codecs.open(filepath, "r", encoding) except: warning(_("@info", "Cannot open '%(file)s' for reading.", file=filepath)) raise try: content = ifl.read() except: warning(_("@info", "Cannot read content of '%(file)s' using %(enc)s encoding.", file=filepath, enc=encoding)) raise ifl.close() lines = [x + "\n" for x in re.split(r"\r\n|\r|\n", content)] # ...no file.readlines(), it treats some other characters as line breaks. if lines[-1] == "\n": # If the file ended properly in a line break, the last line will be # phony, from the empty element splitted out by the last line break. lines.pop() return lines def join_ncwd (*elements): """ Join path and normalize it with respect to current working directory. Path elements are joined with C{os.path.join} and the joined path normalized by C{os.path.normpath}. The normalized path is then made relative to current working directory if it points to a location within current working directory. @param elements: path elements @type elements: varlist @returns: normalized joined path @rtype: string """ path = os.path.join(*elements) cwd = getucwd() + os.path.sep apath = os.path.abspath(path) if apath.startswith(cwd): path = apath[len(cwd):] else: path = os.path.normpath(path) return path def str_to_unicode (strarg): """ Convert a raw string value or sequence of values into Unicode. Strings comming in from the environment are frequently raw byte sequences, and need to be converted into Unicode strings according to system locale (e.g. command-line arguments). This function will take either a single raw string or any sequence of raw strings and convert it into a Unicode string or list thereof. If the input value is not a single raw or unicode string, it is assumed to be a sequence of values. In case there are values in the input which are not raw strings, they will be carried over into the result as-is. @param strarg: input string or sequence @type strarg: string, unicode, or sequence of objects @returns: unicode string or sequence of objects @rtype: unicode string or list of objects """ - if isinstance(strarg, unicode): + if isinstance(strarg, str): return strarg lenc = locale.getpreferredencoding() if isinstance(strarg, str): return strarg.decode(lenc, "replace") else: uargs = [] for val in strarg: if isinstance(val, str): val = val.decode(lenc, "replace") uargs.append(val) return uargs def unicode_to_str (strarg): """ Convert a unicode string into raw byte sequence. Strings goint to the environment should frequently be raw byte sequences, and need to be converted from Unicode strings according to system locale (e.g. command-line arguments). This function will take either a single Unicode string or any sequence of Unicode strings and convert it into a raw string or list thereof. If the input value is not a single raw or unicode string, it is assumed to be a sequence of values. In case there are values in the input which are not Unicode strings, they will be carried over into the result as-is. @param strarg: input string or sequence @type strarg: string, unicode, or sequence of objects @returns: raw string or sequence of objects @rtype: raw string or list of objects """ if isinstance(strarg, str): return strarg lenc = locale.getpreferredencoding() - if isinstance(strarg, unicode): + if isinstance(strarg, str): return strarg.encode(lenc) else: uargs = [] for val in strarg: - if isinstance(val, unicode): + if isinstance(val, str): val = val.encode(lenc) uargs.append(val) return uargs def get_env_langs (): """ Guess user's preferred languages from the environment. Various environment variables are examined to collect the list of languages in which the user may be wanting to read or write in in the environment. The list is ordered from most to least preferred language, and may be empty. Languages are given by their ISO-639 codes. @returns: preferred languages @rtype: [string...] """ langs = [] # Variables which contain colon-separated language strings. for lenv in ["LANGUAGE"]: langs.extend((os.getenv(lenv, "")).split(":")) # Variables which contain locale string: # split into parts, and assemble possible language codes from least to for lenv in ["LC_ALL", "LANG"]: lval = os.getenv(lenv, "") lsplit = [] for sep in ("@", ".", "_"): # order is important p = lval.rfind(sep) if p >= 0: el, lval = lval[p + len(sep):], lval[:p] else: el = None lsplit.insert(0, el) lsplit.insert(0, lval) lng, ctr, enc, mod = lsplit if lng and ctr and mod: langs.append("%s_%s@%s" % (lng, ctr, mod)) if lng and ctr: langs.append("%s_%s" % (lng, ctr)) if lng and mod: langs.append("%s@%s" % (lng, mod)) if lng: langs.append(lng) # Normalize codes, remove empty and any duplicates (but keep order). langs2 = [x.strip() for x in langs] langs2 = [x for x in langs2 if x] seen = set() langs = [] for lang in langs2: if lang not in seen: seen.add(lang) langs.append(lang) return langs def term_width (stream=sys.stdout, default=None): """ Get number of columns in the terminal of output stream. If the output stream is not linked to the terminal, 0 is returned. If the output stream is linked to the terminal, but the number of columns cannot be determined, the supplied default value is returned instead. @param stream: output stream for which the terminal is looked up @type stream: file @param default: value to return if width cannot be determined @type default: int @returns: width of the terminal in columns @rtype: int """ if not stream.isatty(): return 0 try: import curses curses.setupterm() except: return default ncols = curses.tigetnum("cols") return ncols if ncols >= 0 else default def build_path_selector (incnames=None, incpaths=None, excnames=None, excpaths=None, ormatch=False): """ Build a path selection function based on inclusion-exclusion condition. Frequently a collection of paths needs to be filtered, to pass only specific paths (inclusion), or to block only specific paths (exclusion), or both. Filtering conditions are normally posed on full paths, but frequently file base names without extensions are really tested. This function builds a selector function which takes a path and returns C{True} to select the path or C{False} to discard it, based on four sets of conditions: inclusions by base name without extension (C{incnames}), inclusion by full path (C{incpaths}), exclusions by base name without extension (C{excnames}), and exclusions by full path (C{excpaths}). Each condition in each of the sets can be a regular expression string, an object with C{search(string)} method returning true or false value (e.g. compiled regular expression), or a general function taking string and returning true or false value. If C{ormatch} is C{False}, the path is included if there are no inclusion conditions or all inclusion conditions match; the path is excluded if there is at least one exclusion condition and all exclusion conditions match. If C{ormatch} is C{True}, the path is included if there are no inclusion conditions or at least one of them matches; the path is excluded if at least one exclusion condition match. @param incnames: conditions for inclusion by base name without extension @type incnames: sequence (see description) @param incpaths: conditions for inclusion by full path @type incpaths: sequence (see description) @param excnames: conditions for exclusion by base name without extension @type excnames: sequence (see description) @param excpaths: conditions for exclusion by full path @type excpaths: sequence (see description) @param ormatch: whether conditions are linked with OR @type ormatch: bool @returns: path selection function @rtype: (string)->bool """ # Shortcut to avoid complicated selector function. if not incnames and not incpaths and not excnames and not excpaths: return lambda x: x incnames_tf = _build_path_selector_type(incnames) incpaths_tf = _build_path_selector_type(incpaths) excnames_tf = _build_path_selector_type(excnames) excpaths_tf = _build_path_selector_type(excpaths) sumf = any if ormatch else all def selector (path): path = os.path.abspath(path) name = None if incnames_tf or excnames_tf: name = os.path.basename(os.path.normpath(path)) p = name.rfind(".") if p > 0: name = name[:p] - incargs = ( zip(incnames_tf, [name] * len(incnames_tf)) - + zip(incpaths_tf, [path] * len(incpaths_tf))) + incargs = ( list(zip(incnames_tf, [name] * len(incnames_tf))) + + list(zip(incpaths_tf, [path] * len(incpaths_tf)))) incress = [x(y) for x, y in incargs] - excargs = ( zip(excnames_tf, [name] * len(excnames_tf)) - + zip(excpaths_tf, [path] * len(excpaths_tf))) + excargs = ( list(zip(excnames_tf, [name] * len(excnames_tf))) + + list(zip(excpaths_tf, [path] * len(excpaths_tf)))) excress = [x(y) for x, y in excargs] return ( (not incress or sumf(incress)) and (not excress or not sumf(excress))) return selector def _build_path_selector_type (sels): sels_tf = [] if not sels: return sels_tf def tofunc (sel): if hasattr(sel, "search"): return lambda x: bool(sel.search(x)) - elif isinstance(sel, basestring): + elif isinstance(sel, str): sel_rx = re.compile(sel, re.U) return lambda x: bool(sel_rx.search(x)) elif callable(sel): return sel else: raise PologyError( _("@info", "Cannot convert object '%(obj)s' into a string matcher.", obj=sel)) - sels_tf = map(tofunc, sels) + sels_tf = list(map(tofunc, sels)) return sels_tf _dhead = ":" _dincname = "+" _dincpath = "/+" _dexcname = "-" _dexcpath = "/-" def collect_paths_from_file (fpath, cmnts=True, incexc=True, respathf=None, getsel=False, abort=False): """ Collect list of paths from the file. In general, non-empty lines in the file are taken to be paths, and empty lines are skipped. If C{cmnts} is C{True}, then also the lines starting with C{'#'} are skipped as comments. The C{respathf} parameter provides a function to be applied to each path and return a list of paths, which then substitute the original path. This function can be used, for example, to recursively collect files from listed directories, or to exclude paths by an external condition. If C{incexc} is C{True}, then the lines starting with C{':'} define directives by which files and directories are included or excluded from the final list. Inclusion-exclusion directives are mostly useful when some of the paths are directories, and C{respathf} parameter is used to provide a function to collect subpaths from listed directories; the inclusion-exclusion directives are applied to those subpaths too. The directives are as follows: - C{:-REGEX}: excludes path if its base name without extension matches the regular expression - C{:/-REGEX}: excludes path if it matches the regular expression - C{:+REGEX}: includes path only if its base name without extension matches the regular expression - C{:/+REGEX}: includes path only if it matches the regular expression The path is included if there are no inclusion directives, or it matches at least one inclusion directive; the path is excluded if it matches at least one exclusion directive. Inclusion-exclusion directives are given to L{build_path_selector} to create the path selection function (with C{ormatch} set to C{True}), which is then used to filter collected paths (after application of C{respathf}, if given). If C{getsel} is set to C{True}, the selection function is returned instead of being applied to read paths immediately. This is useful in case the C{respathf} parameter is not sufficient to resolve paths, but more complex processing is required. from directories externally, instead with C{respathf}). If there were no inclusion-exclusion directives in the file, the resulting selection function will return C{True} for any path. @param fpath: the path to file which contains paths @type fpath: string @param cmnts: whether the file can contain comments @type cmnts: bool @param incexc: whether the file can contain inclusion-exclusion directives @type incexc: boolean @param respathf: function to resolve collected paths @type respathf: (string)->[string...] @param getsel: whether to return constructed path selection function instead of applying it @type getsel: bool @param abort: whether to abort the execution on exceptions from path resolution or selection functions @type abort: bool @returns: collected paths, possibly with path selection function @rtype: [string...] or ([string...], (string)->bool) """ if abort: def abort_or_raise (e): error(str_to_unicode(str(e))) else: def abort_or_raise (e): raise paths = [] incnames = [] incpaths = [] excnames = [] excpaths = [] lines = open(fpath).read().split("\n") lno = 0 for line in lines: lno += 1 if not line or (cmnts and line.startswith("#")): continue if incexc and line.startswith(_dhead): line = line[len(_dhead):] dstr = None for sels, shead in ( (incnames, _dincname), (incpaths, _dincpath), (excnames, _dexcname), (excpaths, _dexcpath), ): if line.startswith(shead): dstr = line[len(shead):] try: rx = re.compile(dstr, re.U) except: raise PologyError( _("@info", "Invalid regular expression in inclusion/" "exclusion directive at %(file)s:%(line)d.", file=fpath, line=lno)) sels.append(rx) break if dstr is None: raise PologyError( _("@info", "Unknown inclusion/exclusion directive " "at %(file)s:%(line)d.", file=fpath, line=lno)) else: paths.append(line) if respathf: try: - paths = sum(map(respathf, paths), []) - except Exception, e: + paths = sum(list(map(respathf, paths)), []) + except Exception as e: abort_or_raise(e) selectf = build_path_selector(incnames=incnames, incpaths=incpaths, excnames=excnames, excpaths=excpaths, ormatch=True) if getsel: return paths, selectf else: try: - paths = filter(selectf, paths) - except Exception, e: + paths = list(filter(selectf, paths)) + except Exception as e: abort_or_raise(e) return paths def collect_paths_cmdline (rawpaths=None, incnames=None, incpaths=None, excnames=None, excpaths=None, ormatch=False, filesfrom=None, cmnts=True, incexc=True, elsecwd=False, respathf=None, getsel=False, abort=False): """ Collect list of paths from usual sources given on command line. Scripts that process paths will in general get paths directly (as free command line arguments or on standard input), or indirectly from files containing lists of paths (usually given by a command line option). Sometimes input directory paths will be searched for paths of all files in them, possibly of certain type. Especially when searching directory paths, the script may take options to exclude or include only paths that match something. This function conveniently wraps up these possibilities, to fetch all possible paths in single statement. The C{rawpaths} parameter provides a list of directly supplied paths, e.g. from command line arguments. C{incnames}, C{incpaths}, C{excnames}, and C{excpaths} are lists of inclusion and exclusion conditions out of which single path selection function is constructed, with C{ormatch} determining how conditions are linked, see L{build_path_selector} for details. C{filesfrom} is a list of files containing lists of paths, C{cmnts} and C{incexc} are options for the file format, see L{collect_paths_from_file} for details. If both C{rawpaths} and C{filesfrom} are not given or empty, C{elsecwd} determines if current working directory is added to list of paths (C{True}) or not (C{False}). C{respathf} is a function which takes a path and returns list of paths, see description of the same parameter in L{collect_paths_from_file}. The order of path collection is as follows. First all paths from C{rawpaths} are added, applying C{respathf}. Then all paths from all files given by C{fromfiles} are added, by applying L{collect_paths_from_file} on each file (C{respathf} is applied by sending it to L{collect_paths_from_file}). If both C{rawpaths} and C{fromfiles} were C{None} or empty, current working directory is added, possibly applying C{respathf}. Finally, all paths are filtered through inclusion-exclusion tests; if no inclusion tests are given, then all files are included unless excluded by an exclusion test. If C{getsel} is set to C{True}, the path selection function is returned instead of being applied to collected paths. This function will also include path selection functions constructed from inclusion-exclusion directives found in C{filesfrom}, linked with the top conditions according to C{ormatch}. @param respathf: function to resolve collected paths @type respathf: (string)->[string...] @param getsel: whether to return constructed path selection function instead of applying it @type getsel: bool @param abort: whether to abort the execution on exceptions from path resolution or selection functions @type abort: bool @returns: collected paths, possibly with path selection function @rtype: [string...] or ([string...], (string)->bool) """ paths = [] if abort: def abort_or_raise (e): error(str_to_unicode(str(e))) else: def abort_or_raise (e): raise # First add paths given directly, then add paths read from files. if rawpaths: rawpaths2 = rawpaths if respathf: try: - rawpaths2 = sum(map(respathf, rawpaths), []) - except Exception, e: + rawpaths2 = sum(list(map(respathf, rawpaths)), []) + except Exception as e: abort_or_raise(e) paths.extend(rawpaths2) ffselfs = [] if filesfrom: for ffpath in filesfrom: res = collect_paths_from_file(ffpath, cmnts, incexc, respathf, getsel=getsel, abort=abort) if getsel: cpaths, cself = res paths.extend(cpaths) ffselfs.append(cself) else: paths.extend(res) # If neither direct paths nor files to read paths from were given, # add current working directory if requested. if elsecwd and not rawpaths and not filesfrom: cwd = getucwd() if respathf: try: paths.extend(respathf(cwd)) - except Exception, e: + except Exception as e: abort_or_raise(e) else: paths.append(cwd) selectf = build_path_selector(incnames=incnames, incpaths=incpaths, excnames=excnames, excpaths=excpaths, ormatch=ormatch) if ffselfs: if ormatch: selftot = lambda p: selectf(p) or any([x(p) for x in ffselfs]) else: selftot = lambda p: selectf(p) and all([x(p) for x in ffselfs]) else: selftot = selectf if getsel: return paths, selftot else: try: - paths = filter(selftot, paths) - except Exception, e: + paths = list(filter(selftot, paths)) + except Exception as e: abort_or_raise(e) return paths def getucwd (): """ Get path of current working directory as Unicode string. C{os.getcwd()} returns a raw byte sequence, to which the L{str_to_unicode} function is applied to make best guess at decoding it into a unicode string. @returns: path of current working directory @rtype: string """ rawcwd = os.getcwd() cwd = str_to_unicode(rawcwd) return cwd def exit_on_exception (func, cleanup=None): """ Gracefully exit a Pology script when an exception is received. Any error message will be printed, any progress lines will be cleared, and keyboard interrupt will exist silently. The backtrace can be shown instead (on non-keyboard interrupt exceptions) by setting C{[global]/show-backtrace} user configuration field to true. @param func: a zero-argument function @type func: () -> any @param cleanup: a zero-argument function to execute before exiting @type cleanup: () -> any @returns: path of current working directory @rtype: string """ try: func() except KeyboardInterrupt: report("", newline=False) if cleanup: cleanup() exit(100) - except Exception, e: + except Exception as e: report("", newline=False) if cleanup: cleanup() if pology.config.section("global").boolean("show-backtrace"): raise else: error(str_to_unicode(str(e)), code=1) diff --git a/pology/getfunc.py b/pology/getfunc.py index b85694e3..63bb6f5e 100644 --- a/pology/getfunc.py +++ b/pology/getfunc.py @@ -1,345 +1,345 @@ # -*- coding: UTF-8 -*- """ Fetch Pology modules, functions, data, etc. by various handles. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import sys import os import re from pology import PologyError, _, n_ from pology.report import error, warning def get_module (modpath, lang=None, proj=None, abort=False, wpath=False): """ Import a Pology module. Module is specified by its dotted path in Pology's package structure relative to (optional) language and project. For example:: get_module("remove") will try to import the L{pology.remove}, while:: get_module("wconv", lang="sr") will try to import the C{pology.lang.sr.wconv} module, and:: get_module("header", proj="kde") will try to import the C{pology.proj.kde.header} module. Elements of the relative path can also contain hyphens, which will be converted into underscores when looking for the module. If the module cannot be imported, if C{abort} is C{True} the execution will abort with an error message; otherwise an exception is raised. If C{wpath} is C{True}, the resolved module path is returned as well. @param modpath: relative module location @type modpath: string @param lang: language code @type lang: string @param proj: project code @type proj: string @param abort: whether to abort execution if the module cannot be imported @type abort: bool @param wpath: whether to also return resolve module path @type wpath: bool @returns: imported module, possibly with its path @rtype: module or (module, string) """ modpath = modpath.replace("-", "_") if lang and proj: modpath = "pology.lang.%s.proj.%s" % (lang, proj, modpath) elif lang: modpath = "pology.lang.%s.%s" % (lang, modpath) elif proj: modpath = "pology.proj.%s.%s" % (proj, modpath) else: modpath = "pology.%s" % (modpath) try: module = __import__(modpath, globals(), locals(), [""]) except ImportError: _raise_or_abort(_("@info", "Cannot import module '%(mod)s'.", mod=modpath), abort) # TODO: Make more detailed analysis why importing fails: # is there such a language or project, is there such a file, etc. return module if not wpath else (module, modpath) _valid_lang_rx = re.compile(r"^[a-z]{2,3}(_[A-Z]{2})?(@\w+)?$") _valid_proj_rx = re.compile(r"^[a-z_]+$") _valid_path_rx = re.compile(r"^([a-z][\w-]*(\.|$))+", re.I) _valid_item_rx = re.compile(r"^[a-z][\w-]+$", re.I) def split_ireq (ireq, abort=False): """ Split item request string into distinct elements. The item request is a string of the form C{[lang:][proj%]path[/item][~args]} (or C{[proj%][lang:]...}), which this function parses into C{(path, lang, proj, item, args)} tuple. If language, project, item or argument strings are not not stated, their value in the tuple will be C{None}. The language should be a proper language code, the project an identifier-like string, the path a sequence of identifier-like strings connected by dots (though hyphens are accepted an taken as synonymous to underscores), item an identifier-like string, and arguments can be an arbitrary string. If the item request cannot be parsed, either the execution is aborted with an error message, or an exception is raised, depending on value of C{abort}. @param ireq: item request @type ireq: string @param abort: whether to abort execution or if the request cannot be parsed @type abort: bool @returns: parsed request elements @rtype: (string, string or C{None}, string or C{None}, string or C{None}, string or C{None}) """ rest = ireq lst = rest.split("~", 1) if len(lst) == 1: rest, args = lst + [None] else: rest, args = lst lst = rest.split("/", 1) if len(lst) == 1: rest, item = lst + [None] else: rest, item = lst lang = None proj = None plang = rest.find(":") pproj = rest.find("%") if plang >= 0 and pproj >= 0: p1, p2 = min(plang, pproj), max(plang, pproj) c1, c2, rest = rest[:p1], rest[p1 + 1:p2], rest[p2 + 1:] if plang < pproj: lang, proj = c1, c2 else: lang, proj = c2, c1 elif plang >= 0: lang, rest = rest[:plang], rest[plang + 1:] elif pproj >= 0: proj, rest = rest[:pproj], rest[pproj + 1:] path = rest if not _valid_path_rx.search(path): _raise_or_abort(_("@info", "Invalid path '%(path)s' in item request '%(req)s'.", path=path, req=ireq), abort) if lang is not None and not _valid_lang_rx.search(lang): _raise_or_abort(_("@info", "Invalid language code '%(code)s' " "in item request '%(req)s.'", code=lang, req=ireq), abort) if proj is not None and not _valid_proj_rx.search(proj): _raise_or_abort(_("@info", "Invalid project code '%(code)s' " "in item request '%(req)s.'", code=proj, req=ireq), abort) if item is not None and not _valid_item_rx.search(item): _raise_or_abort(_("@info", "Invalid item '%(item)s' in item request '%(req)s'.", item=item, req=ireq), abort) path = path.replace("-", "_") if item: item = item.replace("-", "_") return path, lang, proj, item, args def get_hook (modpath, lang=None, proj=None, func=None, args=None, abort=False): """ Fetch a hook function. Loads a hook function from a module obtained by applying L{get_module} to C{modpath}, C{lang}, and C{proj} parameters. If C{func} is C{None}, the function name defaults to module name; if C{func} is not C{None}, but function of that name is not found, then the function named C{_} is additionally tried (where C{} is the last element in C{modpath}). If C{args} is not C{None}, then the loaded function is considered a hook factory, and the hook is created by calling it with C{args} string as argument list (it should have no surrounding parenthesis). @param modpath: hook module @type modpath: string @param lang: language code @type lang: string @param proj: project code @type proj: string @param func: function name of hook or hook factory @type func: string @param args: argument string to hook factory @type args: string @param abort: whether to abort execution or raise exception if the hook cannot be loaded @type abort: bool @returns: the hook """ lmod, modpath = get_module(modpath, lang, proj, abort, wpath=True) modname = modpath.rsplit(".", 1)[-1] if func is None: func = modname func2 = "\0" else: func2 = "%s_%s" % (modname, func) call = getattr(lmod, func, None) or getattr(lmod, func2, None) if call is None: _raise_or_abort(_("@info", "Module '%(mod)s' does not define " "'%(func)s' function.", mod=modpath, func=func), abort) if args is not None: try: call = eval("call(%s)" % args) - except Exception, e: + except Exception as e: fspec = "%s/%s" % (modpath, func) _raise_or_abort(_("@info", "Cannot create hook by applying function " "'%(func)s' to argument list %(args)s; " "reported error:\n%(msg)s", func=fspec, args=repr(args), msg=e), abort) return call def get_hook_ireq (ireq, abort=False): """ Like L{get_hook}, but the hook is specified by L{item request}. For a module C{pology.FOO} which defines the C{FOO()} hook function, the hook specification is simply C{FOO}. If the hook function is named C{BAR()} instead of C{FOO()}, the hook specification is given as C{FOO/BAR}; if the hook function is named C{FOO_BAR()}, i.e. the specification would be C{FOO/FOO_BAR}, it can be folded to C{FOO/BAR}. Language-specific hooks (C{pology.lang.LANG.FOO}) are aditionally preceded by the language code and colon, as C{LANG:FOO} or C{LANG:FOO/BAR}. Project-specific hooks (C{pology.proj.PROJ.FOO}) are aditionally preceded by the project code and percent, as C{PROJ%FOO} or C{LANG%FOO/BAR}. If the hook is both language- and project- specific, language and project qualifiers can both be added: C{LANG:PROJ%FOO} or C{LANG:PROJ%FOO/BAR}; ordering, C{LANG:PROJ%...} or C{PROJ%LANG:...}, is not significant. If the hook is not a plain hook, but a hook factory function, the factory arguments are supplied after the basic hook specification, separated by tilde: C{LANG:PROJ%FOO/BAR~ARGLIST} (where C{LANG:}, C{PROJ%} and C{/BAR} may be omitted under previously listed conditions). Argument list is formatted just like it would be passed in Python code to the factory function, omitting the surrounding parenthesis. """ return _by_ireq(ireq, get_hook, abort=abort) def _by_ireq (ireq, getter, abort=False): """ Get item using C{getter(path, lang, proj, item, abort)} method, by applying it to parsed item request string. """ path, lang, proj, item, args = split_ireq(ireq, abort) return getter(path, lang, proj, item, args, abort) def _raise_or_abort (errmsg, abort, exc=PologyError): """ Raise an exception or abort execution with given error message, based on the value of C{abort}. """ if abort: error(errmsg) else: raise exc(errmsg) def get_result (modpath, lang=None, proj=None, func=None, args="", abort=False): """ Fetch the result of a function evaluation. Executes function from the module loaded by applying L{get_module} to C{modpath}, C{lang}, and C{proj} parameters. If C{func} is not given, the function name defaults to module name. C{args} is the string representing the argument list to the function call (without surrounding parenthesis). @param modpath: function module @type modpath: string @param lang: language code @type lang: string @param proj: project code @type proj: string @param func: function name within the module @type func: string @param args: argument string to function call @type args: string @param abort: if the function is not found, abort or report C{None} @type abort: bool @returns: the value returned by the function call """ fmod, modpath = get_module(modpath, lang, proj, abort, wpath=True) modname = modpath.rsplit(".", 1)[-1] if func is None: func = modname call = getattr(fmod, func, None) if call is None: _raise_or_abort(_("@info", "Module '%(mod)s' does not define " "function '%(func)s'.", mod=modpath, func=func), abort) try: res = eval("call(%s)" % args) - except Exception, e: + except Exception as e: fspec = "%s/%s" % (modpath, func) _raise_or_abort(_("@info", "Evaluating function '%(func)s' " "with argument list %(args)s failed; " "reported error:\n%(msg)s", func=fspec, args=repr(args), msg=e), abort) return res def get_result_ireq (ireq, abort=False): """ Like L{get_result}, but the function is specified by L{item request}. """ return _by_ireq(ireq, get_result, abort=abort) diff --git a/pology/gtxtools.py b/pology/gtxtools.py index f87dea33..3fc29096 100644 --- a/pology/gtxtools.py +++ b/pology/gtxtools.py @@ -1,110 +1,110 @@ # -*- coding: UTF-8 -*- """ Wrappers for commands from Gettext tools. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import subprocess from pology import _, n_ from pology.report import warning from pology.fsops import unicode_to_str def msgfilter (filtr, options=[]): """ Pass PO file through C{msgfilter(1)} [hook factory]. Wrappers modify PO files in place; the executed command is:: msgfilter -i -o where C{options} parameter may be used to pass any extra options to C{msgfilter}. Both C{filtr} and C{options} are lists of command line arguments rather than monolithic strings, to avoid shell quoting problems. For example, to rewrap the PO file at 70 columns:: msgfilter(["cat"], ["-w", "70"]) or to replace every C{foo} with C{bar} in translation:: msgfilter(["sed", "s/foo/bar/g"]) @param filtr: filter to use @type filtr: [string*] @param options: additional options to pass to C{msgfilter} @type options: [string*] @return: type F6A hook @rtype: C{(filepath) -> numerr} @note: In case C{msgfilter} does not finish without errors, wrapper always reports number of errors as 1. """ # FIXME: Check availability and version of msgfilter. base_cmdargs = ["msgfilter"] + options def wrapper (filepath): cmdargs = base_cmdargs + ["-i", filepath, "-o", filepath] + filtr - cmdargs = map(unicode_to_str, cmdargs) + cmdargs = list(map(unicode_to_str, cmdargs)) ret = subprocess.call(cmdargs) if ret: warning(_("@info", "%(file)s: %(cmd)s failed with exit code %(num)d " "(filter: '%(filter)s', options: '%(options)s')", file=filepath, cmd="msgfilter", num=ret, filter=filtr, options=options)) return 1 return 0 return wrapper def msgfmt (options=[]): """ Pass PO file through C{msgfmt(1)} [hook factory]. The file is not modified; the executed command is:: msgfilter -o /dev/null where C{options} parameter may be used to pass any extra options to C{msgfmt}. C{options} is a list of command line arguments rather than a monolithic string, to avoid shell quoting problems. @param options: additional options to pass to C{msgfmt} @type options: [string*] @return: type S6A hook @rtype: C{(filepath) -> numerr} @note: In case C{msgfmt} does not finish without errors, wrapper always reports number of errors as 1. """ # FIXME: Check availability and version of msgfmt. base_cmdargs = ["msgfmt"] + options + ["-o", "/dev/null"] def wrapper (filepath): cmdargs = base_cmdargs + [filepath] - cmdargs = map(unicode_to_str, cmdargs) + cmdargs = list(map(unicode_to_str, cmdargs)) ret = subprocess.call(cmdargs) if ret: warning(_("@info", "%(file)s: %(cmd)s failed with exit code %(num)d " "(options: '%(options)s')", file=filepath, cmd="msgfmt", num=ret, options=options)) return 1 return 0 return wrapper diff --git a/pology/header.py b/pology/header.py index 47f7bb6d..8ee4687d 100644 --- a/pology/header.py +++ b/pology/header.py @@ -1,619 +1,619 @@ # -*- coding: UTF-8 -*- """ Header entry in PO catalogs. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import PologyError from pology.wrap import wrap_field from pology.monitored import Monitored, Monlist, Monpair -from message import Message +from .message import Message import datetime import time import re _Header_spec = { "title" : {"type" : Monlist, - "spec" : {"*" : {"type" : unicode}}}, - "copyright" : {"type" : (unicode, type(None))}, - "license" : {"type" : (unicode, type(None))}, + "spec" : {"*" : {"type" : str}}}, + "copyright" : {"type" : (str, type(None))}, + "license" : {"type" : (str, type(None))}, "author" : {"type" : Monlist, - "spec" : {"*" : {"type" : unicode}}}, + "spec" : {"*" : {"type" : str}}}, "comment" : {"type" : Monlist, - "spec" : {"*" : {"type" : unicode}}}, + "spec" : {"*" : {"type" : str}}}, "field" : {"type" : Monlist, "spec" : {"*" : {"type" : Monpair, - "spec" : {"first" : {"type" : unicode}, - "second" : {"type" : unicode}}}}}, + "spec" : {"first" : {"type" : str}, + "second" : {"type" : str}}}}}, "initialized" : {"type" : bool, "derived" : True}, # Dummies for summary iteration in catalog: "obsolete" : {"type" : bool, "derived" : True}, "key" : {"type" : bool, "derived" : True}, } class Header (Monitored): """ Header entry in PO catalogs. The PO header is syntactically just another entry in the catalog, but with different semantics. Therefore, instead operating on it using L{Message}, this class provides a different set of interface attributes and methods. Like L{Message}, this class implements monitoring; the starred-types (e.g. C{list*}) are according to the same convention as for messages, and also the strings are assumed unicode unless otherwise noted. There is no lightweight alternative to the monitored header, like that of L{MessageUnsafe} for messages, because no performance demand is expected for the headers only. @ivar title: comment lines giving the title @type title: list* of strings @ivar copyright: comment line with the copyright statement @type copyright: string @ivar license: comment line with the license statement @type license: string @ivar author: comment lines stating translators who worked on this catalog @type author: list* of strings @ivar comment: the free comment lines, being none of the specific ones @type comment: list* of strings @ivar field: parsed header fields as key-value string pairs @type field: list* of pairs* @ivar initialized: (read-only) whether the header is fully initialized @type initialized: bool @see: L{Message} """ def __init__ (self, init=None): """ Initializes the header by the given message or header. @param init: the PO entry containing the header, or another header @type init: subclass of L{Message_base}, or L{Header} """ if isinstance(init, Header): # copy header fields hdr = init self._title = Monlist(hdr._title) self._copyright = hdr._copyright self._license = hdr._license self._author = Monlist(hdr._author) self._comment = Monlist(hdr._comment) - self._field = Monlist(map(Monpair, hdr._field)) + self._field = Monlist(list(map(Monpair, hdr._field))) # Create the message. self._message = hdr.to_msg() elif init: # parse header message msg = init # Comments. self._title = Monlist() - self._copyright = u"" - self._license = u"" + self._copyright = "" + self._license = "" self._author = Monlist() self._comment = Monlist() intitle = True for c in msg.manual_comment: if 0: pass elif ( not self._copyright - and re.search(ur"copyright|\(C\)|©", c, re.I|re.U) + and re.search(r"copyright|\(C\)|©", c, re.I|re.U) ): self._copyright = c intitle = False elif ( not self._license and ( re.search("license", c, re.I) and not re.search("^translation *of.* to", c, re.I)) ): self._license = c intitle = False elif re.search("<.*@.*>", c): self._author.append(c) intitle = False elif intitle: self._title.append(c) else: self._comment.append(c) # Header fields. self._field = Monlist() for field in msg.msgstr[0].split("\n"): m = re.match(r"(.*?): ?(.*)", field) if m: self._field.append(Monpair(m.groups())) # Copy the message. self._message = Message(msg) else: # create default fields - self._title = Monlist([u"SOME DESCRIPTIVE TITLE."]); - self._copyright = u"Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER" - self._license = u"This file is distributed under the same license as the PACKAGE package." - self._author = Monlist([u"FIRST AUTHOR , YEAR."]) - self._comment = Monlist([u""]) + self._title = Monlist(["SOME DESCRIPTIVE TITLE."]); + self._copyright = "Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER" + self._license = "This file is distributed under the same license as the PACKAGE package." + self._author = Monlist(["FIRST AUTHOR , YEAR."]) + self._comment = Monlist([""]) self._field = Monlist([ - Monpair((u"Project-Id-Version", u"PACKAGE VERSION")), - Monpair((u"Report-Msgid-Bugs-To", u"")), - Monpair((u"POT-Creation-Date", format_datetime())), - Monpair((u"PO-Revision-Date", u"YEAR-MO-DA HO:MI+ZONE")), - Monpair((u"Last-Translator", u"FULL NAME ")), - Monpair((u"Language-Team", u"LANGUAGE ")), - Monpair((u"Language", u"")), - Monpair((u"MIME-Version", u"1.0")), - Monpair((u"Content-Type", u"text/plain; charset=CHARSET")), - Monpair((u"Content-Transfer-Encoding", u"8bit")), - Monpair((u"Plural-Forms", u"nplurals=INTEGER; plural=EXPRESSION;")), + Monpair(("Project-Id-Version", "PACKAGE VERSION")), + Monpair(("Report-Msgid-Bugs-To", "")), + Monpair(("POT-Creation-Date", format_datetime())), + Monpair(("PO-Revision-Date", "YEAR-MO-DA HO:MI+ZONE")), + Monpair(("Last-Translator", "FULL NAME ")), + Monpair(("Language-Team", "LANGUAGE ")), + Monpair(("Language", "")), + Monpair(("MIME-Version", "1.0")), + Monpair(("Content-Type", "text/plain; charset=CHARSET")), + Monpair(("Content-Transfer-Encoding", "8bit")), + Monpair(("Plural-Forms", "nplurals=INTEGER; plural=EXPRESSION;")), ]) # Create the message. self._message = Message() self._remake_msg(force=True) self.assert_spec_init(_Header_spec) # Unmodify all monitored members. self.modcount = 0 def __getattr__ (self, att): """ Attribute getter. Processes read-only attributes, and sends others to the base class. @param att: name of the attribute to get @returns: attribute value """ if att == "obsolete": return False elif att == "key": return Message().key # key of an empty-msgid message elif att == "initialized": # Check if all necessary fields have been initialized. gfv = self.get_field_value return not (False or "PACKAGE VERSION" in gfv("Project-Id-Version", "") or "YEAR-MO-DA" in gfv("PO-Revision-Date", "") or "FULL NAME" in gfv("Last-Translator", "") or "LANGUAGE" in gfv("Language-Team", "") or "CHARSET" in gfv("Content-Type", "") or "ENCODING" in gfv("Content-Transfer-Encoding", "") or "INTEGER" in gfv("Plural-Forms", "") or "EXPRESSION" in gfv("Plural-Forms", "") ) else: return Monitored.__getattr__(self, att) def get (self, att, default=None): """ Get attribute value. Allows accessing the header like a dictionary. @param att: name of the attribute to get @type att: string @param default: value to return if the attribute does not exist @returns: value of the attribute or the default value """ if hasattr(self, att): return getattr(self, att) else: return default def _remake_msg (self, force=False): m = self._message if (force or self.title_modcount or self.title.modcount or self.copyright_modcount or self.license_modcount or self.author_modcount or self.author.modcount or self.comment_modcount or self.comment.modcount ): m.manual_comment = Monlist() for t in self.title: m.manual_comment.append(t) if self.copyright: m.manual_comment.append(self.copyright) if self.license: m.manual_comment.append(self.license) for a in self.author: m.manual_comment.append(a) for c in self.comment: m.manual_comment.append(c) if force or self.field_modcount or self.field.modcount: - m.msgstr = Monlist([u""]) + m.msgstr = Monlist([""]) for field in self.field: m.msgstr[0] += "%s: %s\n" % tuple(field) if force or self.modcount: m.fuzzy = not self.initialized def __eq__ (self, ohdr): """ Reports wheter headers are equal in all apparent parts. "Apparent" parts include all those which are visible in the PO file. I.e. the check will ignore internal states, like line caches, etc. @returns: C{True} if headers are equal in apparent parts @rtype: bool """ return self.to_msg() == ohdr.to_msg() def __ne__ (self, ohdr): """ Reports wheter headers are not equal in some apparent parts. Equivalent to C{not (self == ohdr)}. @returns: C{False} if headers are equal in all apparent parts @rtype: bool """ return not self.__eq__(ohdr) def to_msg (self, force=False): """ Convert the header into ordinary message object. The message object returned may be the modification of the one passed to the constructor. In that case, and if the message object has monitoring features, the force parameter will tell whether to modify all message elements, or to try to keep the changes minimal. @param force: whether to recreate all message elements @type force: bool @returns: header as message @rtype: the type that initialized the object """ self._remake_msg(force) return self._message def to_lines (self, wrapf=wrap_field, force=False, colorize=0): """ The line-representation of the header. Equivalent to the same-named method of message classes. @see: L{Message_base} """ return self.to_msg(force).to_lines(wrapf, force, colorize) def to_string (self, wrapf=wrap_field, force=False, colorize=0): """ The string-representation of the header. Equivalent to the same-named method of message classes. @see: L{Message_base} """ return self.to_msg(force).to_string(wrapf, force, colorize) def select_fields (self, name): """ Find header fields with the given name. Header fields need not be unique. @param name: look for the fields with this name @type name: string @returns: references to name-value pairs matching the field name @rtype: list of pairs* """ fields = [] for pair in self.field: if pair.first == name: fields.append(pair) return fields def get_field_value (self, name, default=None): """ Get the value of the given header field. If there are several fields with the same name, it is undefined which of them will supply the value; this method should be used only for fields which are expected to be unique. If there are no fields named as requested, C{default} is returned. @param name: field name @type name: string @param default: value returned if there is no such field @type default: as given @returns: field value @rtype: string or C{default} """ for pair in self.field: if pair.first == name: return pair.second return default def replace_field_value (self, name, new_value, nth=0): """ Replace the value of the n-th occurence of the named header field. Header fields need not be unique, hence the n-th qualification. @param name: name of the header field @type name: string @param new_value: new value for the field @type new_value: string @param nth: replace the value of this field among same-named fields @type nth: int @returns: True if the requested field was found, False otherwise @rtype: bool """ nfound = 0 for i in range(len(self._field)): if self.field[i][0] == name: nfound += 1 if nfound - 1 == nth: - self.field[i] = Monpair((unicode(name), new_value)) + self.field[i] = Monpair((str(name), new_value)) break return nfound - 1 == nth def set_field (self, name, value, after=None, before=None, reorder=False): """ Set a header field to a value. If the field already exists, its value is replaced with the given one. If there are several same-named fields, it is undefined which one and how many of them are going to have their values replaced; this method should be used only for fields expected to be unique. If there is no such field yet, it is inserted into the header; after the field C{after} or before the field C{before} if given and existing, or appended to the end otherwise. If the field already exists, but not in the position according to C{after} or C{before}, reordering can be requested too. @param name: name of the header field @type name: unicode @param value: new value for the field @type value: unicode @param after: the field to insert after @type after: string @param before: the field to insert before @type before: string @param reorder: whether to move an existing field into better position @type reorder: bool @returns: position where the field was modified or inserted @rtype: int """ ins_pos = -1 rpl_pos = -1 for i in range(len(self._field)): if self.field[i][0] == name: rpl_pos = i if not reorder: break if ( (after and i > 0 and self.field[i - 1][0] == after) or (before and self.field[i][0] == before) ): ins_pos = i # Do not break, must try all fields for value replacement. if reorder and ins_pos >= 0 and rpl_pos >= 0 and ins_pos != rpl_pos: self._field.pop(rpl_pos) if ins_pos > rpl_pos: ins_pos -= 1 rpl_pos = -1 pair = Monpair((name, value)) if rpl_pos >= 0: self._field[rpl_pos] = pair pos = rpl_pos elif ins_pos >= 0: self._field.insert(ins_pos, pair) pos = ins_pos else: self._field.append(pair) pos = len(self._field) return pos def remove_field (self, name): """ Remove header fields with the given name, if it exists. @param name: remove fields with this name @type name: string @return: number of removed fields @rtype: int """ i = 0 nrem = 0 while i < len(self.field): if self.field[i][0] == name: self.field.pop(i) nrem += 1 else: i += 1 return nrem _dt_fmt = "%Y-%m-%d %H:%M:%S%z" _dt_fmt_nosec = "%Y-%m-%d %H:%M%z" def format_datetime (dt=None, wsec=False): """ Format datetime as found in PO header fields. If a particular datetime object C{dt} is not given, current datetime is used instead. If C{wsec} is C{False}, the formatted string will not contain the seconds component, which is usual for PO header datetimes. If seconds accuracy is desired, C{wsec} can be set to C{True}. @param dt: datetime @type dt: datetime.datetime @param wsec: whether to add seconds component @type wsec: bool @return: formatted datetime @rtype: string """ if dt is not None: if wsec: dtstr = dt.strftime(_dt_fmt) else: dtstr = dt.strftime(_dt_fmt_nosec) # If timezone is not present, assume UTC. if dt.tzinfo is None: dtstr += "+0000" else: if wsec: dtstr = time.strftime(_dt_fmt) else: dtstr = time.strftime(_dt_fmt_nosec) - return unicode(dtstr) + return str(dtstr) _parse_date_rxs = [re.compile(x) for x in ( r"^ *(\d+)-(\d+)-(\d+) *(\d+):(\d+):(\d+) *([+-]\d+) *$", r"^ *(\d+)-(\d+)-(\d+) *(\d+):(\d+)() *([+-]\d+) *$", # ...needs empty group to differentiate from the next case. r"^ *(\d+)-(\d+)-(\d+) *(\d+):(\d+):(\d+) *$", r"^ *(\d+)-(\d+)-(\d+) *(\d+):(\d+) *$", r"^ *(\d+)-(\d+)-(\d+) *$", r"^ *(\d+)-(\d+) *$", r"^ *(\d+) *$", )] def parse_datetime (dstr): """ Parse formatted datetime from a PO header field into a datetime object. The formatted datetime may also have a seconds component, which is typically not present in PO headers. It may also lack a contiguous number of components from the back, e.g. having no time zone offset, or no time at all. @param dstr: formatted datetime @type dstr: string @return: datetime object @rtype: datetime.datetime """ for parse_date_rx in _parse_date_rxs: m = parse_date_rx.search(dstr) if m: break if not m: raise PologyError(_("@info", "Cannot parse datetime string '%(str)s'.", str=dstr)) pgroups = list([int(x or 0) for x in m.groups()]) pgroups.extend([1] * (3 - len(pgroups))) pgroups.extend([0] * (7 - len(pgroups))) year, month, day, hour, minute, second, off = pgroups offhr = off // 100 offmin = off % 100 dt = datetime.datetime(year=year, month=month, day=day, hour=hour, minute=minute, second=second, tzinfo=TZInfo(hours=offhr, minutes=offmin)) return dt class TZInfo (datetime.tzinfo): """ A simple derived time zone info for use in datetime objects. """ def __init__ (self, hours=None, minutes=None): """ Create a time zone with given offset in hours and minutes. The offset given by C{minutes} is added to that given by C{hours}, e.g. C{hours=2} and C{minutes=30} means two and a half hours offset. If C{minutes} is given but C{hours} is not, C{hours} is considered zero. If neither C{hours} nor C{minutes} are given, the offset is read from system time zone. @param hours: the time zone offset in hours @type hours: int @param minutes: additional offset in minutes @type minutes: int """ self._isdst = time.localtime()[-1] if hours is None and minutes is None: tzoff_sec = -(time.altzone if self._isdst else time.timezone) tzoff_hr = tzoff_sec // 3600 tzoff_min = (tzoff_sec - tzoff_hr * 3600) // 60 else: tzoff_hr = hours or 0 tzoff_min = minutes or 0 self._dst = datetime.timedelta(0) self._utcoffset = datetime.timedelta(hours=tzoff_hr, minutes=tzoff_min) def utcoffset (self, dt): return self._utcoffset def dst (self, dt): return self._dst def tzname (self, dt): return time.tzname[self._isdst] diff --git a/pology/internal/poediffpatch.py b/pology/internal/poediffpatch.py index 2e8854b3..1a31a70d 100644 --- a/pology/internal/poediffpatch.py +++ b/pology/internal/poediffpatch.py @@ -1,508 +1,508 @@ # -*- coding: UTF-8 -*- """ Common functionality for poediff and poepatch scripts. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 @warning: Non-public module. """ import os import time from pology import PologyError, _ from pology.catalog import Catalog import pology.config as pology_config from pology.diff import msg_ediff, tdiff from pology.merge import merge_pofile from pology.message import MessageUnsafe def _raise_no_inst (clssname): raise PologyError( _("@info", "Class '%(clss)s' only provides static attributes, " "objects of this type cannot be constructed.", clss=clssname)) # FIXME: Define message part categories in message module. # Message part categories. class MPC: curr_fields = [ "msgctxt", "msgid", "msgid_plural", ] prev_fields = [x + "_previous" for x in curr_fields] - currprev_fields = zip(curr_fields, prev_fields) - prevcurr_fields = zip(prev_fields, curr_fields) + currprev_fields = list(zip(curr_fields, prev_fields)) + prevcurr_fields = list(zip(prev_fields, curr_fields)) def __init__ (self): _raise_no_inst(self.__class__.__name__) # Syntax tokens in embedded diff catalogs. class EDST: - hmsgctxt_field = u"X-Ediff-Header-Context" # by spec - hmsgctxt_el = u"~" # by spec - filerev_sep = u" <<< " # by spec + hmsgctxt_field = "X-Ediff-Header-Context" # by spec + hmsgctxt_el = "~" # by spec + filerev_sep = " <<< " # by spec def __init__ (self): _raise_no_inst(self.__class__.__name__) def msg_eq_fields (m1, m2, fields): if (m1 is None) != (m2 is None): return False elif m1 is None and m2 is None: return True for field in fields: if not isinstance(field, tuple): field = (field, field) if m1.get(field[0]) != m2.get(field[1]): return False return True def msg_copy_fields (m1, m2, fields): if m1 is None: m1 = MessageUnsafe() for field in fields: if not isinstance(field, tuple): field = (field, field) setattr(m2, field[1], m1.get(field[0])) def msg_clear_prev_fields (m): for field in MPC.prev_fields: setattr(m, field, None) # Remove previous fields if inconsistent with the message in total. def msg_cleanup (msg): # Non-fuzzy messages should have no previous fields. # msgid_previous must be present, or there must be no previous fields. if not msg.fuzzy or msg.msgid_previous is None: for field in MPC.prev_fields: if msg.get(field) is not None: setattr(msg, field, None) def diff_cats (cat1, cat2, ecat, merge=True, colorize=False, wrem=True, wadd=True, noobs=False, upprogf=None): upprogf = upprogf or (lambda: None) dpairs = _pair_msgs(cat1, cat2, merge, wrem, wadd, noobs, upprogf) # Order pairings such that they follow order of messages in # the new catalog wherever the new message exists. # For unpaired old messages, do heuristic analysis of any # renamings of source files, and then insert diffed messages # according to source references of old messages. dpairs_by2 = [x for x in dpairs if x[1]] dpairs_by2.sort(key=lambda x: x[1].refentry) dpairs_by1 = [x for x in dpairs if not x[1]] fnsyn = None if dpairs_by1: fnsyn = cat2.detect_renamed_sources(cat1) # Make the diffs. # Must not add diffed messages directly to global ediff catalog, # because then heuristic insertion would throw them all over. # Instead add to local ediff catalog, then copy in order to global. ndiffed = 0 lecat = Catalog("", create=True, monitored=False) for cdpairs, cfnsyn in ((dpairs_by2, None), (dpairs_by1, fnsyn)): for msg1, msg2 in cdpairs: upprogf() ndiffed += _add_msg_diff(msg1, msg2, lecat, colorize, cfnsyn) for emsg in lecat: ecat.add(emsg, len(ecat)) return ndiffed def cats_update_effort (cat1, cat2, upprogf=None): upprogf = upprogf or (lambda: None) dpairs = _pair_msgs(cat1, cat2, merge=True, wrem=False, wadd=True, noobs=False, upprogf=upprogf) nntw_total = 0 for msg1, msg2 in dpairs: upprogf() if not msg2.active: continue if msg1 is None: msg1 = MessageUnsafe() # The update effort of the given old-new message pair is equal # to "nominal number of newly translated words" (NNTW), # which is defined as follows: # - nominal length of a word in msgid is set to 6 characters (WL). # - number of characters in new msgid is divided by WL # to give nominal number of words in new msgid (NWO) # - number of equal characters in old and new msgid is divided by WL # to give nominal number of equal words in msgid (NEWO) # - number of characters in new msgstr is divided by number of # characters in new msgid to give translation expansion factor (EF) # - number of equal characters in old and new msgstr is divided # by WL*EF to give nominal number of equal words in msgstr (NEWT) # - character-based similarity ratio of old and new msgid # (from 0.0 for no similarity to 1.0 for equality) is computed (SRO) # - character-based similarity ratio of old and new msgstr # is computed (SRT) # - similarity ratio threshold is set to 0.5 (SRB) # - reduction due to similiarity factor is computed as # RSF = (min(SRO, SRT) - SRB) / (1 - SRB) # - nominal number of newly translated words is computed as # NNTW = min(NWO - max(NEWO, NEWT) * RSF, NWO) # # Only those pairs where the new message is active are counted in. # # On plural messages, for the moment only msgid and msgstr[0] # are considered, and the above procedured applied to them. # This underestimates the effort of updating a new plural message # when old message was ordinary. wl = 6.0 nwo = len(msg2.msgid) / wl diffo, dro = tdiff(msg1.msgid, msg2.msgid, diffr=True) newo = len([c for t, c in diffo if t == " "]) / wl ef = float(len(msg2.msgstr[0])) / len(msg2.msgid) difft, drt = tdiff(msg1.msgstr[0], msg2.msgstr[0], diffr=True) newt = len([c for t, c in difft if t == " "]) / (wl * ef) sro = 1.0 - dro srt = 1.0 - drt srb = 0.5 rsf = (min(sro, srt) - srb) / (1.0 - srb) nntw = max(min(nwo - max(newo, newt) * rsf, nwo), 0.0) nntw_total += nntw return nntw_total def _calc_text_update_effort (text1, text2): dr1 = 0.5 ediff, dr = word_ediff(text1, text2, markup=True, diffr=True) def _pair_msgs (cat1, cat2, merge=True, wrem=True, wadd=True, noobs=False, upprogf=None): upprogf = upprogf or (lambda: None) # Remove obsolete messages if they are not to be diffed. if noobs: for cat in (cat1, cat2): _rmobs_no_sync(cat) # Clean up inconsistencies in messages. for cat in (cat1, cat2): for msg in cat: upprogf() msg_cleanup(msg) # Delay inverting of catalogs until necessary. def icat_w (cat, icat_pack): if icat_pack[0] is None: - #print "===> inverting: %s" % cat.filename + #print("===> inverting: %s" % cat.filename) icat = Catalog("", create=True, monitored=False) for msg in cat: upprogf() imsg = _msg_invert_cp(msg) if imsg not in icat: icat.add_last(imsg) icat_pack[0] = icat return icat_pack[0] icat1_pack = [None] icat1 = lambda: icat_w(cat1, icat1_pack) icat2_pack = [None] icat2 = lambda: icat_w(cat2, icat2_pack) # Delay merging of catalogs until necessary. def mcat_w (cat1, cat2, mcat_pack): if mcat_pack[0] is None: - #print "===> merging: %s -> %s" % (cat1.filename, cat2.filename) + #print("===> merging: %s -> %s" % (cat1.filename, cat2.filename)) # Merge is done if requested and both catalogs exist. if merge and not cat1.created() and not cat2.created(): mcat_pack[0] = merge_pofile(cat1.filename, cat2.filename, getcat=True, monitored=False, quiet=True, abort=True) if noobs: _rmobs_no_sync(mcat_pack[0]) else: mcat_pack[0] = {} # only tested for membership return mcat_pack[0] mcat12_pack = [None] mcat12 = lambda: mcat_w(cat1, cat2, mcat12_pack) mcat21_pack = [None] mcat21 = lambda: mcat_w(cat2, cat1, mcat21_pack) # Pair messages: # - first try to find an old message for each new # - then try to find a new message for each unpaired old # - finally add remaining unpaired messages to be diffed with None msgs1_paired = set() msgs2_paired = set() dpairs = [] for msg2 in cat2: upprogf() msg1 = _get_msg_pair(msg2, cat1, icat1, mcat12) if msg1 and msg1 not in msgs1_paired: # Record pairing. msgs1_paired.add(msg1) msgs2_paired.add(msg2) dpairs.append((msg1, msg2)) for msg1 in cat1: upprogf() if msg1 in msgs1_paired: continue msg2 = _get_msg_pair(msg1, cat2, icat2, mcat21) if msg2 and msg2 not in msgs2_paired: # Record pairing. msgs1_paired.add(msg1) msgs2_paired.add(msg2) dpairs.append((msg1, msg2)) for msg2 in (wadd and cat2 or []): upprogf() if msg2 not in msgs2_paired: dpairs.append((None, msg2)) for msg1 in (wrem and cat1 or []): upprogf() if msg1 not in msgs1_paired: dpairs.append((msg1, None)) return dpairs def _rmobs_no_sync (cat): for msg in cat: if msg.obsolete: cat.remove_on_sync(msg) cat.sync_map() # Determine the pair of the message in the catalog, if any. def _get_msg_pair (msg, ocat, icat, mcat): # If no direct match, try pivoting around any previous fields. # Iterate through test catalogs in this order, # to delay construction of those which are not necessary. for tcat in (ocat, icat, mcat): if callable(tcat): tcat = tcat() omsg = tcat.get(msg) if not omsg and msg.fuzzy: omsg = tcat.get(_msg_invert_cp(msg)) if tcat is not ocat: # tcat is one of pivot catalogs omsg = ocat.get(_msg_invert_cp(omsg)) if omsg: break return omsg # Out of a message with previous fields, # construct a lightweight message with previous and current fields exchanged. # If there are no previous fields, return None. # To be used only for lookups def _msg_invert_cp (msg): if msg is None: return None lmsg = MessageUnsafe() if msg.key_previous is not None: # Need to invert only key fields, but whadda hell. for fcurr, fprev in MPC.currprev_fields: setattr(lmsg, fcurr, msg.get(fprev)) setattr(lmsg, fprev, msg.get(fcurr)) else: return lmsg.set_key(msg) return lmsg def _add_msg_diff (msg1, msg2, ecat, colorize, fnsyn=None): # Skip diffing if old and new messages are "same". if msg1 and msg2 and msg1.inv == msg2.inv: return 0 # Create messages for special pairings. msg1_s, msg2_s = _create_special_diff_pair(msg1, msg2) # Create the diff. tmsg = msg2 or msg1 emsg = msg2_s or msg1_s if emsg is tmsg: emsg = MessageUnsafe(tmsg) emsg = msg_ediff(msg1_s, msg2_s, emsg=emsg, ecat=ecat, colorize=colorize) # Add to the diff catalog. if fnsyn is None: ecat.add(emsg, len(ecat)) else: ecat.add(emsg, srefsyn=fnsyn) return 1 def _create_special_diff_pair (msg1, msg2): msg1_s, msg2_s = msg1, msg2 if not msg1 or not msg2: # No special cases if either message non-existant. pass # Cases f-nf-*. elif msg1.fuzzy and msg1.key_previous is not None and not msg2.fuzzy: # Case f-nf-ecc. if msg_eq_fields(msg1, msg2, MPC.curr_fields): msg1_s = MessageUnsafe(msg1) msg_copy_fields(msg1, msg1_s, MPC.prevcurr_fields) msg_clear_prev_fields(msg1_s) # Case f-nf-necc. else: msg1_s = MessageUnsafe(msg1) msg2_s = MessageUnsafe(msg2) msg_copy_fields(msg1, msg1_s, MPC.prevcurr_fields) msg_copy_fields(msg1, msg2_s, MPC.currprev_fields) # Cases nf-f-*. elif not msg1.fuzzy and msg2.fuzzy and msg2.key_previous is not None: # Case nf-f-ecp. if msg_eq_fields(msg1, msg2, MPC.currprev_fields): msg2_s = MessageUnsafe(msg2) msg_clear_prev_fields(msg2_s) # Case nf-f-necp. else: msg1_s = MessageUnsafe(msg1) msg2_s = MessageUnsafe(msg2) msg_copy_fields(msg2, msg1_s, MPC.prev_fields) msg_copy_fields(msg2, msg2_s, MPC.currprev_fields) return msg1_s, msg2_s def diff_hdrs (hdr1, hdr2, vpath1, vpath2, hmsgctxt, ecat, colorize): hmsg1, hmsg2 = [x and MessageUnsafe(x.to_msg()) or None for x in (hdr1, hdr2)] ehmsg = hmsg2 and MessageUnsafe(hmsg2) or None ehmsg, dr = msg_ediff(hmsg1, hmsg2, emsg=ehmsg, ecat=ecat, colorize=colorize, diffr=True) if dr == 0.0: # Revert to empty message if no difference between headers. ehmsg = MessageUnsafe() # Add visual paths as old/new segments into msgid. vpaths = [vpath1, vpath2] # Always use slashes as path separator, for portability of ediffs. vpaths = [x.replace(os.path.sep, "/") for x in vpaths] - ehmsg.msgid = u"- %s\n+ %s" % tuple(vpaths) + ehmsg.msgid = "- %s\n+ %s" % tuple(vpaths) # Add trailing newline if msgstr has it, again to appease msgfmt. if ehmsg.msgstr[0].endswith("\n"): ehmsg.msgid += "\n" # Add context identifying the diffed message as header. ehmsg.msgctxt = hmsgctxt # Add conspicuous separator at the top of the header. - ehmsg.manual_comment.insert(0, u"=" * 76) + ehmsg.manual_comment.insert(0, "=" * 76) return ehmsg, dr > 0.0 def init_ediff_header (ehdr, hmsgctxt=EDST.hmsgctxt_el, extitle=None): cfgsec = pology_config.section("user") user = cfgsec.string("name", "J. Random Translator") email = cfgsec.string("email", None) listtype = type(ehdr.title) if extitle is not None: - title = u"+- ediff (%s) -+" % extitle + title = "+- ediff (%s) -+" % extitle else: - title = u"+- ediff -+" + title = "+- ediff -+" ehdr.title = listtype([title]) year = time.strftime("%Y") if email: - author = u"%s <%s>, %s." % (user, email, year) + author = "%s <%s>, %s." % (user, email, year) else: - author = u"%s, %s." % (user, year) + author = "%s, %s." % (user, year) #ehdr.author = listtype([author]) ehdr.author = listtype([]) - ehdr.copyright = u"" - ehdr.license = u"" + ehdr.copyright = "" + ehdr.license = "" ehdr.comment = listtype() rfv = ehdr.replace_field_value # shortcut - rfv("Project-Id-Version", u"ediff") + rfv("Project-Id-Version", "ediff") ehdr.remove_field("Report-Msgid-Bugs-To") ehdr.remove_field("POT-Creation-Date") - rfv("PO-Revision-Date", unicode(time.strftime("%Y-%m-%d %H:%M%z"))) + rfv("PO-Revision-Date", str(time.strftime("%Y-%m-%d %H:%M%z"))) enc = "UTF-8" # strictly, input catalogs may have different encodings - rfv("Content-Type", u"text/plain; charset=%s" % enc) - rfv("Content-Transfer-Encoding", u"8bit") + rfv("Content-Type", "text/plain; charset=%s" % enc) + rfv("Content-Transfer-Encoding", "8bit") if email: - translator = u"%s <%s>" % (user, email) + translator = "%s <%s>" % (user, email) else: - translator = u"%s" % user + translator = "%s" % user rfv("Last-Translator", translator) - rfv("Language-Team", u"Differs") + rfv("Language-Team", "Differs") # FIXME: Something smarter? (Not trivial.) ehdr.remove_field("Plural-Forms") # Context of header messages in the catalog. ehdr.set_field(EDST.hmsgctxt_field, hmsgctxt) def get_msgctxt_for_headers (cat): - hmsgctxt = u"" + hmsgctxt = "" good = False while not good: hmsgctxt += EDST.hmsgctxt_el good = True for msg in cat: if hmsgctxt == msg.msgctxt: good = False break return hmsgctxt diff --git a/pology/lang/es/compare_with_original.py b/pology/lang/es/compare_with_original.py index ac7a082a..e966d640 100644 --- a/pology/lang/es/compare_with_original.py +++ b/pology/lang/es/compare_with_original.py @@ -1,430 +1,430 @@ # -*- coding: utf-8 -*- """ Make some comparations between the translation and the original text. @author: Javier Viñal @license: GPLv3 """ import re import string import enchant from pology import _, n_, split def test_if_empty_translation (msg, cat): """ Compare the translation with the original text, testing if the translation is empty. [type V4A hook]. @return: parts """ for i in range(len(msg.msgstr)): if i > 0: lm = len(msg.msgid_plural) else: lm = len(msg.msgid) if lm > 0 and len(msg.msgstr[i]) == 0: - return [("msgstr", 0, [(0, 0, u'La traducción parece estar vacía')])] + return [("msgstr", 0, [(0, 0, 'La traducción parece estar vacía')])] return [] _purepunc = re.compile("^\W+$", re.U) def test_if_purepunc (msg, cat): """ Compare the translation with the original text, testing if the translation is different when the original text has not alphanumeric text. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): msgstr = msg.msgstr[i] if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid if _purepunc.match(msgid): msgid = msgid.replace('"', '') msgid = msgid.replace("'", "") msgid = msgid.replace(" ", "") msgstr = msgstr.replace('"', '') msgstr = msgstr.replace("'", "") - msgstr = msgstr.replace(u"«", "") - msgstr = msgstr.replace(u"»", "") + msgstr = msgstr.replace("«", "") + msgstr = msgstr.replace("»", "") msgstr = msgstr.replace(" ", "") msgstr = msgstr.replace("\"", "") if msgid != msgstr: - return [("msgstr", 0, [(0, 0, u'Se ha traducido un texto no alfanumérico')])] + return [("msgstr", 0, [(0, 0, 'Se ha traducido un texto no alfanumérico')])] return [] def test_if_non_printable_characters (msg, cat): """ Compare the translation with the original text, testing if the translation is different when the original text has not alphanumeric text. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): msgstr = msg.msgstr[i] if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid for c in msgstr: - if (c not in string.printable) and (c not in msgid) and (c not in u"áéíóúüñçÁÉÍÓÚÜÑÇ¿¡|«»©ºª€/"): - return [("msgstr", 0, [(0, 0, u'La traducción contiene caracteres no imprimibles')])] - elif (c in string.punctuation) and (c not in msgid) and (c not in u"¿¡|«»©ºª€/.,;:()_-"): - return [("msgstr", 0, [(0, 0, u'La traducción contiene signos de puntuación no incluidos en el original')])] + if (c not in string.printable) and (c not in msgid) and (c not in "áéíóúüñçÁÉÍÓÚÜÑÇ¿¡|«»©ºª€/"): + return [("msgstr", 0, [(0, 0, 'La traducción contiene caracteres no imprimibles')])] + elif (c in string.punctuation) and (c not in msgid) and (c not in "¿¡|«»©ºª€/.,;:()_-"): + return [("msgstr", 0, [(0, 0, 'La traducción contiene signos de puntuación no incluidos en el original')])] return [] def test_if_very_long_translation (msg, cat): """ Compare the translation with the original text, testing if the transaled text is much longer than the original (As much twice with a correction for small text). [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: lm = len(msg.msgid_plural.split()) else: lm = len(msg.msgid.split()) if lm > 0 and len(msg.msgstr[i].split()) > (1.6 * lm + 5): - return [("msgstr", 0, [(0, 0, u'La traducción parece demasiado larga')])] + return [("msgstr", 0, [(0, 0, 'La traducción parece demasiado larga')])] return [] def test_if_very_short_translation (msg, cat): """ Compare the translation with the original text, testing if the transaled text is much shorter than the original. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if len(msg.msgstr[i]) > 0: if i > 0: lm = len(msg.msgid_plural.split()) else: lm = len(msg.msgid.split()) if lm > (1.6 * len(msg.msgstr[i].split()) + 5): - return [("msgstr", 0, [(0, 0, u'La traducción parece demasiado corta')])] + return [("msgstr", 0, [(0, 0, 'La traducción parece demasiado corta')])] return [] _valid_word = re.compile("^\w+$", re.U) -_capital_word = re.compile(u"^[A-Z0-9ÑÇÁÉÍÓÚÁÉÍÓÚÂÊÎÔÛÄËÏÖÜĀ]+$", re.U) -_proper_name = re.compile(u"^\W*?[A-Z0-9ÑÇÁÉÍÓÚÁÉÍÓÚÂÊÎÔÛÄËÏÖÜĀ]\w+(\W+?[A-Z0-9ÑÇÁÉÍÓÚÁÉÍÓÚÂÊÎÔÛÄËÏÖÜĀ]\w+)+\W*$", re.U) +_capital_word = re.compile("^[A-Z0-9ÑÇÁÉÍÓÚÁÉÍÓÚÂÊÎÔÛÄËÏÖÜĀ]+$", re.U) +_proper_name = re.compile("^\W*?[A-Z0-9ÑÇÁÉÍÓÚÁÉÍÓÚÂÊÎÔÛÄËÏÖÜĀ]\w+(\W+?[A-Z0-9ÑÇÁÉÍÓÚÁÉÍÓÚÂÊÎÔÛÄËÏÖÜĀ]\w+)+\W*$", re.U) def test_if_not_translated (msg, cat): """ Compare the translation with the original text, testing if the paragraph is not translated. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid if _proper_name.match(msg.msgstr[i]) or _purepunc.match(msgid): continue e = None l = None if len(msgid) > 0 and msgid == msg.msgstr[i]: for word in split.proper_words(msgid, markup=True, accels=['&']): if _valid_word.match(word) and not _capital_word.match(word): word = word.encode("utf-8") if e is None: e = enchant.Dict("en") if l is None: l = enchant.Dict("es") if e.check(word) and not l.check(word): - return [("msgstr", 0, [(0, 0, u'El párrafo parece no estar traducido')])] + return [("msgstr", 0, [(0, 0, 'El párrafo parece no estar traducido')])] return [] -_ent_accel = re.compile(u"&[A-Za-z0-9ÑñÇç](?!\w+;)", re.U) +_ent_accel = re.compile("&[A-Za-z0-9ÑñÇç](?!\w+;)", re.U) def test_paired_accelerators (msg, cat): """ Compare number of accelerators (&) between original and translated text. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid cont_orig = len(_ent_accel.findall(msgid)) cont_tran = len(_ent_accel.findall(msg.msgstr[i])) if cont_orig < cont_tran: - return [("msgstr", 0, [(0, 0, u"Sobran aceleradores «&» en la traducción")])] + return [("msgstr", 0, [(0, 0, "Sobran aceleradores «&» en la traducción")])] elif cont_orig > cont_tran: - return [("msgstr", 0, [(0, 0, u"Faltan aceleradores «&» en la traducción")])] + return [("msgstr", 0, [(0, 0, "Faltan aceleradores «&» en la traducción")])] return [] def test_paired_strings (msg, cat): """ Compare number of some strings between original and translated text. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid - for s in (["\t", u"tabuladores"], - ["\r", u"retornos de carro"], - ["\n", u"saltos de línea"] + for s in (["\t", "tabuladores"], + ["\r", "retornos de carro"], + ["\n", "saltos de línea"] ): cont_orig = msgid.count(s[0]) cont_tran = msg.msgstr[i].count(s[0]) if cont_orig < cont_tran: - return [("msgstr", 0, [(0, 0, u"Sobran " + s[1] + u" en la traducción")])] + return [("msgstr", 0, [(0, 0, "Sobran " + s[1] + " en la traducción")])] elif cont_orig > cont_tran: - return [("msgstr", 0, [(0, 0, u"Faltan " + s[1] + u" en la traducción")])] + return [("msgstr", 0, [(0, 0, "Faltan " + s[1] + " en la traducción")])] return [] def test_paired_brackets (msg, cat): """ Compare number of some brackets between original and translated text. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid - for s in ([u"(", u")", u"paréntesis"], - [u"{", u"}", u"llaves"], - [u"[", u"]", u"corchetes"], - [u"«", u"»", u"comillas españolas"] + for s in (["(", ")", "paréntesis"], + ["{", "}", "llaves"], + ["[", "]", "corchetes"], + ["«", "»", "comillas españolas"] ): cont_orig_open = msgid.count(s[0]) cont_orig_close = msgid.count(s[1]) if cont_orig_open != cont_orig_close: continue cont_tran_open = msg.msgstr[i].count(s[0]) cont_tran_close = msg.msgstr[i].count(s[1]) if cont_tran_open < cont_tran_close: - return [("msgstr", 0, [(0, 0, u"Sobran " + s[2] + u" en la traducción")])] + return [("msgstr", 0, [(0, 0, "Sobran " + s[2] + " en la traducción")])] elif cont_tran_open > cont_tran_close: - return [("msgstr", 0, [(0, 0, u"Faltan " + s[2] + u" en la traducción")])] + return [("msgstr", 0, [(0, 0, "Faltan " + s[2] + " en la traducción")])] return [] _ent_function = re.compile("(?:\w+\:\:)*\w+\(\)", re.U) _ent_parameter = re.compile("(?<=\W)\-\-\w+(?:\-\w+)*", re.U) def test_paired_expressions (msg, cat): """ Compare expressions (functions, parameters) between original and translated text. Should be the same. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid - for expr in ([_ent_function, u"Nombres de función"], - [_ent_parameter, u"Parámetros de orden"] + for expr in ([_ent_function, "Nombres de función"], + [_ent_parameter, "Parámetros de orden"] ): expr_orig = sorted(expr[0].findall(msgid)) expr_trans = sorted(expr[0].findall(msg.msgstr[i])) if expr_orig != expr_trans: - return [("msgstr", 0, [(0, 0, expr[1] + u" distintos en la traducción")])] + return [("msgstr", 0, [(0, 0, expr[1] + " distintos en la traducción")])] return [] _ent_number = re.compile("\b\d+([\s.,:/-]\d+)*\b", re.U) _not_digit = re.compile("\D", re.U) def test_paired_numbers (msg, cat): """ Compare numbers and dates between original and translated text. Should be the same (except for commas/colons and one digit numbers) [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid number_orig = [] for number in _ent_number.findall(msgid): if len(number) > 1: number_orig += _not_digit.split(number) number_trans = [] for number in _ent_number.findall(msg.msgstr[i]): if len(number) > 1: number_trans += _not_digit.split(number) if sorted(number_orig) != sorted(number_trans): - return [("msgstr", 0, [(0, 0, u"Valores de números distintos en la traducción")])] + return [("msgstr", 0, [(0, 0, "Valores de números distintos en la traducción")])] return [] _ent_context_tags = re.compile("\<(application|bcode|command|email|envar|filename|icode|link|returnvalue)\>(.+?)\<\/\1\>", re.U) def test_paired_context_tags (msg, cat): """ Compare context tags between original and translated text. Some of them should not be changed in the translation. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid for tag in _ent_context_tags.findall(msgid): if not (tag[1] in msg.msgstr[i]): - return [("msgstr", 0, [(0, 0, u"Valor de etiqueta de contexto" + tag[1] + u"traducido indebidamente")])] + return [("msgstr", 0, [(0, 0, "Valor de etiqueta de contexto" + tag[1] + "traducido indebidamente")])] return [] _ent_xml_entities = re.compile("\<\/(application|bcode|command|email|emphasis|envar|filename|icode|interface|link|message|nl|numid|placeholder|resource|shortcut|note|warning|para|title|subtitle|list|item|)\>", re.U) def test_paired_xml_entities (msg, cat): """ Compare xml entities between original and translated text. Some of them should not be changed in the translation. [type V4A hook]. @return: parts """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): return [] if msg.msgid in ("Your emails", "Your names", "CREDIT_FOR_TRANSLATORS", "ROLES_OF_TRANSLATORS"): return [] for i in range(len(msg.msgstr)): if i > 0: msgid = msg.msgid_plural else: msgid = msg.msgid for tag in _ent_xml_entities.findall(msgid): if not (tag in msg.msgstr[i]): - return [("msgstr", 0, [(0, 0, u"Etiqueta XML" + tag + u"no encontrada en la traducción")])] + return [("msgstr", 0, [(0, 0, "Etiqueta XML" + tag + "no encontrada en la traducción")])] for tag in _ent_xml_entities.findall(msg.msgstr[i]): if not (tag in msgid): - return [("msgstr", 0, [(0, 0, u"Etiqueta XML" + tag + u"no encontrada en el texto original")])] + return [("msgstr", 0, [(0, 0, "Etiqueta XML" + tag + "no encontrada en el texto original")])] return [] diff --git a/pology/lang/es/remove_subs.py b/pology/lang/es/remove_subs.py index 853b4b8c..ee6bdeab 100644 --- a/pology/lang/es/remove_subs.py +++ b/pology/lang/es/remove_subs.py @@ -1,161 +1,161 @@ # -*- coding: utf-8 -*- """ Remove special substrings from parts of the message. @author: Javier Viñal @license: GPLv3 """ import re #from pology import PologyError, datadir, _, n_ #from pology.report import report, warning, format_item_list # Capitals words in valid contexts in the translated text according with Spanish grammar # (beggining of paragraph, after some punctuation characters and after a new line) -_valid_capital_word_middle = re.compile(u"(?<=[.:?!>»\"]\s)\w*?[A-ZÁÉÍÓÚÜÑÇ]\w*", re.U) -_valid_capital_word_initial = re.compile(u"^\w*?[A-ZÁÉÍÓÚÜÑÇ]\w*", re.U) +_valid_capital_word_middle = re.compile("(?<=[.:?!>»\"]\s)\w*?[A-ZÁÉÍÓÚÜÑÇ]\w*", re.U) +_valid_capital_word_initial = re.compile("^\w*?[A-ZÁÉÍÓÚÜÑÇ]\w*", re.U) # All capital words in the original English text, -_ent_capital_word = re.compile(u"\w*?[A-Z]\w*", re.U) +_ent_capital_word = re.compile("\w*?[A-Z]\w*", re.U) # All plural full capital words (acronyms) without the final 's'. -_ent_capital_word_plural = re.compile(u"[A-Z0-9]+(?=\'?s\b)", re.U) +_ent_capital_word_plural = re.compile("[A-Z0-9]+(?=\'?s\b)", re.U) def remove_paired_capital_words (msg, cat): """ Remove all capital words from original text and from translated text, except that are located in a place where may be a capital word according the Spanish grammar.[type F4A hook]. @return: number of errors """ # Obtains capitals words in valid contexts in the translated text. for i in range(len(msg.msgstr)): ents = set() ents.update(_valid_capital_word_middle.findall(msg.msgstr[i])) ents.update(_valid_capital_word_initial.findall(msg.msgstr[i])) if i == 0: # Obtains all capitals words in the original English text. ents.update(_ent_capital_word.findall(msg.msgid)) ents.update(_ent_capital_word_plural.findall(msg.msgid)) else: if msg.msgid_plural: ents.update(_ent_capital_word.findall(msg.msgid_plural)) ents.update(_ent_capital_word_plural.findall(msg.msgid_plural)) # Joins both set of words an remove them from the message. for ent in ents: # report(_("@info", "Palabra en mayusculas: %(info)s \n", info=ent)) msg.msgstr[i] = re.sub(r'\b' + ent + r'\b', '~', msg.msgstr[i], 0, re.U) if i == 0: msg.msgid = re.sub(r'\b' + ent + r'\b', '~', msg.msgid, 0, re.U) else: msg.msgid_plural = re.sub(r'\b' + ent + r'\b', '~', msg.msgid_plural, 0, re.U) # The remainning words could have wrong capitalization in the translated message. # TODO: Look the remaining words in a Spanish dictionary. return 0 def remove_original_capital_words (msg, cat): """ Remove all capital words of the original text and from translated text. [type F4A hook]. @return: number of errors """ # Obtains capitals words in valid contexts in the translated text. for i in range(len(msg.msgstr)): ents = set() if i == 0: # Obtains all capitals words in the original English text. ents.update(_ent_capital_word.findall(msg.msgid)) ents.update(_ent_capital_word_plural.findall(msg.msgid)) else: if msg.msgid_plural: ents.update(_ent_capital_word.findall(msg.msgid_plural)) ents.update(_ent_capital_word_plural.findall(msg.msgid_plural)) # Remove English capital words from translated text. for ent in ents: msg.msgstr[i] = re.sub(r'\b' + ent + r'\b', '~', msg.msgstr[i], 0, re.U) return 0 _ent_parameter = re.compile("(%\d%?|\$\{.+?\}|\$\w+|%(?:\d\$)?[ds]|%\|.+?\|)", re.U) def remove_paired_parameters (msg, cat): """ Remove format strings from the original text, and from translation all that are also found in the original text [type F4A hook]. @return: number of errors """ pars_orig = set() pars_orig.update(_ent_parameter.findall(msg.msgid)) pars_orig_plural = set() if msg.msgid_plural: pars_orig_plural.update(_ent_parameter.findall(msg.msgid_plural)) for i in range(len(msg.msgstr)): pars_trans = set(_ent_parameter.findall(msg.msgstr[i])) if i == 0: for par in pars_trans.intersection(pars_orig): msg.msgid = msg.msgid.replace(par, "~") msg.msgstr[i] = msg.msgstr[i].replace(par, "~") else: for par in pars_trans.intersection(pars_orig_plural): msg.msgid_plural = msg.msgid_plural.replace(par, "~") msg.msgstr[i] = msg.msgstr[i].replace(par, "~") return 0 _ent_xml_entity = re.compile("\<\/?\w+\>") _auto_comment_tag = ("trans_comment", "literallayout", "option", "programlisting", "othercredit", "author", "email", "holder", "surname", "personname", "affiliation", "address", "sect1", "chapter", "chapterinfo", "date", "command", "option", "refentrytitle", "refentryinfo", "refname", "synopsis", "literal", "varname", "term", "glossterm", "filename", "entry", "envar", "userinput", "cmdsynopsis", "releaseinfo", "language", "Name", "City", "Region", "Region/state", "unit", "Query", "Kgm") def remove_tags_without_translation (msg, cat): """ Remove all paragraph that belong to contexts that do not have need of translation. [type F4A hook]. @return: number of errors """ if msg.msgctxt in ("EMAIL OF TRANSLATORS", "NAME OF TRANSLATORS", "ROLES OF TRANSLATORS"): msg.msgid = "" msg.msgid_plural = "" for i in range(len(msg.msgstr)): msg.msgstr[i] = "" return 0 # Avoid specially tagged messages. for tagline in msg.auto_comment: for tag in tagline.split(): if tag in _auto_comment_tag: msg.msgid = "" if msg.msgid_plural: msg.msgid_plural = "" for i in range(len(msg.msgstr)): msg.msgstr[i] = "" return 0 if msg.msgctxt: for tag in msg.msgctxt.split(): if tag in _auto_comment_tag: msg.msgid = "" if msg.msgid_plural: msg.msgid_plural = "" for i in range(len(msg.msgstr)): msg.msgstr[i] = "" return 0 return 0 diff --git a/pology/lang/fr/patternAccents.py b/pology/lang/fr/patternAccents.py index 83cbab96..a4992441 100644 --- a/pology/lang/fr/patternAccents.py +++ b/pology/lang/fr/patternAccents.py @@ -1,34 +1,34 @@ # -*- coding: UTF-8 -*- """ Accent equivalence in regular expression patterns. @author: Sébastien Renard @license: GPLv3 """ import re accents={} -accents[u"e"] = u"[%s]" % u"|".join([u'e', u'é', u'è', u'ê', u'E', u'É', u'È', u'Ê']) -accents[u"é"] = u"[%s]" % u"|".join([u'é', u'è', u'ê', u'É', u'È', u'Ê']) -accents[u"è"] = u"[%s]" % u"|".join([u'é', u'è', u'ê', u'É', u'È', u'Ê']) -accents[u"ê"] = u"[%s]" % u"|".join([u'é', u'è', u'ê', u'É', u'È', u'Ê']) -accents[u"a"] = u"[%s]" % u"|".join([u'a', u'à', u'â', u'A', u'À', u'Â']) -accents[u"à"] = u"[%s]" % u"|".join([u'à', u'â', u'À', u'Â']) -accents[u"â"] = u"[%s]" % u"|".join([u'à', u'â', u'À', u'Â']) -accents[u"u"] = u"[%s]" % u"|".join([u'u', u'ù', u'û', u'U', u'Ù', u'Û']) -accents[u"ù"] = u"[%s]" % u"|".join([u'ù', u'û', u'Ù', u'Û']) -accents[u"û"] = u"[%s]" % u"|".join([u'ù', u'û', u'Ù', u'Û']) -accentPattern=re.compile(u"@([%s])" % u"|".join(accents.keys())) +accents["e"] = "[%s]" % "|".join(['e', 'é', 'è', 'ê', 'E', 'É', 'È', 'Ê']) +accents["é"] = "[%s]" % "|".join(['é', 'è', 'ê', 'É', 'È', 'Ê']) +accents["è"] = "[%s]" % "|".join(['é', 'è', 'ê', 'É', 'È', 'Ê']) +accents["ê"] = "[%s]" % "|".join(['é', 'è', 'ê', 'É', 'È', 'Ê']) +accents["a"] = "[%s]" % "|".join(['a', 'à', 'â', 'A', 'À', 'Â']) +accents["à"] = "[%s]" % "|".join(['à', 'â', 'À', 'Â']) +accents["â"] = "[%s]" % "|".join(['à', 'â', 'À', 'Â']) +accents["u"] = "[%s]" % "|".join(['u', 'ù', 'û', 'U', 'Ù', 'Û']) +accents["ù"] = "[%s]" % "|".join(['ù', 'û', 'Ù', 'Û']) +accents["û"] = "[%s]" % "|".join(['ù', 'û', 'Ù', 'Û']) +accentPattern=re.compile("@([%s])" % "|".join(list(accents.keys()))) def patternAccents(pattern): """Replace every C{@x} in the pattern by the value C{accents["x"]}.""" for accentMatch in accentPattern.finditer(pattern): letter=accentMatch.group(1) pattern=pattern.replace("@%s" % letter, accents[letter]) return pattern diff --git a/pology/lang/fr/patternEntities.py b/pology/lang/fr/patternEntities.py index fb84a1a4..9bceb3ec 100644 --- a/pology/lang/fr/patternEntities.py +++ b/pology/lang/fr/patternEntities.py @@ -1,28 +1,28 @@ # -*- coding: UTF-8 -*- """ Convert special entities in rule patterns. @author: Sébastien Renard @license: GPLv3 """ from pology.resolve import resolve_entities_simple entities={} -entities["cr"]=u"\r" -entities["lf"]=u"\n" -entities["lt"]=u"<" -entities["gt"]=u">" -entities["sp"]=u" " -entities["quot"]=u'\"' -entities["amp"]=u"&" -entities["unbsp"]=u"\xa0" -entities["nbsp"]=u" " +entities["cr"]="\r" +entities["lf"]="\n" +entities["lt"]="<" +entities["gt"]=">" +entities["sp"]=" " +entities["quot"]='\"' +entities["amp"]="&" +entities["unbsp"]="\xa0" +entities["nbsp"]=" " def patternEntities(pattern): """Convert entities in pattern.""" return resolve_entities_simple(pattern, entities) diff --git a/pology/lang/ja/katakana.py b/pology/lang/ja/katakana.py index 5e107504..5916c499 100644 --- a/pology/lang/ja/katakana.py +++ b/pology/lang/ja/katakana.py @@ -1,38 +1,38 @@ # -*- coding: UTF-8 -* """ Retain only Katakana words in the text, separated by spaces. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ def katakana (text): """ Type F1A hook. @return: text """ ntext = [] for i in range(len(text)): c = text[i] if _is_katakana(c): ntext.append(c) - elif c == u"・": + elif c == "・": c_prev = text[i-1:i] c_next = text[i+1:i+2] if _is_katakana(c_prev) and _is_katakana(c_next): ntext.append(c) else: if ntext and ntext[-1] != " ": ntext.append(" ") ntext = ("".join(ntext)).strip() return ntext def _is_katakana (c): - return (c >= u"ァ" and c <= u"ヴ") or c == u"ー" + return (c >= "ァ" and c <= "ヴ") or c == "ー" diff --git a/pology/lang/nb/exclusion.py b/pology/lang/nb/exclusion.py index 79b5b8ea..79e67401 100644 --- a/pology/lang/nb/exclusion.py +++ b/pology/lang/nb/exclusion.py @@ -1,98 +1,98 @@ # -*- coding: UTF-8 -* """ Catch inofficial ortography forms in Norwegian Bokmål translation. The check expects that the translation is plain text, i.e. that any markup has been removed from it beforehand; otherwise, problems masked by markup may not be reported. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import codecs from pology import datadir, _, n_ from pology.fsops import collect_files_by_ext from pology.split import split_text def exclude_forms (dictnames): """ Check for excluded ortography forms in translation [hook factory]. @param dictnames: base names of files from which to collect excluded forms; file paths will be assembled as C{/lang/nn/exclusion/.dat} @type dictnames: @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ phrases = _load_phrases(dictnames) - maxwords = max(map(lambda x: len(split_text(x)[0]), phrases)) + maxwords = max([len(split_text(x)[0]) for x in phrases]) def hook (msgstr, msg, cat): spans = [] words, interps = split_text(msgstr) for phstart in range(len(words)): for phlen in range(min(maxwords, len(words) - phstart), 0, -1): # Construct and test the following phrases: # - with inner and trailing intersections # - with leading and inner intersections # - with inner intersections for off1, off2 in ((1, 1), (0, 0), (1, 0)): parts = [] if off1 == 0: parts.append(interps[phstart]) parts.append(words[phstart]) for i in range(1, phlen): parts.append(interps[phstart + i]) parts.append(words[phstart + i]) if off2 == 1: parts.append(interps[phstart + phlen]) phrase = _normph("".join(parts)) if phrase in phrases: p1 = ( sum(map(len, words[:phstart])) + sum(map(len, interps[:phstart + off1]))) p2 = ( sum(map(len, words[:phstart + phlen])) + sum(map(len, interps[:phstart + phlen + off2]))) emsg = _("@info", "Excluded form '%(word)s'.", word=msgstr[p1:p2].strip()) spans.append((p1, p2, emsg)) break return spans return hook def _load_phrases (dictnames): phrases = set() for dictname in dictnames: exfile = os.path.join(datadir(), "lang", "nb", "exclusion", dictname + ".dat") phrases1 = codecs.open(exfile, "r", "UTF-8").read().split("\n")[:-1] - phrases1 = map(_normph, phrases1) + phrases1 = list(map(_normph, phrases1)) phrases.update(phrases1) return phrases _wsseq_rx = re.compile(r"\s{2,}", re.U) def _normph (phrase): return _wsseq_rx.sub(r" ", phrase.lower().strip()) diff --git a/pology/lang/nn/exclusion.py b/pology/lang/nn/exclusion.py index b4fc7477..b0d76d66 100644 --- a/pology/lang/nn/exclusion.py +++ b/pology/lang/nn/exclusion.py @@ -1,98 +1,98 @@ # -*- coding: UTF-8 -* """ Catch inofficial ortography forms in Norwegian Nynorsk translation. The check expects that the translation is plain text, i.e. that any markup has been removed from it beforehand; otherwise, problems masked by markup may not be reported. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import codecs from pology import datadir, _, n_ from pology.fsops import collect_files_by_ext from pology.split import split_text def exclude_forms (dictnames): """ Check for excluded ortography forms in translation [hook factory]. @param dictnames: base names of files from which to collect excluded forms; file paths will be assembled as C{/lang/nn/exclusion/.dat} @type dictnames: @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ phrases = _load_phrases(dictnames) - maxwords = max(map(lambda x: len(split_text(x)[0]), phrases)) + maxwords = max([len(split_text(x)[0]) for x in phrases]) def hook (msgstr, msg, cat): spans = [] words, interps = split_text(msgstr) for phstart in range(len(words)): for phlen in range(min(maxwords, len(words) - phstart), 0, -1): # Construct and test the following phrases: # - with inner and trailing intersections # - with leading and inner intersections # - with inner intersections for off1, off2 in ((1, 1), (0, 0), (1, 0)): parts = [] if off1 == 0: parts.append(interps[phstart]) parts.append(words[phstart]) for i in range(1, phlen): parts.append(interps[phstart + i]) parts.append(words[phstart + i]) if off2 == 1: parts.append(interps[phstart + phlen]) phrase = _normph("".join(parts)) if phrase in phrases: p1 = ( sum(map(len, words[:phstart])) + sum(map(len, interps[:phstart + off1]))) p2 = ( sum(map(len, words[:phstart + phlen])) + sum(map(len, interps[:phstart + phlen + off2]))) emsg = _("@info", "Excluded form '%(word)s'.", word=msgstr[p1:p2].strip()) spans.append((p1, p2, emsg)) break return spans return hook def _load_phrases (dictnames): phrases = set() for dictname in dictnames: exfile = os.path.join(datadir(), "lang", "nn", "exclusion", dictname + ".dat") phrases1 = codecs.open(exfile, "r", "UTF-8").read().split("\n")[:-1] - phrases1 = map(_normph, phrases1) + phrases1 = list(map(_normph, phrases1)) phrases.update(phrases1) return phrases _wsseq_rx = re.compile(r"\s{2,}", re.U) def _normph (phrase): return _wsseq_rx.sub(r" ", phrase.lower().strip()) diff --git a/pology/lang/sr/accents.py b/pology/lang/sr/accents.py index 246b2d8c..42c5b759 100644 --- a/pology/lang/sr/accents.py +++ b/pology/lang/sr/accents.py @@ -1,178 +1,178 @@ # -*- coding: UTF-8 -*- """ Process letter accents in Serbian Cyrillic text. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ # All accented letters in Serbian Cyrillic, for a given non-accented letter. _accents = { - u"а": (u"а̀", u"а́", u"а̏", u"а̑", u"а̄", u"а̂", u"â", u"ȃ"), - u"А": (u"А̀", u"А́", u"А̏", u"А̑", u"А̄", u"А̂", u"Â", u"Ȃ"), + "а": ("а̀", "а́", "а̏", "а̑", "а̄", "а̂", "â", "ȃ"), + "А": ("А̀", "А́", "А̏", "А̑", "А̄", "А̂", "Â", "Ȃ"), # ...with Latin long-falling/genitive a in NFC, used sometimes as makeshift - u"е": (u"ѐ", u"е́", u"е̏", u"е̑", u"е̄", u"е̂", u"ѐ"), - u"Е": (u"Ѐ", u"Е́", u"Е̏", u"Е̑", u"Е̄", u"Е̂", u"Ѐ"), - u"и": (u"ѝ", u"и́", u"и̏", u"и̑", u"ӣ", u"и̂", u"ѝ", u"ӣ"), - u"И": (u"Ѝ", u"И́", u"И̏", u"И̑", u"Ӣ", u"И̂", u"Ѝ", u"Ӣ"), - u"о": (u"о̀", u"о́", u"о̏", u"о̑", u"о̄", u"о̂", u"ȏ", u"ô"), - u"О": (u"О̀", u"О́", u"О̏", u"О̑", u"О̄", u"О̂", u"Ȏ", u"Ô"), + "е": ("ѐ", "е́", "е̏", "е̑", "е̄", "е̂", "ѐ"), + "Е": ("Ѐ", "Е́", "Е̏", "Е̑", "Е̄", "Е̂", "Ѐ"), + "и": ("ѝ", "и́", "и̏", "и̑", "ӣ", "и̂", "ѝ", "ӣ"), + "И": ("Ѝ", "И́", "И̏", "И̑", "Ӣ", "И̂", "Ѝ", "Ӣ"), + "о": ("о̀", "о́", "о̏", "о̑", "о̄", "о̂", "ȏ", "ô"), + "О": ("О̀", "О́", "О̏", "О̑", "О̄", "О̂", "Ȏ", "Ô"), # ...with Latin long-falling/genitive o in NFC, used sometimes as makeshift - u"у": (u"у̀", u"у́", u"у̏", u"у̑", u"ӯ", u"у̂", u"ӯ"), - u"У": (u"У̀", u"У́", u"У̏", u"У̑", u"Ӯ", u"У̂", u"Ӯ"), - u"р": (u"р̀", u"р́", u"р̏", u"р̑", u"р̄", u"р̂"), - u"Р": (u"Р̀", u"Р́", u"Р̏", u"Р̑", u"Р̄", u"Р̂"), + "у": ("у̀", "у́", "у̏", "у̑", "ӯ", "у̂", "ӯ"), + "У": ("У̀", "У́", "У̏", "У̑", "Ӯ", "У̂", "Ӯ"), + "р": ("р̀", "р́", "р̏", "р̑", "р̄", "р̂"), + "Р": ("Р̀", "Р́", "Р̏", "Р̑", "Р̄", "Р̂"), } # All accented letters bunched together, # and inverted mapping (base for each accented letter). _accents_flat = set() _accents_inverted = {} -for base, accents in _accents.items(): +for base, accents in list(_accents.items()): _accents_flat.update(set(accents)) for accent in accents: _accents_inverted[accent] = base del base, accents # do not pollute exports -_max_accent_len = max(map(len, list(_accents_flat))) -_min_accent_len = min(map(len, list(_accents_flat))) -_accent_len_range = range(_max_accent_len, _min_accent_len - 1, -1) +_max_accent_len = max(list(map(len, list(_accents_flat)))) +_min_accent_len = min(list(map(len, list(_accents_flat)))) +_accent_len_range = list(range(_max_accent_len, _min_accent_len - 1, -1)) # FIXME: The graphing sequences with slashes and backslashes are far # too easy to happen accidentally; think of something better. _agraphs_unused = { - ur"\а": ur"а̀", - ur"/а": ur"а́", - ur"\\а": ur"а̏", - ur"//а": ur"а̑", - ur"~а": ur"а̄", - ur"\А": ur"А̀", - ur"/А": ur"А́", - ur"\\А": ur"А̏", - ur"//А": ur"А̑", - ur"~А": ur"А̄", - - ur"\е": ur"ѐ", - ur"/е": ur"е́", - ur"\\е": ur"е̏", - ur"//е": ur"е̑", - ur"~е": ur"е̄", - ur"\Е": ur"Ѐ", - ur"/Е": ur"Е́", - ur"\\Е": ur"Е̏", - ur"//Е": ur"Е̑", - ur"~Е": ur"Е̄", - - ur"\и": ur"ѝ", - ur"/и": ur"и́", - ur"\\и": ur"и̏", - ur"//и": ur"и̑", - ur"~и": ur"ӣ", - ur"\И": ur"Ѝ", - ur"/И": ur"И́", - ur"\\И": ur"И̏", - ur"//И": ur"И̑", - ur"~И": ur"Ӣ", - - ur"\о": ur"о̀", - ur"/о": ur"о́", - ur"\\о": ur"о̏", - ur"//о": ur"о̑", - ur"~о": ur"о̄", - ur"\О": ur"О̀", - ur"/О": ur"О́", - ur"\\О": ur"О̏", - ur"//О": ur"О̑", - ur"~О": ur"О̄", - - ur"\у": ur"у̀", - ur"/у": ur"у́", - ur"\\у": ur"у̏", - ur"//у": ur"у̑", - ur"~у": ur"ӯ", - ur"\У": ur"У̀", - ur"/У": ur"У́", - ur"\\У": ur"У̏", - ur"//У": ur"У̑", - ur"~У": ur"Ӯ", - - ur"\р": ur"р̀", - ur"/р": ur"р́", - ur"\\р": ur"р̏", - ur"//р": ur"р̑", - ur"~р": ur"р̄", - ur"\Р": ur"Р̀", - ur"/Р": ur"Р́", - ur"\\Р": ur"Р̏", - ur"//Р": ur"Р̑", - ur"~Р": ur"Р̄", + r"\а": r"а̀", + r"/а": r"а́", + r"\\а": r"а̏", + r"//а": r"а̑", + r"~а": r"а̄", + r"\А": r"А̀", + r"/А": r"А́", + r"\\А": r"А̏", + r"//А": r"А̑", + r"~А": r"А̄", + + r"\е": r"ѐ", + r"/е": r"е́", + r"\\е": r"е̏", + r"//е": r"е̑", + r"~е": r"е̄", + r"\Е": r"Ѐ", + r"/Е": r"Е́", + r"\\Е": r"Е̏", + r"//Е": r"Е̑", + r"~Е": r"Е̄", + + r"\и": r"ѝ", + r"/и": r"и́", + r"\\и": r"и̏", + r"//и": r"и̑", + r"~и": r"ӣ", + r"\И": r"Ѝ", + r"/И": r"И́", + r"\\И": r"И̏", + r"//И": r"И̑", + r"~И": r"Ӣ", + + r"\о": r"о̀", + r"/о": r"о́", + r"\\о": r"о̏", + r"//о": r"о̑", + r"~о": r"о̄", + r"\О": r"О̀", + r"/О": r"О́", + r"\\О": r"О̏", + r"//О": r"О̑", + r"~О": r"О̄", + + r"\у": r"у̀", + r"/у": r"у́", + r"\\у": r"у̏", + r"//у": r"у̑", + r"~у": r"ӯ", + r"\У": r"У̀", + r"/У": r"У́", + r"\\У": r"У̏", + r"//У": r"У̑", + r"~У": r"Ӯ", + + r"\р": r"р̀", + r"/р": r"р́", + r"\\р": r"р̏", + r"//р": r"р̑", + r"~р": r"р̄", + r"\Р": r"Р̀", + r"/Р": r"Р́", + r"\\Р": r"Р̏", + r"//Р": r"Р̑", + r"~Р": r"Р̄", } _agraphs = { #ur"^а": ur"а̂", #ur"^о": ur"о̂", #ur"^А": ur"А̂", #ur"^О": ur"О̂", # ...use Latin NFC forms at places for the moment. - ur"^а" : ur"â", - ur"^о" : ur"ô", - ur"^А" : ur"Â", - ur"^О" : ur"Ô", + r"^а" : r"â", + r"^о" : r"ô", + r"^А" : r"Â", + r"^О" : r"Ô", } -_max_agraph_len = max(map(len, _agraphs.keys())) -_min_agraph_len = min(map(len, _agraphs.keys())) -_agraph_len_range = range(_max_agraph_len, _min_agraph_len - 1, -1) +_max_agraph_len = max(list(map(len, list(_agraphs.keys())))) +_min_agraph_len = min(list(map(len, list(_agraphs.keys())))) +_agraph_len_range = list(range(_max_agraph_len, _min_agraph_len - 1, -1)) def resolve_agraphs (text): """ Convert accent graphs into real accented letters [type F1A hook]. Accented Cyrillic letters still cannot be widely entered directly by keyboard, and in such cases this module allows converting graphical accent-letter representations into actual Unicode compositions. @note: At the moment, only genitive endings are supported. @return: text """ return _apply_mapping(text, _agraphs, _agraph_len_range) def remove_accents (text): """ Remove accents from all accented letters [type F1A hook]. Sometimes it is convenient to operate on text without accents, e.g. when checking spelling. @return: text """ return _apply_mapping(text, _accents_inverted, _accent_len_range) def _apply_mapping (text, mapping, mlenrange): p = 0 pp = 0 tsegs = [] ltext = len(text) while p < ltext: for mlen in mlenrange: mapfrom = text[p:p + mlen] mapto = mapping.get(mapfrom) if mapto: tsegs.append(text[pp:p]) tsegs.append(mapto) p += mlen - 1 pp = p + 1 break p += 1 tsegs.append(text[pp:p]) return "".join(tsegs) diff --git a/pology/lang/sr/charsets.py b/pology/lang/sr/charsets.py index 63b6369d..fcc4f802 100644 --- a/pology/lang/sr/charsets.py +++ b/pology/lang/sr/charsets.py @@ -1,114 +1,114 @@ # -*- coding: UTF-8 -*- """ Conversions between character sets in Serbian texts. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.report import warning chset_iso8859_5 = set( -u" !\"#$%&'()*+,-./0123456789:;<=>?@" -u"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" -u"abcdefghijklmnopqrstuvwxyz{|}~\u00a0" -u"ЁЂЃЄЅІЇЈЉЊЋЌ­ЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" -u"абвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ" +" !\"#$%&'()*+,-./0123456789:;<=>?@" +"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +"abcdefghijklmnopqrstuvwxyz{|}~\u00a0" +"ЁЂЃЄЅІЇЈЉЊЋЌ­ЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ" +"абвгдежзийклмнопрстуфхцчшщъыьэюя№ёђѓєѕіїјљњћќ§ўџ" ) translit_iso8859_5 = { } chset_iso8859_2 = set( -u" !\"#$%&'()*+,-./0123456789:;<=>?@" -u"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" -u"abcdefghijklmnopqrstuvwxyz{|}~\u00a0" -u"Ą˘Ł¤ĽŚ§¨ŠŞŤŹ­ŽŻ°" -u"ą˛ł´ľśˇ¸šşťź˝žż" -u"ŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢß" -u"ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙" +" !\"#$%&'()*+,-./0123456789:;<=>?@" +"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +"abcdefghijklmnopqrstuvwxyz{|}~\u00a0" +"Ą˘Ł¤ĽŚ§¨ŠŞŤŹ­ŽŻ°" +"ą˛ł´ľśˇ¸šşťź˝žż" +"ŔÁÂĂÄĹĆÇČÉĘËĚÍÎĎĐŃŇÓÔŐÖ×ŘŮÚŰÜÝŢß" +"ŕáâăäĺćçčéęëěíîďđńňóôőö÷řůúűüýţ˙" ) translit_iso8859_2 = { - u"×": u"×", + "×": "×", } translit_ascii = { - u"—": "--", - u"–": "-", - u"„": "\"", - u"“": "\"", - u"‘": "'", - u"’": "'", - u"€": "EUR", - u"©": "c", - u"×": "x", - u"\u2011": "-", # non-breaking hyphen - u"\u00a0": " ", # no-break space - u"\u2009": "", # thin space - u"\u202f": "", # narrow no-break space - u"\u200b": "", # zero-width space - u"ä": "ae", - u"ö": "oe", - u"ü": "ue", + "—": "--", + "–": "-", + "„": "\"", + "“": "\"", + "‘": "'", + "’": "'", + "€": "EUR", + "©": "c", + "×": "x", + "\u2011": "-", # non-breaking hyphen + "\u00a0": " ", # no-break space + "\u2009": "", # thin space + "\u202f": "", # narrow no-break space + "\u200b": "", # zero-width space + "ä": "ae", + "ö": "oe", + "ü": "ue", # TODO: Add more. #u"": "", } def limit_to_isocyr (text): """ Limit characters to those available in ISO-8859-5 [type F1A hook]. If a character is neither available in the target character set nor can be transliterated to it, conversion is undefined, and warning is reported to stderr. @return: text """ return _limit_to_chset(text, chset_iso8859_5, translit_iso8859_5, "ISO-8859-5") def limit_to_isolat (text): """ Limit characters to those available in ISO-8859-2 [type F1A hook]. If a character is neither available in the target character set nor can be transliterated to it, conversion is undefined, and warning is reported to stderr. @return: text """ return _limit_to_chset(text, chset_iso8859_2, translit_iso8859_2, "ISO-8859-2") def _limit_to_chset (text, chset, translit, cname): ltext = [] for c in text: if c in chset: ltext.append(c) continue ct = translit.get(c) # must come before translit_ascii if ct is not None: ltext.append(ct) continue ct = translit_ascii.get(c) if ct is not None: ltext.append(ct) continue warning(_("@info", "Character '%(char)s' (%(code)s) cannot be transliterated " "into character set %(charset)s, removing it.", char=c, code=("U+%X" % ord(c)), charset=cname)) ltext.append("?") return "".join(ltext) diff --git a/pology/lang/sr/nobr.py b/pology/lang/sr/nobr.py index 062570d9..d99c130f 100644 --- a/pology/lang/sr/nobr.py +++ b/pology/lang/sr/nobr.py @@ -1,106 +1,106 @@ # -*- coding: UTF-8 -*- """ Equip text with no-break characters where possibly helpful. The way text is wrapped in UI, by a general wrapping algorithm, is sometimes really not appropriate for Serbian ortography. For example, hyphen-separated case ending should not be wrapped. This module contains functions to heuristically replace ordinary with no-break characters, where such bad breaks can be expected. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import PologyError from pology.lang.sr.wconv import ctol -nobrhyp_char = u"\u2011" +nobrhyp_char = "\u2011" def to_nobr_hyphens (mode=0, wchars="", unsafe=False): """ Replace some ordinary hyphens with no-break hyphens [hook factory]. An ordinary hyphen is replaced in one of the following modes, as given by the C{mode} parameter: - 0: if the hyphen is in between two letters, and either preceded or followed by at most four letters - 1: if the hyphen is in between two letters and followed by exactly one letter Using the C{wchars} parameter, some extra characters other than letters can be treated as equal to letters. Note that the function by default substitutes the hyphen only if there are some Cyrillic letters (or an extra character) in the context, as otherwise the hyphen may be a part of URL, command, etc. This can be relaxed by setting C{unsafe} to C{True}, when all letters are treated equally. @param mode: replacement mode @type mode: int @param wchars: extra characters to consider parts of the word @type wchars: string @param unsafe: whether to replace hyphen even if no Cyrillic letters nearby @type unsafe: bool @return: type F1A hook @rtype: C{(text) -> text} """ wchars = wchars.replace("-", "") # just in case nobrhyp_rxstrs = [] if mode == 0: # Catching possible replacement by text before hyphen. - nobrhyp_rxstrs.append(ur"\b(\w{1,4})(-)([\w%s])" % wchars) + nobrhyp_rxstrs.append(r"\b(\w{1,4})(-)([\w%s])" % wchars) # Catching possible replacement by text after hyphen. - nobrhyp_rxstrs.append(ur"([\w%s])(-)(\w{1,4})\b" % wchars) + nobrhyp_rxstrs.append(r"([\w%s])(-)(\w{1,4})\b" % wchars) elif mode == 1: # Catching possible replacement by text after hyphen. - nobrhyp_rxstrs.append(ur"([\w%s])(-)(\w{1})\b" % wchars) + nobrhyp_rxstrs.append(r"([\w%s])(-)(\w{1})\b" % wchars) else: raise PologyError( _("@info", "Unknown hyphen replacement mode %(mode)s.", mode=mode)) nobrhyp_rxs = [re.compile(x, re.U) for x in nobrhyp_rxstrs] # Function to produce replacement for matched pattern. if not unsafe: def nobrhyp_repl (m): # Replace hyphen with no-break hyphen only if there is at least one # Cyrillic letter in the match, or one of extra characters. if ctol(m.group()) != m.group() or m.group(1) in wchars: return m.group(1) + nobrhyp_char + m.group(3) else: return m.group() else: def nobrhyp_repl (m): # Replace hyphen with no-break hyphen unconditionally. return m.group(1) + nobrhyp_char + m.group(3) def hook (text): # Quick check, is there any hypen at all in the string? if text.find("-") < 0: return text # Replace as long as the string changes, as there are situations # that the regexes will not catch in one pass (e.g. аб-вг-дђ). while True: text_prev = text for nobrhyp_rx in nobrhyp_rxs: text = nobrhyp_rx.sub(nobrhyp_repl, text) if text_prev == text: break return text return hook diff --git a/pology/lang/sr/reduce.py b/pology/lang/sr/reduce.py index 1bb26eee..d16fdad7 100644 --- a/pology/lang/sr/reduce.py +++ b/pology/lang/sr/reduce.py @@ -1,106 +1,106 @@ # -*- coding: UTF-8 -*- """ Reductions of Serbian text convenient in various special uses. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology.lang.sr.accents import remove_accents from pology.lang.sr.wconv import hictoecq, hictoicq -_srcyr = u"абвгдђежзијклљмнњопрстћуфхцчџшАБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ" +_srcyr = "абвгдђежзијклљмнњопрстћуфхцчџшАБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ" def words_ec (text): """ Reduce text to space-separated Ekavian Cyrillic words [type F1A hook]. Words containing only Serbian Cyrillic characters are extracted, sorted, and joined by spaces into a string. In case the text contains dialect and script hybridization, it is passed through L{hictoec()} to resolve it into clean Ekavian Cyrillic. In case the text contains accent marks, it is passed through L{remove_accents()} to remove them. """ return _words_w(remove_accents(hictoecq(text))) def words_ec_lw (text): """ Reduce text to space-separated Ekavian Cyrillic words, in lower case [type F1A hook]. Like L{words_ec}, but the result is lowercased. """ return words_ec(text.lower()) def words_ic (text): """ Reduce text to space-separated Ijekavian Cyrillic words [type F1A hook]. Like L{words_ec}, but if the text was hybrid it is resolved into clean Ijekavian Cyrillic (see L{hictoic()}). """ return _words_w(remove_accents(hictoicq(text))) def words_ic_lw (text): """ Reduce text to space-separated Ijekavian Cyrillic words, in lower case [type F1A hook]. Like L{words_ic}, but the result is lowercased. """ return words_ic(text.lower()) def _dlc_select (w): - return u"е" in w or u"и" in w + return "е" in w or "и" in w # ...no len(w) >= 3 because an accelerator marker may have split the word. def words_ic_lw_dlc (text): """ Reduce text to space-separated Ijekavian Cyrillic words containing at least three letters, one of which is 'е' or 'и', in lower case [type F1A hook]. Like L{words_ic}, but the result is lowercased. """ return _words_w(remove_accents(hictoicq(text.lower())), select=_dlc_select) def _words_w (text, select=None): words = [] tlen = len(text) p = 0 while p < tlen: while p < tlen and not text[p].isalpha(): p += 1 pp = p allsrcyr = True while p < tlen and text[p].isalpha(): if text[p] not in _srcyr: allsrcyr = False p += 1 word = text[pp:p] if word and allsrcyr and (not select or select(word)): words.append(word) words.sort() return " ".join(words) diff --git a/pology/lang/sr/trapnakron.py b/pology/lang/sr/trapnakron.py index 99e5a479..1375f289 100644 --- a/pology/lang/sr/trapnakron.py +++ b/pology/lang/sr/trapnakron.py @@ -1,885 +1,885 @@ # -*- coding: UTF-8 -*- """ Constructors of syntagma derivators for trapnakron. Trapnakron -- transcriptions and translation of names and acronyms -- is a collection of syntagma derivator definitions residing in C{pology/lang/sr/trapnakron/}. Its purpose is to support translation efforts in Serbian language, where proper nouns and acronyms are frequently transcribed, and sometimes translated. For translators, it can be a manual reference, or even directly sourced in translated material (see below). For readers, it is a way to obtain original forms of transcribed and translated phrases. Trapnakron web pages are built based on trapnakron source in Pology. This makes links between original and localized forms readily available through internet search engines. Adding C{trapnakron} or C{трапнакрон} keyword to the search phrase causes the relevant trapnakron page to appear within top few hits, and the desired other form will be shown already in the excerpt of the hit, such that is not even necessary to follow it. This frees translators from the burden of providing original forms in parenthesis to the first mentioning (or some similar method), and frees the text of the clutter caused by this. While trapnakron definitions may be manually collected and imported into a basic L{Synder} object, this module provides wrappers which free the user of this manual work, as well as appropriate transformation functions (C{*tf} parameters to C{Synder} constructor) to produce various special behaviors on lookups. Trapnakron constructors are defined by type of textual material, e.g. for plain text or Docbook documentation. Documentation of each constructor states what special lookup behaviors will be available through C{Synder} objects created by it. For a short demonstration, consider this derivation of a person's name:: 钱学森, Qián Xuésēn, Tsien Hsue-shen: Ћен| Сјуесен| Suppose that a translator wants to source it directly in the text, rather than to manually copy the transcription (e.g. to avoid having to update the text should the transcription be modified in the future). The translator therefore writes, using XML entity syntax:: ...пројектовању ракета &qianxuesen-g; привукле су идеје... where C{-g} denotes genitive case. This text can be easily processed into the final form (before going out to readers), using a script based on these few lines:: >>> from pology.lang.sr.trapnakron import trapnakron_plain >>> from pology.resolve import resolve_entities_simple as resents >>> tp = trapnakron_plain() >>> >>> s = u"...пројектовању ракета &qianxuesen-g; привукле су идеје..." >>> print resents(s, tp) ...пројектовању ракета Ћена Сјуесена привукле су идеје... >>> @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import pology from pology import PologyError, _, n_ from pology.lang.sr.nobr import to_nobr_hyphens, nobrhyp_char from pology.lang.sr.wconv import ctol, cltoa from pology.lang.sr.wconv import hctoc, hctol, hitoe, hitoi, hctocl from pology.lang.sr.wconv import cltoh, tohi from pology.fsops import collect_files_by_ext from pology.normalize import identify, xentitize, simplify from pology.report import format_item_list from pology.resolve import first_to_upper from pology.synder import Synder # Allowed environment compositions, out of, in order: # Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, Ijekavian Latin. # 1 indicates environment present, 0 absent. _good_eicl_combos = set(( "1000", "0100", "0010", "0001", "1100", "0011", "1010", "0101", "1111", )) # Elements for composing alternatives directives. -_alt_sep_scr = u"¦|/" -_alt_sep_dlc = u"¦|/" +_alt_sep_scr = "¦|/" +_alt_sep_dlc = "¦|/" # Keywords of known target markups. _known_markups = ( "plain", "xml", "docbook4", ) # Tags found within people names (groups of synonyms). -_pn_tag_first = (u"i", u"и") -_pn_tag_last = (u"p", u"п") -_pn_tag_middle = (u"s", u"с") +_pn_tag_first = ("i", "и") +_pn_tag_last = ("p", "п") +_pn_tag_middle = ("s", "с") _pn_all_tags = set(sum((_pn_tag_first, _pn_tag_last, _pn_tag_middle), ())) # Tag for derivations with unimportant keys. _nokey_tag = "x" # Disambiguation marker. -_disamb_marker = u"¤" +_disamb_marker = "¤" # Enumeration of known derivation key suffixes, for modifying derived values. _suff_pltext = "_ot" # for "obican tekst" _suff_pltext_id = 10 _suff_ltmarkup = "_lv" # for "laksa varijanta" _suff_ltmarkup_id = 20 _suff_gnmatch_m = "_rm" # for "rod muski" _suff_gnmatch_m_id = 30 _suff_gnmatch_z = "_rz" # for "rod zenski" _suff_gnmatch_z_id = 31 _suff_gnmatch_s = "_rs" # for "rod srednji" _suff_gnmatch_s_id = 32 _suff_gnmatch_u = "_ru" # for "rod muski zivi" _suff_gnmatch_u_id = 33 _suff_gnmatch_mk = "_rmk" # for "rod muski mnozine" _suff_gnmatch_mk_id = 34 _suff_gnmatch_zk = "_rzk" # for "rod zenski mnozine" _suff_gnmatch_zk_id = 35 _suff_gnmatch_sk = "_rsk" # for "rod srednji mnozine" _suff_gnmatch_sk_id = 36 _suff_gnmatch_uk = "_ruk" # for "rod muski zivi mnozine" _suff_gnmatch_uk_id = 37 _gnmatch_suffs = [_suff_gnmatch_m, _suff_gnmatch_z, _suff_gnmatch_s, _suff_gnmatch_u, _suff_gnmatch_mk, _suff_gnmatch_zk, _suff_gnmatch_sk, _suff_gnmatch_uk] _gnmatch_suff_ids = [_suff_gnmatch_m_id, _suff_gnmatch_z_id, _suff_gnmatch_s_id, _suff_gnmatch_u_id, _suff_gnmatch_mk_id, _suff_gnmatch_zk_id, _suff_gnmatch_sk_id, _suff_gnmatch_uk_id] _gnmatch_suff_ids_set = set(_gnmatch_suff_ids) _gnmatch_suffs_genums = [ - (_suff_gnmatch_m_id, (u"м", u"m"), (u"ј", u"j")), - (_suff_gnmatch_z_id, (u"ж", u"ž"), (u"ј", u"j")), - (_suff_gnmatch_s_id, (u"с", u"s"), (u"ј", u"j")), - (_suff_gnmatch_u_id, (u"у", u"u"), (u"ј", u"j")), - (_suff_gnmatch_mk_id, (u"м", u"m"), (u"к", u"k")), - (_suff_gnmatch_zk_id, (u"ж", u"ž"), (u"к", u"k")), - (_suff_gnmatch_sk_id, (u"с", u"s"), (u"к", u"k")), - (_suff_gnmatch_uk_id, (u"у", u"u"), (u"к", u"k")), + (_suff_gnmatch_m_id, ("м", "m"), ("ј", "j")), + (_suff_gnmatch_z_id, ("ж", "ž"), ("ј", "j")), + (_suff_gnmatch_s_id, ("с", "s"), ("ј", "j")), + (_suff_gnmatch_u_id, ("у", "u"), ("ј", "j")), + (_suff_gnmatch_mk_id, ("м", "m"), ("к", "k")), + (_suff_gnmatch_zk_id, ("ж", "ž"), ("к", "k")), + (_suff_gnmatch_sk_id, ("с", "s"), ("к", "k")), + (_suff_gnmatch_uk_id, ("у", "u"), ("к", "k")), ] _suff_systr = "_s" # for "sistemska transkripcija" _suff_systr_id = 40 -_systr_ksuff_esuff = (_suff_systr, u"сист") +_systr_ksuff_esuff = (_suff_systr, "сист") _suff_altdv1 = "_a" # for "alternativno izvodjenje" _suff_altdv1_id = 50 _suff_altdv2 = "_a2" # second alternative _suff_altdv2_id = 51 _suff_altdv3 = "_a3" # third alternative _suff_altdv3_id = 52 _altdv_ksuffs_esuffs = [ - (_suff_altdv1, u"алт"), - (_suff_altdv2, u"алт2"), - (_suff_altdv3, u"алт3"), + (_suff_altdv1, "алт"), + (_suff_altdv2, "алт2"), + (_suff_altdv3, "алт3"), ] _aenv_suff_ids = [_suff_systr_id, # order of elements significant _suff_altdv1_id, _suff_altdv2_id, _suff_altdv3_id] _aenv_suff_ids_set = set(_aenv_suff_ids) _suff_pname_f = "_im" # for "ime" _suff_pname_f_id = 60 _suff_pname_l = "_pr" # for "prezime" _suff_pname_l_id = 61 _pname_suffs = [_suff_pname_f, _suff_pname_l] _pname_suff_ids = [_suff_pname_f_id, _suff_pname_l_id] _pname_suff_ids_set = set(_pname_suff_ids) -def trapnakron (envec=u"", envel=u"л", envic=u"иј", envil=u"ијл", +def trapnakron (envec="", envel="л", envic="иј", envil="ијл", markup="plain", tagmap=None, ptsuff=None, ltsuff=None, gnsuff=None, stsuff=None, adsuff=None, nmsuff=None, npkeyto=None, nobrhyp=False, disamb="", runtime=False): """ Main trapnakron constructor, covering all options. The trapnakron constructor sets, either by default or optionally, various transformations to enhance queries to the resulting derivator. Default Behavior ================ Property values are returned as alternatives/hybridized compositions of Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, and Ijekavian Latin forms, as applicable. Any of these forms can be excluded from derivation by setting its C{env*} parameter to C{None}. C{env*} parameters can also be used to change the priority environment from which the particular form is derived. Derivation and property key separator in compound keys is the ASCII hyphen (C{-}). Derivation keys are derived from syntagmas by applying the L{identify()} function. In derivations where this will result in strange keys, additional keys should be defined through hidden syntagmas. Property keys are transliterated into L{stripped-ASCII}. Conflict resolution for derivation keys is not strict (see L{derivator constructor}). Optional behavior ================= Instead of plain text, properties may be reported with some markup. The markup type is given by C{markup} parameter, and can be one of C{"plain"}, C{"xml"}, C{"docbook4"}. The C{tagmap} parameter contains mapping of derivation keys to tags which should wrap properties of these derivations. Derivation keys can have several suffixes which effect how the properties are reported: - Presence of the suffix given by C{ptsuff} parameter signals that properties should be forced to plain text, if another markup is globally in effect. - Parameter C{ltsuff} states the suffix which produces lighter version of the markup, where applicable (e.g. people names in Docbook). - When fetching a property within a sentence (with keys given e.g. as XML entities), sentence construction may require that the resolved value is of certain gender and number; parameter C{gnsuff} can be used to provide a tuple of 4 suffixes for gender in singular and 4 suffixes for gender in plural, such that the property will resolve only if the value of gender and number matches the gender and number suffix. - Parameters C{stsuff} and C{adsuff} provide suffixes through which systematic transcription and alternative derivations are requested. They are actually tuples, where the first element is the key suffix, and the second element the suffix to primary environment which produces the systematic/alternative environment. C{adsuff} can also be a tuple of tuples, if several alternative derivations should be reachable. - In case the entry is a person's name with tagged first and last name, parameter C{nmsuff} can provide a tuple of 2 suffixes by which only the first or last name are requested, respectively. Ordinary hyphens may be converted into non-breaking hyphens by setting the C{nobrhyp} parameter to C{True}. Non-breaking hyphens are added heuristically, see the L{to_nobr_hyphens()} hook. Useful e.g. to avoid wrapping on hyphen-separated case endings. A property key normally cannot be empty, but C{npkeyto} parameter can be used to automatically substitute another property key when empty property key is seen in request for properties. In the simpler version, value of C{npkeyto} is just a string of the key to substitute for empty. In the more complex version, the value is a tuple containing the key to substitute and the list of two or more supplemental property keys: empty key is replaced only if all supplemental property values exist and are equal (see e.g. L{trapnakron_plain} for usage of this). Some property values may have been manually decorated with disambiguation markers (C{¤}), to differentiate them from property values of another derivation which would otherwise appear equal under a certain normalization. By default such markers are removed, but instead they can be substituted with a string given by C{disamb} parameter. Some derivations are defined only for purposes of obtaining their properties in scripted translations at runtime. They are by default not included, but can be by setting the C{runtime} parameter to C{True}. @param envec: primary environment for Ekavian Cyrillic derivation @type envec: string or C{None} @param envel: primary environment for Ekavian Latin derivation @type envel: string or C{None} @param envic: primary environment for Ijekavian Cyrillic derivation @type envic: string or C{None} @param envil: primary environment for Ijekavian Latin derivation @type envil: string or C{None} @param markup: target markup @type markup: string @param tagmap: tags to assign to properties by derivation keys @type tagmap: dict string -> string @param ptsuff: derivation key suffix to report plain text properties @type ptsuff: string @param ltsuff: derivation key suffix to report properties in lighter markup @type ltsuff: string @param gnsuff: suffixes by gender and number, to have no resolution if gender or number do not match @type gnsuff: [(string, string)*] @param stsuff: derivation key and environment name suffixes to report systematic transcriptions @type stsuff: (string, string) @param adsuff: derivation key and environment name suffixes to report alternative derivations @type adsuff: (string, string) or ((string, string)*) @param nmsuff: suffixes for fetching only first or last name of a person @type nmsuff: (string, string) @param npkeyto: property key to substitute for empty key, when given @type npkeyto: string or (string, [string*]) @param nobrhyp: whether to convert some ordinary into non-breaking hyphens @type nobrhyp: bool @param disamb: string to replace each disambiguation marker with @type disamb: string @param runtime: whether to include runtime-only derivations @type runtime: bool @returns: trapnakron derivator @rtype: L{Synder} """ env0s = [envec, envel, envic, envil] combo = "".join([(x is not None and "1" or "0") for x in env0s]) if combo not in _good_eicl_combos: raise PologyError( _("@info", "Invalid combination of Ekavian/Ijekavian Cyrillic/Latin " "environments to trapnakron derivator.")) if markup not in _known_markups: raise PologyError( _("@info", "Unknown markup type '%(mtype)s' to trapnakron derivator " "(known markups: %(mtypelist)s).", mtype=markup, mtypelist=format_item_list(_known_markups))) # Compose environment fallback chains. env = [] envprops = [] # [(islatin, isije)*] vd = lambda e, d: e if e is not None else d if envec is not None: env.append((envec,)) envprops.append((False, False)) if envel is not None: - env.append((envel, vd(envec, u""))) + env.append((envel, vd(envec, ""))) envprops.append((True, False)) if envic is not None: - env.append((envic, vd(envec, u""))) + env.append((envic, vd(envec, ""))) envprops.append((False, True)) if envil is not None: - env.append((envil, vd(envel, u"л"), vd(envic, u"иј"), vd(envec, u""))) + env.append((envil, vd(envel, "л"), vd(envic, "иј"), vd(envec, ""))) envprops.append((True, True)) # Setup up requests by derivation key suffix. mvends = {} if ptsuff: mvends[ptsuff] = _suff_pltext_id if ltsuff: mvends[ltsuff] = _suff_ltmarkup_id if gnsuff: if len(gnsuff) != 8: raise PologyError( _("@info", "Sequence of gender-number suffixes must have " "exactly 8 elements.")) - mvends.update(zip(gnsuff, _gnmatch_suff_ids)) + mvends.update(list(zip(gnsuff, _gnmatch_suff_ids))) aenvs = {} if adsuff or stsuff: kesuffs = [] # must have same order as _aenv_suff_ids if stsuff is not None: kesuffs.append(stsuff) if not isinstance(adsuff[0], tuple): kesuffs.append(adsuff) else: kesuffs.extend(adsuff) for (ksuff, esuff), suff_id in zip(kesuffs, _aenv_suff_ids): mvends[ksuff] = suff_id # Compose environment fallback chain for this suffix. aenv = [] for env1 in env: aenv1 = [] for esuff1 in (esuff, ""): for env0 in env1: aenv1.append(env0 + esuff1) aenv.append(tuple(aenv1)) aenvs[suff_id] = tuple(aenv) if nmsuff: if len(nmsuff) != 2: raise PologyError( _("@info", "Sequence of person name suffixes must have " "exactly 2 elements.")) - mvends.update(zip(nmsuff, _pname_suff_ids)) + mvends.update(list(zip(nmsuff, _pname_suff_ids))) # Setup substitution of empty property keys. expkeys = [] if isinstance(npkeyto, tuple): npkeyto, expkeys = npkeyto # Create transformators. dkeytf = _sd_dkey_transf(mvends, tagmap) pkeytf = _sd_pkey_transf(npkeyto, expkeys) pvaltf = _sd_pval_transf(envprops, markup, nobrhyp, disamb) ksyntf = _sd_ksyn_transf(markup, False, disamb) envtf = _sd_env_transf(aenvs) # Build the derivator. sd = Synder(env=env, ckeysep="-", dkeytf=dkeytf, dkeyitf=identify, pkeytf=pkeytf, pkeyitf=norm_pkey, pvaltf=pvaltf, ksyntf=ksyntf, envtf=envtf, strictkey=False) # Collect synder files composing the trapnakron. sdfiles = _get_trapnakron_files(runtime) # Import into derivator. for sdfile in sdfiles: sd.import_file(sdfile) return sd def rootdir (): """ Get root directory to trapnakron derivation files. @returns: root directory path @rtype: string """ return os.path.join(pology.datadir(), "lang", "sr", "trapnakron") def _get_trapnakron_files (runtime=False): root = rootdir() files = collect_files_by_ext(root, ["sd"], recurse=False) if runtime: rtroot = os.path.join(root, "runtime") rtfiles = collect_files_by_ext(rtroot, ["sd"], recurse=False) files.extend(rtfiles) return files -def trapnakron_plain (envec=u"", envel=u"л", envic=u"иј", envil=u"ијл"): +def trapnakron_plain (envec="", envel="л", envic="иј", envil="ијл"): """ Constructs trapnakron suitable for application to plain text. Calls L{trapnakron} with the following setup: - Markup is plain text (C{plain}). - Suffixes: C{_rm} ("rod muski") for resolving the property value only if it is of masculine gender, C{_rz} for feminine, C{_rs} for neuter; C{_s} for systematic transcription, C{_a}, C{_a2} and C{_a3} for other alternatives; C{_im} and C{_pr} for person's last and first name. - Non-breaking hyphens are heuristically replacing ordinary hyphens. - Empty property key is converted into C{am} (accusative masculine descriptive adjective), providing that it is equal to C{gm} (genitive masculine descriptive adjective); i.e. if the descriptive adjective is invariable. """ return trapnakron( envec, envel, envic, envil, markup="plain", gnsuff=_gnmatch_suffs, stsuff=_systr_ksuff_esuff, adsuff=_altdv_ksuffs_esuffs, nmsuff=_pname_suffs, npkeyto=("am", ("am", "gm")), nobrhyp=True, ) -def trapnakron_ui (envec=u"", envel=u"л", envic=u"иј", envil=u"ијл"): +def trapnakron_ui (envec="", envel="л", envic="иј", envil="ијл"): """ Constructs trapnakron suitable for application to UI texts. Like L{trapnakron_plain}, except that disambiguation markers are not removed but substituted with an invisible character, and runtime-only derivations are included too. Retaining disambiguation markers is useful when a normalized form (typically nominative) is used at runtime as key to fetch other properties of the derivation, and the normalization is such that it would fold two different derivations to same keys if the originating forms were left undecorated. """ return trapnakron( envec, envel, envic, envil, markup="plain", gnsuff=_gnmatch_suffs, stsuff=_systr_ksuff_esuff, adsuff=_altdv_ksuffs_esuffs, nmsuff=_pname_suffs, npkeyto=("am", ("am", "gm")), nobrhyp=True, - disamb=u"\u2060", + disamb="\u2060", runtime=True, ) -def trapnakron_docbook4 (envec=u"", envel=u"л", envic=u"иј", envil=u"ијл", +def trapnakron_docbook4 (envec="", envel="л", envic="иј", envil="ијл", tagmap=None): """ Constructs trapnakron suitable for application to Docbook 4 texts. Calls L{trapnakron} with the following setup: - Markup is Docbook 4 (C{docbook4}). - Suffixes: C{_ot} ("obican tekst") for plain-text properties, C{_lv} ("laksa varijanta") for lighter variant of the markup. Lighter markup currently applies to: people names (no outer C{}, e.g. when it should be elideded due to particular text segmentation on Docbook->PO extraction). Also the suffixes as for L{trapnakron_plain}. - Non-breaking hyphens and empty property keys are treated like in L{trapnakron_plain}. """ return trapnakron( envec, envel, envic, envil, markup="docbook4", tagmap=tagmap, ptsuff=_suff_pltext, ltsuff=_suff_ltmarkup, gnsuff=_gnmatch_suffs, stsuff=_systr_ksuff_esuff, adsuff=_altdv_ksuffs_esuffs, nmsuff=_pname_suffs, npkeyto=("am", ("am", "gm")), nobrhyp=True, runtime=True, # needed for resolution of UI references ) # Transformation for derivation keys: # - lowercase first letter if upper-case, and indicate value uppercasing # - strip special suffixes and indicate value modifications based on them def _sd_dkey_transf (suffspec, tagmap): def transf (dkey, sd): # Whether to uppercase the first letter of properties. fcap = dkey[0:1].isupper() if fcap: dkey = dkey[0].lower() + dkey[1:] # Collect and strip all known special suffixes. found_suff_ids = set() while True: plen_suff_ids = len(found_suff_ids) - for suff, suff_id in suffspec.items(): + for suff, suff_id in list(suffspec.items()): if dkey.endswith(suff): dkey = dkey[:-len(suff)] found_suff_ids.add(suff_id) if len(found_suff_ids) == plen_suff_ids: break # Tag which wraps the property values of this derivation. tag = tagmap.get(dkey) if tagmap else None # Whether to use plain text instead of markup, where applicable. pltext = _suff_pltext_id in found_suff_ids # Whether to use lighter variant of the markup, where applicable. ltmarkup = _suff_ltmarkup_id in found_suff_ids # Whether the gender and number is matching. if _gnmatch_suff_ids_set.intersection(found_suff_ids): gstr = sd.get2(dkey, "_rod") nstr = sd.get2(dkey, "_broj", "j") genders = list(set(map(ctol, hctocl(gstr)))) if gstr else [] numbers = list(set(map(ctol, hctocl(nstr)))) if nstr else [] if ( not (len(genders) == 1) or not (len(numbers) == 1) or not all([( x[0] not in found_suff_ids or (genders[0] in x[1] and numbers[0] in x[2])) for x in _gnmatch_suffs_genums]) ): dkey = None # Whether to use one of alternative environments. esuffid = None found_aenv_suff_ids = _aenv_suff_ids_set.intersection(found_suff_ids) if found_aenv_suff_ids: esuffid = tuple(found_aenv_suff_ids)[0] # Whether to select only first or last name (persons). nsuffid = None found_pname_suff_ids = _pname_suff_ids_set.intersection(found_suff_ids) if found_pname_suff_ids: nsuffid = tuple(found_pname_suff_ids)[0] return dkey, fcap, tag, ltmarkup, pltext, esuffid, nsuffid return transf, "self" # Transformation for property keys: # - try to convert empty into non-empty key def _sd_pkey_transf (npkeyto, npkey_eqpkeys): def transf (pkey, dkey, sd): # If key not empty, return it as-is. if pkey: return pkey # Empty ending allowed if all properties requested # by supplementary keys are both existing and equal. # In that case, report the indicated key instead of empty. alleq = True ref_pval = None for tpkey in npkey_eqpkeys: pval = sd.get2(dkey, tpkey) if pval is None: alleq = False break if ref_pval is None: ref_pval = pval elif ref_pval != pval: alleq = False break if alleq: return npkeyto else: return pkey return transf, "dkey", "self" # Transformation for property values: # - capitalize on request from key processing # - add tags on request from key processing # - optionally replace ordinary with no-break hyphens # - resolve known taggings according to selected markup # - add outer tags according to selected markup # - replace disambiguation markers with invisible characters # - construct hybridized forms out of multiple values # If the property key starts with underscore, only hybridization is performed. def _sd_pval_transf (envprops, markup, nobrhyp, disamb): def transf (mtsegs, pkey, dkrest, sd): fcap, tag, ltmarkup, pltext, d5, nsuffid = dkrest if pkey.startswith("_"): fcap = False tag = None pltext = True pvals = [] for tsegs, (islatin, isije) in zip(mtsegs, envprops): if tsegs is None: return None pval1 = _compose_text(tsegs, markup, nobrhyp, disamb, fcap, tag, ltmarkup, pltext, nsuffid, pkey, islatin) if pval1 is None: return None pvals.append(pval1) pval = _hybridize(envprops, pvals) return pval return transf, "pkey", "dkrest", "self" # Transformation for derivation syntagmas. # Like for property value transformation, # except for alternatives/hybridization. def _sd_ksyn_transf (markup, nobrhyp, disamb): def transf (tsegs, dkrest, sd): fcap, tag, ltmarkup, pltext, d5, nsuffid = dkrest ksyn = _compose_text(tsegs, markup, nobrhyp, disamb, fcap, tag, ltmarkup, pltext, nsuffid) return ksyn return transf, "dkrest", "self" # Transformation for derivation environments. # Returns a non-default environment on request from keys processing. def _sd_env_transf (aenvs): def transf (env, dkrest): d1, d2, d3, d4, esuffid, d6 = dkrest if esuffid is not None: return aenvs[esuffid] else: return env return transf, "dkrest" def _compose_text (tsegs, markup, nobrhyp, disamb, fcap, tag, ltmarkup, pltext, nsuffid, pkey=None, tolatin=False): # Tagging and escaping. tagsubs="%(v)s" vescape = None if markup in ("xml", "docbook4"): tagsubs = "<%(t)s>%(v)s" vescape = xentitize # All unique tags to current segments. atags = set(sum([x[1] for x in tsegs], [])) if atags.intersection(_pn_all_tags): # A person name. markup_mod = markup if not pltext else "plain" text = _compose_person_name(tsegs, fcap, markup_mod, ltmarkup, nsuffid, pkey) else: # Ordinary derivations. text = simplify("".join([x[0] for x in tsegs])) if _nokey_tag in atags and " " in text: # before anything else text = text[text.find(" "):].lstrip() if fcap: # before adding outer tags text = first_to_upper(text) if vescape: # before adding outer tags text = vescape(text) if tag and not pltext: text = tagsubs % dict(t=tag, v=text) if text is None: return None text = text.replace(_disamb_marker, disamb or "") if nobrhyp: # before conversion to Latin text = to_nobr_hyphens(unsafe=True)(text) if tolatin: text = ctol(text) return text # Combine Ekavian/Ijekavian Cyrillic/Latin forms # into hybrid Ijekavian Cyrillic text. def _hybridize (envprops, pvals): if len(envprops) == 4: # different scripts and dialects cvalc = tohi(pvals[0], pvals[2], delims=_alt_sep_dlc) cvall = tohi(pvals[1], pvals[3], delims=_alt_sep_dlc) if ctol(cvalc) != cvall: cval = cltoh(cvalc, cvall, delims=_alt_sep_scr, full=True) else: cval = cvalc elif len(envprops) == 2: if envprops[0][0] == envprops[1][0]: # different dialects cval = tohi(pvals[0], pvals[1], delims=_alt_sep_dlc) else: # different scripts cval = cltoh(pvals[0], pvals[1], delims=_alt_sep_scr, full=True) else: cval = pvals[0] return cval # Convert tagged person name into destination markup. def _compose_person_name (tsegs, fcap, markup, light, nsuffid, pkey): # Reduce the name to one of its elements if requested. # If the reduction results in empty string, revert to full name. upperlast = False if nsuffid is not None: ntsegs = [] for seg, tags in tsegs: tag = tags[0] if len(tags) > 0 else None if ( (tag in _pn_tag_first and nsuffid == _suff_pname_f_id) or (tag in _pn_tag_last and nsuffid == _suff_pname_l_id) ): ntsegs.append((seg, tags)) if "".join([seg for seg, tags in ntsegs]).strip(): tsegs = ntsegs # Take care to uppercase title to last name ("von", "al", etc.) # if last name alone is selected. upperlast = nsuffid == _suff_pname_l_id # Otherwise, if the requested property is of special type, # cancel the derivation if full name contains several name elements. # FIXME: Actually do this once decided how the client should supply # the test for special keys. elif False: #pkey and len(pkey) > 2: seentags = set() for seg, tags in tsegs: if not seg.strip(): continue tag = tags[0] if len(tags) > 0 else None if tag in _pn_tag_first: seentags.add(_pn_tag_first[0]) elif tag in _pn_tag_last: seentags.add(_pn_tag_last[0]) elif tag in _pn_tag_middle: seentags.add(_pn_tag_middle[0]) else: seentags.add(None) if len(seentags) > 1: return None if markup == "docbook4": name_segs = [] for seg, tags in tsegs: seg = xentitize(seg).strip() if not seg: continue tag = tags[0] if len(tags) > 0 else None if tag in _pn_tag_first: name_segs.append(" %s" % seg) elif tag in _pn_tag_last: if upperlast: seg = seg[0].upper() + seg[1:] upperlast = False name_segs.append(" %s" % seg) elif tag in _pn_tag_middle: name_segs.append(" %s" % seg) else: # untagged name_segs.append(" %s" % seg) name = "".join(name_segs).strip() if not light: name = "%s" % name else: name = simplify("".join([seg for seg, tags in tsegs])) if upperlast: name = name[0].upper() + name[1:] return name def norm_pkey (pkey): """ Normalize internal property keys in trapnakron. @param pkey: property key or keys to normalize @type pkey: string or (string*) or [string*] @returns: normalized keys @rtype: as input """ - if isinstance(pkey, basestring): + if isinstance(pkey, str): return cltoa(pkey) elif isinstance(pkey, tuple): return tuple(map(cltoa, pkey)) elif isinstance(pkey, list): - return map(cltoa, pkey) + return list(map(cltoa, pkey)) else: raise PologyError( _("@info", "Normalization of property keys requested " "on unsupported data type '%(type)s'.", type=type(pkey))) _norm_rtkey_rx = re.compile("\s", re.U) def norm_rtkey (text): """ Normalize text into runtime key for translation scripting. @param text: text to normalize into runtime key @type text: string @returns: runtime key @rtype: string """ return _norm_rtkey_rx.sub("", text).lower() diff --git a/pology/lang/sr/trapres.py b/pology/lang/sr/trapres.py index 326d9dde..6277629b 100644 --- a/pology/lang/sr/trapres.py +++ b/pology/lang/sr/trapres.py @@ -1,152 +1,152 @@ # -*- coding: UTF-8 -* """ Resolve trapnakron references in translations. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import PologyError import pology.lang.sr.trapnakron as T from pology.comments import manc_parse_list from pology.markup import xml_entities, html_entities from pology.resolve import resolve_entities _known_cons = {} def _get_names (): _known_cons[""] = T.trapnakron # base constructor for name in dir(T): # specialized constructors if name.startswith("trapnakron_"): _known_cons[name[len("trapnakron_"):]] = getattr(T, name) _get_names() def froments (name, args=(), kwargs={}, vfilter=None, testsub=False): """ Resolve trapnakron references in translation using XML entity format [hook factory]. If an entity cannot be resolved as trapnakron reference, warning is output and the entity is left unresolved. Instead of leaving the entity unresolved, an illustrative expansion for the property key given by the reference can be substituted by setting the C{testsub} parameter to C{True}. Entities in a given message can be manually ignored through C{ignore-entity:} translation comment, which contains comma-separated list of entity names:: # ignore-entity: foo, bar msgid "Blah, blah, &foo;, blah, blah, &bar;." msgstr "Бла, бла, &foo;, бла, бла, &bar;." Standard XML and HTML entities are ignored by default. @param name: suffix of trapnakron constructor, e.g. "ui" for L{trapnakron_ui} @type name: string @param args: positional arguments to send to the constructor @type args: tuple @param kwargs: keyword arguments to send to the constructor @type kwargs: dict @param vfilter: format string (with single C{%s} directive) or function to apply to every resolved reference @type vfilter: string or (string)->string @param testsub: whether to substitute test forms in place of undefined references @type testsub: bool @return: type F3C hook @rtype: C{(msgstr, msg, cat) -> msgstr} """ trapcon = _known_cons.get(name) if trapcon is None: raise PologyError( _("@info \"trapnakron\" is a shorthand for " "\"Transcriptions and Translations of Names and Acronyms\" " "in Serbian", "Unknown trapnakron constructor '%(name)s'.", name=name)) tp = trapcon(*args, **kwargs) # Setup dummy replacement for undefined references. undefrepl = None if testsub: dkeysub1 = "__test1234a__" dkeysub2 = "__test1234b__" - tp.import_string(u""" + tp.import_string(""" >%s/base/aff.sd %s: лопт|а, лопт|е+2, лопт|ин, лопта|сти %s: ваљ|ак, ваљ|ци+, ваљк|ов, ваљка|сти """ % (T.rootdir(), dkeysub1, dkeysub2)) def undefrepl (ref): res = ref.rsplit("-", 1) if len(res) != 2: return None dkey, pkey = res if pkey == "": pkey = "n" dkeysub = dkeysub1 if len(pkey) == 2 and pkey.endswith("k"): dkeysub = dkeysub2 ckeysub = dkeysub + "-" + pkey return tp[ckeysub].upper() # Entitites normally ignored on resolution. # FIXME: This should go by markup type advertised in catalog header. ignored_refs = {} ignored_refs.update(xml_entities) ignored_refs.update(html_entities) def hook (msgstr, msg, cat): srcstr = "%s:%d(%d)" % (cat.filename, msg.refline, msg.refentry) locally_ignored = manc_parse_list(msg, "ignore-entity:", ",") if locally_ignored: ignored_refs_mod = ignored_refs.copy() ignored_refs_mod.update([(x, None) for x in locally_ignored]) else: ignored_refs_mod = ignored_refs res = resolve_entities(msgstr, tp, ignored_refs_mod, srcname=srcstr, vfilter=vfilter, undefrepl=undefrepl) msgstr, resolved, unknown = res return msgstr return hook _froments_t1_hook = None def froments_t1 (msgstr, msg, cat): """ A testing specialization of L{froments}: C{name="ui"}, C{vfilter="^%s"}, C{testsub=True} [type F3C hook]. """ global _froments_t1_hook if not _froments_t1_hook: - _froments_t1_hook = froments("ui", vfilter=u"^%s", testsub=True) + _froments_t1_hook = froments("ui", vfilter="^%s", testsub=True) return _froments_t1_hook(msgstr, msg, cat) def froments_t1db (msgstr, msg, cat): """ A testing specialization of L{froments}: C{name="docbook4"}, C{vfilter="^%s"}, C{testsub=True} [type F3C hook]. """ global _froments_t1_hook if not _froments_t1_hook: - _froments_t1_hook = froments("docbook4", vfilter=u"^%s", testsub=True) + _froments_t1_hook = froments("docbook4", vfilter="^%s", testsub=True) return _froments_t1_hook(msgstr, msg, cat) diff --git a/pology/lang/sr/wconv.py b/pology/lang/sr/wconv.py index 30b51fc0..7fc719ec 100644 --- a/pology/lang/sr/wconv.py +++ b/pology/lang/sr/wconv.py @@ -1,768 +1,768 @@ # -*- coding: UTF-8 -* """ Conversions between scripts and dialects in Serbian. Serbian standard literary language can be written in two dialects, Ekavian and Ijekavian, and two scripts, Cyrillic and Latin. Dialects and scripts can be freely combined, resulting in four official writing standards: Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, and Ijekavian Latin. Some automatic and semi-automatic conversions between them are possible. Script Transliteration ====================== For plain text containing only Serbian words (including well adapted loans), it is trivial to transliterate from Cyrillic to Latin script. It is only necessary to take care when converting Cyrillic Љ, Њ, Џ into Latin digraphs Lj, Nj, Dž, because sometimes they should be full upper-case (e.g. Љубљана→Ljubljana, ЉУБЉАНА→LJUBLJANA). But this is easily algorithmically resolvable, by checking if the previous or the next letter are upper-case too. To transliterate from Latin to Cyrillic is somewhat harder, because in rare cases digraphs nj, lj, dž may not be single, but standalone letters; i.e. they do not map Cyrillic to љ, њ, џ, but to лј, нј, дж (dablju→даблју, konjunkcija→конјункција, nadživeti→надживети). The only way to handle this is by having a dictionary of special cases. Furthermore, in today's practice texts are rarely clean as assumed above. They are frequently riddled with foreign Latin phrases (such as proper names) quasiphrases (such as electronic addresses), and constructive elements (such as markup tags). On the other hand, foreign Cyrillic phrases are quite infrequent (may be found e.g. in texts on linguistic topics). This means that in practice transliteration from Cyrillic to Latin remains straightforward, but from Latin to Cyrillic decidedly not so. Script Hybridization ==================== Sometimes the result of direct transliteration from Cyrillic to Latin is against the established Latin practice in a certain field, even if valid according to official orthography. Then it becomes necessary to specially handle some parts of the text (e.g. transliterations or lack thereof of foreign proper names). Alternatives directives are a way to compose "hybrid" Cyrillic-Latin text, out of which both ordinary Cyrillic and non-directly transliterated Latin texts can be automatically derived. For example, this hybrid text:: Различите ~@/линукс/Linux/ дистрибуције... can be automatically resolved into:: Различите линукс дистрибуције... Različite Linux distribucije... String C{~@} is the head of alternatives directive. It is followed by a single character, which is then used to delimit Cyrillic and Latin parts, in that order, out of surrounding text. (For all details on format of alternatives directives, see L{resolve_alternatives()< pology.resolve.resolve_alternatives>}). Transliteration from Cyrillic to Latin is performed only on text outside of alternatives directives. Dialect Hybridization ===================== Both Ekavian and Ijekavian dialect may be represented within single text. Such hybrid text is basically Ijekavian, but jat-reflexes are marked by inserting one of the jat-reflex ticks C{›}, C{‹}, C{▹}, C{◃}:: Д‹ио б‹иљежака о В›јештичјој р›ијеци. Clean Ijekavian text is then obtained by just removing jat-reflex ticks preceding valid jat-reflexes, and Ekavian by applying the jat-reflex map:: Дио биљежака о Вјештичјој ријеци. Део бележака о Вештичјој реци. The jat-reflex mapping rules are as follows, grouped by tick: - ›ије→е, ›је→е - ‹иј→еј, ‹иљ→ел, ‹ио→ео, ‹ље→ле, ‹ње→не - ▹ије→и, ▹је→и - ◃ијел→ео, ◃ијен→ењ, ◃ит→ет, ◃ил→ел, ◃јел→ео, ◃тн→тњ, ◃шње→сне For very rare special cases, it is possible to directly provide different forms for Ekavian and Ijekavian, in that order, by using alternatives directive:: Гд›је с' ~#/то/ба/ пошо̑? Compared to alternatives directives for scripts, the only difference is that here the directive head is C{~#}. Alternatives directives for script and dialect can thus be mixed without conflicts, in single text and even interwoven (when interweaving, different delimiters must be used). @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import PologyError, _, n_ from pology.diff import word_diff, tdiff from pology.report import warning, format_item_list from pology.resolve import resolve_alternatives_simple from pology.resolve import resolve_alternatives # Transliteration table Serbian Cyrillic->Latin. _dict_c2l = { - u'а':u'a', u'б':u'b', u'в':u'v', u'г':u'g', u'д':u'd', u'ђ':u'đ', - u'е':u'e', u'ж':u'ž', u'з':u'z', u'и':u'i', u'ј':u'j', u'к':u'k', - u'л':u'l', u'љ':u'lj',u'м':u'm', u'н':u'n', u'њ':u'nj',u'о':u'o', - u'п':u'p', u'р':u'r', u'с':u's', u'т':u't', u'ћ':u'ć', u'у':u'u', - u'ф':u'f', u'х':u'h', u'ц':u'c', u'ч':u'č', u'џ':u'dž',u'ш':u'š', - u'А':u'A', u'Б':u'B', u'В':u'V', u'Г':u'G', u'Д':u'D', u'Ђ':u'Đ', - u'Е':u'E', u'Ж':u'Ž', u'З':u'Z', u'И':u'I', u'Ј':u'J', u'К':u'K', - u'Л':u'L', u'Љ':u'Lj',u'М':u'M', u'Н':u'N', u'Њ':u'Nj',u'О':u'O', - u'П':u'P', u'Р':u'R', u'С':u'S', u'Т':u'T', u'Ћ':u'Ć', u'У':u'U', - u'Ф':u'F', u'Х':u'H', u'Ц':u'C', u'Ч':u'Č', u'Џ':u'Dž',u'Ш':u'Š', + 'а':'a', 'б':'b', 'в':'v', 'г':'g', 'д':'d', 'ђ':'đ', + 'е':'e', 'ж':'ž', 'з':'z', 'и':'i', 'ј':'j', 'к':'k', + 'л':'l', 'љ':'lj','м':'m', 'н':'n', 'њ':'nj','о':'o', + 'п':'p', 'р':'r', 'с':'s', 'т':'t', 'ћ':'ć', 'у':'u', + 'ф':'f', 'х':'h', 'ц':'c', 'ч':'č', 'џ':'dž','ш':'š', + 'А':'A', 'Б':'B', 'В':'V', 'Г':'G', 'Д':'D', 'Ђ':'Đ', + 'Е':'E', 'Ж':'Ž', 'З':'Z', 'И':'I', 'Ј':'J', 'К':'K', + 'Л':'L', 'Љ':'Lj','М':'M', 'Н':'N', 'Њ':'Nj','О':'O', + 'П':'P', 'Р':'R', 'С':'S', 'Т':'T', 'Ћ':'Ć', 'У':'U', + 'Ф':'F', 'Х':'H', 'Ц':'C', 'Ч':'Č', 'Џ':'Dž','Ш':'Š', # accented NFC: - u'ѐ':u'è', u'ѝ':u'ì', u'ӣ':u'ī', u'ӯ':u'ū', - u'Ѐ':u'È', u'Ѝ':u'Ì', u'Ӣ':u'Ī', u'Ӯ':u'Ū', + 'ѐ':'è', 'ѝ':'ì', 'ӣ':'ī', 'ӯ':'ū', + 'Ѐ':'È', 'Ѝ':'Ì', 'Ӣ':'Ī', 'Ӯ':'Ū', # frequent accented from NFD to NFC (keys now 2-char): - u'а̂':u'â', u'о̂':u'ô', u'а̑':u'ȃ', u'о̑':u'ȏ', + 'а̂':'â', 'о̂':'ô', 'а̑':'ȃ', 'о̑':'ȏ', } # Transliteration table Serbian Cyrillic->ASCII, basic stripped. _dict_c2a_stripped = _dict_c2l.copy() _dict_c2a_stripped.update({ - u'ђ':u'dj', u'ж':u'z', u'ћ':u'c', u'ч':u'c', u'џ':u'dz', u'ш':u's', - u'Ђ':u'Dj', u'Ж':u'Z', u'Ћ':u'C', u'Ч':u'C', u'Џ':u'Dz', u'Ш':u'S', + 'ђ':'dj', 'ж':'z', 'ћ':'c', 'ч':'c', 'џ':'dz', 'ш':'s', + 'Ђ':'Dj', 'Ж':'Z', 'Ћ':'C', 'Ч':'C', 'Џ':'Dz', 'Ш':'S', }) # Transliteration table Serbian Latin->ASCII, basic stripped. _dict_l2a_stripped = { - u'đ':u'dj', u'ž':u'z', u'ć':u'c', u'č':u'c', u'š':u's', - u'Đ':u'Dj', u'Ž':u'Z', u'Ć':u'C', u'Č':u'C', u'Š':u'S', + 'đ':'dj', 'ž':'z', 'ć':'c', 'č':'c', 'š':'s', + 'Đ':'Dj', 'Ž':'Z', 'Ć':'C', 'Č':'C', 'Š':'S', } # Transliteration table Serbian any->ASCII, basic stripped. _dict_cl2a_stripped = {} _dict_cl2a_stripped.update(_dict_c2a_stripped) _dict_cl2a_stripped.update(_dict_l2a_stripped) # Transliteration table English in Serbian Cyrillic->Latin, by keyboard layout. _dict_c2a_englay = _dict_c2l.copy() _dict_c2a_englay.update({ - u'љ':u'q', u'њ':u'w', u'ж':u'y', u'џ':u'x', - u'Љ':u'Q', u'Њ':u'W', u'Ж':u'Y', u'Џ':u'X', + 'љ':'q', 'њ':'w', 'ж':'y', 'џ':'x', + 'Љ':'Q', 'Њ':'W', 'Ж':'Y', 'Џ':'X', }) def ctol (text): """ Transliterate text from Cyrillic to proper Latin [type F1A hook]. """ return _ctol_w(text, _dict_c2l) def cltoa (text): """ Transliterate text from Cyrillic or Latin to stripped ASCII [type F1A hook]. """ return _ctol_w(text, _dict_cl2a_stripped) def ectol (text): """ Transliterate text from English in Cyrillic by keyboard layout to proper English [type F1A hook]. """ return _ctol_w(text, _dict_c2a_englay) def _ctol_w (text, trdict): # NOTE: Converted directly from C++ code, # perhaps something more efficient is possible. tlen = len(text) - ntext = u"" + ntext = "" for i in range(tlen): c = text[i] c2 = text[i:i+2] r = trdict.get(c2) or trdict.get(c) if r is not None: if len(r) > 1 and c.isupper() \ and ( (i + 1 < tlen and text[i + 1].isupper()) \ or (i > 0 and text[i - 1].isupper())): ntext += r.upper() else: ntext += r else: ntext += c return ntext # Head of alternatives directives for script. _shyb_althead = "~@" def hctoc (text): """ Resolve hybrid Cyrillic text with script alternatives into plain Cyrillic text [type F1A hook]. """ return resolve_alternatives_simple(text, 1, 2, althead=_shyb_althead) def hctol (text): """ Resolve hybrid Cyrillic text with script alternatives into plain Latin text [type F1A hook]. """ return resolve_alternatives_simple(text, 2, 2, althead=_shyb_althead, outfilter=ctol) def hctocl (htext): """ Resolve hybrid Cyrillic-Latin text into clean Cyrillic and clean Latin. @param htext: hybrid text @type htext: string @returns: Cyrillic and Latin texts @rtype: (string, string) """ return hctoc(htext), hctol(htext) -def cltoh (textc, textl, delims=u"/|¦", full=False): +def cltoh (textc, textl, delims="/|¦", full=False): """ Construct hybrid Cyrillic text out of clean Cyrillic and Latin texts. Hybridization is performed by inserting alternatives directives for parts which cannot be resolved by direct transliteration. If C{full} is set to C{True}, complete texts are unconditionally wrapped into single alternatives directive. @param textc: Cyrillic text @type textc: string @param textl: Latin text @type textl: string @param delims: possible delimiter characters @type delims: string @param full: whether to wraf full texts as single alternatives directive @type full: bool @returns: hybrid Cyrillic text @rtype: string """ if not full: wdiff = word_diff(ctol(textc), textl) textc = _padc(textc) segs = [] i = 0 ic = 0 while i < len(wdiff): tag, seg = wdiff[i] if tag == " ": segc = textc[ic:ic + len(seg)] segs.append(segc) else: seg2 = wdiff[i + 1][1] if i + 1 < len(wdiff) else "" if tag == "-": segc = textc[ic:ic + len(seg)] segl = seg2 else: segc = textc[ic:ic + len(seg2)] segl = seg i += 1 segs.append(_shyb_althead + _delimit([segc, segl], delims)) ic += len(seg) i += 1 return _unpadc("".join(segs)) else: return _shyb_althead + _delimit([textc, textl], delims) return "".join(segs) -_padc_chr = u"\u0004" -_padc_alphas = (u"љ", u"њ", u"џ", u"Љ", u"Њ", u"Џ") +_padc_chr = "\u0004" +_padc_alphas = ("љ", "њ", "џ", "Љ", "Њ", "Џ") def _padc (text): for alpha in _padc_alphas: text = text.replace(alpha, _padc_chr + alpha) return text def _unpadc (text): for alpha in _padc_alphas: text = text.replace(_padc_chr + alpha, alpha) return text # Ijekavian to Ekavian map (Latin script and letter cases derived afterwards). # All Ijekavian-Ekavian form pairs have to be unique across all groups. # Within a group, one Ijekavian form must not be in the prefix of another. _reflex_spec = ( - (u"›", { - u"ије": u"е", - u"је": u"е", + ("›", { + "ије": "е", + "је": "е", }), - (u"‹", { - u"иј": u"еј", # гријати → грејати - u"иљ": u"ел", # биљешка → белешка - u"ио": u"ео", # дио → део - u"ље": u"ле", # љето → лето - u"ње": u"не", # гњев → гнев + ("‹", { + "иј": "еј", # гријати → грејати + "иљ": "ел", # биљешка → белешка + "ио": "ео", # дио → део + "ље": "ле", # љето → лето + "ње": "не", # гњев → гнев }), - (u"▹", { - u"ије": u"и", # налијевати → наливати - u"је": u"и", # утјецај → утицај + ("▹", { + "ије": "и", # налијевати → наливати + "је": "и", # утјецај → утицај }), - (u"◃", { - u"ијел": u"ео", # бијел → бео - u"ијен": u"ењ", # лијен → лењ - u"ил": u"ел", # вриједила → вредела - u"ит": u"ет", # вриједити → вредети - u"јел": u"ео", # одјел → одео - u"тн": u"тњ", # љетни → летњи - u"шње": u"сне", # побјешњели → побеснели + ("◃", { + "ијел": "ео", # бијел → бео + "ијен": "ењ", # лијен → лењ + "ил": "ел", # вриједила → вредела + "ит": "ет", # вриједити → вредети + "јел": "ео", # одјел → одео + "тн": "тњ", # љетни → летњи + "шње": "сне", # побјешњели → побеснели }), ) def _derive_reflex_specs (reflex_spec): reflex_spec_dehyb = [] reflex_spec_hyb = {} for tick, refmap in reflex_spec: # Derive data for dehybridization. # Derive Latin cases (must be done before other cases). - refmap.update([map(ctol, x) for x in refmap.items()]) + refmap.update([list(map(ctol, x)) for x in list(refmap.items())]) # Derive cases with first letter in uppercase. - refmap.update([map(unicode.capitalize, x) for x in refmap.items()]) + refmap.update([list(map(str.capitalize, x)) for x in list(refmap.items())]) # Derive cases with all letters in uppercase. - refmap.update([map(unicode.upper, x) for x in refmap.items()]) + refmap.update([list(map(str.upper, x)) for x in list(refmap.items())]) # Compute minimum and maximum reflex lengths. - ijklen_min = min(map(len, refmap.keys())) - ijklen_max = max(map(len, refmap.keys())) + ijklen_min = min(list(map(len, list(refmap.keys())))) + ijklen_max = max(list(map(len, list(refmap.keys())))) reflex_spec_dehyb.append((tick, refmap, ijklen_min, ijklen_max)) # Derive data for hybridization: # {(ekvlen, ijklen, btrk): {ijkfrm: [(ekvfrm, tick)...]}} - for ijkfrm, ekvfrm in refmap.items(): + for ijkfrm, ekvfrm in list(refmap.items()): # Compute backtracking from position of jat-reflex difference. btrk = 0 while ( btrk < len(ijkfrm) and btrk < len(ekvfrm) and ijkfrm[btrk] == ekvfrm[btrk] ): btrk += 1 pkey = (btrk, len(ekvfrm), len(ijkfrm)) if pkey not in reflex_spec_hyb: reflex_spec_hyb[pkey] = {} if ijkfrm not in reflex_spec_hyb[pkey]: reflex_spec_hyb[pkey][ijkfrm] = [] reflex_spec_hyb[pkey][ijkfrm].append((ekvfrm, tick)) # Convert hybridization data into list of pairs. # Sort such that on hybridization reflexes are tried by # increasing backtrack, # decreasing smaller length of the two reflexes, # decreasing greater lenght of the two reflexes. tmplst = [] - pkeys = reflex_spec_hyb.keys() + pkeys = list(reflex_spec_hyb.keys()) pkeys.sort(key=lambda x: (x[0], -min(x[1], x[2]), -max(x[1], x[2]))) reflex_spec_hyb = [k + (reflex_spec_hyb[k],) for k in pkeys] return reflex_spec_dehyb, reflex_spec_hyb _reflex_spec_dehyb, _reflex_spec_hyb = _derive_reflex_specs(_reflex_spec) # Head of alternatives directives for dialect. _dhyb_althead = "~#" def hitoe (text): """ Resolve hybrid Ijekavian text with jat-reflex ticks and dialect alternatives into plain Ekavian text [type F1A hook]. """ return _hito_w(text) def hitoeq (text): """ Like L{hitoe}, but does not output warnings on problems [type F1A hook]. """ return _hito_w(text, silent=True) def hitoi (text): """ Resolve hybrid Ijekavian text with jat-reflex ticks and dialect alternatives into plain Ijekavian text [type F1A hook]. """ return _hito_w(text, toijek=True) def hitoiq (text): """ Like L{hitoi}, but does not output warnings on problems [type F1A hook]. """ return _hito_w(text, toijek=True, silent=True) def _hito_w (text, toijek=False, silent=False, validate=False): errspans = [] if validate else None for tick, refmap, ijklen_min, ijklen_max in _reflex_spec_dehyb: text = _hito_w_simple(text, tick, refmap, ijklen_min, ijklen_max, toijek, silent, errspans) srcname = "" if (not silent and not validate) else None selalt = 1 if not toijek else 2 text, ngood, allgood = resolve_alternatives(text, selalt, 2, althead=_dhyb_althead, srcname=srcname) if not allgood and validate: errmsg = n_("@info \"alternatives directive\" is a term", "Malformed Ekavian-Ijekavian alternatives directive " "encountered after %(num)d good directive.", "Malformed Ekavian-Ijekavian alternatives directive " "encountered after %(num)d good directives.", num=ngood) errspans.append((0, 0, errmsg)) if not validate: return text else: return errspans def _hito_w_simple (text, tick, refmap, ijklen_min, ijklen_max, toijek, silent, errspans): segs = [] p = 0 while True: pp = p p = text.find(tick, p) if p < 0: segs.append(text[pp:]) break segs.append(text[pp:p]) pp = p p += len(tick) if p >= len(text) or not text[p:p + 1].isalpha(): segs.append(tick) continue ijklen = ijklen_min ekvfrm = None while ijklen <= ijklen_max and ekvfrm is None: ijkfrm = text[p:p + ijklen] ekvfrm = refmap.get(ijkfrm) ijklen += 1 if ekvfrm is not None: segs.append(ekvfrm if not toijek else ijkfrm) p += len(ijkfrm) else: segs.append(tick) errmsg = _("@info \"jat\" is the name of an old Serbian letter", "Unknown jat-reflex starting from '%(snippet)s'.", snippet=text[pp:pp + 20]) if not silent: warning(errmsg) if errspans is not None: errspans.append((pp, pp + ijklen_max, errmsg)) return "".join(segs) def validate_dhyb (text): """ Check whether dialect-hybrid text is valid [type V1A hook]. """ return _hito_w(text, silent=True, validate=True) def hitoei (htext): """ Resolve hybrid Ijekavian-Ekavain text into clean Ekavian and Ijekavian. @param htext: hybrid text @type htext: string @returns: Ekavian and Ijekavian text @rtype: (string, string) """ return hitoe(htext), hitoi(htext) -def tohi (text1, text2, ekord=None, delims=u"/|¦", parthyb=False): +def tohi (text1, text2, ekord=None, delims="/|¦", parthyb=False): """ Construct hybrid Ijekavian text out of Ekavian and Ijekavian texts. Hybridization is performed by merging Ekavian and Ijekavian forms into Ijekavian forms with inserted jat-reflex ticks. Input texts can be both in Cyrillic and Latin, and piecewise so. Texts also do not have to be clean Ekavian and Ijekavian, as hybridization is performed only at difference segments. Order of text arguments is not important as long as all difference segments can be merged (i.e. the function is comutative in that case). If a difference segment cannot be merged by jat-reflex ticks, then the resolution depends on C{ekord} parameter. If it is C{None}, then the segment of C{text2} is taken into result. If it is C{1} or C{2}, then the segments of C{text1} and C{text2} are combined in a dialect alternatives directive (C{~#/.../.../}); the number determines which segment is put first in the directive (i.e. considered Ekavian), that of C{text1} or of C{text2}. Any other value of C{ekord} leads to undefined behavior. It is possible that input texts are already partially hybridized, and only some parts of them need to be additionally hybridized. Setting C{parthyb} to C{True} will tell the function to detect and skip already hybridized segments, and hybridize only the rest. @param text1: first text @type text1: string @param text2: second text @type text2: string @param ekord: enumerates the text to be considered Ekavian when adding alternatives directives @type ekord: None, 1, 2 @param delims: possible delimiter characters for alternatives directives @type delims: string @param parthyb: whether input texts are already partially hybridized @type parthyb: bool @returns: hybrid Ijekavian text @rtype: string """ len1 = len(text1); len2 = len(text2) i1 = 0; i1p = 0; i2 = 0; i2p = 0 segs = [] while True: while i1 < len1 and i2 < len2 and text1[i1] == text2[i2]: if not parthyb: i1 += 1 i2 += 1 else: i1 += _step_over_hyb(text1, i1) i2 += _step_over_hyb(text2, i2) if i1 == len1 and i2 == len2: segs.append(text1[i1p:]) # same as text2[i2p:] break # Try to hybridize difference by jat-reflex ticks. tick = None for texte, texti, ie, ii, order12 in ( (text1, text2, i1, i2, True), (text2, text1, i2, i1, False), ): frms = [] for btrk, lene, leni, refmap in _reflex_spec_hyb: ieb = ie - btrk iib = ii - btrk if ieb < 0 or iib < 0: continue frme = texte[ieb:ieb + lene] frmi = texti[iib:iib + leni] for cfrme, ctick in refmap.get(frmi, []): if cfrme == frme: tick = ctick break if tick: break if tick: break if tick: # Hybridization by difference marks possible. segs.append(text1[i1p:i1 - btrk]) # same as text2[i2p:i2 - btrk] segs.append(tick + frmi) i1p = i1 - btrk + (lene if order12 else leni) i2p = i2 - btrk + (leni if order12 else lene) else: # Hybridization by difference marks not possible. # Use alternatives directive, or pure Ijekavian. i1b = i1; i2b = i2 while ( i1b > i1p and i2b > i2p and (text1[i1b - 1].isalpha() + text2[i2b - 1].isalpha() == 1) ): i1b -= 1; i2b -= 1 segs.append(text1[i1p:i1b]) wdiff = word_diff(text1[i1b:], text2[i2b:]) frm1s = [] frm2s = [] while wdiff and wdiff[0][0] != " ": tag, seg = wdiff.pop(0) if tag != "+": frm1s.append(seg) if tag != "-": frm2s.append(seg) frm1 = "".join(frm1s) frm2 = "".join(frm2s) i1p = i1b + len(frm1) i2p = i2b + len(frm2) if ekord == 1: segs.append(_dhyb_althead + _delimit([frm1, frm2], delims)) elif ekord == 2: segs.append(_dhyb_althead + _delimit([frm2, frm1], delims)) else: segs.append(frm2) i1 = i1p i2 = i2p htext = "".join(segs) return htext _reflex_spec_dehyb_by_tick = dict((x[0], x[1:]) for x in _reflex_spec_dehyb) def _step_over_hyb (text, pos): refspec = _reflex_spec_dehyb_by_tick.get(text[pos]) if refspec is not None: # there is a reflex refmap, ijklen_min, ijklen_max = refspec ijklen = ijklen_min ekvfrm = None while ijklen <= ijklen_max and ekvfrm is None: ijkfrm = text[pos + 1:pos + 1 + ijklen] ekvfrm = refmap.get(ijkfrm) ijklen += 1 if ekvfrm is not None: steplen = 1 + len(ijkfrm) else: # malformed reflex steplen = 1 elif text.startswith(_dhyb_althead): # there is an alternatives directive if pos + len(_dhyb_althead) < len(text): sep = text[pos + len(_dhyb_althead)] pos2 = text.find(sep, pos + len(_dhyb_althead) + 1) if pos2 < len(text): pos3 = text.find(sep, pos2 + 1) if pos3 < len(text): steplen = pos3 - pos else: # malformed directive steplen = 1 else: # malformed directive steplen = 1 else: # malformed directive steplen = 1 else: # there is plain text steplen = 1 return steplen def hictoec (text): """ Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into clean Ekavian Cyrillic text [type F1A hook]. """ return hctoc(hitoe(text)) def hictoecq (text): """ Like L{hictoec}, but does not output warnings on problems [type F1A hook]. """ return hctoc(hitoeq(text)) def hictoel (text): """ Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into clean Ekavian Latin text [type F1A hook]. """ return hctol(hitoe(text)) def hictoic (text): """ Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into clean Ijekavian Cyrillic text [type F1A hook]. """ return hctoc(hitoi(text)) def hictoicq (text): """ Like L{hictoic}, but does not output warnings on problems [type F1A hook]. """ return hctoc(hitoiq(text)) def hictoil (text): """ Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into clean Ijekavian Latin text [type F1A hook]. """ return hctol(hitoi(text)) def hictoall (htext): """ Resolve hybrid Ijekavian-Ekavian Cyrillic-Latin text into all four clean variants. @param htext: hybrid text @type htext: string @returns: Ekavian Cyrillic, Ekavian Latin, Ijekavian Cyrillic, and Ijekavian Latin text @rtype: (string, string, string, string) """ htextc = hctoc(htext) htextl = hctol(htext) return hitoe(htextc), hitoe(htextl), hitoi(htextc), hitoi(htextl) def _delimit (alts, delims): good = False for delim in delims: good = True for alt in alts: if delim in alt: good = False break if good: break if not good: fmtalts = format_item_list(["{%s}" % x for x in alts]) raise PologyError( _("@info", "No delimiter from '%(delimstr)s' can be used for " "alternatives directive containing: %(snippetlist)s.", delimstr=delims, snippetlist=fmtalts)) return delim + delim.join(alts) + delim diff --git a/pology/markup.py b/pology/markup.py index f0339493..2ac52d99 100644 --- a/pology/markup.py +++ b/pology/markup.py @@ -1,2051 +1,2051 @@ # -*- coding: UTF-8 -*- """ Convert and validate markup in text. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import codecs import xml.parsers.expat import difflib from pology import PologyError, datadir, _, n_ from pology.comments import manc_parse_flag_list from pology.diff import adapt_spans from pology.entities import read_entities from pology.getfunc import get_result_ireq from pology.msgreport import report_on_msg from pology.multi import Multidict from pology.report import format_item_list # Pipe flag used to manually prevent check for a particular message. flag_no_check_markup = "no-check-markup" _nlgr_rx = re.compile(r"\n{2,}") _wsgr_rx = re.compile(r"\s+") def plain_to_unwrapped (text): """ Convert wrapped plain text to unwrapped. Two or more newlines are considered as paragraph boundaries and left in, while all other newlines are removed. Whitespace in the text is simplified throughout. @param text: text to unwrap @type text: string @returns: unwrapped text @rtype: string """ # Strip leading and trailing whitespace. text = text.strip() # Strip leading and trailing whitespace in all lines. text = "\n".join([x.strip() for x in text.split("\n")]) # Mask all paragraph breaks. pbmask = "\x04\x04" text = _nlgr_rx.sub(pbmask, text) # Replace all whitespace groups with single space. text = _wsgr_rx.sub(" ", text) # Unmask paragraph breaks. text = text.replace(pbmask, "\n\n") return text xml_entities = { "lt": "<", "gt": ">", "apos": "'", "quot": "\"", "amp": "&", } WS_SPACE = "\x04~sp" WS_TAB = "\x04~tb" WS_NEWLINE = "\x04~nl" _ws_masks = { WS_SPACE: " ", WS_TAB: "\t", WS_NEWLINE: "\n", } -_ws_unmasks = dict([(y, x) for x, y in _ws_masks.items()]) +_ws_unmasks = dict([(y, x) for x, y in list(_ws_masks.items())]) def xml_to_plain (text, tags=None, subs={}, ents={}, keepws=set(), ignels=set()): """ Convert any XML-like markup to plain text. By default, all tags in the text are replaced with a single space; entities, unless one of the XML default (C{<}, C{>}, C{&}, C{"}, C{'}), are left untouched; all whitespace groups are simplified to single space and leading and trailing removed. If only a particular subset of tags should be taken into account, it can be specified by the C{tags} parameter, as a sequence of tag names (the sequence is internally converted to set before processing). If a tag should be replaced with a special sequence of characters (either opening or closing tag), or the text wrapped by it replaced too, this can be specified by the C{subs} parameter. It is a dictionary of 3-tuples by tag name, which tells what to replace with the opening tag, the closing tag, and the wrapped text. For example, to replace C{foobar} with C{/foobar/}, the dictionary entry would be C{{"i": ("/", "/", None)}} (where final C{None} states not to touch the wrapped text); to replace C{...} with C{@@@} (i.e. remove code segment completely but leave in a marker that there was something), the entry is C{{"code": ("", "", "@@@")}}. The replacement for the wrapped text can also be a function, taking a string and returning a string. Note that whitespace is automatically simplified, so if whitespace given by the replacements should be exactly preserved, use C{WS_*} string constants in place of corresponding whitespace characters. To have some entities other than the XML default replaced with proper values, a dictionary of known entities with values may be provided using the C{ents} parameter. Whitespace can be preserved within some elements, as given by their tags in the C{keepws} sequence. Some elements may be completely removed, as given by the C{ignels} sequence. Each element of the sequence should either be a tag, or a (tag, type) tuple, where type is the value of the C{type} argument to element, if any. It is assumed that the markup is well-formed, and if it is not the result is undefined; but best attempt at conversion is made. There are several other functions in this module which deal with well known markups, such that it is not necessary to use this function with C{tags}, C{subs}, or C{ents} manually specified. If you only want to resolve entities from a known set, instead of calling this function with empty C{tags} and entities given in C{ents}, consider using the more powerfull L{pology.resolve.resolve_entities}. @param text: markup text to convert to plain @type text: string @param tags: known tags @type tags: sequence of strings @param subs: replacement specification @type subs: dictionary of 3-tuples @param ents: known entities and their values @type ents: dictionary @param keepws: tags of elements in which to preserve whitespace @type keepws: sequence of strings @param ignels: tags or tag/types or elements to completely remove @type ignels: sequence of strings and (string, string) tuples @returns: plain text version @rtype: string """ # Convert some sequences to sets, for faster membership checks. if tags is not None and not isinstance(tags, set): tags = set(tags) if not isinstance(keepws, set): keepws = set(keepws) if not isinstance(ignels, set): ignels = set(ignels) # Resolve user-supplied entities before tags, # as they may contain more markup. # (Resolve default entities after tags, # because the default entities can introduce invalid markup.) text = _resolve_ents(text, ents, xml_entities) # Build element tree, trying to work around badly formed XML # (but do note when the closing element is missing). # Element tree is constructed as list of tuples: # (tag, opening_tag_literal, closing_tag_literal, atype, content) # where atype is the value of type attribute (if any), # and content is a sublist for given element; # tag may be #text, when the content is string. eltree = [] curel = eltree parent = [] any_tag = False p = 0 while True: pp = p p = text.find("<", p) if p < 0: break curel.append(("#text", None, None, None, text[pp:p])) tag_literal, tag, atype, opening, closing, p = _parse_tag(text, p) if p < 0: break if opening: # opening tag any_tag = True curel.append([tag, tag_literal, None, atype, []]) parent.append(curel) curel = curel[-1][-1] if closing: # closing tag (can be both opening and closing) if parent: curel = parent.pop() if not opening: # Record closing tag literal if not opening as well. curel[-1][2] = tag_literal else: # faulty markup, move top element eltree = [[tag, None, tag_literal, None, curel]] curel = eltree curel.append(("#text", None, None, None, text[pp:])) # Replace tags. text = _resolve_tags(eltree, tags, subs, keepws, ignels) # Resolve default entities. text = _resolve_ents(text, xml_entities) return text def _parse_tag (text, p): # text[p] must be "<" tag = "" atype = None opening = True closing = False tlen = len(text) pp = p in_str = False in_tag = False in_attr = False in_lead = True in_afterslash = False in_aftereq = False in_aftertag = False in_afterattr = False ntag = "" nattr = "" while True: p += 1 if p >= tlen: break if in_lead and not text[p].isspace(): in_lead = False opening = text[p] != "/" if opening: in_tag = True p_tag = p else: in_afterslash = True elif in_afterslash and not text[p].isspace(): in_afterslash = False in_tag = True p_tag = p elif in_tag and (text[p].isspace() or text[p] in "/>"): in_tag = False in_aftertag = True tag = text[p_tag:p] ntag = tag.lower() elif in_aftertag and not (text[p].isspace() or text[p] in "/>"): in_aftertag = False in_attr = True p_attr = p elif in_attr and (text[p].isspace() or text[p] in "=/>"): in_attr = False if text[p] != "=": in_afterattr = True else: in_aftereq = True attr = text[p_attr:p] nattr = attr.lower() elif in_aftereq and text[p] in ('"', "'"): in_aftereq = False in_str = True quote_char = text[p] p_str = p + 1 elif in_str and text[p] == quote_char: in_str = False s = text[p_str:p].strip().replace(" ", "") if nattr == "type": atype = s elif in_afterattr and text[p] == "=": in_afterattr = False in_aftereq = True if not in_str and text[p] == "/": closing = True if not in_str and text[p] == ">": break p += 1 tag_literal = text[pp:p] return tag_literal, tag, atype, opening, closing, p _entity_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U) def _resolve_ents (text, ents={}, ignents={}): """ Resolve XML entities as described in L{xml_to_plain}, ignoring some. """ # There may be entities within entities, so replace entities in each # entity value too before substituting in the main text. ntext = [] p = 0 while True: pp = p p = text.find("&", p) if p < 0: break ntext.append(text[pp:p]) m = _entity_rx.match(text, p) if m: name = m.group(1) if name not in ignents: value = ents.get(name) if value is not None: # FIXME: Endless recursion if the entity repeats itself. value = _resolve_ents(value, ents, ignents) ntext.append(value) else: # Put entity back as-is. ntext.append(m.group(0)) else: # ignored entity, do not touch ntext.append(text[p:m.span()[1]]) p = m.span()[1] else: ntext.append(text[p]) # the ampersand p += 1 ntext.append(text[pp:]) text = "".join(ntext) return text # Ordinary around masked whitespace. _wsgr_premask_rx = re.compile(r"\s+(\x04~\w\w)") _wsgr_postmask_rx = re.compile(r"(\x04~\w\w)\s+") def _resolve_tags (elseq, tags=None, subs={}, keepws=set(), ignels=set()): """ Replace XML tags as described in L{xml_to_plain}, given the parsed tree. Split into top and recursive part. """ # Text with masked whitespace where significant. text = _resolve_tags_r(elseq, tags, subs, keepws, ignels) # Simplify whitespace. text = _wsgr_rx.sub(" ", text) text = _wsgr_premask_rx.sub(r"\1", text) text = _wsgr_postmask_rx.sub(r"\1", text) text = text.strip() # Unmask significant whitespace. text = _unmask_ws(text) # Remove excess newlines even if supposedly significant. text = text.strip("\n") text = _nlgr_rx.sub("\n\n", text) return text def _resolve_tags_r (elseq, tags=None, subs={}, keepws=set(), ignels=set()): segs = [] for el in elseq: if el[0] in ignels or (el[0], el[3]) in ignels: # Complete element is ignored (by tag, or tag/type). continue if el[0] == "#text": segs.append(el[-1]) elif tags is None or el[0] in tags: repl_pre, repl_post, repl_cont = subs.get(el[0], [" ", " ", None]) if repl_pre is None: repl_pre = "" if repl_post is None: repl_post = "" repl_cont_orig = repl_cont - if not isinstance(repl_cont, basestring): + if not isinstance(repl_cont, str): repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels) if el[0] in keepws: # Mask whitespace in wrapped text. repl_cont = _mask_ws(repl_cont) if callable(repl_cont_orig): repl_cont = repl_cont_orig(repl_cont) # If space not significant, # find first non-whitespace characters in wrapped text # and shift them before surrounding replacements. if el[0] not in keepws: lcont = len(repl_cont) p1 = 0 while p1 < lcont and repl_cont[p1].isspace(): p1 += 1 p2 = lcont - 1 while p2 > 0 and repl_cont[p2].isspace(): p2 -= 1 repl_pre = repl_cont[:p1] + repl_pre repl_post = repl_post + repl_cont[p2+1:] repl_cont = repl_cont[p1:p2+1] segs.append(repl_pre + repl_cont + repl_post) else: # Ignored tag, put back verbatim. repl_pre = el[1] if repl_pre is None: repl_pre = "" repl_post = el[2] if repl_post is None: repl_post = "" repl_cont = _resolve_tags_r(el[-1], tags, subs, keepws, ignels) segs.append(repl_pre + repl_cont + repl_post) return "".join(segs) def _mask_ws (text): - for mask, ws in _ws_masks.items(): + for mask, ws in list(_ws_masks.items()): text = text.replace(ws, mask) return text def _unmask_ws (text): - for mask, ws in _ws_masks.items(): + for mask, ws in list(_ws_masks.items()): text = text.replace(mask, ws) return text _html_tags = set(""" a address applet area b base basefont big blockquote body br button caption center cite code col colgroup dd del dfn dir div dl dt em fieldset font form frame frameset h1 h2 h3 h4 h5 h6 head hr html i iframe img input ins isindex kbd label legend li link map menu meta noframes noscript ol option p param pre s samp script select small span strike strong style sub sup table tbody td textarea tfoot th thead title tr tt u ul var xmp """.split()) _html_subs = { "_nows" : ("", "", None), "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None), } _html_subs.update([(x, _html_subs["_nows"]) for x in _html_tags]) _html_subs.update([(x, _html_subs["_parabr"]) for x in "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr" "".split()]) _html_ents = { # in addition to default XML entities - "nbsp": u"\xa0", + "nbsp": "\xa0", } _html_keepws = set(""" code pre xmp """.split()) _html_ignels = set([ ("style", "text/css"), ]) def html_to_plain (text): """ Convert HTML markup to plain text. @param text: HTML text to convert to plain @type text: string @returns: plain text version @rtype: string """ return xml_to_plain(text, _html_tags, _html_subs, _html_ents, _html_keepws, _html_ignels) def html_plain (*args, **kwargs): """ Deprecated name for L{html_to_plain}. """ return html_to_plain(*args, **kwargs) _qtrich_tags = set(""" qt html a b big blockquote body br center cite code dd dl dt em font h1 h2 h3 h4 h5 h6 head hr i img li meta nobr ol p pre s span strong style sub sup table td th tr tt u ul var """.split()) _qtrich_subs = { "_nows" : ("", "", None), "_parabr": (WS_NEWLINE*2, WS_NEWLINE*2, None), } _qtrich_subs.update([(x, _qtrich_subs["_nows"]) for x in _qtrich_tags]) _qtrich_subs.update([(x, _qtrich_subs["_parabr"]) for x in "br dd dl dt h1 h2 h3 h4 h5 h6 hr li p pre td th tr" "".split()]) _qtrich_ents = { # in addition to default XML entities - "nbsp": u"\xa0", + "nbsp": "\xa0", } _qtrich_keepws = set(""" code pre """.split()) _qtrich_ignels = set([ ("style", "text/css"), ]) def qtrich_to_plain (text): """ Convert Qt rich-text markup to plain text. @param text: Qt rich text to convert to plain @type text: string @returns: plain text version @rtype: string """ return xml_to_plain(text, _qtrich_tags, _qtrich_subs, _qtrich_ents, _qtrich_keepws, _qtrich_ignels) _kuit_tags = set(""" kuit kuil title subtitle para list item note warning filename link application command resource icode bcode shortcut interface emphasis placeholder email envar message numid nl """.split()) _kuit_subs = { "_nows" : ("", "", None), "_parabr" : ("", WS_NEWLINE*2, None), "_ws" : (" ", " ", None), "_ui" : ("[", "]", None), } _kuit_subs.update([(x, _kuit_subs["_nows"]) for x in _kuit_tags]) _kuit_subs.update([(x, _kuit_subs["_ws"]) for x in "placeholder".split()]) _kuit_subs.update([(x, _kuit_subs["_parabr"]) for x in "title subtitle para item nl" "".split()]) _kuit_subs.update([(x, _kuit_subs["_ui"]) for x in "interface".split()]) _kuit_ents = { # in addition to default XML entities } _kuit_keepws = set(""" icode bcode """.split()) _kuit_ignels = set([ ]) def kuit_to_plain (text): """ Convert KUIT markup to plain text. @param text: KUIT text to convert to plain @type text: string @returns: plain text version @rtype: string """ return xml_to_plain(text, _kuit_tags, _kuit_subs, _kuit_ents, _kuit_keepws, _kuit_ignels) _htkt_tags = set(list(_qtrich_tags) + list(_kuit_tags)) -_htkt_subs = dict(_qtrich_subs.items() + _kuit_subs.items()) -_htkt_ents = dict(_qtrich_ents.items() + _kuit_ents.items()) +_htkt_subs = dict(list(_qtrich_subs.items()) + list(_kuit_subs.items())) +_htkt_ents = dict(list(_qtrich_ents.items()) + list(_kuit_ents.items())) _htkt_keepws = set(list(_qtrich_keepws) + list(_kuit_keepws)) _htkt_ignels = set(list(_qtrich_ignels) + list(_kuit_ignels)) def kde4_to_plain (text): """ Convert KDE4 GUI markup to plain text. KDE4 GUI texts may contain both Qt rich-text and KUIT markup, even mixed in the same text. Note that the conversion cannot be achieved, in general, by first converting Qt rich-text, and then KUIT, or vice versa. For example, if the text has C{<} entity, after first conversion it will become plain C{<}, and interfere with second conversion. @param text: KDE4 text to convert to plain @type text: string @returns: plain text version @rtype: string """ return xml_to_plain(text, _htkt_tags, _htkt_subs, _htkt_ents, _htkt_keepws, _htkt_ignels) # Assembled on first use. _dbk_tags = None _dbk_subs = None _dbk_ents = None _dbk_keepws = None _dbk_ignels = None def _prep_docbook4_to_plain (): global _dbk_tags, _dbk_subs, _dbk_ents, _dbk_keepws, _dbk_ignels specpath = os.path.join(datadir(), "spec", "docbook4.l1") docbook4_l1 = collect_xml_spec_l1(specpath) _dbk_tags = set(docbook4_l1.keys()) _dbk_subs = { "_nows" : ("", "", None), "_parabr" : ("", WS_NEWLINE*2, None), "_ws" : (" ", " ", None), "_ui" : ("[", "]", None), "_uipath" : ("", "", lambda s: re.sub("\]\s*\[", "->", s, re.U)), } _dbk_subs.update([(x, _dbk_subs["_nows"]) for x in _dbk_tags]) _dbk_subs.update([(x, _dbk_subs["_parabr"]) for x in "para title".split()]) # FIXME: Add more. _dbk_subs.update([(x, _dbk_subs["_ws"]) for x in "contrib address firstname placeholder surname " "primary secondary " "".split()]) _dbk_subs.update([(x, _dbk_subs["_ui"]) for x in "guilabel guibutton guiicon guimenu guisubmenu " "guimenuitem " "".split()]) _dbk_subs.update([(x, _dbk_subs["_uipath"]) for x in "menuchoice " "".split()]) _dbk_ents = { # in addition to default XML entities } _dbk_keepws = set(""" screen programlisting """.split()) # FIXME: Add more. _dbk_ignels = set([ ]) def docbook4_to_plain (text): """ Convert Docbook 4.x markup to plain text. @param text: Docbook text to convert to plain @type text: string @returns: plain text version @rtype: string """ if _dbk_tags is None: _prep_docbook4_to_plain() return xml_to_plain(text, _dbk_tags, _dbk_subs, _dbk_ents, _dbk_keepws, _dbk_ignels) def collect_xml_spec_l1 (specpath): """ Collect lightweight XML format specification, level 1. Level 1 specification is the dictionary of all known tags, with allowed attributes and subtags for each. File of the level 1 specification is in the following format:: # A comment. # Tag with unconstrained attributes and subtags: tagA; # Tag with constrained attributes and unconstrained subtags: tagF : attr1 attr2 ...; # Tag with unconstrained attributes and constrained subtags: tagF > stag1 stag2 ...; # Tag with constrained attributes and subtags: tagF : attr1 attr2 ... > stag1 stag2 ...; # Tag with no attributes and unconstrained subtags: tagA :; # Tag with unconstrained attributes and no subtags: tagA >; # Tag with no attributes and no subtags: tagA :>; # Attribute value constrained by a regular expression: .... attr1=/^(val1|val2|val3)$/i ... # Reserved dummy tag specifying attributes common to all tags: pe-common-attrib : attrX attrY; The specification can contain a dummy tag named C{pe-common-attrib}, stating attributes which are common to all tags, instead of having to list them with each and every tag. To make an attribute mandatory, it's name should be prefixed by exclamation sign (!). Specification file must be UTF-8 encoded. @param specpath: path to level 1 specification file @type specpath: string @return: level 1 specification @rtype: dict """ ch_comm = "#" ch_attr = ":" ch_attre = "=" ch_mattr = "!" ch_stag = ">" ch_end = ";" dtag_attr = "pe-common-attrib" valid_tag_rx = re.compile("^[\w-]+$") valid_attr_rx = re.compile("^[\w-]+$") - c_tag, c_attr, c_attre, c_stag = range(4) + c_tag, c_attr, c_attre, c_stag = list(range(4)) ifs = codecs.open(specpath, "r", "UTF-8").read() lenifs = len(ifs) pos = [0, 1, 1] def signal (msg, bpos): emsg = _("@info \"L1-spec\" is shorthand for " "\"level 1 specification\"", "[L1-spec] %(file)s:%(line)d:%(col)d: %(msg)s", file=specpath, line=bpos[0], col=bpos[1], msg=msg) raise PologyError(emsg) def advance (stoptest, cmnt=True): ind = pos[0] oind = ind substr = [] sep = None while ind < lenifs and sep is None: if cmnt and ifs[ind] == ch_comm: ind = ifs.find("\n", ind) if ind < 0: break else: sep = stoptest(ind) if sep is None: substr.append(ifs[ind]) ind += 1 else: ind += len(sep) pos[0] = ind rawsubstr = ifs[oind:ind] p = rawsubstr.rfind("\n") if p >= 0: pos[1] += rawsubstr.count("\n") pos[2] = len(rawsubstr) - p else: pos[2] += len(rawsubstr) return "".join(substr), sep def make_rx_lint (rx_str, rx_flags, wch, lincol): try: rx = re.compile(rx_str, rx_flags) except: signal(_("@info the regex is already quoted when inserted", "Cannot compile regular expression %(regex)s.", regex=(wch + rx_str + wch)), lincol) return lambda x: rx.search(x) is not None spec = {} ctx = c_tag entry = None while pos[0] < lenifs: if ctx == c_tag: t = lambda i: ( ifs[i] in (ch_attr, ch_stag, ch_end) and ifs[i] or None) tag, sep = advance(t) tag = tag.strip() if tag: if sep is None: signal(_("@info", "Entry not terminated after the initial tag."), lincol) if not valid_tag_rx.search(tag) and tag != dtag_attr: signal(_("@info", "Invalid tag name '%(tag)s'.", tag=tag), lincol) entry = _L1Element(tag) spec[tag] = entry if sep == ch_attr: ctx = c_attr elif sep == ch_stag: ctx = c_stag elif sep == ch_end: ctx = c_tag else: break elif ctx == c_attr: if entry.attrs is None: entry.attrs = set() lincol = tuple(pos[1:]) t = lambda i: ( ( ifs[i].isspace() or ifs[i] in (ch_attre, ch_stag, ch_end)) and ifs[i] or [None])[0] attr, sep = advance(t) attr = attr.strip() if attr: if attr.startswith(ch_mattr): attr = attr[len(ch_mattr):] entry.mattrs.add(attr) if attr in entry.attrs: signal(_("@info", "Duplicate attribute '%(attr)s'.", attr=attr), lincol) if not valid_attr_rx.search(attr): signal(_("@info", "Invalid attribute name '%(attr)s'.", attr=attr), lincol) entry.attrs.add(attr) lastattr = attr if sep.isspace(): ctx = c_attr elif sep == ch_attre: ctx = c_attre elif sep == ch_stag: ctx = c_stag elif sep == ch_end: ctx = c_tag else: signal(_("@info", "Entry not terminated after the attribute list."), lincol) elif ctx == c_attre: lincol = tuple(pos[1:]) t = lambda i: not ifs[i].isspace() and ifs[i] or None sub, wch = advance(t) if wch is None: signal(_("@info", "End of input inside the value constraint."), lincol) t = lambda i: ifs[i] == wch and ifs[i] or None rx_str, sep = advance(t, cmnt=False) if sep is None: signal(_("@info", "End of input inside the value constraint."), lincol) t = lambda i: (not ifs[i].isalpha() and [""] or [None])[0] rx_flag_spec, sep = advance(t) rx_flags = re.U seen_flags = set() lincol = tuple(pos[1:]) for c in rx_flag_spec: if c in seen_flags: signal(_("@info", "Regex flag '%(flag)s' is already issued.", flag=c), lincol) if c == "i": rx_flags |= re.I else: signal(_("@info", "Unknown regex flag '%(flag)s'.", flag=c), lincol) seen_flags.add(c) entry.avlints[lastattr] = make_rx_lint(rx_str, rx_flags, wch, lincol) ctx = c_attr elif ctx == c_stag: if entry.stags is None: entry.stags = set() lincol = tuple(pos[1:]) t = lambda i: ( (ifs[i].isspace() or ifs[i] == ch_end) and ifs[i] or [None])[0] stag, sep = advance(t) stag = stag.strip() if stag: if stag in entry.stags: signal(_("@info", "Repeated subtag '%(tag)s'.", tag=stag), lincol) entry.stags.add(stag) if sep == ch_end: ctx = c_tag else: signal(_("@info", "Entry not terminated after the subtag list."), lincol) # Add common attributes to each tag. dentry_attr = spec.pop(dtag_attr, []) if dentry_attr: for attr in dentry_attr.attrs: attre = dentry_attr.avlints.get(attr) - for entry in spec.values(): + for entry in list(spec.values()): if entry.attrs is None: entry.attrs = set() if attr not in entry.attrs: entry.attrs.add(attr) if attre: entry.avlints[attr] = attre return spec class _L1Element: def __init__ (self, tag=None, attrs=None, mattrs=set(), avlints={}, stags=None): # The tag of this element (string). self.tag = tag # Possible attributes (set, or None meaning any). self.attrs = attrs # Mandatory attributes (set). self.mattrs = mattrs # Validator functions for attribute values, per attribute (dict). # Validator does not have to be defined for each attribute. self.avlints = avlints # Possible subelements by tag (set, or None meaning any). self.stags = stags # Simplified matching of XML entity name (sans ampersand and semicolon). _simple_ent_rx = re.compile(r"^([\w.:-]+|#[0-9]+)$", re.U); # Get line/column segment in error report. _lin_col_rx = re.compile(r":\s*line\s*\d+,\s*column\s*\d+", re.I) # Dummy top tag for topless texts. _dummy_top = "_" # Global data for XML checking. class _Global: pass _g_xml_l1 = _Global() def validate_xml_l1 (text, spec=None, xmlfmt=None, ents=None, casesens=True, accelamp=False): """ Validate XML markup in text against L{level1} specification. Text is not required to have a top tag; if it does not, a dummy one will be assigned to assure that the check passes. If C{spec} is C{None}, text is only checked to be well-formed. If C{ents} are C{None}, entities in the text are ignored by the check; otherwise, an entity not belonging to the known set is considered erroneous. Default XML entities (C{<}, C{>}, C{&}, C{"}, C{'}) are automatically added to the set of known entities. Tag and attribute names can be made case-insensitive by setting C{casesens} to C{False}. If text is a part of user interface, and the environment may use the literal ampersand as accelerator marker, it can be allowed to pass the check by setting C{accelamp} to C{True}. Text can be one or more entity definitions of the form C{}, when special check is applied. The result of the check is list of erroneous spans in the text, each given by start and end index (in Python standard semantics), and the error description, packed in a tuple. If there are no errors, empty list is returned. Reported spans need not be formally complete with respect to the error location, but are heuristically determined to be short and provide good visual indication of what triggers the error. @param text: text to check @type text: string @param spec: markup definition @type spec: L{level1} specification @param xmlfmt: name of the particular XML format (for error messages) @type xmlfmt: string @param ents: set of known entities @type ents: sequence @param casesens: whether tag names are case-insensitive @type casesens: bool @param accelamp: whether to allow ampersand as accelerator marker @type accelamp: bool @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ if text.lstrip().startswith("%s" % (_dummy_top, text, _dummy_top) # Prepare parser. xenc = "UTF-8" parser = xml.parsers.expat.ParserCreate(xenc) parser.UseForeignDTD() # not to barf on non-default XML entities parser.StartElementHandler = _handler_start_element parser.DefaultHandler = _handler_default # Link state for handlers. g = _g_xml_l1 g.text = text g.spec = spec g.xmlfmt = xmlfmt or "XML" g.ents = ents g.casesens = casesens g.xenc = xenc g.parser = parser g.errcnt = 0 g.spans = [] g.tagstack = [] # Parse and check. try: parser.Parse(text.encode(xenc), True) - except xml.parsers.expat.ExpatError, e: + except xml.parsers.expat.ExpatError as e: errmsg = _("@info a problem in the given type of markup " "(e.g. HTML, Docbook)", "%(mtype)s markup: %(snippet)s.", mtype=g.xmlfmt, snippet=e.args[0]) span = _make_span(text, e.lineno, e.offset, errmsg) g.spans.append(span) # Adapt spans back to original text. pure_spans = [x[:2] for x in g.spans] pure_spans = adapt_spans(text_orig, text, pure_spans, merge=False) # Remove unhelpful line/column in error messages. errmsgs = [] for errmsg, span in zip([x[2] for x in g.spans], pure_spans): m = _lin_col_rx.search(errmsg) if m: errmsg = errmsg[:m.start()] + errmsg[m.end():] errmsgs.append(errmsg) # Put spans back together. g.spans = [x + (y,) for x, y in zip(pure_spans, errmsgs)] return g.spans _ts_fence = "|/|" def _escape_amp_accel (text): p_ts = text.find(_ts_fence) in_script = False p1 = 0 found_accel = False while True: # Bracket possible entity reference. p1 = text.find("&", p1) if p1 < 0: break if not in_script and p_ts >= 0 and p1 > p_ts: in_script = True found_accel = False p2 = text.find(";", p1) # An accelerator marker if no semicolon in rest of the text # or the bracketed segment does not look like an entity, # and it is in front of an alphanumeric or itself. nc = text[p1 + 1:p1 + 2] if ( (p2 < 0 or not _simple_ent_rx.match(text[p1 + 1:p2])) and (nc.isalnum() or nc == "&") ): # Check if the next one is an ampersand too, # i.e. if it's a self-escaped accelerator marker. namp = 1 if ( text[p1 + 1:p1 + 2] == "&" and not _simple_ent_rx.match(text[p1 + 2:p2]) ): namp += 1 # Escape the marker if first or self-escaped, # or currently in scripted part (in which there can be # any number of non-escaped markers). if not found_accel or namp > 1 or in_script: escseg = "&" * namp text = text[:p1] + escseg + text[p1 + namp:] p1 += len(escseg) if namp == 1: found_accel = True else: p1 += namp elif p2 > p1: p1 = p2 else: break return text def _handler_start_element (tag, attrs): g = _g_xml_l1 if g.spec is None: return # Normalize names to lower case if allowed. if not g.casesens: tag = tag.lower() - attrs = dict([(x.lower(), y) for x, y in attrs.items()]) + attrs = dict([(x.lower(), y) for x, y in list(attrs.items())]) # Check existence of the tag. if tag not in g.spec and tag != _dummy_top: errmsg = _("@info", "%(mtype)s markup: unrecognized tag '%(tag)s'.", mtype=g.xmlfmt, tag=tag) span = _make_span(g.text, g.parser.CurrentLineNumber, g.parser.CurrentColumnNumber + 1, errmsg) g.spans.append(span) return if tag == _dummy_top: return elspec = g.spec[tag] errmsgs = [] # Check applicability of attributes and validity of their values. if elspec.attrs is not None: - for attr, aval in attrs.items(): + for attr, aval in list(attrs.items()): if attr not in elspec.attrs: errmsgs.append(_("@info", "%(mtype)s markup: invalid attribute " "'%(attr)s' to tag '%(tag)s'.", mtype=g.xmlfmt, attr=attr, tag=tag)) else: avlint = elspec.avlints.get(attr) if avlint and not avlint(aval): errmsgs.append(_("@info", "%(mtype)s markup: invalid value " "'%(val)s' to attribute '%(attr)s'.", mtype=g.xmlfmt, val=aval, attr=attr)) # Check proper parentage. if g.tagstack: ptag = g.tagstack[-1] pelspec = g.spec.get(ptag) if ( pelspec is not None and pelspec.stags is not None and tag not in pelspec.stags ): errmsgs.append(_("@info", "%(mtype)s markup: tag '%(tag1)s' cannot be " "a subtag of '%(tag2)s'.", mtype=g.xmlfmt, tag1=tag, tag2=ptag)) # Record element stack. g.tagstack.append(tag) for errmsg in errmsgs: span = _make_span(g.text, g.parser.CurrentLineNumber, g.parser.CurrentColumnNumber + 1, errmsg) g.spans.append(span) def _handler_default (text): g = _g_xml_l1 if g.ents is not None and text.startswith('&') and text.endswith(';'): ent = text[1:-1] errmsg = None if ent.startswith("#"): if nument_to_char(ent) is None: errmsg = _("@info", "%(mtype)s markup: invalid numeric " "entity '%(ent)s'.", mtype=g.xmlfmt, ent=ent) elif ent not in g.ents and ent not in xml_entities: nearents = [] #difflib.get_close_matches(ent, g.ents) if nearents: if len(nearents) > 5: # do not overwhelm message fmtents = format_item_list(nearents[:5], incmp=True) else: fmtents = format_item_list(nearents) errmsg = _("@info", "%(mtype)s markup: unknown entity '%(ent)s' " "(suggestions: %(entlist)s).", mtype=g.xmlfmt, ent=ent, entlist=fmtents) else: errmsg = _("@info", "%(mtype)s markup: unknown entity '%(ent)s'.", mtype=g.xmlfmt, ent=ent) if errmsg is not None: span = _make_span(g.text, g.parser.CurrentLineNumber, g.parser.CurrentColumnNumber + 1, errmsg) g.spans.append(span) # Text to fetch from the reported error position in XML stream. _near_xml_error_rx = re.compile(r"\W*[\w:.-]*[^\w\s>]*(\s*>)?", re.U) def _make_span (text, lno, col, errmsg): # Find problematic position. clno = 1 p = 0 while clno < lno: p = text.find("\n", p) if p < 0: break p += 1 clno += 1 if p < 0: return (0, len(text)) # Scoop some reasonable nearby text. m = _near_xml_error_rx.match(text, p + col - 1) if not m: return (0, len(text), errmsg) start, end = m.span() while text[start].isalnum(): if start == 0: break start -= 1 return (start, end, errmsg) _entname_rx = re.compile(r"^([\w:][\w\d.:-]*)$", re.U) def _validate_xml_entdef (text, xmlfmt): state = "void" pos = 0 tlen = len(text) errmsg = None dhead = "!ENTITY" def next_nws (pos): while pos < tlen and text[pos].isspace(): pos += 1 return pos def next_ws (pos, ows=()): while pos < tlen and not text[pos].isspace() and text[pos] not in ows: pos += 1 return pos errend = lambda: (_("@info", "%(mtype)s markup: premature end of entity definition.", mtype=xmlfmt), tlen) while True: if state == "void": pos = next_nws(pos) if pos == tlen: break elif text[pos] != "<": errmsg = _("@info", "%(mtype)s markup: expected opening angle bracket " "in entity definition.", mtype=xmlfmt) pos1 = pos + 1 else: pos += 1 state = "head" elif state == "head": pos = next_nws(pos) if pos == tlen: errmsg, pos1 = errend() else: pos1 = next_ws(pos) head = text[pos:pos1] if head != dhead: errmsg = _("@info", "%(mtype)s markup: expected '%(keyword)s' " "in entity definition.", mtype=xmlfmt, keyword=dhead) else: pos = pos1 state = "name" elif state == "name": pos = next_nws(pos) pos1 = next_ws(pos, ("'", "\"")) name = text[pos:pos1] if not _entname_rx.match(name): errmsg = _("@info", "%(mtype)s markup: invalid entity name '%(name)s' " "in entity definition.", mtype=xmlfmt, name=name) else: pos = pos1 state = "value" elif state == "value": pos = next_nws(pos) if pos == tlen: errmsg, pos1 = errend() elif text[pos] not in ("'", "\""): errmsg = _("@info", "%(mtype)s markup: expected opening quote " "(ASCII single or double) in entity definition.", mtype=xmlfmt) pos1 = pos + 1 else: quote = text[pos] pos1 = text.find(quote, pos + 1) if pos1 < 0: errmsg = _("@info", "%(mtype)s markup: unclosed entity value " "in entity definition.", mtype=xmlfmt) pos1 = tlen else: value = text[pos + 1:pos1] # FIXME: Validate value? Does not have to be valid # on its own, in principle. pos = pos1 + 1 state = "tail" elif state == "tail": pos = next_nws(pos) if pos == tlen: errmsg, pos1 = errend() elif text[pos] != ">": errmsg = _("@info", "%(mtype)s markup: expected closing angle bracket " "in entity definition.", mtype=xmlfmt) pos1 = pos + 1 else: pos += 1 state = "void" if errmsg: break spans = [] if errmsg: if pos1 is None: pos1 = pos spans = [(pos, pos1, errmsg)] return spans def check_xml (strict=False, entities={}, mkeyw=None): """ Check general XML markup in translation [hook factory]. Text is only checked to be well-formed XML, and possibly also whether encountered entities are defined. Markup errors are reported to stdout. C{msgstr} can be either checked only if the C{msgid} is valid itself, or regardless of the validity of the original. This is governed by the C{strict} parameter. Entities in addition to XML's default (C{<}, etc.) may be provided using the C{entities} parameter. Several types of values with different semantic are possible: - if C{entities} is C{None}, unknown entities are ignored on checking - if string, it is understood as a general function evaluation L{request}, and its result expected to be (name, value) dictionary-like object - otherwise, C{entities} is considered to be a (name, value) dictionary If a message has L{sieve flag} C{no-check-markup}, the check is skipped for that message. If one or several markup keywords are given as C{mkeyw} parameter, check is skipped for all messages in a catalog which does not report one of the given keywords by its L{markup()} method. See L{set_markup()} for list of markup keywords recognized at the moment. @param strict: whether to require valid C{msgstr} even if C{msgid} is not @type strict: bool @param entities: additional entities to consider as known @type entities: C{None}, dict, or string @param mkeyw: markup keywords for taking catalogs into account @type mkeyw: string or list of strings @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, False) def check_xml_sp (strict=False, entities={}, mkeyw=None): """ Like L{check_xml}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ return _check_xml_w(validate_xml_l1, strict, entities, mkeyw, True) # Worker for C{check_xml*} hook factories. def _check_xml_w (check, strict, entities, mkeyw, spanrep, ignctxt=(), ignid=(), ignctxtsw=(), ignidsw=()): if mkeyw is not None: - if isinstance(mkeyw, basestring): + if isinstance(mkeyw, str): mkeyw = [mkeyw] mkeyw = set(mkeyw) # Lazy-evaluated data. ldata = {} def eval_ldata (): ldata["entities"] = _get_entities(entities) def checkf (msgstr, msg, cat): if ( mkeyw is not None and not mkeyw.intersection(cat.markup() or set()) ): return [] if spanrep else 0 if ( msg.msgctxt in ignctxt or msg.msgid in ignid or (msg.msgctxt is not None and msg.msgctxt.startswith(ignctxtsw)) or msg.msgid.startswith(ignidsw) ): return [] if spanrep else 0 if not ldata: eval_ldata() entities = ldata["entities"] if ( flag_no_check_markup in manc_parse_flag_list(msg, "|") or ( not strict and ( check(msg.msgid, ents=entities) - or check(msg.msgid_plural or u"", ents=entities))) + or check(msg.msgid_plural or "", ents=entities))) ): return [] if spanrep else 0 spans = check(msgstr, ents=entities) if spanrep: return spans else: for span in spans: if span[2:]: report_on_msg(span[2], msg, cat) return len(spans) return checkf # Cache for loaded entities, by entity specification string, # to speed up when several markup hooks are using the same setup. _loaded_entities_cache = {} def _get_entities (entspec): - if not isinstance(entspec, basestring): + if not isinstance(entspec, str): return entspec entities = _loaded_entities_cache.get(entspec) if entities is not None: return entities entities = get_result_ireq(entspec) _loaded_entities_cache[entspec] = entities return entities _docbook4_l1 = None def validate_docbook4_l1 (text, ents=None): """ Validate Docbook 4.x markup in text against L{level1} specification. Markup definition is extended to include C{} elements, which C{xml2po} uses to segment text when extracting markup documents into PO templates. See L{validate_xml_l1} for description of the C{ents} parameter and the return value. @param text: text to check @type text: string @param ents: set of known entities (in addition to default) @type ents: sequence @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ global _docbook4_l1 if _docbook4_l1 is None: specpath = os.path.join(datadir(), "spec", "docbook4.l1") _docbook4_l1 = collect_xml_spec_l1(specpath) xmlfmt = _("@item markup type", "Docbook4") return validate_xml_l1(text, spec=_docbook4_l1, xmlfmt=xmlfmt, ents=ents) _db4_meta_msgctxt = set(( )) _db4_meta_msgid = set(( "translator-credits", )) _db4_meta_msgid_sw = ( "@@image:", ) def check_docbook4 (strict=False, entities={}, mkeyw=None): """ Check XML markup in translations of Docbook 4.x catalogs [hook factory]. See L{check_xml} for description of parameters. @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, False, ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt, ignidsw=_db4_meta_msgid_sw) def check_docbook4_sp (strict=False, entities={}, mkeyw=None): """ Like L{check_docbook4}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ return _check_xml_w(validate_docbook4_l1, strict, entities, mkeyw, True, ignid=_db4_meta_msgid, ignctxt=_db4_meta_msgctxt, ignidsw=_db4_meta_msgid_sw) def check_docbook4_msg (strict=False, entities={}, mkeyw=None): """ Check for any known problem in translation in messages in Docbook 4.x catalogs [hook factory]. Currently performed checks: - Docbook markup - cross-message insertion placeholders See L{check_xml} for description of parameters. @return: type V4A hook @rtype: C{(msg, cat) -> parts} """ check_markup = check_docbook4_sp(strict, entities, mkeyw) def checkf (msg, cat): hl = [] for i in range(len(msg.msgstr)): spans = [] spans.extend(check_markup(msg.msgstr[i], msg, cat)) spans.extend(check_placeholder_els(msg.msgid, msg.msgstr[i])) if spans: hl.append(("msgstr", i, spans)) return hl return checkf _entpath_html = os.path.join(datadir(), "spec", "html.entities") html_entities = read_entities(_entpath_html) _html_l1 = None def validate_html_l1 (text, ents=None): """ Validate HTML markup in text against L{level1} specification. At the moment, this function can only check HTML markup if well-formed in the XML sense, although HTML allows omission of some closing tags. See L{validate_xml_l1} for description of the C{ents} parameter and the return value. @param text: text to check @type text: string @param ents: set of known entities (in addition to default) @type ents: sequence @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ global _html_l1 if _html_l1 is None: specpath = os.path.join(datadir(), "spec", "html.l1") _html_l1 = collect_xml_spec_l1(specpath) if ents is not None: ents = Multidict([ents, html_entities]) xmlfmt = _("@item markup type", "HTML") return validate_xml_l1(text, spec=_html_l1, xmlfmt=xmlfmt, ents=ents, accelamp=True, casesens=False) def check_html (strict=False, entities={}, mkeyw=None): """ Check HTML markup in translations [hook factory]. See L{check_xml} for description of parameters. See notes on checking HTML markup to L{validate_html_l1}. @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ return _check_xml_w(validate_html_l1, strict, entities, mkeyw, False) def check_html_sp (strict=False, entities={}, mkeyw=None): """ Like L{check_html}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ return _check_xml_w(validate_html_l1, strict, entities, mkeyw, True) _qtrich_l1 = None def validate_qtrich_l1 (text, ents=None): """ Validate Qt rich-text markup in text against L{level1} specification. At the moment, this function can only check Qt rich-text if well-formed in the XML sense, although Qt rich-text allows HTML-type omission of closing tags. See L{validate_xml_l1} for description of the C{ents} parameter and the return value. @param text: text to check @type text: string @param ents: set of known entities (in addition to default) @type ents: sequence @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ global _qtrich_l1 if _qtrich_l1 is None: specpath = os.path.join(datadir(), "spec", "qtrich.l1") _qtrich_l1 = collect_xml_spec_l1(specpath) if ents is not None: ents = Multidict([ents, html_entities]) xmlfmt = _("@item markup type", "Qt-rich") return validate_xml_l1(text, spec=_qtrich_l1, xmlfmt=xmlfmt, ents=ents, accelamp=True, casesens=False) def check_qtrich (strict=False, entities={}, mkeyw=None): """ Check Qt rich-text markup in translations [hook factory]. See L{check_xml} for description of parameters. See notes on checking Qt rich-text to L{validate_qtrich_l1}. @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, False) def check_qtrich_sp (strict=False, entities={}, mkeyw=None): """ Like L{check_qtrich}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ return _check_xml_w(validate_qtrich_l1, strict, entities, mkeyw, True) _entpath_kuit = os.path.join(datadir(), "spec", "kuit.entities") kuit_entities = read_entities(_entpath_kuit) _kuit_l1 = None def validate_kuit_l1 (text, ents=None): """ Validate KUIT markup in text against L{level1} specification. KUIT is the semantic markup for user interface in KDE4. See L{validate_xml_l1} for description of the C{ents} parameter and the return value. @param text: text to check @type text: string @param ents: set of known entities (in addition to default) @type ents: sequence @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ global _kuit_l1 if _kuit_l1 is None: specpath = os.path.join(datadir(), "spec", "kuit.l1") _kuit_l1 = collect_xml_spec_l1(specpath) if ents is not None: ents = Multidict([ents, kuit_entities]) xmlfmt = _("@item markup type", "KUIT") return validate_xml_l1(text, spec=_kuit_l1, xmlfmt=xmlfmt, ents=ents, accelamp=True) _kde4_l1 = None _kde4_ents = None def validate_kde4_l1 (text, ents=None): """ Validate markup in texts used in KDE4 GUI. KDE4 GUI texts may contain both Qt rich-text and KUIT markup, even mixed in the same text. See L{validate_xml_l1} for description of the C{ents} parameter and the return value. @param text: text to check @type text: string @param ents: set of known entities (in addition to default) @type ents: sequence @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ global _kde4_l1, _kde4_ents if _kde4_l1 is None: _kde4_l1 = {} spath1 = os.path.join(datadir(), "spec", "qtrich.l1") _kde4_l1.update(collect_xml_spec_l1(spath1)) spath2 = os.path.join(datadir(), "spec", "kuit.l1") _kde4_l1.update(collect_xml_spec_l1(spath2)) _kde4_ents = {} _kde4_ents.update(html_entities) _kde4_ents.update(kuit_entities) if ents is not None: ents = Multidict([ents, _kde4_ents]) xmlfmt = _("@item markup type", "KDE4") return validate_xml_l1(text, spec=_kde4_l1, xmlfmt=xmlfmt, ents=ents, accelamp=True, casesens=False) def check_kde4 (strict=False, entities={}, mkeyw=None): """ Check XML markup in translations of KDE4 UI catalogs [hook factory]. See L{check_xml} for description of parameters. @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, False) def check_kde4_sp (strict=False, entities={}, mkeyw=None): """ Like L{check_kde4}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ return _check_xml_w(validate_kde4_l1, strict, entities, mkeyw, True) _pango_l1 = None def validate_pango_l1 (text, ents=None): """ Validate Pango markup in text against L{level1} specification. See L{validate_xml_l1} for description of the C{ents} parameter and the return value. @param text: text to check @type text: string @param ents: set of known entities (in addition to default) @type ents: sequence @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ global _pango_l1 if _pango_l1 is None: specpath = os.path.join(datadir(), "spec", "pango.l1") _pango_l1 = collect_xml_spec_l1(specpath) if ents is not None: ents = Multidict([ents, html_entities]) xmlfmt = _("@item markup type", "Pango") return validate_xml_l1(text, spec=_pango_l1, xmlfmt=xmlfmt, ents=ents, accelamp=True, casesens=False) def check_pango (strict=False, entities={}, mkeyw=None): """ Check XML markup in translations of Pango UI catalogs [hook factory]. See L{check_xml} for description of parameters. @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, False) def check_pango_sp (strict=False, entities={}, mkeyw=None): """ Like L{check_pango}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ return _check_xml_w(validate_pango_l1, strict, entities, mkeyw, True) _digits_dec = set("0123456789") _digits_hex = set("0123456789abcdefABCDEF") def nument_to_char (nument): """ Convert numeric XML entity to character. Numeric XML entities can be decimal, C{&#DDDD;}, or hexadecimal, C{&#xHHHH;}, where C{D} and C{H} stand for number system's digits. 4 digits is the maximum, but there can be less. If the entity cannot be converted to a character, for whatever reason, C{None} is reported. @param nument: numeric entity, with or without C{&} and C{;} @type nument: string @return: character represented by the entity @rtype: string or None """ if nument[:1] == "&": nument = nument[1:-1] if nument[:1] != "#": return None if nument[1:2] == "x": known_digits = _digits_hex numstr = nument[2:] base = 16 else: known_digits = _digits_dec numstr = nument[1:] base = 10 if len(numstr) > 4 or len(numstr) < 1: return None unknown_digits = set(numstr).difference(known_digits) if unknown_digits: return None - return unichr(int(numstr, base)) + return chr(int(numstr, base)) def validate_xmlents (text, ents={}, default=False, numeric=False): """ Check whether XML-like entities in the text are among known. The text does not have to be XML markup as such. No XML parsing is performed, only the raw search for XML-like entities. @param text: text with entities to check @type text: string @param ents: known entities @type ents: sequence @param default: whether default XML entities are allowed (C{&}, etc.) @type default: bool @param numeric: whether numeric character entities are allowed @type numeric: bool @returns: erroneous spans in the text @rtype: list of (int, int, string) tuples """ spans = [] p = 0 while True: p = text.find("&", p) if p < 0: break pp = p m = _entity_rx.match(text, p) if m: p = m.end() ent = m.group(1) errmsg = None if numeric and ent.startswith("#"): if nument_to_char(ent) is None: errmsg = _("@info", "Invalid numeric entity '%(ent)s'.", ent=ent) elif ent not in ents and (not default or ent not in xml_entities): nearents = [] #difflib.get_close_matches(ent, ents) if nearents: if len(nearents) > 5: # do not overwhelm message fmtents = format_item_list(nearents[:5], incmp=True) else: fmtents = format_item_list(nearents) errmsg = _("@info", "Unknown entity '%(ent)s' " "(suggestions: %(entlist)s).", ent=ent, entlist=fmtents) else: errmsg = _("@info", "Unknown entity '%(ent)s'.", ent=ent) if errmsg is not None: spans.append((pp, p, errmsg)) else: p += 1 return spans def check_xmlents (strict=False, entities={}, mkeyw=None, default=False, numeric=False): """ Check existence of XML entities in translations [hook factory]. See L{check_xml} for description of parameters C{strict}, C{entities}, and C{mkeyw}. See L{validate_xmlents} for parameters C{default} and C{numeric}, and for general notes on checking entities. @return: type S3C hook @rtype: C{(msgstr, msg, cat) -> numerr} """ def check (text, ents): return validate_xmlents(text, ents, default=default, numeric=numeric) return _check_xml_w(check, strict, entities, mkeyw, False) def check_xmlents_sp (strict=False, entities={}, mkeyw=None, default=False, numeric=False): """ Like L{check_xmlents}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ def check (text, ents): return validate_xmlents(text, ents, default=default, numeric=numeric) return _check_xml_w(check, strict, entities, mkeyw, True) _placeholder_el_rx = re.compile(r"<\s*placeholder-(\d+)\s*/\s*>") def check_placeholder_els (orig, trans): """ Check if sets of C{} elements are matching between original and translated text. C{} elements are added into text by C{xml2po}, for finer segmentation of markup documents extracted into PO templates. See L{validate_xml_l1} for description of the return value. @param orig: original text @type orig: string @param trans: translated text @type trans: string @returns: erroneous spans in translation @rtype: list of (int, int, string) tuples """ spans = [] orig_plnums = set() for m in _placeholder_el_rx.finditer(orig): orig_plnums.add(m.group(1)) trans_plnums = set() for m in _placeholder_el_rx.finditer(trans): trans_plnums.add(m.group(1)) missing_plnums = list(orig_plnums.difference(trans_plnums)) extra_plnums = list(trans_plnums.difference(orig_plnums)) if missing_plnums: tags = "".join(["" % x for x in missing_plnums]) errmsg = _("@info", "Missing placeholder tags in translation: %(taglist)s.", taglist=format_item_list(tags)) spans.append((0, 0, errmsg)) elif extra_plnums: # do not report both, single glitch may cause them tags = "".join(["" % x for x in extra_plnums]) errmsg = _("@info", "Superfluous placeholder tags in translation: %(taglist)s.", taglist=format_item_list(tags)) spans.append((0, 0, errmsg)) return spans diff --git a/pology/match.py b/pology/match.py index 729128ba..93f06f78 100644 --- a/pology/match.py +++ b/pology/match.py @@ -1,555 +1,555 @@ # -*- coding: UTF-8 -*- """ Matchers and matcher helpers for various objects. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import locale import re from pology import _, n_ from pology.comments import parse_summit_branches from pology.fsops import str_to_unicode from pology.message import MessageUnsafe from pology.remove import remove_accel_msg from pology.report import error _all_ops = set() _unary_ops = set(["not"]) _all_ops.update(_unary_ops) _binary_ops = set(["and", "or"]) _all_ops.update(_binary_ops) class ExprError (Exception): """ Exception for errors in matching expressions. """ def __init__ (self, expr=None, msg=None, start=None, end=None): """ Constructor. All the parameters are made available as instance variables. @param expr: the complete expression that caused the problem @type expr: string or None @param msg: the description of the problem @type msg: string or None @param start: start position of the problem into the expression string @type start: int or None @param end: end position of the problem @type end: int or None """ self.expr = expr self.msg = msg self.start = start self.end = end def __unicode__ (self): if self.expr is not None and self.start is not None: start = self.start if self.end is not None: end = self.end else: end = self.start + 10 subexpr = self.expr[start:end] if start > 0: subexpr = "..." + subexpr if end < len(self.expr): subexpr = subexpr + "..." else: subexpr = None if self.msg is not None and subexpr is not None: repstr = _("@info", "Invalid expression at %(col)d [%(snippet)s]: " "%(reason)s.", col=self.start, snippet=subexpr, reason=self.msg) elif self.msg is not None: repstr = _("@info", "Invalid expression: %(reason)s.", reason=self.msg) elif subexpr is not None: repstr = _("@info", "Invalid expression at %(col)d [%(snippet)s].", col=self.start, snippet=subexpr) else: repstr = _("@info", "Invalid expression.") - return unicode(repstr) + return str(repstr) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) def make_filtered_msg (msg, cat, accels=None, filters=[]): """ TODO: Write documentation. """ # Must not modify contents of real message. msgf = MessageUnsafe(msg) # - remove accelerators if accels is not None: old_accels = cat.accelerator() cat.set_accelerator(accels) remove_accel_msg(msgf, cat) if accels is not None: cat.set_accelerator(old_accels) # - apply msgstr filters for filtr in filters: for i in range(len(msgf.msgstr)): msgf.msgstr[i] = filtr(msgf.msgstr[i]) return msgf def make_msg_matcher (exprstr, mopts=None, abort=False): """ Build expression matcher for messages. For expression syntax, check C{find-messages} sieve documentation for C{fexpr} parameter. TODO: Put this instruction here. The C{mopts} parameter, if given, defines global matching options. It can be either a dictionary or an object with data attributes, and can contain the following keys/attributes (in parenthesis: type and default value in case the key is not present): - C{case} (C{bool}, C{False}): C{True} for case-sensitive matching The built matcher function takes up to four parameters, in order: - C{msgf}: filtered message (to really match against) - C{msg}: raw message (to properly report matched spans) - C{cat}: catalog in which the message resides - C{hl}: L{highlight specification} (to be filled with matched spans, can be omitted from the call) Matcher function returns C{True} if the message is matched, C{False} otherwise. In case an error in expression is encountered while building the matcher, either L{ExprError} exception may be thrown or execution aborted, depending on the parameter C{abort}. @param exprstr: expression string @type exprstr: string @param mopts: global matching options @type mopts: dict or attribute object @param abort: on errors in expression, abort execution if C{True}, raise L{ExprError} if C{False} @type abort: bool @return: matcher function @rtype: (msgf, msg, cat, hl=[])->bool """ mopts = _prep_attrobj(mopts, dict( case=False, )) try: expr, p = _build_expr_r(exprstr, 0, len(exprstr), mopts) if p < len(exprstr): raise ExprError(exprstr, _("@item:intext", "premature end of expression")) - except ExprError, e: + except ExprError as e: if abort: error(str_to_unicode(str(e))) else: raise return expr def make_msg_fmatcher (exprstr, mopts=None, accels=None, filters=[], abort=False): """ Build expression matcher for messages, with filtering. Like L{make_msg_matcher}, except that matchers built by this function do their own filtering, and so omit the first argument. For semantics of C{accels} and C{filters}, see this module documentation on C{accel} and C{filter} sieve parameters. @param exprstr: expression string @type exprstr: string @param mopts: global matching options @type mopts: attribute object @param accels: possible accelerator markers @type accels: sequence of strings or C{None} @param filters: filters to apply to text fields [F1A hooks] @type filters: (text)->text @param abort: on errors, abort execution if C{True}, raise exception if C{False} @type abort: bool @return: matcher function @rtype: (msg, cat, hl=[])->bool """ raw_matcher = make_msg_matcher(exprstr, mopts=mopts, abort=abort) def matcher (msg, cat, hl=[]): msgf = make_filtered_msg(msg, cat, accels, filters) return raw_matcher(msgf, msg, cat, hl) return matcher def _prep_attrobj (aobj, dctdef=None): if aobj is None or isinstance(aobj, dict): dct = aobj or {} class _Data: pass aobj = _Data() - for key, value in dct.items(): + for key, value in list(dct.items()): setattr(aobj, key, value) - for key, val in (dctdef or {}).items(): + for key, val in list((dctdef or {}).items()): if not hasattr(aobj, key): setattr(aobj, key, val) return aobj def _build_expr_r (exprstr, start, end, params): p = start tstack = [] can_unary = True can_binary = False can_operand = True while p < end: while p < end and exprstr[p].isspace() and exprstr[p] != ")": p += 1 if p == end or exprstr[p] == ")": break # Parse current subexpression, matcher, or operator. if exprstr[p] == "(": if not can_operand: raise ExprError(exprstr, _("@item:intext", "expected operator"), p) expr, p = _build_expr_r(exprstr, p + 1, end, params) if p == end or exprstr[p] != ")": raise ExprError(exprstr, _("@item:intext", "no closing parenthesis"), p) tstack.append(expr) can_operand = False can_unary = False can_binary = True p += 1 elif exprstr[p].isalpha(): pp = p while p < end and exprstr[p].isalnum(): p += 1 tok = exprstr[pp:p].lower() if tok in _all_ops: if tok in _unary_ops and not can_unary: raise ExprError(exprstr, _("@item:intext", "unexpected unary operator"), pp) if tok in _binary_ops and not can_binary: raise ExprError(exprstr, _("@item:intext", "unexpected binary operator"), pp) can_operand = True can_unary = True can_binary = False tstack.append(tok) else: if not can_operand: raise ExprError(exprstr, _("@item:intext", "expected an operator"), pp) expr, p = _build_expr_matcher(tok, exprstr, p, end, params) tstack.append(expr) can_operand = False can_unary = False can_binary = True else: raise ExprError(exprstr, _("@item:intext", "expected token starting with a letter"), p + 1) # Update expression as possible. updated = True while updated: updated = False if ( len(tstack) >= 2 and tstack[-2] in _unary_ops and tstack[-1] not in _all_ops ): def closure (): # for closure over cexpr* cexpr1 = tstack.pop() op = tstack.pop() if op == "not": cexpr = lambda *a: not cexpr1(*a) else: # cannot happen raise ExprError(exprstr, _("@item:intext", "unknown unary operator '%(op)s'", op=op)) return cexpr tstack.append(closure()) updated = True if ( len(tstack) >= 3 and tstack[-3] not in _all_ops and tstack[-2] in _binary_ops and tstack[-1] not in _all_ops ): def closure (): # for closure over cexpr* cexpr2 = tstack.pop() op = tstack.pop() cexpr1 = tstack.pop() if op == "and": cexpr = lambda *a: cexpr1(*a) and cexpr2(*a) elif op == "or": cexpr = lambda *a: cexpr1(*a) or cexpr2(*a) else: # cannot happen raise ExprError(exprstr, _("@item:intext", "unknown binary operator '%(op)s'", op=op)) return cexpr tstack.append(closure()) updated = True if len(tstack) >= 2: raise ExprError(exprstr, _("@item:intext", "premature end of expression"), end) if len(tstack) == 0: raise ExprError(exprstr, _("@item:intext", "expected subexpression"), start) return tstack[0], p # Matchers taking a value. _op_matchers = set(["msgctxt", "msgid", "msgstr", "comment", "flag", "branch"]) # Matchers not taking a value. _nop_matchers = set(["transl", "obsol", "active", "plural"]) # Matchers which produce a regular expression out of their value. _rx_matchers = set(["msgctxt", "msgid", "msgstr", "comment", "flag"]) # All matchers together. _all_matchers = set() _all_matchers.update(_op_matchers) _all_matchers.update(_nop_matchers) def _build_expr_matcher (mname, exprstr, start, end, params): if mname not in _all_matchers: raise ExprError(exprstr, _("@item:intext", "unknown matcher '%(match)s'", match=mname), start - len(mname)) # Get matcher value, if any. mval = None p = start if mname in _op_matchers: c = exprstr[p:p + 1] if p == end or c.isspace() or c.isalnum() or c in ("(", ")"): raise ExprError(exprstr, _("@item:intext", "expected parameter delimiter"), p) delim = exprstr[p] pp = p + 1 p = exprstr.find(delim, p + 1, end) if p < 0: raise ExprError(exprstr, _("@item:intext", "expected closing delimiter"), end - 1) mval = exprstr[pp:p] # Get match modifiers, if any. mmods = [] c = exprstr[p:p + 1] if p < end and not c.isspace() and not c.isalnum() and c not in ("(", ")"): p += 1 pp = p while p < end and exprstr[p].isalnum(): p += 1 mmods = list(exprstr[pp:p]) - #print "{%s}{%s}{%s}" % (mname, mval, mmods) + #print("{%s}{%s}{%s}" % (mname, mval, mmods)) return make_matcher(mname, mval, mmods, params), p _matcher_mods = { "msgctxt": ["c", "i"], "msgid": ["c", "i"], "msgstr": ["c", "i"], "comment": ["c", "i"], } def make_matcher (name, value, mods, params, neg=False): """ TODO: Write documentation. """ known_mods = _matcher_mods.get(name, []) bad_mods = set(mods).difference(known_mods) if bad_mods: raise ExprError(None, _("@item:intext", "unknown modifiers %(modlist)s " "to matcher '%(match)s'", modlist=format_item_list(bad_mods), match=name)) if name in _rx_matchers: rxflags = re.U if "i" in mods or (not params.case and "c" not in mods): rxflags |= re.I try: regex = re.compile(value, rxflags) except: raise ExprError(None, _("@item:intext", "invalid regular expression '%(regex)s'", regex=value)) if 0: pass elif name == "msgctxt": def matcher (msgf, msg, cat, hl=[]): texts = [] if msgf.msgctxt is not None: texts += [(msgf.msgctxt, "msgctxt", 0)] return _rx_in_any_text(regex, texts, hl) elif name == "msgid": def matcher (msgf, msg, cat, hl=[]): texts = [(msgf.msgid, "msgid", 0)] if msgf.msgid_plural is not None: texts += [(msgf.msgid_plural, "msgid_plural", 0)] return _rx_in_any_text(regex, texts, hl) elif name == "msgstr": def matcher (msgf, msg, cat, hl=[]): texts = [(msgf.msgstr[i], "msgstr", i) for i in range(len(msgf.msgstr))] return _rx_in_any_text(regex, texts, hl) elif name == "comment": def matcher (msgf, msg, cat, hl=[]): texts = [] texts.extend([(msgf.manual_comment[i], "manual_comment", i) for i in range(len(msgf.manual_comment))]) texts.extend([(msgf.auto_comment[i], "auto_comment", i) for i in range(len(msgf.auto_comment))]) texts.extend([(msgf.source[i][0], "source", i) for i in range(len(msgf.source))]) return _rx_in_any_text(regex, texts, hl) elif name == "transl": def matcher (msgf, msg, cat, hl=[]): if value is None or value: return msg.translated else: return not msg.translated elif name == "obsol": def matcher (msgf, msg, cat, hl=[]): if value is None or value: return msg.obsolete else: return not msg.obsolete elif name == "active": def matcher (msgf, msg, cat, hl=[]): if value is None or value: return msg.translated and not msg.obsolete else: return not msg.translated or msg.obsolete elif name == "plural": def matcher (msgf, msg, cat, hl=[]): if value is None or value: return msg.msgid_plural is not None else: return msg.msgid_plural is None elif name == "maxchar": def matcher (msgf, msg, cat, hl=[]): otexts = [msgf.msgid] if msgf.msgid_plural is not None: otexts.append(msgf.msgid_plural) ttexts = msgf.msgstr onchar = sum([len(x) for x in otexts]) // len(otexts) tnchar = sum([len(x) for x in ttexts]) // len(ttexts) return onchar <= value and tnchar <= value elif name == "lspan": try: start, end = value.split(":", 1) start = int(start) if start else 0 end = int(end) if end else None except: raise ExprError(value, _("@item:intext", "invalid line span"), 0) def matcher (msgf, msg, cat, hl=[]): cend = end if cend is None: cend = cat[-1].refline + 1 return msg.refline >= start and msg.refline < cend elif name == "espan": try: start, end = value.split(":", 1) start = int(start) if start else 0 end = int(end) if end else None except: raise ExprError(value, _("@item:intext", "invalid entry span"), 0) def matcher (msgf, msg, cat, hl=[]): cend = end if cend is None: cend = cat[-1].refentry + 1 return msg.refentry >= start and msg.refentry < cend elif name == "branch": def matcher (msgf, msg, cat, hl=[]): return value in parse_summit_branches(msg) elif name == "flag": def matcher (msgf, msg, cat, hl=[]): #FIXME: How to highlight flags? (then use _rx_in_any_text) for flag in msgf.flag: if regex.search(flag): return True return False else: raise ExprError(name, _("@item:intext", "unknown matcher"), 0) if neg: return lambda *a: not matcher(*a) else: return matcher def _rx_in_any_text (regex, texts, hl): match = False hl_dct = {} for text, hl_name, hl_item in texts: # Go through all matches, to highlight them all. for m in regex.finditer(text): hl_key = (hl_name, hl_item) if hl_key not in hl_dct: hl_dct[hl_key] = ([], text) hl_dct[hl_key][0].append(m.span()) match = True - hl.extend([x + y for x, y in hl_dct.items()]) + hl.extend([x + y for x, y in list(hl_dct.items())]) return match diff --git a/pology/message.py b/pology/message.py index 992503e1..1623b023 100644 --- a/pology/message.py +++ b/pology/message.py @@ -1,1028 +1,1028 @@ # -*- coding: UTF-8 -*- """ Message entries in PO catalogs. Classes from this module define the entries proper, while the header entry is handled by L{pology.header}. @see: L{pology.header} @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology.colors import ColorString, cjoin from pology.escape import escape_c from pology.wrap import wrap_field, wrap_comment, wrap_comment_unwrap from pology.monitored import Monitored, Monlist, Monset, Monpair _Message_spec = { "manual_comment" : {"type" : Monlist, - "spec" : {"*" : {"type" : unicode}}}, + "spec" : {"*" : {"type" : str}}}, "auto_comment" : {"type" : Monlist, - "spec" : {"*" : {"type" : unicode}}}, + "spec" : {"*" : {"type" : str}}}, "source" : {"type" : Monlist, "spec" : {"*" : {"type" : Monpair, - "spec" : {"first" : {"type" : unicode}, + "spec" : {"first" : {"type" : str}, "second" : {"type" : int}}}}}, "flag" : {"type" : Monset, - "spec" : {"*" : {"type" : unicode}}}, + "spec" : {"*" : {"type" : str}}}, "obsolete" : {"type" : bool}, - "msgctxt_previous" : {"type" : (unicode, type(None))}, - "msgid_previous" : {"type" : (unicode, type(None))}, - "msgid_plural_previous" : {"type" : (unicode, type(None))}, + "msgctxt_previous" : {"type" : (str, type(None))}, + "msgid_previous" : {"type" : (str, type(None))}, + "msgid_plural_previous" : {"type" : (str, type(None))}, - "msgctxt" : {"type" : (unicode, type(None))}, - "msgid" : {"type" : unicode}, - "msgid_plural" : {"type" : (unicode, type(None))}, + "msgctxt" : {"type" : (str, type(None))}, + "msgid" : {"type" : str}, + "msgid_plural" : {"type" : (str, type(None))}, "msgstr" : {"type" : Monlist, - "spec" : {"*" : {"type" : unicode}}}, + "spec" : {"*" : {"type" : str}}}, - "key" : {"type" : unicode, "derived" : True}, - "fmt" : {"type" : unicode, "derived" : True}, - "inv" : {"type" : unicode, "derived" : True}, - "trn" : {"type" : unicode, "derived" : True}, + "key" : {"type" : str, "derived" : True}, + "fmt" : {"type" : str, "derived" : True}, + "inv" : {"type" : str, "derived" : True}, + "trn" : {"type" : str, "derived" : True}, "fuzzy" : {"type" : bool}, "untranslated" : {"type" : bool, "derived" : True}, "translated" : {"type" : bool, "derived" : True}, "active" : {"type" : bool, "derived" : True}, - "format" : {"type" : unicode, "derived" : True}, + "format" : {"type" : str, "derived" : True}, "refline" : {"type" : int}, "refentry" : {"type" : int}, } # Exclusive groupings. _Message_single_fields = ( "msgctxt_previous", "msgid_previous", "msgid_plural_previous", "msgctxt", "msgid", "msgid_plural", ) _Message_list_fields = ( "manual_comment", "auto_comment", "msgstr", ) _Message_list2_fields = ( "source", ) _Message_set_fields = ( "flag", ) _Message_state_fields = ( "fuzzy", "obsolete", ) # Convenience groupings. _Message_all_fields = (() + _Message_single_fields + _Message_list_fields + _Message_list2_fields + _Message_set_fields + _Message_state_fields ) _Message_sequence_fields = (() + _Message_list_fields + _Message_list2_fields + _Message_set_fields ) _Message_key_fields = ( "msgctxt", "msgid", ) _Message_mandatory_fields = ( "msgid", "msgstr", ) _Message_currprev_fields = ( ("msgctxt", "msgctxt_previous"), ("msgid", "msgid_previous"), ("msgid_plural", "msgid_plural_previous"), ) _Message_fmt_fields = ( "msgctxt", "msgid", "msgid_plural", "msgstr", "obsolete", "fuzzy", ) _Message_inv_fields = ( "obsolete", "fuzzy", "manual_comment", "msgctxt_previous", "msgid_previous", "msgid_plural_previous", "msgctxt", "msgid", "msgid_plural", "msgstr", ) def _escape (text): text = escape_c(text) if isinstance(text, ColorString): text = text.replace(""", "\\"") return text class Message_base (object): """ Abstract base class for entries in PO catalogs. Elements of the message are accessed through instance attributes. Some of them are read-only, typically those that are derived from the normal read-write attributes and cannot be set independently. The precise type of each attribute depends on the subclass through which it is accessed, but has a general behavior of one of the standard types. E.g. when the behavior is that of a list, the type is stated as C{list*}. All strings are assumed unicode, except where noted otherwise. Regardless of the exact composition of the message, each message object will have all the instance attributes listed. In case the message actually does not have an element corresponding to an instance attribute, that attribute will have an appropriate null value. Only the read-only attributes are provided by this base class, while the read-write attributes are to be provided by its subclasses. All are listed here, however, as the interface that all subclasses should implement. @ivar manual_comment: manual (translator) comments (C{# ...}) @type manual_comment: list* of strings @ivar auto_comment: automatic (extracted) comments (C{#. ...}) @type auto_comment: list* of strings @ivar source: source references, as filepath:lineno pairs (C{#: ...}) @type source: list* of pairs* @ivar flag: message flags (C{#, ...}) @type flag: set* of strings @ivar obsolete: whether entry is obsolete (C{#~ ...}) @type obsolete: bool @ivar msgctxt_previous: previous context field (C{#| msgctxt "..."}) @type msgctxt_previous: string or None @ivar msgid_previous: previous message field (C{#| msgid "..."}) @type msgid_previous: string or None @ivar msgid_plural_previous: previous plural field (C{#| msgid_plural "..."}) @type msgid_plural_previous: string or None @ivar msgctxt: context field (C{msgctxt "..."}) @type msgctxt: string or None @ivar msgid: message field (C{msgid "..."}) @type msgid: string @ivar msgid_plural: plural field (C{msgid_plural "..."}) @type msgid_plural: string or None @ivar msgstr: translation fields (C{msgstr "..."}, C{msgstr[n] "..."}) @type msgstr: list* of strings @ivar key: (read-only) key composition Message key is formed by the parts of the message which define unique entry in a catalog. The value is an undefined serialization of C{msgctxt} and C{msgid}. @type key: string @ivar fmt: (read-only) format composition Format composition consists of all message parts which determine contents of compiled message in the MO file, including whether it is compiled at all. The value is an undefined serialization of: C{msgctxt}, C{msgid}, C{msgid_plural}, C{msgstr}, C{fuzzy}, C{obsolete}. @type fmt: string @ivar inv: (read-only) extraction-invariant composition Extraction-invariant parts of the message are those that are not dependent on the placement and comments to the message in the code. In effect, these are the parts which are not eliminated when the message is obsoleted after merging. The value is an undefined serialization of: C{msgctxt}, C{msgid}, C{msgid_plural}, C{msgstr}, C{fuzzy}, C{obsolete}, C{manual_comment}, C{msgctxt_previous}, C{msgid_previous}, C{msgid_plural_previous}. @type inv: string @ivar trn: (read-only) translator-controlled composition Translator-controlled parts of the message are those that are normally modified by a translator when working on a PO file. The value is an undefined serialization of: C{msgstr}, C{fuzzy}, C{manual_comment}. @type trn: string @ivar fuzzy: whether the message is fuzzy The state of fuzziness can be also checked and set by looking for and adding/removing the C{fuzzy} flag from the set of flags, but this is needed frequently enough to deserve a standalone attribute. Note: To "thoroughly" unfuzzy the message, see method L{unfuzzy}. @type fuzzy: bool @ivar untranslated: (read-only) whether the message is untranslated (False for fuzzy messages) @type untranslated: bool @ivar translated: (read-only) whether the message is translated (False for fuzzy messages) @type translated: bool @ivar active: (read-only) whether the translation of the message is used at destination (C{False} for untranslated, fuzzy and obsolete messages) @type active: bool @ivar format: (read-only) the format flag of the message (e.g. C{c-format}) or empty string @type format: string @ivar refline: referent line number of the message inside the catalog Valid only if there were no modifications to the catalog, otherwise undefined (made valid again after syncing the catalog). Normally this is the line number of C{msgid} keyword, but not guaranteed to be so. @type refline: int @ivar refentry: referent entry number of the message inside the catalog Valid only if there were no additions/removals of messages from the catalog, otherwise undefined (made valid again after syncing the catalog). @type refentry: int @ivar key_previous: (read-only) previous key composition Like L{key}, except this is for previous fields. If there are no previous fields, this is C{None}. The value is an undefined serialization of C{msgctxt_previous} and C{msgid_previous}. @type key: string or C{None} @see: L{Message} @see: L{MessageUnsafe} """ def __init__ (self, getsetattr): """ Internal constructor for subclasses' usage. @param getsetattr: the object with C{__getattr__} and C{__setattr__} methods, as handler for unhandled instance attributes """ self.__dict__["^getsetattr"] = getsetattr self._colorize_prev = 0 def __getattr__ (self, att): """ Attribute getter. Processes read-only attributes, and sends others to the getter given by the constructor. @param att: name of the attribute to get @returns: attribute value """ if 0: pass elif att == "translated": if self.fuzzy: return False # Consider message translated if at least one msgstr is translated: # that's how gettext tools do, but then they report an error for # missing argument in non-translated msgstrs. for val in self.msgstr: if val: return True return False elif att == "untranslated": if self.fuzzy: return False for val in self.msgstr: if val: return False return True elif att == "active": return self.translated and not self.obsolete elif att == "key": return self._compose(["msgctxt", "msgid"]) elif att == "fmt": return self._compose(["msgctxt", "msgid", "msgid_plural", "msgstr", "fuzzy", "obsolete"]) elif att == "inv": return self._compose(["msgctxt", "msgid", "msgid_plural", "msgstr", "fuzzy", "obsolete", "manual_comment", "msgctxt_previous", "msgid_previous", "msgid_plural_previous"]) elif att == "trn": return self._compose(["msgstr", "fuzzy", "manual_comment"]) elif att == "format": format_flag = "" for flag in self.flag: if flag.find("-format") >= 0: format_flag = flag break return format_flag elif att == "fuzzy": - return u"fuzzy" in self.flag + return "fuzzy" in self.flag elif att == "key_previous": if self.msgid_previous is not None: return self._compose(["msgctxt_previous", "msgid_previous"]) else: return None else: return self.__dict__["^getsetattr"].__getattr__(self, att) def _compose (self, fields): fmtvals = [] for field in fields: val = self.get(field) if field in _Message_state_fields: - fval = val and u"1" or u"0" + fval = val and "1" or "0" elif field in _Message_list_fields: - fval = u"\x02".join([u"%s" % x for x in val]) + fval = "\x02".join(["%s" % x for x in val]) elif field in _Message_list2_fields: - fval = u"\x02".join([u"%s:%s" % tuple(x) for x in val]) + fval = "\x02".join(["%s:%s" % tuple(x) for x in val]) elif field in _Message_set_fields: - vlst = [u"%s" % x for x in val] + vlst = ["%s" % x for x in val] vlst.sort() - fval = u"\x02".join(vlst) + fval = "\x02".join(vlst) else: - fval = val is None and u"\x00" or u"%s" % val + fval = val is None and "\x00" or "%s" % val fmtvals.append(fval) return "\x04".join(fmtvals) def get (self, att, default=None): """ Get attribute value. Allows accessing the message like a dictionary. @param att: name of the attribute to get @type att: string @param default: value to return if attribute does not exist @returns: value of the attribute or the default value """ if hasattr(self, att): return getattr(self, att) else: return default def __setattr__ (self, att, val): """ Attribute setter. May act upon some attributes (e.g. checks), but finally passes all of them to the setter given by the constructor. @param att: name of the attribute to set @param val: value to set the attribute to """ if 0: pass elif att == "fuzzy": if val == True: - self.flag.add(u"fuzzy") - elif u"fuzzy" in self.flag: - self.flag.remove(u"fuzzy") + self.flag.add("fuzzy") + elif "fuzzy" in self.flag: + self.flag.remove("fuzzy") else: self.__dict__["^getsetattr"].__setattr__(self, att, val) def __eq__ (self, omsg): """ Reports whether messages are equal in all apparent parts. "Apparent" parts include all those which are visible in the PO file. I.e. the check will ignore internal states, like line caches, etc. @returns: C{True} if messages are equal in apparent parts @rtype: bool """ # Make messages the same type. # NOTE: All this instead of just omsg = type(self)(omsg) # for the sake of performance. if not isinstance(omsg, Message_base): omsg = MessageUnsafe(omsg) msg = self if isinstance(self, Message) and isinstance(omsg, MessageUnsafe): msg = MessageUnsafe(msg) elif isinstance(self, MessageUnsafe) and isinstance(omsg, Message): omsg = MessageUnsafe(omsg) for field in _Message_all_fields: if msg.get(field) != omsg.get(field): return False return True def __ne__ (self, omsg): """ Reports whether messages are not equal in some apparent parts. Equivalent to C{not (self == omsg)}. @returns: C{False} if messages are equal in all apparent parts @rtype: bool """ return not self.__eq__(omsg) def _renew_lines_bymod (self, mod, wrapf=wrap_field, force=False, colorize=0): prefix = {} if self.obsolete: prefix["curr"] = "#~ " prefix["prev"] = "#~| " else: prefix["curr"] = "" prefix["prev"] = "#| " if force or mod["manual_comment"] or not self._lines_manual_comment: self._lines_manual_comment = [] for manc in self.manual_comment: ls = wrap_comment_unwrap("", manc) if colorize >= 2: ls = [ColorString("%s") % x for x in ls] self._lines_manual_comment.extend(ls) if force or mod["auto_comment"] or not self._lines_auto_comment: self._lines_auto_comment = [] for autoc in self.auto_comment: ls = wrap_comment_unwrap(".", autoc) if colorize >= 2: ls = [ColorString("%s") % x for x in ls] self._lines_auto_comment.extend(ls) if force or mod["source"] or not self._lines_source: self._lines_source = [] srcrefs = [] for src in self.source: if src[1] > 0: srcrefs.append(src[0] + ":" + str(src[1])) else: srcrefs.append(src[0]) if srcrefs: ls = wrap_comment(":", cjoin(srcrefs, " ")) if colorize >= 2: ls = [ColorString("%s") % x for x in ls] self._lines_source = ls if force or mod["flag"] or not self._lines_flag: self._lines_flag = [] # Rearange so that fuzzy is first, if present. flst = [] for fl in self.flag: - if fl == u"fuzzy": + if fl == "fuzzy": if colorize >= 1: fl = ColorString("%s") % fl flst.insert(0, fl) else: flst.append(fl) if flst: ls = wrap_comment(",", cjoin(flst, ", ")) if colorize >= 2: ls = [ColorString("%s") % x for x in ls] self._lines_flag = ls for att in _Message_single_fields: att_lins = "_lines_" + att if force or mod[att] or not self.__dict__[att_lins]: # modcount of this string > 0 or lines not cached or forced self.__dict__[att_lins] = [] msgsth = getattr(self, att) if msgsth is not None or att in _Message_mandatory_fields: if msgsth is None: - msgsth = u"" + msgsth = "" if att.endswith("_previous"): fname = att[:-len("_previous")] pstat = "prev" else: fname = att pstat = "curr" if colorize >= 1: fname = ColorString("%s") % fname self.__dict__[att_lins] = wrapf(fname, _escape(msgsth), prefix[pstat]) # msgstr must be renewed if the plurality of the message changed. new_plurality = ( getattr(self, "_lines_msgstr", []) and ( ( self.msgid_plural is None and "msgstr[" in self._lines_msgstr[0]) or ( self.msgid_plural is not None and "msgstr[" not in self._lines_msgstr[0]))) if force or mod["msgstr"] or not self._lines_msgstr or new_plurality: self._lines_msgstr = [] - msgstr = self.msgstr or [u""] + msgstr = self.msgstr or [""] if self.msgid_plural is None: fname = "msgstr" if colorize >= 1: fname = ColorString("%s") % fname self._lines_msgstr.extend(wrapf(fname, _escape(msgstr[0]), prefix["curr"])) else: for i in range(len(msgstr)): fname = "msgstr[%d]" % i if colorize >= 1: fname = ColorString("%s") % fname self._lines_msgstr.extend(wrapf(fname, _escape(msgstr[i]), prefix["curr"])) # Marshal the lines into proper order. self._lines_all = [] lins = self._lines_all lins.extend(self._lines_manual_comment) lins.extend(self._lines_auto_comment) if not self.obsolete: # no source for an obsolete message lins.extend(self._lines_source) lins.extend(self._lines_flag) # Actually, it might make sense regardless... ## Old originals makes sense only for a message with a fuzzy flag. #if self.fuzzy: lins.extend(self._lines_msgctxt_previous) lins.extend(self._lines_msgid_previous) lins.extend(self._lines_msgid_plural_previous) lins.extend(self._lines_msgctxt) lins.extend(self._lines_msgid) lins.extend(self._lines_msgid_plural) lins.extend(self._lines_msgstr) if self._lines_all[-1] != "\n": - lins.extend(u"\n") + lins.extend("\n") def to_lines (self, wrapf=wrap_field, force=False, colorize=0): """ The line-representation of the message. Lines are returned with newlines included. @param wrapf: the function used for wrapping message fields (msgctxt, msgid, ...) As arguments the function should accept the field name, the field text, and the prefix to all lines, and return the list of wrapped lines (with newlines included). @type wrapf: string, string, string -> list of strings @param force: whether to force reformatting of all elements. Subclasses may keep a track of lines exactly as read from the PO file, and allow reformatting of only the modified elements of the message. @type force: bool @param colorize: whether and how much to colorize the message. Typically useful when the message is output to terminal, HTML file, etc. as accompanying information to a user. If the value is 0, no colorization is applied; 1 gives conservative colorization, 2 and more full colorization. @type colorize: int @returns: formatted lines @rtype: list of strings @see: L{pology.wrap} """ # Renew lines if one of: forced, no lines formed yet, no modcounter, # different colorization. if colorize != self._colorize_prev: force = True if force or getattr(self, "modcount", True) or not self._lines_all: self._renew_lines(wrapf, force, colorize) self._colorize_prev = colorize return self._lines_all def to_string (self, wrapf=wrap_field, force=False, colorize=0): """ The string-representation of the message. Passes the arguments to L{to_lines} and joins the resulting list. @see: L{to_lines} """ return cjoin(self.to_lines(wrapf, force, colorize)) def _append_to_list (self, other, att): self_list = getattr(self, att) other_list = getattr(other, att) for el in other_list: self_list.append(el) def _overwrite_list (self, other, att): # Overwrites self list by element-assignment/append/pop, # so that modification history is tracked. self_list = getattr(self, att) other_list = getattr(other, att) self_len = len(self_list) other_len = len(other_list) if self_len <= other_len: for i in range(self_len): self_list[i] = other_list[i] for i in range(self_len, other_len): self_list.append(other_list[i]) else: for i in range(other_len): self_list[i] = other_list[i] for i in range(other_len, self_len): self_list.pop() def unfuzzy (self): """ Thoroughly unfuzzy the message. Strictly speaking, a message is fuzzy if it has the C{fuzzy} flag set. Thus a message can be unfuzzied by removing this flag, either manually from the C{flag} set, or through attribute C{fuzzy}. But if there were previous fields (e.g. C{msgid_previous}) added to the message when it was made fuzzy on merge, they will remain in the message after it has been unfuzzied in this way. This is normally not wanted, and in such cases this method may be used to I{thouroughly} unfuzzy the message: remove C{fuzzy} flag, set C{fuzzy} attribute to C{False}, and all C{*_previous} attributes to C{None}. If the message is not strictly fuzzy upon this call, it is undefined whether any present previous fields will be left untouched, or removed nontheless. @returns: True if the message was unfuzzied, false otherwise """ if not self.fuzzy: return False self.fuzzy = False # also removes fuzzy flag self.msgctxt_previous = None self.msgid_previous = None self.msgid_plural_previous = None return True def clear (self, keepmanc=False, msgstrlen=None): """ Revert message to pristine untranslated state. Reverting to untranslated state removes manual comments (by default), C{fuzzy} flag, and previous fields, and clears C{msgstr} fields. @param keepmanc: do not remove manual comments @type keepmanc: bool @param msgstrlen: the number of empty msgstr fields; if C{None}, the existing number of fields is preserved @type msgstrlen: int """ if not keepmanc: self.manual_comment = type(self.manual_comment)() self.fuzzy = False # also removes fuzzy flag self.msgctxt_previous = None self.msgid_previous = None self.msgid_plural_previous = None if msgstrlen is None: msgstrlen = len(self.msgstr) - self.msgstr = type(self.msgstr)([u""] * msgstrlen) + self.msgstr = type(self.msgstr)([""] * msgstrlen) def state (self): """ Coded description of the translation state of the message. Code string can be one of: "T" (translated), "F" (fuzzy), "U" (untranslated), "OT" (obsolete translated), "OF" (obsolete fuzzy), "OU" (obsolete untranslated). @returns: coded translation state @rtype: string """ if not self.obsolete: if self.fuzzy: return "F" elif self.translated: return "T" else: return "U" else: if self.fuzzy: return "OF" elif self.translated: return "OT" else: return "OU" def set (self, omsg): """ Copy all parts from the other message. All mutable parts are deeply copied. @param omsg: the message from which to copy the parts @type omsg: instance of L{Message_base} @returns: self """ return self._set_parts(omsg, _Message_all_fields) def set_key (self, omsg): """ Copy all key parts from the other message. See L{key} attribute for the description and list of key parts. All mutable parts are deeply copied. @param omsg: the message from which to copy the parts @type omsg: instance of L{Message_base} @returns: self """ return self._set_parts(omsg, _Message_key_fields) def set_fmt (self, omsg): """ Copy all format parts from the other message. See L{fmt} attribute for the description and list of format parts. All mutable parts are deeply copied. @param omsg: the message from which to copy the parts @type omsg: instance of L{Message_base} @returns: self """ return self._set_parts(omsg, _Message_fmt_fields) def set_inv (self, omsg): """ Copy extraction-invariant parts from the other message. See L{inv} attribute for the description and list of extraction-invariant parts. All mutable parts are deeply copied. @param omsg: the message from which to copy the parts @type omsg: instance of L{Message_base} @returns: self """ return self._set_parts(omsg, _Message_inv_fields) def _set_parts (self, omsg, parts): """ Worker for set* methods. """ for part in parts: oval = omsg.get(part) val = self.get(part) if oval is not None: if part in _Message_list2_fields: oval = type(val)([type(x)(x) for x in oval]) elif part in _Message_sequence_fields: oval = type(val)(oval) elif val is not None: oval = type(val)(oval) setattr(self, part, oval) return self class Message (Message_base, Monitored): # order important for get/setattr """ The default class for catalog entries. The interface is inherited from L{Message_base}, but when used through this class it behaves in a special way: the modifications are I{monitored}, such that no new attributes can be created by assignment and all assignments are checked for value types. If you don't need to modify the messages after creation, consider using the faster L{MessageUnsafe} class. The loosely defined types in the base class (those with a star) are resolved into one of C{Mon*} types: L{Monlist}, L{Monset}, L{Monpair}. They implement some, but not all, of the functionality of their standard counterparts. @see: L{Message_base} @see: L{MessageUnsafe} @see: L{pology.monitored} """ def __init__ (self, init={}): """ Initializes the message elements by the values in the dictionary. The dictionary keys are like the names of attributes in the interface, and not all must be supplied. Those left out will be initialized to appropriate null values. The monitored sequences should be supplied as their ordinary counterparts (e.g. a C{list} in place of L{Monlist}), @param init: dictionary of initial values @type init: dict """ # NOTE: Make sure all sequences are shallow copied. Message_base.__init__(self, Monitored) self._manual_comment = Monlist(init.get("manual_comment", [])[:]) self._auto_comment = Monlist(init.get("auto_comment", [])[:]) - self._source = Monlist(map(Monpair, init.get("source", [])[:])) + self._source = Monlist(list(map(Monpair, init.get("source", [])[:]))) self._flag = Monset(init.get("flag", [])) self._obsolete = init.get("obsolete", False) self._msgctxt_previous = init.get("msgctxt_previous", None) self._msgid_previous = init.get("msgid_previous", None) self._msgid_plural_previous = init.get("msgid_plural_previous", None) self._msgctxt = init.get("msgctxt", None) - self._msgid = init.get("msgid", u"") + self._msgid = init.get("msgid", "") self._msgid_plural = init.get("msgid_plural", None) self._msgstr = Monlist(init.get("msgstr", [])[:]) - self._fuzzy = (u"fuzzy" in self._flag and not self._obsolete) + self._fuzzy = ("fuzzy" in self._flag and not self._obsolete) self._refline = init.get("refline", -1) self._refentry = init.get("refentry", -1) self.assert_spec_init(_Message_spec) # Line caches. self._lines_all = init.get("_lines_all", [])[:] self._lines_manual_comment = init.get("_lines_manual_comment", [])[:] self._lines_auto_comment = init.get("_lines_auto_comment", [])[:] self._lines_source = init.get("_lines_source", [])[:] self._lines_flag = init.get("_lines_flag", [])[:] self._lines_msgctxt_previous = init.get("_lines_msgctxt_previous", [])[:] self._lines_msgid_previous = init.get("_lines_msgid_previous", [])[:] self._lines_msgid_plural_previous = init.get("_lines_msgid_plural_previous", [])[:] self._lines_msgctxt = init.get("_lines_msgctxt", [])[:] self._lines_msgid = init.get("_lines_msgid", [])[:] self._lines_msgid_plural = init.get("_lines_msgid_plural", [])[:] self._lines_msgstr = init.get("_lines_msgstr", [])[:] def _renew_lines (self, wrapf=wrap_field, force=False, colorize=0): if not self.obsolete_modcount: mod = {} mod["manual_comment"] = ( self.manual_comment_modcount or self.manual_comment.modcount) mod["auto_comment"] = ( self.auto_comment_modcount or self.auto_comment.modcount) mod["source"] = self.source_modcount or self.source.modcount mod["flag"] = self.flag_modcount or self.flag.modcount for att in _Message_single_fields: mod[att] = getattr(self, att + "_modcount") > 0 mod["msgstr"] = self.msgstr_modcount or self.msgstr.modcount else: # Must recompute all lines if the message has been modified # by changing the obsolete status. mod = None force = True return self._renew_lines_bymod(mod, wrapf, force, colorize) class MessageUnsafe (Message_base): """ The lightweight class for catalog entries, for read-only applications. Unlike the L{Message}, this class does nothing special with attributes. The interface attributes are implemented as in L{Message_base}, where the starred lists are standard lists, starred sets standard sets, etc. There is no assignment and type checking, nor modification monitoring. You should use this class when messages are not expected to be modified, for the performance benefit. The top modification counter still exists, but only as an ordinary inactive attribute, which the client code can manually increase to signal that the message has changed. This may be necessary for some client code, which relies on top counter to function properly. @see: L{Message_base} """ def __init__ (self, init={}): """ Initializes the message elements by the values in the dictionary. The dictionary keys are like the names of attributes in the interface, and not all must be supplied. Those left out will be initialized to appropriate null values. @param init: dictionary of initial values @type init: dict """ # NOTE: Make sure all sequences are shallow copied. Message_base.__init__(self, object) self.manual_comment = list(init.get("manual_comment", [])) self.auto_comment = list(init.get("auto_comment", [])) self.source = [tuple(x) for x in init.get("source", [])] self.flag = set(init.get("flag", [])) self.obsolete = init.get("obsolete", False) self.msgctxt_previous = init.get("msgctxt_previous", None) self.msgid_previous = init.get("msgid_previous", None) self.msgid_plural_previous = init.get("msgid_plural_previous", None) self.msgctxt = init.get("msgctxt", None) - self.msgid = init.get("msgid", u"") + self.msgid = init.get("msgid", "") self.msgid_plural = init.get("msgid_plural", None) - self.msgstr = list(init.get("msgstr", [u""])) + self.msgstr = list(init.get("msgstr", [""])) self.refline = init.get("refline", -1) self.refentry = init.get("refentry", -1) # No need to look for line caches, as lines must always be reformatted. def _renew_lines (self, wrapf=wrap_field, force=False, colorize=0): # No monitoring, content must always be reformatted. return self._renew_lines_bymod(None, wrapf, True, colorize) diff --git a/pology/monitored.py b/pology/monitored.py index 83f78056..0ac8a208 100644 --- a/pology/monitored.py +++ b/pology/monitored.py @@ -1,518 +1,518 @@ # -*- coding: UTF-8 -*- """ Framework for monitored classes. Includes the base class that monitored classes should inherit from, and some monitored partial counterparts to standard Python data types. Monitored objects are limited to prescribed set of public instance variables, and, optionally, the values which can be assigned to those are limited to a prescribed set of types. Each public instance variable has a I{shadowing} modification counter, an instance variable which counts the changes made to the variable which it shadows. As of yet, this module and its functionality is for internal use in core PO interface classes (L{Catalog}, L{Message}...), not intended for creation of monitored classes in client code. Use this documentation only to find out which of the methods available in standard data types are available through their monitored counterparts (e.g. L{Monlist} compared to C{list}). @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import PologyError, _, n_ # ============================================================================= # Internal functions. def _gather_modcount (obj): modcount = 0 - for cnt in getattr(obj, "#", {}).values(): # own counts + for cnt in list(getattr(obj, "#", {}).values()): # own counts modcount += cnt for att in getattr(obj, "_spec", {}): # sub counts if att != "*": # single sub counts if not obj._spec[att].get("derived", False): modcount += getattr(obj.__dict__["_" + att], "modcount", 0) else: for itemobj in obj.__dict__[att]: # sequence sub counts modcount += getattr(itemobj, "modcount", 0) return modcount def _scatter_modcount (obj, val): if hasattr(obj, "#"): for att in obj.__dict__["#"]: obj.__dict__["#"][att] = val for att in getattr(obj, "_spec", {}): if att != "*": if not obj._spec[att].get("derived", False): subobj = obj.__dict__["_" + att] if hasattr(subobj, "modcount"): subobj.modcount = val else: for itemobj in obj.__dict__[att]: if hasattr(itemobj, "modcount"): itemobj.modcount = val def _assert_spec_single (att, obj, spec): if "type" in spec: if not isinstance(obj, spec["type"]): if att != "*": raise PologyError( _("@info", "Expected %(type1)s for attribute '%(attr)s', " "got %(type2)s.", type1=spec["type"], attr=att, type2=type(obj))) else: raise PologyError( _("@info", "Expected %(type1)s for sequence element, " "got %(type2)s.", type1=spec["type"], type2=type(obj))) if "spec" in spec: _assert_spec_init(obj, spec["spec"]) def _assert_spec_init (self, spec): - for att, subspec in spec.items(): + for att, subspec in list(spec.items()): if att != "*": if not subspec.get("derived", False): _assert_spec_single(att, self.__dict__["_" + att], subspec) else: for itemobj in self.__dict__[att]: _assert_spec_single(att, itemobj, subspec) # All checks done, add spec and counts. self._spec = spec self.__dict__["#"] = {} for att in spec: if not spec[att].get("derived", False): self.__dict__["#"][att] = 0 # ============================================================================= # Base class for monitored classes. class Monitored (object): """ Base class for monitored classes. Internal. """ def __getattr__ (self, att): attp = "_" + att if att.startswith("_"): return self.__dict__[att] elif att == "modcount": return _gather_modcount(self) elif att.endswith("_modcount"): return self.__dict__["#"][att[:-len("_modcount")]] elif att == "#": return self.__dict__[att] else: return self.__dict__[attp] def __setattr__ (self, att, val): if att.startswith("_"): self.__dict__[att] = val else: if att == "modcount" or att.endswith("_modcount"): # Only set if given to 0, ignore silently other values. if isinstance(val, int) and val == 0: if att == "modcount": _scatter_modcount(self, val) else: attb = att[:-len("_modcount")] attbp = "_" + attb self.__dict__["#"][attb] = val _scatter_modcount(self.__dict__[attbp], val) else: self.assert_spec_setattr(att, val) attp = "_" + att cval = self.__dict__[attp] if cval != val: self.__dict__["#"][att] += 1 if hasattr(cval, "modcount"): mc_diff = cval.modcount - val.modcount if mc_diff > 0: self.__dict__["#"][att] += mc_diff self.__dict__[attp] = val def __eq__ (self, other): return isinstance(self, type(other)) and self.data() == other.data() def __ne__ (self, other): return not isinstance(self, type(other)) or self.data() != other.data() def data (self): if hasattr(self, "_spec"): d = {} for att in self._spec: if att != "*": subobj = self.__getattr__(att) else: subobj = self.__dict__[att] if hasattr(subobj, "data"): d[att] = subobj.data() else: d[att] = subobj d["#"] = self.__dict__["#"] return d def assert_spec_init (self, spec): _assert_spec_init(self, spec) def assert_spec_setattr (self, att, subobj): if not hasattr(self, "_spec"): return if att in self._spec: spec = self._spec[att] if spec.get("derived", False): raise PologyError( _("@info", "Derived attribute '%(attr)s' is read-only.", attr=att)) _assert_spec_single(att, subobj, spec) elif att.endswith("_modcount"): if not isinstance(subobj, int): raise PologyError( _("@info", "Expected %(type1)s for attribute '%(attr)s', " "got %(type2)s.", type1=int, attr=att, type2=type(subobj))) else: raise PologyError( _("@info", "Attribute '%(attr)s' is not among specified.", attr=att)) def assert_spec_getattr (self, att): if not hasattr(self, "_spec"): return if att not in self._spec: raise PologyError( _("@info", "Attribute '%(attr)s' is not among specified.", attr=att)) def assert_spec_setitem (self, itemobj): if not hasattr(self, "_spec"): return if "*" in self._spec: _assert_spec_single("*", itemobj, self._spec["*"]) else: raise PologyError( _("@info", "Object '%(obj)s' is not specified to be a sequence.", obj=self)) def assert_spec_getitem (self): if not hasattr(self, "_spec"): return if "*" not in self._spec: raise PologyError( _("@info", "Object '%(obj)s' is not specified to be a sequence.", obj=self)) # ============================================================================= # Monitored pair. _Monpair_spec = { "first" : {}, "second" : {}, } class Monpair (Monitored): """ Monitored pair (counterpart to two-element C{tuple}). @ivar first: the first element of the pair @ivar second: the second element of the pair """ def __init__ (self, init=None): """ Create a pair with two elements. All methods behave as their namesakes in standard C{tuple}. @param init: 2-element sequence or another pair @param init: tuple, list,... or Monpair """ if not isinstance(init, Monpair): pair = tuple(init) if len(pair) != 2: raise PologyError( _("@info", "Initializer sequence for a pair must contain " "exactly two elements.")) self._first, self._second = pair else: self._first = init.first self._second = init.second self.assert_spec_init(_Monpair_spec) def __repr__ (self): elfmt = ", ".join((repr(self._first), repr(self._second))) return "%s([%s])" % (self.__class__.__name__, elfmt) def __str__ (self): return self.__repr__() def __len__ (self): return 2 def __iter__ (self): return iter((self._first, self._second)) def __getitem__ (self, i): if i == 0: return self._first elif i == 1: return self._second else: raise IndexError # ============================================================================= # Monitored list. _Monlist_spec = { "*" : {}, } class Monlist (Monitored): """ Monitored list. """ def __init__ (self, lst=None): """ Create a monitored list from a sequence. All methods behave as their namesakes in standard C{list}. @param lst: sequence of elements @type lst: any convertible into list by C{list()} """ if lst is not None: self.__dict__["*"] = list(lst) else: self.__dict__["*"] = list() self.assert_spec_init(_Monlist_spec) def __repr__ (self): elfmt = ", ".join(repr(x) for x in self.__dict__["*"]) return "%s([%s])" % (self.__class__.__name__, elfmt) def __str__ (self): return self.__repr__() def __len__ (self): return len(self.__dict__["*"]) def __getitem__ (self, i): self.assert_spec_getitem() if not isinstance(i, slice): return self.__dict__["*"][i] else: return Monlist(self.__dict__["*"][i]) def __setitem__ (self, i, val): if not isinstance(i, slice): self.assert_spec_setitem(val) else: for v in val: self.assert_spec_setitem(v) cval = self.__dict__["*"][i] if cval != val: self.__dict__["#"]["*"] += 1 if hasattr(cval, "modcount"): mc_diff = cval.modcount - val.modcount if mc_diff > 0: self.__dict__["#"]["*"] += mc_diff self.__dict__["*"][i] = val def __delitem__ (self, i): self.assert_spec_getitem() nitems = len(self.__dict__["*"]) del self.__dict__["*"][i] if len(self.__dict__["*"]) != nitems: self.__dict__["#"]["*"] += 1 def __eq__ (self, other): if len(self.__dict__["*"]) != len(other): return False for i in range(len(other)): if self.__dict__["*"][i] != other[i]: return False return True def __ne__ (self, other): return not self.__eq__(other) def __add__ (self, other): lst = Monlist(self.__dict__["*"]) lst.extend(other) return lst def append (self, val): self.assert_spec_setitem(val) self.__dict__["*"].append(val) self.__dict__["#"]["*"] += 1 def extend (self, other): for val in other: self.append(val) def remove (self, val): self.assert_spec_setitem(val) if val in self.__dict__["*"]: self.__dict__["*"].remove(val) self.__dict__["#"]["*"] += 1 def pop (self, i=None): if i is None: val = self.__dict__["*"].pop() else: val = self.__dict__["*"].pop(i) self.__dict__["#"]["*"] += 1 return val def insert (self, i, val): self.assert_spec_setitem(val) self.__dict__["*"].insert(i, val) self.__dict__["#"]["*"] += 1 # ============================================================================= # Monitored set. _Monset_spec = { "*" : {}, } class Monset (Monitored): """ Monitored set. """ def __init__ (self, st=None): """ Create a monitored set from a sequence. All methods behave as their namesakes in standard C{set}. @param st: sequence of elements @type st: any convertible into list by C{list()} """ self.__dict__["*"] = list() if st is not None: for val in st: if val not in self.__dict__["*"]: self.__dict__["*"].append(val) self.assert_spec_init(_Monset_spec) def __repr__ (self): elfmt = ", ".join(repr(x) for x in self.__dict__["*"]) return "%s([%s])" % (self.__class__.__name__, elfmt) def __str__ (self): return self.__repr__() def __len__ (self): return len(self.__dict__["*"]) def __iter__ (self): return iter(self.__dict__["*"]) def __eq__ (self, other): if len(self.__dict__["*"]) != len(other): return False for i in range(len(other)): if self.__dict__["*"][i] not in other: return False return True def __ne__ (self, other): return not self.__eq__(other) def __contains__ (self, val): return val in self.__dict__["*"] def add (self, val): self.assert_spec_setitem(val) if val not in self.__dict__["*"]: self.__dict__["*"].append(val) self.__dict__["#"]["*"] += 1 def remove (self, val): self.assert_spec_setitem(val) if val in self.__dict__["*"]: self.__dict__["*"].remove(val) self.__dict__["#"]["*"] += 1 def items (self): return list(self.__dict__["*"]) diff --git a/pology/msgreport.py b/pology/msgreport.py index b617a819..1fddc396 100644 --- a/pology/msgreport.py +++ b/pology/msgreport.py @@ -1,756 +1,756 @@ # -*- coding: utf-8 -*- """ Report info, warning and error messages. Functions for Pology tools to report PO messages to the user at runtime, in different contexts and scenario. May colorize some output. @author: Chusslove Illich (Часлав Илић) @author: Nick Shaforostoff (Николай Шафоростов) @license: GPLv3 """ # NOTE: These functions are not in pology.report module, # as that would cause circular module dependencies. from copy import deepcopy import os import re import sys from pology import _, n_ from pology.message import Message from pology.colors import ColorString, cjoin, cinterp from pology.diff import adapt_spans from pology.escape import escape_c as escape from pology.monitored import Monpair from pology.report import report, warning, error, format_item_list # FIXME: Make this a public function in getfunc module. _modules_on_request = {} def _get_module (name, cmsg=None): if name not in _modules_on_request: try: _modules_on_request[name] = __import__(name) except: if cmsg: warning(_("@info", "Cannot import module '%(mod)s'; consequence:\n" "%(msg)s", mod=name, msg=cmsg)) else: warning(_("@info", "Cannot import module '%(mod)s'.", mod=name)) _modules_on_request[name] = None return _modules_on_request[name] def report_on_msg (text, msg, cat, subsrc=None, file=sys.stdout): """ Report on a PO message. Outputs the message reference (catalog name and message position), along with the report text. @param text: text to report @type text: string @param msg: the message for which the text is reported @type msg: L{Message_base} @param cat: the catalog where the message lives @type cat: L{Catalog} @param subsrc: more detailed source of the message @type subsrc: C{None} or string @param file: send output to this file descriptor @type file: C{file} """ posinfo = _msg_pos_fmt(cat.filename, msg.refline, msg.refentry) text = cinterp("%s: %s", posinfo, text) report(text, subsrc=subsrc, showcmd=False) def warning_on_msg (text, msg, cat, subsrc=None, file=sys.stderr): """ Warning on a PO message. Outputs the message reference (catalog name and the message position), along with the warning text. @param text: text to report @type text: string @param msg: the message for which the text is reported @type msg: L{Message_base} @param cat: the catalog where the message lives @type cat: L{Catalog} @param subsrc: more detailed source of the message @type subsrc: C{None} or string @param file: send output to this file descriptor @type file: C{file} """ posinfo = _msg_pos_fmt(cat.filename, msg.refline, msg.refentry) text = cinterp("%s: %s", posinfo, text) warning(text, subsrc=subsrc, showcmd=False) def error_on_msg (text, msg, cat, code=1, subsrc=None, file=sys.stderr): """ Error on a PO message (aborts the execution). Outputs the message reference (catalog name and message position), along with the error text. Aborts execution with the given code. @param text: text to report @type text: string @param msg: the message for which the text is reported @type msg: L{Message_base} @param cat: the catalog where the message lives @type cat: L{Catalog} @param code: the exit code @type code: int @param subsrc: more detailed source of the message @type subsrc: C{None} or string @param file: send output to this file descriptor @type file: C{file} """ posinfo = _msg_pos_fmt(cat.filename, msg.refline, msg.refentry) text = cinterp("%s: %s", posinfo, text) error(text, code=code, subsrc=subsrc, showcmd=True) def report_on_msg_hl (highlight, msg, cat, fmsg=None, subsrc=None, file=sys.stdout): """ Report on parts of a PO message. For each of the spans found in the L{highlight} specification which have a note attached, outputs the position reference (catalog name, message position, spanned segment) and the span note. The highlight can be relative to a somewhat modified, filtered message instead of the original one. @param highlight: highlight specification @type highlight: L{highlight} @param msg: the message for which the text is reported @type msg: L{Message_base} @param cat: the catalog where the message lives @type cat: L{Catalog} @param fmsg: filtered message to which the highlight corresponds @type fmsg: L{Message_base} @param subsrc: more detailed source of the message @type subsrc: C{None} or string @param file: send output to this file descriptor @type file: C{file} """ refpos = _msg_pos_fmt(cat.filename, msg.refline, msg.refentry) if not fmsg: # use original message as filtered if not given fmsg = msg for hspec in highlight: name, item, spans = hspec[:3] if name == "msgctxt": - text = msg.msgctxt or u"" - ftext = fmsg.msgctxt or u"" + text = msg.msgctxt or "" + ftext = fmsg.msgctxt or "" elif name == "msgid": text = msg.msgid ftext = fmsg.msgid elif name == "msgid_plural": - text = msg.msgid_plural or u"" - ftext = fmsg.msgid_plural or u"" + text = msg.msgid_plural or "" + ftext = fmsg.msgid_plural or "" elif name == "msgstr": text = msg.msgstr[item] ftext = fmsg.msgstr[item] # TODO: Add more fields. else: warning(_("@info", "Unknown field '%(field)s' " "in highlighting specification.", field=name)) continue if len(hspec) > 3: # Override filtered text from filtered message # by filtered text from the highlight spec. ftext = hspec[3] spans = adapt_spans(text, ftext, spans, merge=False) if msg.msgid_plural is not None and name == "msgstr": name = "%s_%d" % (name, item) for span in spans: if len(span) < 3: continue start, end, snote = span if isinstance(start, int) and isinstance(end, int): seglen = end - start if seglen > 0: segtext = text[start:end] if len(segtext) > 30: segtext = segtext[:27] + "..." posinfo = "%s:%d:\"%s\"" % (name, start, escape(segtext)) else: posinfo = "%s:%d" % (name, start) else: posinfo = "%s" % name posinfo = ColorString("%s") % posinfo rtext = cinterp("%s[%s]: %s", refpos, posinfo, snote) report(rtext, subsrc=subsrc, showcmd=False) def report_msg_to_lokalize (msg, cat, report=None): """ Open catalog in Lokalize and jump to message. Lokalize is a CAT tool for KDE 4, U{http://userbase.kde.org/Lokalize}. This function opens the catalog in Lokalize (if not already open) and jumps to the given message within it. If the message is obsolete, it will be ignored. @param msg: the message which should be jumped to in Lokalize @type msg: L{Message_base} @param cat: the catalog in which the message resides @type cat: L{Catalog} @param report: simple text or highlight specification @type report: string or L{highlight} """ dbus = _get_module("dbus", _("@info", "Communication with Lokalize not possible. " "Try installing the '%(pkg)s' package.", pkg="python-dbus")) if not dbus: return if msg.obsolete: return # If report is a highlight specification, # flatten it into lines of notes by spans. if isinstance(report, list): notes=[] for hspec in report: for span in hspec[2]: if len(span) > 2: notes.append(span[2]) report = cjoin(notes, "\n") try: try: globals()['lokalizeobj'] except: bus = dbus.SessionBus() - lokalize_dbus_instances=lambda:filter(lambda name: name.startswith('org.kde.lokalize'),bus.list_names()) + lokalize_dbus_instances=lambda:[name for name in bus.list_names() if name.startswith('org.kde.lokalize')] for lokalize_dbus_instance in lokalize_dbus_instances(): try: globals()['lokalizeinst']=lokalize_dbus_instance globals()['lokalizeobj']=bus.get_object(globals()['lokalizeinst'],'/ThisIsWhatYouWant') globals()['openFileInEditor']=globals()['lokalizeobj'].get_dbus_method('openFileInEditor','org.kde.Lokalize.MainWindow') globals()['visitedcats']={} except: pass if 'openFileInEditor' not in globals(): return index=globals()['openFileInEditor'](os.path.abspath(cat.filename)) editorobj=dbus.SessionBus().get_object(globals()['lokalizeinst'],'/ThisIsWhatYouWant/Editor/%d' % index) if cat.filename not in globals()['visitedcats']: globals()['visitedcats'][cat.filename]=1 setEntriesFilteredOut=editorobj.get_dbus_method('setEntriesFilteredOut','org.kde.Lokalize.Editor') setEntriesFilteredOut(True) setEntryFilteredOut=editorobj.get_dbus_method('setEntryFilteredOut','org.kde.Lokalize.Editor') setEntryFilteredOut(msg.refentry-1,False) gotoEntry=editorobj.get_dbus_method('gotoEntry','org.kde.Lokalize.Editor') gotoEntry(msg.refentry-1) if report: addTemporaryEntryNote=editorobj.get_dbus_method('addTemporaryEntryNote','org.kde.Lokalize.Editor') addTemporaryEntryNote(msg.refentry-1,report.resolve(ctype="none")) except: return def report_msg_content (msg, cat, wrapf=None, force=False, note=None, delim=None, highlight=None, showmsg=True, fmsg=None, showfmsg=False, subsrc=None, file=sys.stdout): """ Report the content of a PO message. Provides the message reference, consisting of the catalog name and the message position within it, the message contents, and any notes on particular segments. Parts of the message can be highlighted using colors. Parameter C{highlight} provides the highlighting specification, as list of tuples where each tuple consists of: name of the message element to highlight, element index (used when the element is a list of values), list of spans, and optionally the filtered text of the element value. For example, to highlight spans C{(5, 10)} and C{(15, 25)} in the C{msgid}, and C{(30, 40)} in C{msgstr}, the highlighting specification would be:: [("msgid", 0, [(5, 10), (15, 25)]), ("msgstr", 0, [(30, 40)])] Names of the elements that can presently be highlighted are: C{"msgctxt"}, C{"msgid"}, C{"msgid_plural"}, C{"msgstr"}, C{"manual_comment"}, C{"auto_comment"}, C{"source"}, C{"flag"}. For unique fields the element index is not used, but 0 should be given for consistency (may be enforced later). Span tuples can have a third element, following the indices, which is the note about why the particular span is highlighted; there may be more elements after the note, and these are all ignored. If start or end index of a span is not an integer, then the note is taken as relating to the complete field. Sometimes the match to which the spans correspond has been made on a filtered value of the message field (e.g. after accelerator markers or tags have been removed). In that case, the filtered text can be given as the fourth element of the tuple, after the list of spans, and the function will try to fit spans from filtered onto original text. More globally, if the complete highlight is relative to a modified, filtered version of the message, this message can be given as C{fmsg} parameter. The display of content can be controlled by C{showmsg} parameter; if it is C{False}, only the message reference and span notes are shown. Similarly for the C{showfmsg} parameter, which controls the display of the content of filtered message (if given by C{fmsg}). To show the filtered message may be useful for debugging filtering in cases when it is not straightforward, or it is user-defined. @param msg: the message to report the content for @type msg: L{Message_base} @param cat: the catalog where the message lives @type cat: L{Catalog} or C{None} @param wrapf: the function used for wrapping message fields in output. See L{to_lines()} method of message classes for details. If not given, it will be taken from the catalog (see L{Catalog.wrapf}). @type wrapf: (string)->[string...] @param force: whether to force reformatting of cached message content @type force: bool @param note: note about why the content is being reported @type note: string @param delim: text to print on the line following the message @type delim: C{None} or string @param highlight: highlighting specification of message elements @type highlight: (see description) @param showmsg: show content of the message @type showmsg: bool @param fmsg: filtered message @type fmsg: L{Message_base} @param showfmsg: show content of the filtered message, if any @type showfmsg: bool @param subsrc: more detailed source of the message @type subsrc: C{None} or string @param file: output stream @type file: file """ rsegs = [] wrapf = wrapf or cat.wrapf() notes_data = [] if highlight: msg = Message(msg) # must work on copy, highlight modifies it ffmsg = fmsg or msg # use original message as filtered if not given # Unify spans for same parts, to have single coloring pass per part # (otherwise markup can get corrupted). highlightd = {} for hspec in highlight: name, item, spans = hspec[:3] pkey = (name, item) phspec = highlightd.get(pkey) if phspec is None: # Make needed copies in order not to modify # the original highlight when adding stuff later. highlightd[pkey] = list(hspec) highlightd[pkey][2] = list(spans) else: phspec[2].extend(spans) # Take filtered text if available and not already taken. if len(hspec) > 3 and len(phspec) <= 3: phspec.append(hspec[3]) - highlight = highlightd.values() + highlight = list(highlightd.values()) for hspec in highlight: name, item, spans = hspec[:3] def hl (text, ftext): if len(hspec) > 3: # Override filtered text from filtered message # by filtered text from the highlight spec. ftext = hspec[3] aspans = adapt_spans(text, ftext, spans, merge=False) notes_data.append((text, name, item, aspans)) text = _highlight_spans(text, spans, "red", ftext=ftext) return text if name == "msgctxt": if msg.msgctxt or ffmsg.msgctxt: - msg.msgctxt = hl(msg.msgctxt or u"", ffmsg.msgctxt or u"") + msg.msgctxt = hl(msg.msgctxt or "", ffmsg.msgctxt or "") elif name == "msgid": msg.msgid = hl(msg.msgid, ffmsg.msgid) elif name == "msgid_plural": - msg.msgid_plural = hl(msg.msgid_plural or u"", - ffmsg.msgid_plural or u"") + msg.msgid_plural = hl(msg.msgid_plural or "", + ffmsg.msgid_plural or "") elif name == "msgstr": msg.msgstr[item] = hl(msg.msgstr[item], ffmsg.msgstr[item]) elif name == "manual_comment": msg.manual_comment[item] = hl(msg.manual_comment[item], ffmsg.manual_comment[item]) elif name == "auto_comment": msg.auto_comment[item] = hl(msg.auto_comment[item], ffmsg.auto_comment[item]) elif name == "source": msg.source[item] = Monpair((hl(msg.source[item][0], ffmsg.source[item][0]), msg.source[item][1])) elif name == "flag": pass # FIXME: How to do this? else: warning(_("@info", "Unknown field '%(field)s' " "in highlighting specification.", field=name)) # Report the message. msegs = [] if cat is not None: msegs += [_msg_pos_fmt(cat.filename, msg.refline, msg.refentry) + "\n"] if showmsg: msgstr = msg.to_string(wrapf=wrapf, force=force, colorize=1) msegs += [msgstr.rstrip() + "\n"] if msegs: rsegs.append(cjoin(msegs).rstrip()) # Report notes. if note is not None: # global notestr = _("@info", "[note] %(msg)s", msg=note) rsegs.append(notestr) if notes_data: # span notes note_ord = 1 for text, name, item, spans in notes_data: if msg.msgid_plural is not None and name == "msgstr": name = "%s_%d" % (name, item) for span in spans: if len(span) < 3: continue start, end, snote = span if isinstance(start, int) and isinstance(end, int): seglen = end - start if seglen > 0: segtext = text[start:end] if len(segtext) > 30: segtext = _("@item:intext shortened longer text", "%(snippet)s...", snippet=segtext[:27]) posinfo = "%s:%d:\"%s\"" % (name, start, escape(segtext)) else: posinfo = "%s:%d" % (name, start) else: posinfo = "%s" % name posinfo = ColorString("%s") % posinfo rsegs.append(_("@info", "[%(pos)s]: %(msg)s", pos=posinfo, msg=snote)) note_ord += 1 # Report the filtered message, if given and requested. if fmsg and showfmsg: fmtnote = (ColorString("%s") % _("@info", ">>> Filtered message was:")) rsegs.append(fmtnote) fmsgstr = fmsg.to_string(wrapf=wrapf, force=force, colorize=1) mstr = fmsgstr.rstrip() + "\n" rsegs.append(mstr.rstrip()) if delim: rsegs.append(delim) rtext = cjoin(rsegs, "\n").rstrip() report(rtext, subsrc=subsrc, file=file) def rule_error(msg, cat, rule, highlight=None, fmsg=None, showmsg=True, predelim=False): """ Print formated rule error message on screen. @param msg: pology.message.Message object @param cat: pology.catalog.Catalog object @param rule: pology.rules.Rule object @param highlight: highlight specification (see L{report_msg_content}) @param fmsg: filtered message which the rule really matched @param showmsg: whether to show contents of message (either filtered or original) @param predelim: whether to also print delimiter before the rule error """ # Some info on the rule. rinfo = _("@info", "rule %(rule)s ==> " "%(msg)s", rule=rule.displayName, msg=rule.hint) if showmsg: delim = "-" * 40 if predelim: report(delim) report_msg_content(msg, cat, highlight=highlight, fmsg=fmsg, showfmsg=(fmsg is not None), note=rinfo, delim=delim) else: report_on_msg(rinfo, msg, cat) report_on_msg_hl(highlight, msg, cat, fmsg) def multi_rule_error (msg, cat, rspec, showmsg=True, predelim=False): """ Print formated rule error messages on screen. Like L{rule_error}, but reports multiple failed rules at once. Contents of the matched message is shown only once for all rules, with all highlights embedded, and all rule information following. This holds unless there are several different filtered messages, when rule failures are reported in groups by filtered message. @param msg: the message matched by rules @type msg: Message @param cat: the catalog in which the message resides @type cat: Catalog @param rspec: specification of failed rules. This is a list in which each element can be one of: - rule - tuple of rule and highlight specification (see L{report_msg_content} for details on highlight specifications). Highlight can be None. - tuple of rule, highlight, and filtered message which the rule really matched. Highlight and filtered message can be None. @type rspec: [(Rule|(Rule, highlight)|(Rule, highlight, Message))*] @param showmsg: whether to show contents of message (both original and filtered if given) @type showmsg: bool @param predelim: whether to also print delimiter before the first error @type predelim: bool """ # Expand elements in rule specification to full lengths. rspec_mod = [] for el in rspec: if not isinstance(el, tuple): el = (el,) el_mod = el + tuple(None for i in range(3 - len(el))) rspec_mod.append(el_mod) rspec = rspec_mod # Split into groups by distinct filtered messages, # or make one dummy group if content display not requested. if showmsg: rspec_groups = [] for rule, hl, fmsg in rspec: rlhls = None for ofmsg, rlhls in rspec_groups: if fmsg == ofmsg: # check for apparent equality break if rlhls is None: rlhls = [] rspec_groups.append((fmsg, rlhls)) rlhls.append((rule, hl)) else: rlhls = [] rspec_groups = [(None, rlhls)] for rule, hl, fmsg in rspec: rlhls.append((rule, hl)) # Report each rule group. for fmsg, rlhls in rspec_groups: rinfos = [] highlight = [] for rule, hl in rlhls: rinfos.append(_("@info", "rule %(rule)s ==> " "%(msg)s", rule=rule.displayName, msg=rule.hint)) highlight.extend(hl) if len(rinfos) > 1: note = cjoin([""] + rinfos, "\n") elif rinfos: note = rinfos[0] if showmsg: delim = "-" * 40 if predelim: report(delim) report_msg_content(msg, cat, highlight=highlight, fmsg=fmsg, showfmsg=(fmsg is not None), note=note, delim=delim) else: report_on_msg(note, msg, cat) report_on_msg_hl(highlight, msg, cat, fmsg) def rule_xml_error(msg, cat, rule, span, pluralId=0): """Create and returns rule error message in XML format @param msg: pology.message.Message object @param cat: pology.catalog.Catalog object @param span: list of 2-tuple (start, end) of offending spans @param rule: pology.rules.Rule object @param pluralId: msgstr count in case of plural form. Default to 0 @return: XML message as a list of unicode string""" xmlError=[] xmlError.append("\t\n") xmlError.append("\t\t%s\n" % msg.refline) xmlError.append("\t\t%s\n" % msg.refentry) - xmlError.append("\t\t\n" % _escapeCDATA(msg.msgctxt or u"")) + xmlError.append("\t\t\n" % _escapeCDATA(msg.msgctxt or "")) xmlError.append("\t\t\n" % _escapeCDATA(msg.msgid)) xmlError.append("\t\t\n" % _escapeCDATA(msg.msgstr[pluralId])) for begin, end in span: if isinstance(begin, int) and isinstance(end, int): xmlError.append("\t\t\n" % (begin, end)) #xmlError.append("\t\t%s\n" % span[0]) #xmlError.append("\t\t%s\n" % span[1]) xmlError.append("\t\t\n" % rule.rawPattern) xmlError.append("\t\t\n" % rule.hint) xmlError.append("\t\n") return xmlError def spell_error(msg, cat, faultyWord, suggestions): """Print formated rule error message on screen @param msg: pology.message.Message object @param cat: pology.catalog.Catalog object @param faultyWord: badly spelled word @param suggestions : list of correct words to suggest""" report("-"*40) report(ColorString("%s:%d(%d)") % (cat.filename, msg.refline, msg.refentry)) if msg.msgctxt: report(_("@info", "Context: %(snippet)s", snippet=msg.msgctxt)) #TODO: color in red part of context that make the mistake report(_("@info", "Faulty word: %(word)s", word=faultyWord)) if suggestions: report(_("@info", "Suggestions: %(wordlist)s", wordlist=format_item_list(suggestions))) def spell_xml_error(msg, cat, faultyWord, suggestions, pluralId=0): """Create and returns spell error message in XML format @param msg: pology.message.Message object @param cat: pology.catalog.Catalog object @param faultyWord: badly spelled word @param suggestions : list of correct words to suggest @param pluralId: msgstr count in case of plural form. Default to 0 @return: XML message as a list of unicode string""" xmlError=[] xmlError.append("\t\n") xmlError.append("\t\t%s\n" % msg.refline) xmlError.append("\t\t%s\n" % msg.refentry) - xmlError.append("\t\t\n" % _escapeCDATA(msg.msgctxt or u"")) + xmlError.append("\t\t\n" % _escapeCDATA(msg.msgctxt or "")) xmlError.append("\t\t\n" % _escapeCDATA(msg.msgid)) xmlError.append("\t\t\n" % _escapeCDATA(msg.msgstr[pluralId])) xmlError.append("\t\t%s\n" % faultyWord) for suggestion in suggestions: xmlError.append("\t\t%s\n" % suggestion) xmlError.append("\t\n") return xmlError # Format string for message reference, based on the file descriptor. def _msg_pos_fmt (path, line, col): return (ColorString("%s:%d" "(#%d)") % (path, line, col)) def _escapeCDATA(text): """Escape CDATA tags to allow inclusion into CDATA @param text: text to convert @type text: str or unicode @return: modified string""" text=text.replace("", "]_]_>") return text def _highlight_spans (text, spans, color, ftext=None): """ Adds colors around highlighted spans in text. Spans are given as list of index tuples C{[(start1, end1), ...]} where start and end index have standard Python semantics. Span tuples can have more than two elements, with indices followed by additional elements, which are ignored by this function. If start or end index in a span is not an integer, the span is ignored. The C{color} parameter is one of the color tags available in L{ColorString} markup. If C{ftext} is not C{None}, spans are understood as relative to it, and the function will try to adapt them to the main text (see L{pology.diff.adapt_spans}). @param text: text to be highlighted @type text: string @param spans: spans to highlight @type spans: list of tuples @param color: color tag @type color: string @param ftext: text to which spans are actually relative @type ftext: string @returns: highlighted text @rtype: string """ if not spans or color is None: return text # Adapt spans regardless if filtered text has been given or not, # to fix any overlapping and put into expected ordering. if ftext is None: ftext = text spans = adapt_spans(text, ftext, spans, merge=True) if not spans: return text ctext = "" cstart = 0 for span in spans: if not isinstance(span[0], int) or not isinstance(span[1], int): continue ctext += text[cstart:span[0]] ctext += (ColorString("<%s>%%s" % (color, color)) % text[span[0]:span[1]]) # outside, to have auto-escaping cstart = span[1] ctext += text[span[1]:] return ctext diff --git a/pology/multi.py b/pology/multi.py index ad285e97..b97631b3 100644 --- a/pology/multi.py +++ b/pology/multi.py @@ -1,105 +1,105 @@ # -*- coding: UTF-8 -*- """ Collections of multiple sequences. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ class Multidict (object): """ Several dictionaries readable as one. Allows to get elements from several dictionary-like sequences as if they were one, without really creating a single union of items from all of them. This is useful when it is more expensive to create the union, than to look sequentially in each dictionary in turn. All methods named same as in C{dict} have same semantics too. """ def __init__ (self, dicts): """ Constructor. Order of dictionaries in the list matters, firstmost has highest priority when looking for a key. Collected sequences need to implement the following methods of a dictionary: C{__getitem__}, C{__contains__}, C{iterkeys}, C{itervalues}, C{iteritems}. Iterators have to implement C{next} method, and raise C{StopIteration} when exhausted. @param dicts: sequence of dictionary-like objects @type dicts: list of dict """ self._dicts = dicts def __contains__ (self, key): for d in self._dicts: if key in d: return True return False def __getitem__ (self, key): for d in self._dicts: if key in d: return d[key] raise KeyError(key) def __iter__ (self): - return self.iterkeys() + return iter(self.keys()) def get (self, key, defval=None): for d in self._dicts: if key in d: return d[key] return defval def iterkeys (self): - return self._Iterator(lambda x: x.iterkeys()) + return self._Iterator(lambda x: iter(x.keys())) def itervalues (self): - return self._Iterator(lambda x: x.itervalues()) + return self._Iterator(lambda x: iter(x.values())) def iteritems (self): - return self._Iterator(lambda x: x.iteritems()) + return self._Iterator(lambda x: iter(x.items())) class _Iterator (object): def __init__ (self, getit): self._iters = [getit(d) for d in self._dicts] def __iter__ (self): return self - def next (self): + def __next__ (self): while self._iters: try: - return self._iters[0].next() + return next(self._iters[0]) except StopIteration: self._iters.pop(0) raise StopIteration diff --git a/pology/normalize.py b/pology/normalize.py index 4eb6fe7c..3006f5ed 100644 --- a/pology/normalize.py +++ b/pology/normalize.py @@ -1,491 +1,491 @@ # -*- coding: UTF-8 -*- """ Various normalizations for strings and PO elements. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import unicodedata from pology import _, n_ from pology.message import MessageUnsafe from pology.monitored import Monlist, Monpair from pology.report import warning _wsseq_rx = re.compile(r"[ \t\n]+", re.U) def simplify (s): """ Simplify ASCII whitespace in the string. All leading and trailing ASCII whitespace are removed, all inner ASCII whitespace sequences are replaced with space. @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ return _wsseq_rx.sub(" ", s.strip()) _uwsseq_rx = re.compile(r"\s+", re.U) def usimplify (s): """ Simplify whitespace in the string. Like L{simplify}, but takes into account all whitespace defined by Unicode. @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ return _uwsseq_rx.sub(" ", s.strip()) def shrink (s): """ Remove all whitespace from the string. @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ return _uwsseq_rx.sub("", s) def tighten (s): """ Remove all whitespace and lowercase the string. @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ return _uwsseq_rx.sub("", s.lower()) _non_ascii_ident_rx = re.compile(r"[^a-z0-9_]", re.U|re.I) def identify (s): """ Construct an uniform-case ASCII-identifier out of the string. ASCII-identifier is constructed in the following order: - string is decomposed into Unicode NFKD - string is lowercased - every character that is neither an ASCII alphanumeric nor the underscore is removed - if the string starts with a digit, underscore is prepended @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ ns = s # Decompose. ns = unicodedata.normalize("NFKD", ns) # Lowercase. ns = ns.lower() # Remove non-identifier chars. ns = _non_ascii_ident_rx.sub("", ns) # Prefix with underscore if first char is digit. if ns[0:1].isdigit(): ns = "_" + ns return ns def xentitize (s): """ Replace characters having default XML entities with the entities. The replacements are: - C{&} for ampersand - C{<} and C{>} for less-than and greater-then signs - C{'} and C{"} for ASCII single and double quotes @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ ns = s ns = ns.replace("&", "&") # must come first ns = ns.replace("<", "<") ns = ns.replace(">", ">") ns = ns.replace("'", "'") ns = ns.replace('"', """) return ns # As defined by http://www.unicode.org/faq/unsup_char.html. _invisible_character_codepoints = ([] + [0x200C, 0x200D] # cursive joiners - + range(0x202A, 0x202E + 1) # bidirectional format controls + + list(range(0x202A, 0x202E + 1)) # bidirectional format controls + [0x00AD] # soft hyphen + [0x2060, 0xFEFF] # word joiners + [0x200B] # the zero width space - + range(0x2061, 0x2064 + 1) # invisible math operators + + list(range(0x2061, 0x2064 + 1)) # invisible math operators + [0x115F, 0x1160] # Jamo filler characters - + range(0xFE00, 0xFE0F + 1) # variation selectors + + list(range(0xFE00, 0xFE0F + 1)) # variation selectors ) -_invchstr = "".join(map(unichr, _invisible_character_codepoints)) +_invchstr = "".join(map(chr, _invisible_character_codepoints)) _invisible_character_replrx = re.compile("[%s]" % _invchstr, re.U) def noinvisible (s): """ Remove all invisible characters from the string. Invisible characters are those which have zero width, i.e. do not have any visual representation in the text (when the text is rendered proportionally). See U{http://www.unicode.org/faq/unsup_char.html} for the list of these characters as defined by Unicode. @param s: string to normalize @type s: string @returns: normalized string @rtype: string """ ns = _invisible_character_replrx.sub("", s) return ns def demangle_srcrefs (collsrcs=None, collsrcmap=None, truesrcheads=None, compexts=None): """ Resolve source references in message created by intermediate extraction [hook factory]. Sometimes the messages from a source file in the format not known to C{xgettext(1)} are first extracted by a preextraction tool into a format known to C{xgettext}, and then by C{xgettext} to PO template. This is the intermediate extraction, and the files that C{xgettext} gets to operate on are intermediate files. When intermediate extraction is performed, the source references in the resulting PO template are going to be "mangled", pointing to the intermediate files rather than to the true source files. This hook factory will produce a function that will resolve intermediate into true source reference, "demangle" them, where possible. One mode of intermediate extraction is to extract multiple sources into a collective intermediate file. This file may have standardized name throughout a collection of catalogs, or it may be special by catalog. For demangling to be possible in this case, the preextraction tool has to provide true source references in the extracted comments (C{#.}) of the messages. When that is the case, parameter C{collsrcs} is used to specify the sequence of names of generally known intermediate files, parameter C{collsrcmap} of those specific by catalog (as dictionary of catalog name to sequence of intermediate file names), and parameter C{truesrcheads} specifies the sequence of initial strings in extracted comments which are followed by the true source reference. (If C{truesrcheads} is C{None} or empty, this mode of demangling is disabled.) For example, collective-intermediate extraction:: #. file: apples.clt:156 #: resources.cpp:328 msgid "Granny Smith" msgstr "" #. file: peaches.clt:49 #: resources.cpp:2672 msgid "Redhaven" msgstr "" is demangled by setting C{collsrcs=["resources.cpp"]} and C{truesrcheads=["file:"]}. Another mode of intermediate extraction is to for each source file to be extracted into a single paired intermediate file, which is named same as the true source plus an additional extension. In this mode, parameter C{compexts} specifies the list of known composite extensions (including the leading dot), which will be demangled by stripping the final extension from the path. For example, paired-intermediate extraction:: #: apples.clt.h:156 msgid "Granny Smith" msgstr "" #: peaches.clt.h:49 msgid "Redhaven" msgstr "" is demangled by setting C{compexts=[".clt.h"]}. @param collsrcs: general intermediate file names @type collsrcs: @param collsrcmap: catalog-specific intermediate file names @type collsrcmap: {string: *} @param truesrcheads: prefixes to true file references in comments @type truesrcheads: @param compexts: composite intermediate file extensions @type compexts: @return: type F4A hook @rtype: C{(cat, msg) -> numerr} """ def hook (msg, cat): numerr = 0 truerefs = [] # Demangle source references in collective-intermediate mode if truesrcheads: # Collect source references from extracted comments. cmnts = [] for cmnt in msg.auto_comment: hasrefs = False for head in truesrcheads: if cmnt.startswith(head): refs = [x.split(":") for x in cmnt[len(head):].split()] hasrefs = all((len(x) == 2 and x[1].isdigit) for x in refs) if not hasrefs: numerr += 1 break if hasrefs: refs = [(path, int(lno)) for path, lno in refs] truerefs.extend(refs) else: cmnts.append(cmnt) msg.auto_comment[:] = cmnts # Exclude intermediates from source references. for path, lno in msg.source: bname = os.path.basename(path) if (not ( (collsrcs and bname in collsrcs) or ( collsrcmap and bname in collsrcmap.get(cat.name, {}))) ): truerefs.append((path, lno)) # Demangle source references in paired-intermediate mode if compexts: for path, lno in msg.source: for ext in compexts: if path.endswith(ext): p = path.rfind(".") if p > 0: path = path[:p] else: numerr += 1 break truerefs.append((path, lno)) if isinstance(msg, MessageUnsafe): msg.source = truerefs else: - msg.source = Monlist(map(Monpair, truerefs)) + msg.source = Monlist(list(map(Monpair, truerefs))) return numerr return hook def uniq_source (msg, cat): """ Make message source references unique [type F4A hook]. Sometimes source references of a message can be non-unique due to particularities of extraction or later processing. This hook makes them unique, while preserving the ordering. """ uniqrefs = [] for path, line in msg.source: ref = (os.path.normpath(path), line) if ref not in uniqrefs: uniqrefs.append(ref) if isinstance(msg, MessageUnsafe): msg.source = uniqrefs else: - msg.source = Monlist(map(Monpair, uniqrefs)) + msg.source = Monlist(list(map(Monpair, uniqrefs))) def uniq_auto_comment (onlyheads=None): """ Remove non-unique automatic comment lines in message [hook factory]. Sometimes the message extraction tool adds automatic comments to provide more context for the message (for example, XML tag path to the current message). If the message is found more than once in the same context, such comment lines get repeated. This hook can be used to make auto comment lines unique; either fully, or only those with certain prefixes given by C{onlyheads} parameter. @param onlyheads: prefixes of comment lines which should be made unique @type onlyheads: @return: type F4A hook @rtype: C{(cat, msg) -> numerr} """ if onlyheads is not None and not isinstance(onlyheads, tuple): onlyheads = tuple(onlyheads) def hook (msg, cat): seen_cmnts = set() cmnts = [] for cmnt in msg.auto_comment: if onlyheads is None or cmnt.startswith(onlyheads): if cmnt not in seen_cmnts: cmnts.append(cmnt) seen_cmnts.add(cmnt) else: cmnts.append(cmnt) msg.auto_comment[:] = cmnts return hook def canonical_header (hdr, cat): """ Check and rearrange content of a PO header into canonical form [type F4B hook]. @return: number of errors @rtype: int """ nerr = 0 nerr += _fix_authors(hdr, cat) return nerr -_yr1_rx = re.compile(ur"^\s*(\d{4}|\d{2})\s*$") -_yr2_rx = re.compile(ur"^\s*(\d{4}|\d{2})\s*[-—–]\s*(\d{4}|\d{2})\s*$") +_yr1_rx = re.compile(r"^\s*(\d{4}|\d{2})\s*$") +_yr2_rx = re.compile(r"^\s*(\d{4}|\d{2})\s*[-—–]\s*(\d{4}|\d{2})\s*$") def _fix_authors (hdr, cat): nerr = 0 # Parse authors data from the header. authors = {} problems = False pos = 0 for a in hdr.author: pos += 1 m = re.search(r"(.*?)<(.*?)>(.*)$", a) if not m: warning(_("@info", "%(file)s: Cannot parse name and email address " "from translator comment '%(cmnt)s'.", file=cat.filename, cmnt=a)) problems = True nerr += 1 continue name, email, rest = m.groups() name = simplify(name) email = simplify(email) m = re.search(r"^\s*,(.+?)\.?\s*$", rest) if not m: warning(_("@info", "%(file)s: Missing years in " "translator comment '%(cmnt)s'.", file=cat.filename, cmnt=a)) problems = True nerr += 1 continue yearstr = m.group(1) years = [] for yspec in yearstr.split(","): m = _yr1_rx.search(yspec) or _yr2_rx.search(yspec) if not m: warning(_("@info", "%(file)s: Cannot parse years in " "translator comment '%(cmnt)s'.", file=cat.filename, cmnt=a)) problems = True nerr += 1 break if len(m.groups()) == 1: ystr = m.group(1) if len(ystr) == 2: ystr = (ystr[0] == "9" and "19" or "20") + ystr years.append(int(ystr)) else: - years.extend(range(int(m.group(1)), int(m.group(2)) + 1)) + years.extend(list(range(int(m.group(1)), int(m.group(2)) + 1))) if not years: continue if name not in authors: authors[name] = {"email": "", "pos": 0, "years": set()} authors[name]["email"] = email authors[name]["pos"] = pos authors[name]["years"].update(years) # If there were any problems, do not touch author comments. if problems: return nerr # Post-process authors data. authlst = [] - for name, adata in authors.items(): + for name, adata in list(authors.items()): adata["years"] = list(adata["years"]) adata["years"].sort() - adata["years"] = map(str, adata["years"]) + adata["years"] = list(map(str, adata["years"])) adata["name"] = name authlst.append(adata) authlst.sort(key=lambda x: (min(x["years"]), x["pos"])) # Construct new author comments. authcmnts = Monlist() for a in authlst: - acmnt = u"%s <%s>, %s." % (a["name"], a["email"], + acmnt = "%s <%s>, %s." % (a["name"], a["email"], ", ".join(a["years"])) authcmnts.append(acmnt) hdr.author = authcmnts return nerr diff --git a/pology/proj/kde/header.py b/pology/proj/kde/header.py index cb96ab23..def18c77 100644 --- a/pology/proj/kde/header.py +++ b/pology/proj/kde/header.py @@ -1,96 +1,96 @@ # -*- coding: UTF-8 -*- """ Additional header operations for KDE Translation Project. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re from pology import _, n_ from pology.report import warning from pology.proj.kde.cattype import get_project_subdir from pology.proj.kde.cattype import is_txt_cat, is_qt_cat, is_docbook_cat from pology.proj.kde.cattype import is_html_cat, is_unknown_cat def equip_header (hdr, cat): """ Add extra information to header [type F4B hook]. The following header fields are set: - C{Language}: the language code of translation; set only if the language can be determined - C{X-Environment}: linguistic subset of the language of translation (team choices on terminology, ortography...); set to C{kde} if not existing, otherwise left untouched. - C{X-Accelerator-Marker}: accelerator marker character which may be encountered in text - C{X-Text-Markup}: text markups (e.g. Qt rich text, Docbook...) which may be encountered in text, as keywords For the hook to function properly, the local checkout of language catalogs must match the repository structure up to a certain level. See the documentation on C{check-tp-kde} sieve for details. TODO: Put that instruction here. """ cname = cat.name csubdir = get_project_subdir(cat.filename) if not csubdir: warning(_("@info TP stands for Translation Project", "Cannot determine KDE TP subdirectory " "of '%(file)s', skipping header updates.", file=cat.filename)) return 1 pathels = os.path.abspath(cat.filename).split(os.path.sep) lang_rx = re.compile(r"^[a-z]{2}(_[A-Z]{2}|@[a-z]+)?$") lang = None if len(pathels) >= 5 and pathels[-4] == "summit": if lang_rx.search(pathels[-5]): lang = pathels[-5] elif len(pathels) >= 4: if lang_rx.search(pathels[-4]): lang = pathels[-4] if is_txt_cat(cname, csubdir): accmark = "" mtypes = [""] elif is_qt_cat(cname, csubdir): accmark = "&" mtypes = ["qtrich"] elif is_docbook_cat(cname, csubdir): accmark = "" mtypes = ["docbook4"] elif is_html_cat(cname, csubdir): accmark = "" mtypes = ["html"] elif is_unknown_cat(cname, csubdir): accmark = None mtypes = None else: # default to native KDE4 catalog accmark = "&" mtypes = ["kde4"] fvs = [] fvs.append(("Language", lang, "Language-Team", False)) - fvs.append(("X-Environment", u"kde", None, True)) + fvs.append(("X-Environment", "kde", None, True)) if accmark is not None: fvs.append(("X-Accelerator-Marker", accmark, None, False)) if mtypes is not None: fvs.append(("X-Text-Markup", ", ".join(mtypes), None, False)) for fnam, fval, fnamaft, fkeep in fvs: if fval is None: continue existing = hdr.select_fields(fnam) if not (existing and fkeep): if len(existing) > 1: hdr.remove_field(fnam) - hdr.set_field(unicode(fnam), unicode(fval), after=fnamaft) + hdr.set_field(str(fnam), str(fval), after=fnamaft) return 0 diff --git a/pology/remove.py b/pology/remove.py index c8cfba56..3dce8770 100644 --- a/pology/remove.py +++ b/pology/remove.py @@ -1,784 +1,784 @@ # -*- coding: utf-8 -*- """ Remove special substrings from text. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import _, n_ from pology.comments import manc_parse_field_values, manc_parse_list import pology.markup as M from pology.msgreport import warning_on_msg from pology.resolve import remove_accelerator as _rm_accel_in_text from pology.resolve import remove_fmtdirs as _rm_fmtd_in_text_single from pology.resolve import remove_literals as _rm_lit_in_text_single from pology.resolve import resolve_entities_simple def _rm_accel_in_msg (msg, accels, greedy=False): msg.msgid = _rm_accel_in_text(msg.msgid, accels, greedy) if msg.msgid_plural: msg.msgid_plural = _rm_accel_in_text(msg.msgid_plural, accels, greedy) for i in range(len(msg.msgstr)): msg.msgstr[i] = _rm_accel_in_text(msg.msgstr[i], accels, greedy) if msg.msgid_previous: msg.msgid_previous = _rm_accel_in_text(msg.msgid_previous, accels, greedy) if msg.msgid_plural_previous: msg.msgid_plural_previous = _rm_accel_in_text(msg.msgid_plural_previous, accels, greedy) return 0 def _get_accel_marker (msg, cat): accels = manc_parse_field_values(msg, "accelerator-marker") if not accels: accels = cat.accelerator() return accels def remove_accel_text (text, msg, cat): """ Remove accelerator marker from one of the text fields of the message [type F3A hook]. Accelerator marker is determined from the catalog, by calling its L{accelerator()} method. Use L{set_accelerator()} to set possible accelerator markers after the catalog has been opened, in case it does not specify any on its own. If catalog reports C{None} for accelerators, text is not touched. Accelerator marker can also be specified for a particular message, by embedded C{accelerator-marker} field in manual comments:: # accelerator-marker: _ This overrides accelerator marker reported by the catalog. @return: text @see: L{pology.resolve.remove_accelerator} """ accels = _get_accel_marker(msg, cat) return _rm_accel_in_text(text, accels) def remove_accel_text_greedy (text, msg, cat): """ Like L{remove_accel_text}, except that if catalog reports C{None} for accelerators, some frequent marker characters are removed [type F3A hook]. @return: text @see: L{pology.resolve.remove_accelerator} """ accels = _get_accel_marker(msg, cat) return _rm_accel_in_text(text, accels, greedy=True) def remove_accel_msg (msg, cat): """ Remove accelerator marker from all applicable text fields in the message, as if L{remove_accel_text} was applied to each [type F4A hook]. @return: number of errors @see: L{pology.resolve.remove_accelerator} """ accels = _get_accel_marker(msg, cat) return _rm_accel_in_msg(msg, accels) def remove_accel_msg_greedy (msg, cat): """ Like L{remove_accel_msg}, except that if catalog reports C{None} for accelerators, some frequent marker characters are removed [type F4A hook]. @return: number of errors @see: L{pology.resolve.remove_accelerator} """ accels = _get_accel_marker(msg, cat) return _rm_accel_in_msg(msg, accels, greedy=True) def _rm_markup_in_text (text, mtypes): if mtypes is None: return text for mtype in mtypes: mtype = mtype.lower() if 0: pass elif mtype == "html": text = M.html_to_plain(text) elif mtype == "kde4": text = M.kde4_to_plain(text) elif mtype == "qtrich": text = M.qtrich_to_plain(text) elif mtype == "kuit": text = M.kuit_to_plain(text) elif mtype == "docbook4" or mtype == "docbook": text = M.docbook4_to_plain(text) elif mtype == "xml": text = M.xml_to_plain(text) elif mtype == "xmlents": # FIXME: Only default XML entities can be handled as-is; # perhaps markup remover should also take entity mapping # as argument, and pass it here? text = resolve_entities_simple(text, M.xml_entities) return text def _rm_markup_in_msg (msg, mtypes): msg.msgid = _rm_markup_in_text(msg.msgid, mtypes) if msg.msgid_plural: msg.msgid_plural = _rm_markup_in_text(msg.msgid_plural, mtypes) for i in range(len(msg.msgstr)): msg.msgstr[i] = _rm_markup_in_text(msg.msgstr[i], mtypes) if msg.msgid_previous: msg.msgid_previous = _rm_markup_in_text(msg.msgid_previous, mtypes) if msg.msgid_plural_previous: msg.msgid_plural_previous = _rm_markup_in_text(msg.msgid_plural_previous, mtypes) return 0 def remove_markup_text (text, msg, cat): """ Remove markup from one of the text fields of the message [type F3A hook]. Expected markup types are determined from the catalog, by calling its L{markup()} method. Use L{set_markup()} to set expected markup types after the catalog has been opened, in case it does not specify any on its own. If catalog reports C{None} for markup types, text is not touched. @return: text """ mtypes = cat.markup() return _rm_markup_in_text(text, mtypes) def remove_markup_msg (msg, cat): """ Remove markup from all applicable text fields in the message, as if L{remove_markup_text} was applied to each [type F4A hook]. @return: number of errors """ mtypes = cat.markup() return _rm_markup_in_msg(msg, mtypes) def _format_flags (msg): return [x for x in msg.flag if x.endswith("-format")] def _rm_fmtd_in_text (text, formats, subs=""): for format in formats: text = _rm_fmtd_in_text_single(text, format, subs=subs) return text def _rm_fmtd_in_msg (msg, subs=""): formats = _format_flags(msg) msg.msgid = _rm_fmtd_in_text(msg.msgid, formats, subs) if msg.msgid_plural: msg.msgid_plural = _rm_fmtd_in_text(msg.msgid_plural, formats, subs) for i in range(len(msg.msgstr)): msg.msgstr[i] = _rm_fmtd_in_text(msg.msgstr[i], formats, subs) if msg.msgid_previous: msg.msgid_previous = _rm_fmtd_in_text(msg.msgid_previous, formats, subs) if msg.msgid_plural_previous: msg.msgid_plural_previous = _rm_fmtd_in_text(msg.msgid_plural_previous, formats, subs) return 0 def remove_fmtdirs_text (text, msg, cat): """ Remove format directives from one of the text fields of the message [type F3A hook]. The type of format directives is determined from message format flags. @return: text @see: L{pology.resolve.remove_fmtdirs} """ return _rm_fmtd_in_text(text, _format_flags(msg)) def remove_fmtdirs_text_tick (tick): """ Like L{remove_fmtdirs_text}, except that each format directive is replaced by a non-whitespace "tick" instead of plainly removed [hook factory]. @param tick: the tick sequence @type tick: string @return: type F3A hook @rtype: C{(cat, msg, text) -> text} """ def hook (text, msg, cat): return _rm_fmtd_in_text(text, _format_flags(msg), tick) return hook def remove_fmtdirs_msg (msg, cat): """ Remove format directives from all applicable text fields in the message, as if L{remove_fmtdirs_text} was applied to each [type F4A hook]. @return: number of errors """ return _rm_fmtd_in_msg(msg) def remove_fmtdirs_msg_tick (tick): """ Remove format directives from all applicable text fields in the message, as if L{remove_fmtdirs_text_tick} was applied to each [hook factory]. @param tick: the tick sequence @type tick: string @return: type F4A hook @rtype: C{(cat, msg, text) -> numerr} """ def hook (msg, cat): return _rm_fmtd_in_msg(msg, tick) return hook def _literals_spec (msg, cat): fname = "literal-segment" rx_strs = manc_parse_field_values(msg, fname) # Compile regexes. # Empty regex indicates not to do any heuristic removal. rxs = [] heuristic = True for rx_str in rx_strs: if rx_str: try: rxs.append(re.compile(rx_str, re.U|re.S)) except: warning_on_msg(_("@info", "Field %(field)s states " "malformed regex '%(re)s'.", field=fname, re=rx_str), msg, cat) else: heuristic = False return [], rxs, heuristic def _rm_lit_in_text (text, substrs, regexes, heuristic, subs=""): return _rm_lit_in_text_single(text, subs=subs, substrs=substrs, regexes=regexes, heuristic=heuristic) def _rm_lit_in_msg (msg, cat, strs, rxs, heu, subs=""): msg.msgid = _rm_lit_in_text(msg.msgid, strs, rxs, heu, subs) if msg.msgid_plural: msg.msgid_plural = _rm_lit_in_text(msg.msgid_plural, strs, rxs, heu, subs) for i in range(len(msg.msgstr)): msg.msgstr[i] = _rm_lit_in_text(msg.msgstr[i], strs, rxs, heu, subs) if msg.msgid_previous: msg.msgid_previous = _rm_lit_in_text(msg.msgid_previous, strs, rxs, heu, subs) if msg.msgid_plural_previous: msg.msgid_plural_previous = _rm_lit_in_text(msg.msgid_plural_previous, strs, rxs, heu, subs) return 0 def remove_literals_text (text, msg, cat): """ Remove literal segments from one of the text fields of the message [type F3A hook]. Literal segments are URLs, email addresses, command line options, etc. anything symbolic that the machine, rather than human alone, should parse. Note format directives are excluded here, see L{remove_fmtdirs_text} for removing them. By default, literals are removed heuristically, but this can be influenced by embedded C{literal-segment} fields in manual comments. For example:: # literal-segment: foobar states that all C{foobar} segments are literals. The field value is actually a regular expression, and there may be several such fields:: # literal-segment: \w+bar # literal-segment: foo[&=] ### a sub comment To prevent any heuristic removal of literals, add a C{literal-segment} field with empty value. @return: text @see: L{pology.resolve.remove_literals} """ strs, rxs, heu = _literals_spec(msg, cat) return _rm_lit_in_text(text, strs, rxs, heu) def remove_literals_text_tick (tick): """ Like L{remove_literals_text}, except that each literal segment is replaced by a non-whitespace "tick" instead of plainly removed [hook factory]. @param tick: the tick sequence @type tick: string @return: type F3A hook @rtype: C{(cat, msg, text) -> text} """ def hook (text, msg, cat): strs, rxs, heu = _literals_spec(msg, cat) return _rm_lit_in_text(text, strs, rxs, heu, tick) return hook def remove_literals_msg (msg, cat): """ Remove literal segments from all applicable text fields in the message, as if L{remove_literals_text} was applied to each [type F4A hook]. @return: number of errors """ strs, rxs, heu = _literals_spec(msg, cat) return _rm_lit_in_msg(msg, cat, strs, rxs, heu) def remove_literals_msg_tick (tick): """ Remove literal segments from all applicable text fields in the message, as if L{remove_literals_text_tick} was applied to each [hook factory]. @param tick: the tick sequence @type tick: string @return: type F4A hook @rtype: C{(cat, msg, text) -> numerr} """ def hook (msg, cat): strs, rxs, heu = _literals_spec(msg, cat) return _rm_lit_in_msg(msg, cat, strs, rxs, heu, tick) return hook def remove_marlits_text (text, msg, cat): """ Remove literals by markup from one of the text fields of the message [type F3A hook]. An "intersection" of L{remove_markup_text} and L{remove_literals_text}, where literals segments are determined by markup, and both the segment text and its markup is removed. See documentation of these hooks for notes on what is considered literal and how markup type is determined. @return: text """ strs, rxs, heu = [], _marlit_rxs(msg, cat), False return _rm_lit_in_text(text, strs, rxs, heu) def remove_marlits_msg (msg, cat): """ Remove literal segments by markup from all applicable text fields in the message, as if L{remove_marlits_text} was applied to each [type F4A hook]. @return: number of errors """ strs, rxs, heu = [], _marlit_rxs(msg, cat), False return _rm_lit_in_msg(msg, cat, strs, rxs, heu) class _Cache: pass _marlit_cache = _Cache() _marlit_cache.mtypes = None _marlit_cache.tags = set() _marlit_cache.rxs = [] _marlit_cache.acmnt_tag_rx = re.compile(r"^\s*tag:\s*(\w+)\s*$", re.I) _marlit_cache.rxs_all = [re.compile(r".*", re.S)] def _marlit_rxs (msg, cat): # Update regex cache due to markup type. mtypes = cat.markup() if _marlit_cache.mtypes != mtypes: _marlit_cache.mtypes = mtypes _marlit_cache.tags = set() _marlit_cache.rxs = [] for mtype in mtypes or []: _marlit_cache.tags.update(_marlit_tags(mtype)) rx = _build_tagged_rx(_marlit_cache.tags) _marlit_cache.rxs.append(rx) # Check if the whole message is under a literal tag. for acmnt in msg.auto_comment: m = _marlit_cache.acmnt_tag_rx.search(acmnt) if m: tag = m.group(1).strip().lower() if tag in _marlit_cache.tags: return _marlit_cache.rxs_all return _marlit_cache.rxs def _marlit_tags (mtype): tags = "" if 0: pass elif mtype == "html": tags = """ tt code """ elif mtype == "kde4": tags = """ icode bcode filename envar command numid tt code """ elif mtype == "qtrich": tags = """ tt code """ elif mtype == "kuit": tags = """ icode bcode filename envar command numid """ elif mtype == "docbook4" or mtype == "docbook": tags = """ literal filename envar command option function markup varname screen programlisting userinput computeroutput """ elif mtype == "xml": pass return set(tags.split()) def _build_tagged_rx (tags): - if isinstance(tags, basestring): + if isinstance(tags, str): tags = tags.split() # For proper regex matching, tags that begin with a substring # equal to another full tag must come before that full tag. # So sort tags first by length, then by alphabet. tags = sorted(tags, key=lambda x: (-len(x), x)) basetagged_rxsub = r"<\s*(%s)\b[^<]*>.*?<\s*/\s*\1\s*>" tagged_rx = re.compile(basetagged_rxsub % "|".join(tags), re.I|re.S) return tagged_rx def remove_ignored_entities_msg (msg, cat): """ Remove locally ignored entities from all applicable text fields in the message [type F4A hook]. Entities are ignored by listing them in the embedded C{ignore-entities} fields in manual comments. For example:: # ignore-entity: foobar, froobaz will remove entities C{&foobar;} and C{&froobaz;} from all text fields. @return: number of errors """ locally_ignored = manc_parse_list(msg, "ignore-entity:", ",") if not locally_ignored: return 0 msg.msgid = _rm_ent_in_text(msg.msgid, locally_ignored) if msg.msgid_plural: msg.msgid_plural = _rm_ent_in_text(msg.msgid_plural, locally_ignored) for i in range(len(msg.msgstr)): msg.msgstr[i] = _rm_ent_in_text(msg.msgstr[i], locally_ignored) return 0 def _rm_ent_in_text (text, entities): for entity in entities: text = text.replace("&%s;" % entity, "") return text def rewrite_msgid (msg, cat): """ Rewrite parts of C{msgid} based on translator comments [type F4A hook]. Translator comments may issue C{rewrite-msgid} directives to modify parts of C{msgid} (as well as C{msgid_plural}) fields by applying a search regular expression and replace pattern. The search and replace pattern are wrapped and separated by any character consistently used, such as slashes. Examples:: # rewrite-msgid: /foo/bar/ # rewrite-msgid: /foo (\\w+) fam/bar \\1 bam/ # rewrite-msgid: :foo/bar:foo/bam: If a search pattern is not valid, a warning on message is issued. Search pattern is case-sensitive. @return: number of errors """ nerrors = 0 # Collect and compile regular expressions. fname = "rewrite-msgid" rwspecs = manc_parse_field_values(msg, fname) rwrxs = [] for rwspec in rwspecs: sep = rwspec[0:1] if not sep: warning_on_msg(_("@info", "No patterns in rewrite directive."), msg, cat) nerrors += 1 continue lst = rwspec.split(sep) if len(lst) != 4 or lst[0] or lst[3]: warning_on_msg(_("@info", "Wrongly separated patterns in " "rewrite directive '%(dir)s'.", dir=rwspec), msg, cat) nerrors += 1 continue srch, repl = lst[1], lst[2] try: rx = re.compile(srch, re.U) except: warning_on_msg(_("@info", "Invalid search pattern in " "rewrite directive '%(dir)s'.", dir=rwspec), msg, cat) nerrors += 1 continue rwrxs.append((rx, repl, rwspec)) for rx, repl, rwspec in rwrxs: try: msg.msgid = rx.sub(repl, msg.msgid) if msg.msgid_plural is not None: msg.msgid_plural = rx.sub(repl, msg.msgid_plural) except: warning_on_msg(_("@info", "Error in application of " "rewrite directive '%(dir)s'.", dir=rwspec), msg, cat) nerrors += 1 return nerrors def rewrite_inverse (msg, cat): """ Rewrite message by replacing all its elements with that of another message which has the same C{msgstr[0]} [type F4A hook]. Translator comments may issue C{rewrite-inverse} directives to replace all message parts with those from another message having the same C{msgstr[0]} field. The argument to the directive is a regular expression search pattern on C{msgid} and C{msgctxt} (leading and trailing whitespace get stripped) which is used to select the particular message if more than one other messages have same C{msgstr[0]}. Examples:: # rewrite-inverse: # rewrite-inverse: Foo If the pattern does not match or it matches more than one other message, current message is not touched; also if the pattern is left empty and there is more than one other message. Search pattern is applied to C{msgctxt} and C{msgid} in turn, and the message is matched if any matches. Search pattern is case-sensitive. If more than one C{rewrite-inverse} directive is seen, or the search pattern is not valid, a warning on message is issued and current message is not touched. This hook is then executed again on the resulting message, in case the new translator comments contain another C{rewrite-inverse} directive. @return: number of errors """ # Collect and compile regular expressions. fname = "rewrite-inverse" rwspecs = manc_parse_field_values(msg, fname) if not rwspecs: return 0 if len(rwspecs) > 1: warning_on_msg(_("@info", "More than one inverse rewrite directive " "encountered."), msg, cat) return 1 srch = rwspecs[0] try: rx = re.compile(srch, re.U) except: warning_on_msg(_("@info", "Invalid search pattern '%(pattern)s' in " "inverse rewrite directive.", pattern=srch), msg, cat) return 1 msgs = cat.select_by_msgstr(msg.msgstr[0], lazy=True) msgs = [x for x in msgs if x.key != msg.key] # remove current if not msgs: warning_on_msg(_("@info", "There are no other messages with same translation, " "needed by inverse rewrite directive."), msg, cat) return 1 match = lambda x: ( (x.msgctxt is not None and rx.search(x.msgctxt)) or rx.search(x.msgid)) sel_msgs = [x for x in msgs if match(x)] # remove non-matched if not sel_msgs: warning_on_msg(_("@info", "Inverse rewrite directive matches none of " "the other messages with same translation."), msg, cat) return 1 if len(sel_msgs) > 1: warning_on_msg(_("@info", "Inverse rewrite directive matches more than " "one other message with same translation."), msg, cat) return 1 # Copy all parts of the other message. omsg = sel_msgs[0] msg.msgid = omsg.msgid if msg.msgid_plural is not None and omsg.msgid_plural is not None: msg.msgid_plural = omsg.msgid_plural # Copy comments and recurse. msg.set(omsg) nerrors = rewrite_inverse(msg, cat) return nerrors _ent_rx = re.compile(r"&[\w.:-]+;", re.U) def remove_paired_ents (msg, cat): """ Remove all XML-like entities from original, and from translation all that are also found in original [type F4A hook]. To remove all entities from original, and all entitities from translation that also exist in original, may be useful prior to markup checks, when list of known entities is not available. @return: number of errors """ return _rm_paired_ents(msg, cat) def remove_paired_ents_tick (tick): """ Like L{remove_paired_ents}, except that each XML-like entity is replaced by a non-whitespace "tick" instead of plainly removed [hook factory]. @param tick: the tick sequence @type tick: string @return: type F3A hook @rtype: C{(cat, msg, text) -> text} """ def hook (msg, cat): return _rm_paired_ents(msg, cat, tick) return hook def _rm_paired_ents (msg, cat, tick=''): ents_orig = set() ents_orig.update(_ent_rx.findall(msg.msgid)) for ent in ents_orig: msg.msgid = msg.msgid.replace(ent, tick) if msg.msgid_plural: ents_orig.update(_ent_rx.findall(msg.msgid_plural)) for ent in ents_orig: msg.msgid_plural = msg.msgid_plural.replace(ent, tick) for i in range(len(msg.msgstr)): ents_trans = set(_ent_rx.findall(msg.msgstr[i])) for ent in ents_trans.intersection(ents_orig): msg.msgstr[i] = msg.msgstr[i].replace(ent, tick) return 0 diff --git a/pology/report.py b/pology/report.py index 30350f73..108aeed6 100644 --- a/pology/report.py +++ b/pology/report.py @@ -1,354 +1,354 @@ # -*- coding: UTF-8 -*- """ Report info, warning and error messages. Functions for Pology tools to issue reports to the user at runtime. May colorize some output. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import sys import locale import time from pology import _, n_, t_, TextTrans from pology.colors import ColorString _prev_text_cr = [None, None] def encwrite (file, text): """ Write unicode text to file using best encoding guess. If the file has been opened with explicit encoding, that encoding is used. Otherwise a guess is made based on the environment locale. @param file: file to write to @type file: C{file} @param text: text to write @type text: string or unicode """ enc = getattr(file, "encoding", None) or locale.getpreferredencoding() text = text.encode(enc, "replace") # If last output was returning to line start with CR, clean up the line. if _prev_text_cr[0] is not None and not _prev_text_cr[1].closed: cstr = "\r%s\r" % (" " * len(_prev_text_cr[0])) _prev_text_cr[0] = None _prev_text_cr[1].write(cstr) _prev_text_cr[1] = None # If current output is returning to line start with CR, record it. if text.endswith("\r"): cstr = text if "\n" in cstr: cstr = cstr[cstr.rfind("\n") + 1:] _prev_text_cr[0] = cstr _prev_text_cr[1] = file file.write(text) def report (text, showcmd=False, subsrc=None, file=sys.stdout, newline=True): """ Generic report. Text is output to the file descriptor, with one newline appended by default. @param text: text to report @type text: string @param showcmd: whether to show the command name @type showcmd: bool @param subsrc: more detailed source of the text @type subsrc: C{None} or string @param file: send output to this file descriptor @type file: C{file} @param newline: whether to append newline to output @type newline: bool """ if not isinstance(text, ColorString): text = ColorString("%s") % text text = text.resolve(dest=file) cmdname = None if showcmd: cmdname = os.path.basename(sys.argv[0]) lines = text.split("\n") for i in range(len(lines)): if i == 0: if cmdname and subsrc: head = "%s (%s): " % (cmdname, subsrc) elif cmdname: head = "%s: " % cmdname elif subsrc: head = "(%s): " % subsrc else: head = "" lhead = len(head) else: if lhead: head = "... " else: head = "" lines[i] = head + lines[i] if newline: lines.append("") text = "\n".join(lines) encwrite(file, text) def warning (text, showcmd=True, subsrc=None, file=sys.stderr): """ Generic warning. @param text: text to report @type text: string @param showcmd: whether to show the command name @type showcmd: bool @param subsrc: more detailed source of the text @type subsrc: C{None} or string @param file: send output to this file descriptor @type file: C{file} """ rtext = _("@info", "[warning] %(msg)s", msg=text) report(rtext, showcmd=showcmd, subsrc=subsrc, file=file) def error (text, code=1, showcmd=True, subsrc=None, file=sys.stderr): """ Generic error (aborts the execution). Exits with the given code. @param text: text to report @type text: string @param code: the exit code @type code: int @param showcmd: whether to show the command name @type showcmd: bool @param file: send output to this file descriptor @type file: C{file} """ rtext = _("@info", "[error] %(msg)s", msg=text) report(rtext, showcmd=showcmd, subsrc=subsrc, file=file) sys.exit(code) def init_file_progress (fpaths, timeint=1.0, stream=sys.stderr, addfmt=None): """ Create a function to output progress bar while processing files. When a collection of files is about to be processed, this function can be used to construct a progress update function, which shows and updates the progress bar in the terminal. The progress update function can be called as frequently as desired during processing of a particular file, with file path as argument. For example:: update_progress == init_file_progress(file_paths) for file_path in file_paths: for line in open(file_path).readlines(): update_progress(file_path) # ... # Processing. # ... update_progress() # clears last progress line Parameter C{timeint} determines the frequency of update, in seconds. It should be chosen such that the progress updates themselves (formatting, writing out to shell) are only a small fraction of total processing time. The output stream for the progress bar can be specified by the C{stream} parameter. Additional formatting for the progress bar may be supplied by the C{addfmt} parameter. It can be one of: a function taking one string parameter (the basic progress bar) and returning a string, a delayed translation (L{TextTrans}) with single named formatting directive C{%(file)s}, or a plain string with same formatting directive. @param fpaths: collection of file paths @type fpaths: list of strings @param timeint: update interval in seconds @type timeint: float @param stream: the stream to output progress to @type stream: file @param addfmt: additional format for the progress line @type addfmt: (text) -> text or L{TextTrans} or string @returns: progress updating function @rtype: (file_path, last_time, time_interval) -> new_last_time """ if not fpaths or not stream.isatty(): return lambda x=None: x try: import curses curses.setupterm() except: return lambda x=None: x def postfmt (pstr): if callable(addfmt): pstr = addfmt(pstr) elif isinstance(addfmt, TextTrans): pstr = addfmt.with_args(file=pstr).to_string() elif addfmt: pstr = addfmt % dict(file=pstr) if isinstance(pstr, ColorString): pstr = pstr.resolve(dest=stream) return pstr pfmt = ("%%1s %%%dd/%d %%s" % (len(str(len(fpaths))), len(fpaths))) - pspins = [u"–", u"\\", u"|", u"/"] + pspins = ["–", "\\", "|", "/"] i_spin = [0] i_file = [0] seen_fpaths = set() otime = [-timeint] enc = getattr(stream, "encoding", None) or locale.getpreferredencoding() minenclen = len(postfmt(pfmt % (pspins[0], 0, "")).encode(enc, "replace")) def update_progress (fpath=None): ntime = time.time() if ntime - otime[0] >= timeint: otime[0] = ntime elif fpath in seen_fpaths: return if fpath: i_spin[0] = (i_spin[0] + 1) % len(pspins) if fpath not in seen_fpaths: seen_fpaths.add(fpath) i_file[0] += 1 # Squeeze file path to fit into the terminal width. curses.setupterm() acolfp = curses.tigetnum("cols") - minenclen - 2 # 2 for \r\r rfpath = fpath infix = "..." lenred = 1 while len(rfpath.encode(enc, "replace")) > acolfp: hlfp = (len(fpath) - len(infix)) // 2 - lenred lenred += 1 rfpath = fpath[:hlfp] + infix + fpath[-hlfp:] pstr = postfmt(pfmt % (pspins[i_spin[0]], i_file[0], rfpath)) encwrite(stream, "\r%s\r" % pstr) else: encwrite(stream, "") stream.flush() return update_progress def list_options (optparser, short=False, both=False): """ Simple list of all option names found in the option parser. The list is composed of option names delimited by newlines. If an option is having both short and long name, the behavior is determined by parameters C{short} and C{both}. If neither is C{True}, only the long name is added to list. If only C{short} is C{True}, only the short name is added to list. If C{both} is C{True} both names are added to the list, in the order determined by C{short} -- if C{True}, short name is listed first. The list is sorted by long option names where available, with short name listed before or after the long name, depending on C{short} (C{True} for before). @param optparser: option parser @type optparser: OptionParser @param short: whether to prefer short names @type short: bool @param both: whether to show both long and short name of an option @type both: bool @returns: formated list of option names @rtype: string """ optnames = [] for opt in optparser.option_list: if str(opt) != opt.get_opt_string(): sname, lname = str(opt).split("/") if both: onames = [sname, lname] if short else [lname, sname] else: onames = [sname] if short else [lname] else: onames = [opt.get_opt_string()] optnames.append(onames) elind = -1 if short else 0 optnames.sort(key=lambda x: x[elind].lstrip("-")) fmtlist = "\n".join(sum(optnames, [])) return fmtlist def format_item_list (items, incmp=False, quoted=False): """ Format inline item list, for insertion into text. @param items: items to list @type items: sequence of elements convertible to string by unicode() @param incmp: whether some items are omitted from the list @type incmp: bool @param quoted: whether each item should be quoted @type quoted: bool @returns: inline formatted list of items @rtype: string """ sep = _("@item:intext general separator for inline lists of items, " "e.g. \", \" in \"apples, bananas, cherries, and plums\"", ", ") sep_last = _("@item:intext last separator for inline lists of items, " "e.g. \", and \" in \"apples, bananas, cherries, and plums\"", ", and ") sep_two = _("@item:intext separator for inline list of exactly two items, " "e.g. \" and \" in \"apples and bananas\"", " and ") ellipsis = _("@item:intext trailing string for incomplete lists, " "e.g. \"...\" in \"apples, bananas, cherries...\"", "...") quoting = t_("@item:intext quotes around each element in the list", "'%(el)s'") - itemstrs = map(unicode, items) + itemstrs = list(map(str, items)) if quoted: itemstrs = [quoting.with_args(el=x).to_string() for x in items] if not incmp: if len(itemstrs) == 0: - return u"" + return "" elif len(itemstrs) == 1: return itemstrs[0] elif len(itemstrs) == 2: return sep_two.join(itemstrs) else: return sep.join(itemstrs[:-1]) + sep_last + itemstrs[-1] else: return sep.join(itemstrs) + ellipsis diff --git a/pology/resolve.py b/pology/resolve.py index 78f0ceec..92d4ae8f 100644 --- a/pology/resolve.py +++ b/pology/resolve.py @@ -1,883 +1,883 @@ # -*- coding: UTF-8 -*- """ Replace value-defining segments in text with their values. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import difflib import os import re from pology import PologyError, _, n_ from pology.report import warning, format_item_list # Defult starting string of alternatives directives. DEFAULT_ALTHEAD = "~@" _entity_ref_rx = re.compile(r"&([\w:][\w\d.:-]*);", re.U) def resolve_entities (text, entities, ignored=set(), srcname=None, vfilter=None, undefrepl=None): """ Replace XML entities in the text with their values. Entity values are defined by the supplied dictionary of name-value pairs. Not all entities need to be replaced, some can be explicitly ignored. If an entity is neither defined nor ignored, a warning will be reported to standard output if C{srcname} is given. An undefined entity is by default left untouched in the resulting text. Instead, the parameter C{undefrepl} can be used to supply a string to substitute for every undefined entity, or a function which takes the undefined entity name and returns the string to substitute. @param text: the text to transform @type text: string @param entities: entity name-value pairs @type entities: has .get() with dict.get() semantics @param ignored: entities to ignore; a sequence of entity names, or function taking the entity name and returning C{True} if ignored @type ignored: a sequence or (string)->bool @param srcname: if not None, report unknown entities to standard output, with this parameter as source identifier @type srcname: None or string @param vfilter: format string (with single C{%s} directive) or function to apply to every resolved entity value @type vfilter: string or (string)->string @param undefrepl: string or function to use in case of undefined entity @type undefrepl: string of (string)->string @returns: the resulting text, resolved entities names, and unknown entity names @rtype: (string, [string...], [string...]) """ ignoredf = ignored if callable(ignored) else lambda x: x in ignored unknown = [] resolved = [] segs = [] p = 0 while True: pp = p p = text.find("&", p) if p < 0: segs.append(text[pp:]) break segs.append(text[pp:p]) m = _entity_ref_rx.match(text, p) if m: entref = m.group(0) entname = m.group(1) if not ignoredf(entname): entval = entities.get(entname) entvalr = entval if entval is not None: resolved.append(entname) else: unknown.append(entname) if undefrepl is not None: - if isinstance(undefrepl, basestring): + if isinstance(undefrepl, str): entvalr = undefrepl else: entvalr = undefrepl(entname) if entvalr is not None: if vfilter is not None: - if isinstance(vfilter, basestring): + if isinstance(vfilter, str): entvalr = vfilter % entvalr else: entvalr = vfilter(entvalr) # Recurse in case entity resolves into new entities. res = resolve_entities(entvalr, entities, ignoredf, srcname, vfilter, undefrepl) entvalr, resolved_extra, unknown_extra = res resolved.extend(resolved_extra) unknown.extend(unknown_extra) segs.append(entvalr) else: segs.append(entref) if entval is None and srcname is not None: # Try to suggest some near matches. #nears = difflib.get_close_matches(entname, entities) # FIXME: Too slow for a lot entities. nears = [] if nears: warning(_("@info", "%(file)s: Unknown entity '%(ent)s' " "(near matches: %(entlist)s).", file=srcname, ent=entname, entlist=format_item_list(nears))) else: warning(_("@info", "%(file)s: Unknown entity '%(ent)s'.", file=srcname, ent=entname)) else: segs.append(entref) p += len(entref) else: segs.append("&") p += 1 new_text = type(text)("").join(segs) return new_text, resolved, unknown def resolve_entities_simple (text, entities, ignored=set(), srcname=None, vfilter=None): """ As L{resolve_entities}, but returns only the resolved text. @returns: the resulting text @rtype: string @see: L{resolve_entities} """ return resolve_entities(text, entities, ignored, srcname=srcname, vfilter=vfilter)[0] def resolve_alternatives (text, select, total, althead=DEFAULT_ALTHEAD, altfilter=None, outfilter=None, condf=None, srcname=None): """ Replace alternatives directives in the text with the selected alternative. Alternatives directives are of the form C{~@/.../.../...}, for example:: I see a ~@/pink/white/ elephant. where C{~@} is the directive head, followed by a character that defines the delimiter of alternatives (like in C{sed} command). The number of alternatives per directive is not defined by the directive itself, but is provided as an external parameter. Alternative directive is resolved into one of the alternative substrings by given index of the alternative (one-based). Before substituting the directive, the selected alternative can be filtered through function given by C{altfilter} parameter. Text outside of directives can be filtered as well, piece by piece, through the function given by C{outfilter} parameter. If an alternatives directive is malformed (e.g. to little alternatives), it may be reported to standard output. Unless all encountered directives were well-formed, the original text is returned instead of the partially resolved one. @param text: the text to transform @type text: string @param select: index of the alternative to select (one-based) @type select: int > 0 @param total: number of alternatives per directive @type total: int > 0 @param althead: directive head to use instead of the default one @type althead: string @param altfilter: filter to apply to chosen alternatives @type altfilter: (string) -> string @param outfilter: filter to apply to text outside of directives @type outfilter: (string) -> string @param condf: resolve current alternative directive only when this function returns C{True} on call with each alternative as argument @type condf: None or C{(x_1, ..., x_n) -> True/False} @param srcname: if not None, report malformed directives to standard output, with this string as source identifier @type srcname: None or string @returns: resulting text, number of resolved alternatives, and an indicator of well-formedness (C{True} if all directives well-formed) @rtype: string, int, bool """ alt_head = althead alt_hlen = len(althead) if outfilter is None: outfilter = lambda x: x if altfilter is None: altfilter = lambda x: x original_text = text - new_text = u"" + new_text = "" nresolved = 0 malformed = False p = -1 while True: pp = p + 1 p = text.find(alt_head, pp) if p < 0: new_text += outfilter(text[pp:]) break ps = p # Append segment prior to alternatives directive to the result. new_text += outfilter(text[pp:p]) rep_text = text[p:] # text segment for error reporting # Must have at least 2 characters after the head. if len(text) < p + alt_hlen + 2: malformed = True if srcname is not None: warning(_("@info", "%(file)s: Malformed alternatives directive " "'...%(snippet)s'.", file=srcname, snippet=rep_text)) break # Read the separating character. p += alt_hlen sep = text[p] # Parse requested number of inserts, # choose the one with matching index for the result. alts = [] for i in range(total): pp = p + 1 p = text.find(sep, pp) # Must have exactly the given total number of alternatives. if p < 0: malformed = True if srcname is not None: warning(_("@info", "%(file)s: Too few alternatives in " "the alternatives directive '...%(snippet)s'.", file=srcname, snippet=rep_text)) break alts.append(text[pp:p]) if malformed: break # Replace the alternative if admissible, or leave directive untouched. isel = select - 1 if isel < len(alts) and (not condf or condf(*alts)): new_text += altfilter(alts[isel]) nresolved += 1 else: new_text += text[ps:p+1] if malformed: new_text = original_text nresolved = 0 return new_text, nresolved, not malformed def resolve_alternatives_simple (text, select, total, althead=DEFAULT_ALTHEAD, altfilter=None, outfilter=None, condf=None, srcname=None): """ As L{resolve_alternatives}, but return only the resolved text. @returns: the resulting text @rtype: string @see: L{resolve_alternatives} """ res = resolve_alternatives(text, select, total, althead, altfilter, outfilter, condf, srcname) ntext, d1, valid = res if not valid: return text return ntext def first_to_case (text, upper=True, nalts=0, althead=DEFAULT_ALTHEAD): """ Change case of the first letter in the text. Text may also have alternatives directives (see L{resolve_alternatives}). In that case, if the first letter is found within an alternative, change cases for first letters in other alternatives of the same directive too. If lowercasing is requested, it is not done if both the first and the second letter are uppercase (e.g. acronyms, all-caps writting). @param text: the text to transform @type text: string @param upper: whether to transform to uppercase (lowercase otherwise) @type upper: bool @param nalts: if non-zero, the number of alternatives per directive @type nalts: int @param althead: alternatives directive head instead of the default one @type althead: string @returns: the resulting text @rtype: string @see: L{resolve_alternatives} """ alt_head = althead alt_hlen = len(althead) tlen = len(text) remalts = 0 checkcase = True intag = False ncchanged = 0 textcc = "" i0 = 0 i = 0 while i < tlen: i0 = i c = text[i] cchange = False if c == "<": # A markup tag is just starting. intag = True elif c == ">": # A markup tag is just ending. intag = False elif ( not intag and nalts and not remalts and text[i:i+alt_hlen] == alt_head): # An alternatives directive is just starting. i += 2 if i >= tlen: # malformed directive, bail out textcc = text break # Record alternatives separator, set number of remaining # alternatives, reactivate case checking. altsep = text[i] remalts = nalts checkcase = True elif not intag and remalts and c == altsep: # Alternative separator found, reduce number of remaining # alternatives and reactivate case checking. remalts -= 1 checkcase = True elif not intag and checkcase and c.isalpha(): # Case check is active and the character is a letter; # request case change. cchange = True # No more case checks until next alternatives separator. checkcase = False # Go to next character. i += 1 # Check if previous segment should be added with case change, or as is. cseg = text[i0:i] if cchange: ncchanged += 1 if upper: textcc += cseg.upper() else: # Find first next letter, for two-uppercase check. i1 = i while i1 < tlen and not text[i1].isalpha(): i1 += 1 if i1 == tlen or not cseg.isupper() or not text[i1].isupper(): textcc += cseg.lower() else: textcc += cseg else: textcc += cseg # If any letter has been upcased and there are no more alternatives # to be processed, we're done. if ncchanged > 0 and remalts == 0: textcc += text[i:] break return textcc def first_to_upper (text, nalts=0, althead=DEFAULT_ALTHEAD): """ Uppercase the first letter in the text. A shortcut for L{first_to_case} for uppercasing. @see: L{first_to_case} """ return first_to_case(text, upper=True, nalts=nalts, althead=althead) def first_to_lower (text, nalts=0, althead=DEFAULT_ALTHEAD): """ Lowercase the first letter in the text. A shortcut for L{first_to_case} for lowercasing. @see: L{first_to_case} """ return first_to_case(text, upper=False, nalts=nalts, althead=althead) def expand_vars (text, varmap, head="%"): """ Expand variables in the text. Expansion directives start with a directive head (C{head} parameter), followed by variable name consisting of alphanumeric characters and underscores, and ending by any other character. Variable name may also be explicitly delimited within braces. Variable values for substitution are looked up by name in the C{varmap} dictionary; if not found, C{NameError} is raised. Some examples:: expand_vars("Mary had a little %mammal.", {"mammal":"lamb"}) expand_vars("Quite a %{critic}esque play.", {"critic":"burl"}) expand_vars("Lost in single ~A.", {"A":"parenthesis"}, "~") Dictionary values are filtered as C{"%s" % value} prior to substitution. Directive head may be escaped by repeating it twice in a row. @param text: string to expand @type text: string @param varmap: mapping of variable names to values @type varmap: (name, value) dictionary @param head: opening sequence for expansion directive @type head: string """ p = 0 hlen = len(head) tlen = len(text) ntext = [] while p < tlen: pp = p p = text.find(head, pp) if p < 0: ntext.append(text[pp:]) break ntext.append(text[pp:p]) p += hlen if p < tlen and text[p:p+hlen] == head: # escaped ntext.append(head) p += hlen continue if p == tlen: raise PologyError( _("@info", "Empty variable expansion directive " "at column %(col)d in string '%(str)s'.", col=(p - hlen), str=text)) braced = False if text[p] == "{": braced = True p += 1 pp = p while p < tlen: c = text[p] if ( (not braced and not (c.isalnum() or c == "_")) or (braced and c == "}") ): break p += 1 if braced and p == tlen: raise PologyError( _("@info", "Unclosed variable expansion directive " "at column %(col)d in string '%(str)s'.", col=(pp - 1 - hlen), str=text)) varname = text[pp:p] if braced: p += 1 varvalue = varmap.get(varname) if varvalue is None: raise PologyError( _("@info", "Unknown variable '%(var)s' in variable expansion directive " "at column %(col)d in string '%(str)s'.", var=varname, col=pp, str=text)) ntext.append("%s" % varvalue) return type(text)("").join(ntext) _usual_accels = list("_&~^") def remove_accelerator (text, accels=None, greedy=False): """ Remove accelerator from the text. Accelerator markers are characters which determine which letter in the text will be used as keyboard accelerator in user interface. They are usually a single non-alphanumeric character, and inserted before the letter which should be the accelerator, e.g. C{"Foo &Bar"}, C{"Foo _Bar"}, etc. Sometimes, especially in CJK texts, accelerator letter is separated out in parenthesis, at the start or end of the text, such as C{"Foo Bar (&B)"}. This function will try to remove the accelerator in a smart way. E.g. it will ignore ampersand in C{"Foo & Bar"}, and completely remove a CJK-style accelerator. If C{accels} is C{None}, the behavior depends on the value of C{greedy}. If it is C{False}, text is removed as is. If it is C{True}, some usual accelerator markers are considered: C{_}, C{&}, C{~}, and C{^}. @param text: text to clear of the accelerator @type text: string @param accels: possible accelerator markers @type accels: sequence of strings or C{None} @param greedy: whether to try known markers if C{accels} is C{None} @type greedy: bool @returns: text without the accelerator @rtype: string """ if accels is None: if not greedy: return text else: accels = _usual_accels for accel in accels: alen = len(accel) p = 0 while True: p = text.find(accel, p) if p < 0: break if text[p + alen:p + alen + 1].isalnum(): # If the accelerator marker is &, do not remove it if it # looks like an XML entity (less damage than otherwise). if accel == "&": m = _entity_ref_rx.match(text, p) if m: p = m.span()[1] continue # Valid accelerator. text = text[:p] + text[p + alen:] # May have been an accelerator in style of # "()" at the start or end of text. if (text[p - 1:p] == "(" and text[p + 1:p + 2] == ")"): # Check if at start or end, ignoring non-alphanumerics. tlen = len(text) p1 = p - 2 while p1 >= 0 and not text[p1].isalnum(): p1 -= 1 p1 += 1 p2 = p + 2 while p2 < tlen and not text[p2].isalnum(): p2 += 1 p2 -= 1 if p1 == 0: text = text[:p - 1].lstrip() + text[p2 + 1:] elif p2 + 1 == tlen: text = text[:p1] + text[p + 2:].rstrip() # Do not break, remove all accelerator markers, # as it is indeterminate which one is the real one. if text[p + alen:p + 2 * alen] == accel: # Escaped accelerator marker. text = text[:p] + text[p + alen:] p += alen return text def remove_fmtdirs (text, format, subs=""): """ Remove format directives from the text. Format directives are used to substitute values in the text. An example text with directives in several formats:: "%d men on a %s man's chest." # C "%(num)d men on a %(attrib)s man's chest." # Python "%1 men on a %2 man's chest." # KDE/Qt Format is specified by a string keyword. The following formats are known at the moment: C{c}, C{qt}, c{kde}, c{python}. Format string may also have C{-format} appended to the keyword, for compatibility with Gettext format flags. @param text: text from which to remove format directives @type text: string @param format: format keyword @type format: string @param subs: text to replace format directives instead of just removing it @type subs: string @returns: text without format directives @rtype: string """ format = format.lower() if format.endswith("-format"): format = format[:format.rfind("-")] if 0: pass elif format == "c": text = _remove_fmtdirs_c(text, subs) elif format in ("kde", "qt"): # FIXME: Actually, there are some differences between the two. text = _remove_fmtdirs_qt(text, subs) elif format == "python": text = _remove_fmtdirs_python(text, subs) # must be first text = _remove_fmtdirs_c(text, subs) return text #_fmtdir_tail_c = r"[ +-]?\d*\.?\d*[a-z]" # A conversion specifier begins with the % character. After the % character come the following in this order: # [flags] Control the conversion (optional). # [width] Defines the number of characters to print (optional). # [.precision] Defines the amount of precision to print for a number type (optional). # [modifier] Overrides the size (type) of the argument (optional). # [type] The type of conversion to be applied (required). # from http://www.acm.uiuc.edu/webmonkeys/book/c_guide/2.12.html#printf _fmtdir_tail_c = r"[ +-0]?(\d+|\*)?(\.(\d+|\*))?[hlL]?[cdieEfgGosuxXpn%]" _fmtdir_tail_c_rx = re.compile(_fmtdir_tail_c) def _remove_fmtdirs_c (text, subs=""): p = 0 nsegs = [] while True: pp = p p = text.find("%", p) if p < 0: nsegs.append(text[pp:]) break nsegs.append(text[pp:p]) p += 1 if text[p:p+1] == "%": nsegs.append("%") p += 1 continue m = _fmtdir_tail_c_rx.match(text, p) if m: p = m.span()[1] if subs: nsegs.append(subs) return type(text)("").join(nsegs) _fmtdir_tail_python_rx = re.compile(r"(\(.*?\))?" + _fmtdir_tail_c) def _remove_fmtdirs_python (text, subs=""): p = 0 nsegs = [] while True: pp = p p = text.find("%", p) if p < 0: nsegs.append(text[pp:]) break nsegs.append(text[pp:p]) p += 1 if text[p:p+1] == "%": nsegs.append("%") p += 1 continue m = _fmtdir_tail_python_rx.match(text, p) if m: p = m.span()[1] if subs: nsegs.append(subs) return type(text)("").join(nsegs) _fmtdir_tail_qt_rx = re.compile(r"L?\d{1,2}") def _remove_fmtdirs_qt (text, subs=""): p = 0 nsegs = [] while True: pp = p p = text.find("%", p) if p < 0: nsegs.append(text[pp:]) break nsegs.append(text[pp:p]) p += 1 m = _fmtdir_tail_qt_rx.match(text, p) if m: p = m.span()[1] if subs: nsegs.append(subs) else: nsegs.append("%") return type(text)("").join(nsegs) def remove_literals (text, subs="", substrs=[], regexes=[], heuristic=True): """ Remove literal substrings from the text. Literal substrings are URLs, email addresses, web site names, command options, etc. This function will heuristically try to remove such substrings from the text. Additional literals to remove may be specified as verbatim substrings (C{substrs} parameter) and regular expressions (C{regexes}). These are applied before the internal heuristic matchers. Heuristic removal may be entirely disabled by setting C{heuristic} to C{False}. @param text: text from which to remove literals @type text: string @param subs: text to replace literals instead of just removing them @type subs: string @param substrs: additional substrings to remove by direct string match @type substrs: sequence of strings @param regexes: additional substrings to remove by regex match @type regexes: sequence of compiled regular expressions @param heuristic: whether to apply heuristic at all @type heuristic: bool @returns: text without literals @rtype: string """ # Apply explicit literals before heuristics. for substr in substrs: text = text.replace(substr, subs) for regex in regexes: text = regex.sub(subs, text) if heuristic: text = _remove_literals_url(text, subs) text = _remove_literals_email(text, subs) text = _remove_literals_web(text, subs) # after URLs and email text = _remove_literals_cmd(text, subs) text = _remove_literals_file(text, subs) return text def _remove_by_rx (text, rx, subs=""): p = 0 nsegs = [] while True: m = rx.search(text, p) if not m: nsegs.append(text[p:]) break p1, p2 = m.span() nsegs.append(text[p:p1]) if subs: nsegs.append(subs) p = p2 return type(text)("").join(nsegs) _literal_url_rx = re.compile(r"\S+://\S*[\w\d&=]", re.U) def _remove_literals_url (text, subs=""): return _remove_by_rx(text, _literal_url_rx, subs) _literal_web_rx = re.compile(r"\w[\w-]{2,}(\.[\w-]{2,})+", re.U) def _remove_literals_web (text, subs=""): return _remove_by_rx(text, _literal_web_rx, subs) _literal_email_rx = re.compile(r"\w[\w.-]*@\w+\.[\w.-]*\w") def _remove_literals_email (text, subs=""): return _remove_by_rx(text, _literal_email_rx, subs) _literal_cmd_rx = re.compile(r"[a-z\d_-]+\(\d\)", re.I) _literal_cmdopt_rx = re.compile(r"(? numerr} """ - dst_inds = map(set, zip(*mapping))[1] + dst_inds = list(map(set, list(zip(*mapping))))[1] num_plurals = max(dst_inds) + 1 - if sorted(dst_inds) != range(num_plurals): + if sorted(dst_inds) != list(range(num_plurals)): raise PologyError( _("@info", "Gaps in destination indices for conversion of plural forms " "(expected (%(list1)s), got (%(list2)s)).", - list1=format_item_list(range(num_plurals)), + list1=format_item_list(list(range(num_plurals))), list2=format_item_list(sorted(dst_inds)))) - ord_src_inds = zip(*sorted(mapping, key=lambda x: x[1]))[0] + ord_src_inds = list(zip(*sorted(mapping, key=lambda x: x[1])))[0] def hook (cat): - cat.header.set_field(u"Plural-Forms", unicode(plhead), + cat.header.set_field("Plural-Forms", str(plhead), after="Content-Transfer-Encoding") for msg in cat: if msg.msgid_plural is not None: msg.msgstr[:] = [msg.msgstr[i] for i in ord_src_inds] return 0 return hook diff --git a/pology/rules.py b/pology/rules.py index b9918f8f..61697654 100644 --- a/pology/rules.py +++ b/pology/rules.py @@ -1,1549 +1,1549 @@ # -*- coding: UTF-8 -*- """ Match messages by rules of arbitrary specificity. A message-matching rule, represented by L{Rule} object, is a series of pattern matches to be applied to the message, leading to the decision of whether or not the rule as whole matches the message. Patterns can be of different kinds, act on different parts of the message, and be applied in a boolean-like combinations. See C{doc/user/lingo.docbook#sec-lgrules} for detailed discussion of rules. @author: Sébastien Renard @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from codecs import open from locale import getlocale from os.path import dirname, basename, isdir, join, isabs from os import listdir import re import sys from time import time from pology import PologyError, datadir, _, n_ from pology.message import MessageUnsafe from pology.config import strbool from pology.getfunc import get_hook_ireq, split_ireq from pology.report import report, warning, format_item_list from pology.tabulate import tabulate from pology.timeout import timed_out TIMEOUT=8 # Time in sec after which a rule processing is timeout def printStat(rules): """Print rules match statistics @param rules: list of rule files """ statRules=[r for r in rules if r.count!=0 and r.stat is True] if statRules: statRules.sort(key=lambda x: x.time) data=[] rown=[r.displayName for r in statRules] data.append([r.count for r in statRules]) data.append([r.time/r.count*1000 for r in statRules]) totTimeMsg=sum(data[-1])/1000 data.append([r.time for r in statRules]) totTime=sum(data[-1]) data.append([r.time/totTime*100 for r in statRules]) report(_("@label", "Rule application statistics:")) coln=[_("@title:column", "calls"), _("@title:column avg = average", "avg-time [ms]"), _("@title:column tot = total", "tot-time [s]"), _("@title:column", "time-share")] dfmt=[ "%d", "%.3f", "%.1f", "%.2f%%"] report(tabulate(data, rown=rown, coln=coln, dfmt=dfmt, colorize=True)) report(_("@info statistics", "Total application time [s]: %(num).1f", num=totTime)) report(_("@info statistics", "Average application time per message [ms]: %(num).1f", num=totTimeMsg*1000)) def loadRules(lang, envs=[], envOnly=False, ruleFiles=None, stat=False, printInfo=False): """Load rules for a given language @param lang: lang as a string in two caracter (i.e. fr). If none or empty, try to autodetect language @param envs: also load rules applicable in these environments @param envOnly: load only rules applicable in given environments @param ruleFiles: a list of rule files to load instead of internal @param stat: stat is a boolean to indicate if rule should gather count and time execution @param printInfo: whether to output information about loading of rules @return: list of rules objects or None if rules cannot be found (with complaints on stdout) """ ruleDir="" # Rules directory rules=[] # List of rule objects langDir=join(datadir(), "lang") # Base of rule files per language # Collect rule files. if ruleFiles is not None: if printInfo: report(_("@info:progress", "Using external rules.")) else: ruleDir=join(langDir, lang, "rules") if not isdir(ruleDir): raise PologyError( _("@info", "There are no internal rules for language '%(langcode)s'.", langcode=lang)) if printInfo: report(_("@info:progress", "Using internal rules for language '%(langcode)s'.", langcode=lang)) ruleFiles=[join(ruleDir, f) for f in listdir(ruleDir) if f.endswith(".rules")] # Parse rules. seenMsgFilters = {} for ruleFile in ruleFiles: rules.extend(loadRulesFromFile(ruleFile, stat, set(envs), seenMsgFilters)) # Remove rules with specific but different to given environments, # or any rule not in given environments in environment-only mode. # FIXME: This should be moved to loadRulesFromFile. srules=[] for rule in rules: if envOnly and rule.environ not in envs: continue elif rule.environ and rule.environ not in envs: continue srules.append(rule) rules=srules # When operating in specific environments, for rules with # equal identifiers eliminate all but the one in the last environment. if envs: envsByIdent={} for rule in rules: if rule.ident: if rule.ident not in envsByIdent: envsByIdent[rule.ident]=set() envsByIdent[rule.ident].add(rule.environ) srules=[] for rule in rules: eliminate=False if rule.ident and len(envsByIdent[rule.ident])>1: iEnv=((rule.environ is None and -1) or envs.index(rule.environ)) for env in envsByIdent[rule.ident]: iEnvOther=((env is None and -1) or envs.index(env)) if iEnv= len(lines): if not fileStack: lines = None break lines, filePath, lno = fileStack.pop() if lines is None: break lno += 1 fields, lno = _parseRuleLine(lines, lno) # End of rule bloc # FIXME: Remove 'not fields' when global directives too # start with something. This will eliminate rule separation # by empty lines, and skipping comment-only lines. if lines[lno - 1].strip().startswith("#"): continue if not fields or fields[0][0] in (_rule_start,): if inRule: inRule=False if msgFilters is None: msgFilters = globalMsgFilters if ruleFilters is None: ruleFilters = globalRuleFilters # Use previously assembled filter with the same signature, # to be able to compare filter functions by "is". msgFilterSig = _filterFinalSig(msgFilters) msgFilterFunc = seenMsgFilters.get(msgFilterSig) if msgFilterFunc is None: msgFilterFunc = _msgFilterComposeFinal(msgFilters) seenMsgFilters[msgFilterSig] = msgFilterFunc ruleFilterSig = _filterFinalSig(ruleFilters) ruleFilterFunc = seenRuleFilters.get(ruleFilterSig) if ruleFilterFunc is None: ruleFilterFunc = _ruleFilterComposeFinal(ruleFilters) seenRuleFilters[ruleFilterSig] = ruleFilterFunc rules.append(Rule(pattern, msgpart, hint=hint, valid=valid, stat=stat, casesens=casesens, ident=ident, disabled=disabled, manual=manual, environ=(environ or globalEnviron), mfilter=msgFilterFunc, rfilter=ruleFilterFunc, trigger=triggerFunc)) - pattern=u"" + pattern="" msgpart="" - hint=u"" + hint="" ident=None disabled=False manual=False casesens=True environ=None msgFilters=None ruleFilters=None triggerFunc=None elif inGroup: inGroup=False validGroup[validGroupName]=valid - validGroupName=u"" + validGroupName="" valid=[] if not fields: continue # Begin of rule (pattern or special) if fields[0][0]==_rule_start: inRule=True keyword=fields[0][1] if keyword in _trigger_msgparts: msgpart=keyword pattern=fields[1][0] for mmod in fields[1][1]: if mmod not in _trigger_matchmods: raise _SyntaxError( _("@info", "Unknown match modifier '%(mod)s' " "in trigger pattern.", mod=mmod)) casesens=("i" not in fields[1][1]) elif keyword in _trigger_specials: casesens, rest = _triggerParseGeneral(fields[1:]) if keyword == "hook": triggerFunc = _triggerFromHook(rest) else: raise _SyntaxError( _("@info", "Unknown keyword '%(kw)s' in rule trigger.", kw=keyword)) # valid line (for rule ou validGroup) elif fields[0][0]=="valid": if not inRule and not inGroup: raise _SyntaxError( _("@info", "Directive '%(dir)s' outside of rule or " "validity group.", dir="valid")) valid.append(fields[1:]) # Rule hint elif fields[0][0]=="hint": if not inRule: raise _SyntaxError( _("@info", "Directive '%(dir)s' outside of rule.", dir="hint")) hint=fields[0][1] # Rule identifier elif fields[0][0]=="id": if not inRule: raise _SyntaxError( _("@info", "Directive '%(dir)s' outside of rule.", dir="id")) ident=fields[0][1] if ident in identLines: (prevLine, prevEnviron)=identLines[ident] if prevEnviron==globalEnviron: raise _IdentError(ident, prevLine) identLines[ident]=(lno, globalEnviron) # Whether rule is disabled elif fields[0][0]=="disabled": if not inRule: raise _SyntaxError( _("@info", "Directive '%(dir)s' outside of rule.", dir="disabled")) disabled=True # Whether rule is manually applied elif fields[0][0]=="manual": if not inRule: raise _SyntaxError( _("@info", "Directive '%(dir)s' outside of rule.", dir="manual")) manual=True # Validgroup elif fields[0][0]=="validGroup": if inGroup: raise _SyntaxError( _("@info", "Directive '%(dir)s' inside validity group.", dir="validGroup")) if inRule: # Use of validGroup directive inside a rule bloc validGroupName=fields[1][0] valid.extend(validGroup[validGroupName]) else: # Begin of validGroup inGroup=True validGroupName=fields[1][0] # Switch rule environment elif fields[0][0]=="environment": if inGroup: raise _SyntaxError( _("@info", "Directive '%(dir)s' inside validity group.", dir="environment")) envName=fields[1][0] if inRule: # Environment specification for current rule. environ=envName else: # Environment switch for rules that follow. globalEnviron=envName # Add or remove filters elif ( fields[0][0].startswith("addFilter") or fields[0][0] in ["removeFilter", "clearFilters"]): # Select the proper filter lists on which to act. if inRule: if msgFilters is None: # local filters not created yet msgFilters = globalMsgFilters[:] # shallow copy if ruleFilters is None: ruleFilters = globalRuleFilters[:] currentMsgFilters = msgFilters currentRuleFilters = ruleFilters currentEnviron = environ or globalEnviron else: currentMsgFilters = globalMsgFilters currentRuleFilters = globalRuleFilters currentEnviron = globalEnviron if fields[0][0].startswith("addFilter"): filterType = fields[0][0][len("addFilter"):] handles, parts, fenvs, rest = _filterParseGeneral(fields[1:]) if fenvs is None and currentEnviron: fenvs = [currentEnviron] if filterType == "Regex": func, sig = _filterCreateRegex(rest) elif filterType == "Hook": func, sig = _filterCreateHook(rest) else: raise _SyntaxError( _("@info", "Unknown filter directive '%(dir)s'.", dir=fields[0][0])) msgParts = set(parts).difference(_filterKnownRuleParts) if msgParts: totFunc, totSig = _msgFilterSetOnParts(msgParts, func, sig) currentMsgFilters.append([handles, fenvs, totFunc, totSig]) ruleParts = set(parts).difference(_filterKnownMsgParts) if ruleParts and (not envs or not fenvs or envs.intersection(fenvs)): totFunc, totSig = _ruleFilterSetOnParts(ruleParts, func, sig) currentRuleFilters.append([handles, fenvs, totFunc, totSig]) elif fields[0][0] == ("removeFilter"): _filterRemove(fields[1:], (currentMsgFilters, currentRuleFilters), envs) else: # remove all filters if len(fields) != 1: raise _SyntaxError( _("@info", "Expected no fields in " "all-filter removal directive.")) # Must not loose reference to the selected lists. while currentMsgFilters: currentMsgFilters.pop() while currentRuleFilters: currentRuleFilters.pop() # Include another file elif fields[0][0] == "include": if inRule or inGroup: raise _SyntaxError( _("@info", "Directive '%(dir)s' inside a rule or group.", dir="include")) fileStack.append((lines, filePath, lno)) lines, filePath, lno = _includeFile(fields[1:], filePath) else: raise _SyntaxError( _("@info", "Unknown directive '%(dir)s'.", dir=fields[0][0])) - except _IdentError, e: + except _IdentError as e: raise PologyError( _("@info", "Identifier '%(id)s' at %(file)s:%(line)d " "previously encountered at %(pos)s.", id=e.args[0], file=filePath, line=lno, pos=e.args[1])) - except IOError, e: + except IOError as e: raise PologyError( _("@info", "Cannot read rule file '%(file)s'. The error was: %(msg)s", file=filePath, msg=e.args[0])) - except _SyntaxError, e: + except _SyntaxError as e: raise PologyError( _("@info", "Syntax error at %(file)s:%(line)d:\n%(msg)s", file=filePath, line=lno, msg=e.args[0])) return rules def _checkFields (directive, fields, knownFields, mandatoryFields=set(), unique=True): fieldDict = dict(fields) if unique and len(fieldDict) != len(fields): raise _SyntaxError( _("@info", "Duplicate fields in '%(dir)s' directive.", dir=directive)) if not isinstance(knownFields, set): knownFields = set(knownFields) unknownFields = set(fieldDict).difference(knownFields) if unknownFields: raise _SyntaxError( _("@info", "Unknown fields in '%(dir)s' directive: %(fieldlist)s.", dir=directive, fieldlist=format_item_list(unknownFields))) for name in mandatoryFields: if name not in fieldDict: raise _SyntaxError( _("@info", "Mandatory field '%(field)s' missing in '%(dir)s' directive.", field=name, dir=directive)) def _includeFile (fields, includingFilePath): _checkFields("include", fields, ["file"], ["file"]) fieldDict = dict(fields) relativeFilePath = fieldDict["file"] if isabs(relativeFilePath): filePath = relativeFilePath else: filePath = join(dirname(includingFilePath), relativeFilePath) if filePath.endswith(".rules"): warning(_("@info", "Including one rule file into another, " "'%(file1)s' from '%(file2)s'.", file1=filePath, file2=includingFilePath)) lines=open(filePath, "r", "UTF-8").readlines() lines.append("\n") # sentry line return lines, filePath, 0 def _filterRemove (fields, filterLists, envs): _checkFields("removeFilter", fields, ["handle", "env"], ["handle"]) fieldDict = dict(fields) handleStr = fieldDict["handle"] fenvStr = fieldDict.get("env") if fenvStr is not None: fenvs = [x.strip() for x in fenvStr.split(",")] if not envs or not envs.intersection(fenvs): # We are operating in no environment, or no operating environment # is listed among the selected; skip removal. return handles = set([x.strip() for x in handleStr.split(",")]) seenHandles = set() for flist in filterLists: k = 0 while k < len(flist): commonHandles = flist[k][0].intersection(handles) if commonHandles: flist.pop(k) seenHandles.update(commonHandles) else: k += 1 unseenHandles = handles.difference(seenHandles) if unseenHandles: raise PologyError( _("@info", "No filters with these handles to remove: %(handlelist)s.", handlelist=format_item_list(unseenHandles))) _filterKnownMsgParts = set([ "msg", "msgid", "msgstr", "pmsgid", "pmsgstr", ]) _filterKnownRuleParts = set([ "pattern", ]) _filterKnownParts = set( list(_filterKnownMsgParts) + list(_filterKnownRuleParts)) def _filterParseGeneral (fields): handles = set() parts = [] envs = None rest = [] for field in fields: name, value = field if name == "handle": handles = set([x.strip() for x in value.split(",")]) elif name == "on": parts = [x.strip() for x in value.split(",")] unknownParts = set(parts).difference(_filterKnownParts) if unknownParts: raise _SyntaxError( _("@info", "Unknown message parts for the filter to act on: " "%(partlist)s.", partlist=format_item_list(unknownParts))) elif name == "env": envs = [x.strip() for x in value.split(",")] else: rest.append(field) if not parts: raise _SyntaxError( _("@info", "No message parts specified for the filter to act on.")) return handles, parts, envs, rest def _msgFilterSetOnParts (parts, func, sig): chain = [] parts = list(parts) parts.sort() for part in parts: if part == "msg": chain.append(_filterOnMsg(func)) elif part == "msgstr": chain.append(_filterOnMsgstr(func)) elif part == "msgid": chain.append(_filterOnMsgid(func)) elif part == "pmsgstr": chain.append(_filterOnMsgstrPure(func)) elif part == "pmsgid": chain.append(_filterOnMsgidPure(func)) def composition (msg, cat): for func in chain: func(msg, cat) totalSig = sig + "\x04" + ",".join(parts) return composition, totalSig def _filterFinalSig (filterList): sigs = [x[3] for x in filterList] finalSig = "\x05".join(sigs) return finalSig def _msgFilterComposeFinal (filterList): if not filterList: return None fenvs_funcs = [(x[1], x[2]) for x in filterList] def composition (msg, cat, envs): for fenvs, func in fenvs_funcs: # Apply filter if environment-agnostic or in an operating environment. if fenvs is None or envs.intersection(fenvs): func(msg, cat) return composition def _filterOnMsg (func): def aggregate (msg, cat): func(msg, cat) return aggregate def _filterOnMsgstr (func): def aggregate (msg, cat): for i in range(len(msg.msgstr)): tmp = func(msg.msgstr[i], msg, cat) if tmp is not None: msg.msgstr[i] = tmp return aggregate def _filterOnMsgid (func): def aggregate (msg, cat): tmp = func(msg.msgid, msg, cat) if tmp is not None: msg.msgid = tmp if msg.msgid_plural is not None: tmp = func(msg.msgid_plural, msg, cat) if tmp is not None: msg.msgid_plural = tmp return aggregate def _filterOnMsgstrPure (func): def aggregate (msg, cat): for i in range(len(msg.msgstr)): tmp = func(msg.msgstr[i]) if tmp is not None: msg.msgstr[i] = tmp return aggregate def _filterOnMsgidPure (func): def aggregate (msg, cat): tmp = func(msg.msgid) if tmp is not None: msg.msgid = tmp if msg.msgid_plural is not None: tmp = func(msg.msgid_plural) if tmp is not None: msg.msgid_plural = tmp return aggregate def _ruleFilterSetOnParts (parts, func, sig): chain = [] parts = list(parts) parts.sort() for part in parts: if part == "pattern": chain.append((_filterOnPattern(func), part)) def composition (value, part): if part not in _filterKnownRuleParts: raise PologyError( _("@info", "Unknown rule part '%(part)s' for the filter to act on.", part=part)) for func, fpart in chain: if fpart == part: value = func(value) return value totalSig = sig + "\x04" + ",".join(parts) return composition, totalSig def _ruleFilterComposeFinal (filterList): if not filterList: return None funcs = [x[2] for x in filterList] def composition (value, part): for func in funcs: value = func(value, part) return value return composition def _filterOnPattern (func): def aggregate (pattern): tmp = func(pattern) if tmp is not None: pattern = tmp return pattern return aggregate _filterRegexKnownFields = set(["match", "repl", "casesens"]) def _filterCreateRegex (fields): _checkFields("addFilterRegex", fields, _filterRegexKnownFields, ["match"]) fieldDict = dict(fields) caseSens = _fancyBool(fieldDict.get("casesens", "0")) flags = re.U | re.S if not caseSens: flags |= re.I matchStr = fieldDict["match"] matchRx = re.compile(matchStr, flags) replStr = fieldDict.get("repl", "") def func (text): return matchRx.sub(replStr, text) sig = "\x04".join([matchStr, replStr, str(caseSens)]) return func, sig def _filterCreateHook (fields): _checkFields("addFilterHook", fields, ["name"], ["name"]) fieldDict = dict(fields) hookSpec = fieldDict["name"] hook = get_hook_ireq(hookSpec, abort=False) sigSegs = [] for el in split_ireq(hookSpec): if el is not None: sigSegs.append(el) else: sigSegs.append("\x00") sig = "\x04".join(sigSegs) return hook, sig def _triggerParseGeneral (fields): casesens = True rest = [] for field in fields: name, value = field if name == "casesens": casesens = _fancyBool(value) else: rest.append(field) return casesens, rest _triggerKnownMsgParts = set([ "msg", "msgid", "msgstr", "pmsgid", "pmsgstr", ]) def _triggerFromHook (fields): _checkFields("hook", fields, ["name", "on"], ["name", "on"]) fieldDict = dict(fields) hook = get_hook_ireq(fieldDict["name"], abort=False) msgpart = fieldDict["on"].strip() if msgpart not in _triggerKnownMsgParts: raise PologyError( _("@info", "Unknown message part '%(part)s' for trigger to act on.", part=msgpart)) if msgpart == "msg": def trigger (msg, cat): return hook(msg, cat) elif msgpart == "msgid": def trigger (msg, cat): hl = [] hl.append(("msgid", 0, hook(msg.msgid, msg, cat))) if msg.msgid_plural is not None: hl.append(("msgid_plural", 0, hook(msg.msgid_plural, msg, cat))) return hl elif msgpart == "msgstr": def trigger (msg, cat): hl = [] for i in range(len(msg.msgstr)): hl.append(("msgstr", i, hook(msg.msgstr[i], msg, cat))) return hl elif msgpart == "pmsgid": def trigger (msg, cat): hl = [] hl.append(("msgid", 0, hook(msg.msgid))) if msg.msgid_plural is not None: hl.append(("msgid_plural", 0, hook(msg.msgid_plural))) return hl elif msgpart == "pmsgstr": def trigger (msg, cat): hl = [] for i in range(len(msg.msgstr)): hl.append(("msgstr", i, hook(msg.msgstr[i]))) return hl return trigger def _fancyBool (string): value = strbool(string) if value is None: raise PologyError( _("@info", "Cannot convert '%(val)s' to a boolean value.", val=string)) return value _trigger_msgparts = set([ # For matching in all messages. "msgctxt", "msgid", "msgstr", # For matching in plural messages part by part. "msgid_singular", "msgid_plural", "msgstr_0", "msgstr_1", "msgstr_2", "msgstr_3", "msgstr_4", "msgstr_5", "msgstr_6", "msgstr_7", "msgstr_8", "msgstr_9", # ought to be enough ]) _trigger_specials = set([ "hook", ]) _trigger_matchmods = [ "i", ] class Rule(object): """Represent a single rule""" _knownKeywords = set(("env", "cat", "catrx", "span", "after", "before", "ctx", "msgid", "msgstr", "head", "srcref", "comment")) _regexKeywords = set(("catrx", "span", "after", "before", "ctx", "msgid", "msgstr", "srcref", "comment")) _twoRegexKeywords = set(("head",)) _listKeywords = set(("env", "cat")) def __init__(self, pattern, msgpart, hint=None, valid=[], stat=False, casesens=True, ident=None, disabled=False, manual=False, environ=None, mfilter=None, rfilter=None, trigger=None): """Create a rule @param pattern: valid regexp pattern that trigger the rule @type pattern: unicode @param msgpart: part of the message to be matched by C{pattern} @type msgpart: string @param hint: hint given to user when rule match @type hint: unicode @param valid: list of cases that should make or not make rule matching @type valid: list of unicode key=value @param casesens: whether regex matching will be case-sensitive @type casesens: bool @param ident: rule identifier @type ident: unicode or C{None} @param disabled: whether rule is disabled @type disabled: bool @param manual: whether rule is manually applied @type manual: bool @param environ: environment in which the rule applies @type environ: string or C{None} @param mfilter: filter to apply to message before checking @type mfilter: (msg, cat, envs) -> @param rfilter: filter to apply to rule strings (e.g. on regex patterns) @type rfilter: (string) -> string @param trigger: function to act as trigger instead of C{pattern} applied to C{msgpart} @type trigger: (msg, cat, envs) -> L{highlight} """ # Define instance variable self.pattern=None # Compiled regexp into re.pattern object self.msgpart=msgpart # The part of the message to match self.valid=None # Parsed valid definition self.hint=hint # Hint message return to user self.ident=ident # Rule identifier self.disabled=disabled # Whether rule is disabled self.manual=manual # Whether rule is manually applied self.count=0 # Number of time rule have been triggered self.time=0 # Total time of rule process calls self.stat=stat # Wheter to gather stat or not. Default is false (10% perf hit due to time() call) self.casesens=casesens # Whether regex matches are case-sensitive self.environ=environ # Environment in which to apply the rule self.mfilter=mfilter # Function to filter the message before checking self.rfilter=rfilter # Function to filter the rule strings self.trigger=None # Function to use as trigger instead of pattern if trigger is None and msgpart not in _trigger_msgparts: raise PologyError( _("@info", "Unknown message part '%(part)s' set for the rule's " "trigger pattern.", part=msgpart)) # Flags for regex compilation. self.reflags=re.U|re.S if not self.casesens: self.reflags|=re.I # Setup trigger. if not trigger: self.setPattern(pattern) else: self.setTrigger(trigger) #Parse valid key=value arguments self.setValid(valid) def setPattern(self, pattern): """Compile pattern @param pattern: pattern as an unicode string""" try: if self.rfilter: pattern=self.rfilter(pattern, "pattern") self.pattern=re.compile(pattern, self.reflags) - except Exception, e: + except Exception as e: warning(_("@info", "Invalid pattern '%(pattern)s', disabling rule:\n" "%(msg)s", pattern=pattern, msg=e)) self.disabled=True self.rawPattern=pattern self.trigger=None # invalidate any trigger function if self.ident: self.displayName=_("@item:intext", "[id=%(rule)s]", rule=self.ident) else: self.displayName=_("@item:intext", "[pattern=%(pattern)s]", pattern=self.rawPattern) def setTrigger(self, trigger): """ Use trigger function instead of pattern. @param trigger: function to act as trigger @type trigger: (msg, cat, envs) -> {highlight} """ self.trigger=trigger self.pattern=None # invalidate any pattern self.rawPattern="" if self.ident: self.displayName=_("@item:intext", "[id=%(rule)s]", rule=self.ident) else: self.displayName=_("@item:intext", "[function]") def setValid(self, valid): """Parse valid key=value arguments of valid list @param valid: valid line as an unicode string""" self.valid=[] for item in valid: try: entry=[] # Empty valid entry for (key, value) in item: key=key.strip() bkey = key if key.startswith("!"): bkey = key[1:] if bkey not in Rule._knownKeywords: warning(_("@info", "Invalid keyword '%(kw)s' in " "validity definition, skipped.", kw=key)) continue if self.rfilter: value=self.rfilter(value, "pattern") if bkey in Rule._regexKeywords: # Compile regexp value=re.compile(value, self.reflags) elif bkey in Rule._listKeywords: # List of comma-separated words value=[x.strip() for x in value.split(",")] elif bkey in Rule._twoRegexKeywords: # Split into the two regexes and compile them. frx, vrx=value[1:].split(value[:1]) value=(re.compile(frx, self.reflags), re.compile(vrx, self.reflags)) entry.append((key, value)) self.valid.append(entry) - except Exception, e: + except Exception as e: warning(_("@info", "Invalid validity definition '%(dfn)s', skipped. " "The error was:\n%(msg)s", dfn=item, msg=e)) continue #@timed_out(TIMEOUT) def process (self, msg, cat, envs=set(), nofilter=False): """ Apply rule to the message. If the rule matches, I{highlight specification} of offending spans is returned (see L{report_msg_content}); otherwise an empty list. Rule will normally apply its own filters to the message before matching (on a local copy, the original message will not be affected). If the message is already appropriately filtered, this self-filtering can be prevented by setting C{nofilter} to {True}. @param msg: message to which the texts belong @type msg: instance of L{Message_base} @param cat: catalog to which the message belongs @type cat: L{Catalog} @param envs: environments in which the rule is applied @type envs: set @param nofilter: avoid filtering the message if C{True} @type nofilter: bool @return: highlight specification (may be empty list) """ if self.pattern is None and self.trigger is None: warning(_("@info", "Rule trigger not defined, rule skipped.")) return [] # If this rule belongs to a specific environment, # and it is not among operating environments, # cancel the rule immediately. if self.environ and self.environ not in envs: return [] # Cancel immediately if the rule is disabled. if self.disabled: return [] if self.stat: begin=time() # Apply own filters to the message if not filtered already. if not nofilter: msg = self._filter_message(msg, cat, envs) if self.pattern: failed_spans = self._processWithPattern(msg, cat, envs) else: failed_spans = self._processWithTrigger(msg, cat, envs) # Update stats for matched rules. self.count += 1 if self.stat: self.time += time() - begin return failed_spans def _create_text_spec (self, msgpart, msg): if 0: pass elif msgpart == "msgid": text_spec = [("msgid", 0, msg.msgid)] if msg.msgid_plural is not None: text_spec += [("msgid_plural", 0, msg.msgid_plural)] elif msgpart == "msgstr": text_spec = [("msgstr", i, msg.msgstr[i]) for i in range(len(msg.msgstr))] elif msgpart == "msgctxt": text_spec = [] if msg.msgctxt is not None: text_spec = [("msgctxt", 0, msg.msgctxt)] elif msgpart == "msgid_singular": text_spec = [("msgid", 0, msg.msgid)] elif msgpart == "msgid_plural": text_spec = [] if msg.msgid_plural is not None: text_spec += [("msgid_plural", 0, msg.msgid_plural)] elif msgpart.startswith("msgstr_"): item = int(msgpart.split("_")[1]) text_spec = [("msgstr", item, msg.msgstr[item])] else: raise PologyError( _("@info", "Unknown message part '%(part)s' referenced in the rule.", part=msgpart)) return text_spec def _processWithPattern (self, msg, cat, envs): text_spec = self._create_text_spec(self.msgpart, msg) failed_spans = {} for part, item, text in text_spec: # Get full data per match. pmatches = list(self.pattern.finditer(text)) if not pmatches: # Main pattern does not match anything, go to next text. continue # Test all matched segments. for pmatch in pmatches: # First validity entry that matches excepts the current segment. cancel = False for entry in self.valid: if self._is_valid(pmatch.group(0), pmatch.start(), pmatch.end(), text, entry, msg, cat, envs): cancel = True break if not cancel: # Record the span of problematic segment. skey = (part, item) if skey not in failed_spans: failed_spans[skey] = (part, item, [], text) failed_spans[skey][2].append(pmatch.span()) - return failed_spans.values() + return list(failed_spans.values()) def _processWithTrigger (self, msg, cat, envs): # Apply trigger. possibly_failed_spans = self.trigger(msg, cat) # Try to clear spans with validity tests. failed_spans = {} for spanspec in possibly_failed_spans: part, item, spans = spanspec[:3] ftext = None if len(spanspec) > 3: ftext = spanspec[3] part_item = part if part == "msgstr": part_item = part + "_" + str(item) text_spec = self._create_text_spec(part_item, msg) if ftext is None: # the trigger didn't do any own filtering ftext = text_spec[0][2] # message field which contains the span for span in spans: mstart, mend = span[:2] # may contain 3rd element, error text pmatch = ftext[mstart:mend] cancel = False for entry in self.valid: if self._is_valid(pmatch, mstart, mend, ftext, entry, msg, cat, envs): cancel = True break if not cancel: # Record the span of problematic segment. skey = (part, item) if skey not in failed_spans: failed_spans[skey] = (part, item, [], ftext) failed_spans[skey][2].append(span) - return failed_spans.values() + return list(failed_spans.values()) def _filter_message (self, msg, cat, envs): fmsg = msg if self.mfilter is not None: fmsg = MessageUnsafe(msg) self.mfilter(fmsg, cat, envs) return fmsg def _is_valid (self, match, mstart, mend, text, ventry, msg, cat, envs): # All keys within a validity entry must match for the # entry to match as whole. valid = True for key, value in ventry: bkey = key invert = False if key.startswith("!"): bkey = key[1:] invert = True if bkey == "env": match = envs.intersection(value) if invert: match = not match if not match: valid = False break elif bkey == "cat": match = cat.name in value if invert: match = not match if not match: valid = False break elif bkey == "catrx": match = bool(value.search(cat.name)) if invert: match = not match if not match: valid = False break elif bkey == "head": frx, vrx = value match = False for name, value in cat.header.field: match = frx.search(name) and vrx.search(value) if match: break if invert: match = not match if not match: valid = False break elif bkey == "span": found = value.search(match) is not None if invert: found = not found if not found: valid = False break elif bkey == "after": # Search up to the match to avoid need for lookaheads. afterMatches = value.finditer(text, 0, mstart) found = False for afterMatch in afterMatches: if afterMatch.end() == mstart: found = True break if invert: found = not found if not found: valid = False break elif bkey == "before": # Search from the match to avoid need for lookbehinds. beforeMatches = value.finditer(text, mend) found = False for beforeMatch in beforeMatches: if beforeMatch.start() == mend: found = True break if invert: found = not found if not found: valid = False break elif bkey == "ctx": match = False if msg.msgctxt: match = value.search(msg.msgctxt) if invert: match = not match if not match: valid = False break elif bkey == "msgid": match = False for msgid in (msg.msgid, msg.msgid_plural): if msgid is not None: match = value.search(msgid) if match: break if invert: match = not match if not match: valid = False break elif bkey == "msgstr": match = False for msgstr in msg.msgstr: match = value.search(msgstr) if match: break if invert: match = not match if not match: valid = False break elif bkey == "srcref": match = False for file, lno in msg.source: if value.search(file): match = True break if invert: match = not match if not match: valid = False break elif bkey == "comment": match = False all_cmnt = [] all_cmnt.extend(msg.manual_comment) all_cmnt.extend(msg.auto_comment) for cmnt in all_cmnt: if value.search(cmnt): match = True break if invert: match = not match if not match: valid = False break return valid def _parseRuleLine (lines, lno): """ Split a rule line into fields as list of (name, value) pairs. If a field name is followed by '=' or '=""', the field value will be an empty string. If there is no equal sign, the value will be C{None}. If the line is the trigger pattern, the name of the first field is going to be the "*", and its value the keyword of the message part to be matched; the name of the second field is going to be the pattern itself, and its value the string of match modifiers. """ # Compose line out or backslash continuations. line = lines[lno - 1] while line.endswith("\\\n"): line = line[:-2] if lno >= len(lines): break lno += 1 line += lines[lno - 1] llen = len(line) fields = [] p = 0 in_modifiers = False while p < llen: while line[p].isspace(): p += 1 if p >= llen: break if p >= llen or line[p] == "#": break if len(fields) == 0 and line[p] in ("[", "{"): # Shorthand trigger pattern. bropn = line[p] brcls, fname = {"{": ("}", "msgid"), "[": ("]", "msgstr")}[bropn] # Collect the pattern. # Look for the balanced closing bracket. p1 = p + 1 balance = 1 while balance > 0: p += 1 if p >= llen: break if line[p] == bropn: balance += 1 elif line[p] == brcls: balance -= 1 if balance > 0: raise _SyntaxError( _("@info", "Unbalanced '%(delim)s' in shorthand trigger pattern.", delim=bropn)) fields.append((_rule_start, fname)) fields.append((line[p1:p], "")) p += 1 in_modifiers = True elif len(fields) == 0 and line[p] == _rule_start: # Verbose trigger. p += 1 while p < llen and line[p].isspace(): p += 1 if p >= llen: raise _SyntaxError( _("@info", "Missing '%(kw)s' keyword in the rule trigger.", kw="match")) # Collect the match keyword. p1 = p while line[p].isalnum() or line[p] == "_": p += 1 if p >= llen: raise _SyntaxError( _("@info", "Malformed rule trigger.")) tkeyw = line[p1:p] fields.append((_rule_start, tkeyw)) if tkeyw in _trigger_msgparts: # Collect the pattern. while line[p].isspace(): p += 1 if p >= llen: raise _SyntaxError( _("@info", "No pattern after the trigger keyword '%(kw)s'.", kw=tkeyw)) quote = line[p] p1 = p + 1 p = _findEndQuote(line, p) fields.append((line[p1:p], "")) p += 1 # skip quote in_modifiers = True else: # Special trigger, go on reading fields. pass elif in_modifiers: # Modifiers after the trigger pattern. p1 = p while not line[p].isspace(): p += 1 if p >= llen: break pattern, pmods = fields[-1] fields[-1] = (pattern, pmods + line[p1:p]) else: # Subdirective field. # Collect field name. p1 = p while not line[p].isspace() and line[p] != "=": p += 1 if p >= llen: break fname = line[p1:p] if not re.match(r"^!?[a-z][\w-]*$", fname): raise _SyntaxError( _("@info", "Invalid field name '%(field)s'.", field=fname)) if p >= llen or line[p].isspace(): fields.append((fname, None)) else: # Collect field value. p += 1 # skip equal-character if p >= llen or line[p].isspace(): fields.append((fname, "")) else: quote = line[p] p1 = p + 1 p = _findEndQuote(line, p) fvalue = line[p1:p] fields.append((fname, fvalue)) p += 1 # skip quote return fields, lno def _findEndQuote (line, pos=0): """ Find end quote to the quote at given position in the line. Character at the C{pos} position is taken as the quote character. Closing quote can be escaped with backslash inside the string, in which the backslash is removed in parsed string; backslash in any other position is considered ordinary. @param line: the line to parse @type line: string @param pos: position of the opening quote @type pos: int @return: position of the closing quote @rtype: int """ quote = line[pos] epos = pos + 1 llen = len(line) string = "" while epos < llen: c = line[epos] if c == "\\": epos += 1 c2 = line[epos] if c2 != quote: string += c string += c2 elif c == quote: break else: string += c epos += 1 if epos == llen: raise _SyntaxError( _("@info", "Non-terminated quoted string '%(snippet)s'.", snippet=line[pos:])) return epos diff --git a/pology/sieve.py b/pology/sieve.py index ba750c2a..7ba72e65 100644 --- a/pology/sieve.py +++ b/pology/sieve.py @@ -1,257 +1,257 @@ # -*- coding: UTF-8 -*- """ Helpers for catalog sieves. Pology's C{posieve} script processes catalogs with "sieves": objects to which catalog entries are fed one by one, possibly with finalization phase at the end. This module contains some common helpers which are used by many sieves. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import locale from pology import PologyError, _, n_ from pology.comments import manc_parse_flag_list class SieveError (PologyError): """ Base exception class for sieve errors with special meaning. """ pass class SieveMessageError (SieveError): """ Exception for single messages. If sieve's C{process} method throws it, client is allowed to send other messages from the same catalog to the sieve. """ pass class SieveCatalogError (SieveError): """ Exception for single catalogs. If sieve's C{process} or C{process_header} method throw it, client is not allowed to send other messages from the same catalog to the sieve, but can send messages from other catalogs. """ pass def parse_sieve_flags (msg): """ Extract sieve flags embedded in manual comments. Sieve flags are put into manual comments with the following syntax:: # |, flag1, flag2, ... Some sieves will define certain sieve flags by which their behavior can be altered on a particular message. @param msg: message to parse @type msg: Message @returns: parsed flags @rtype: set of strings """ return set(manc_parse_flag_list(msg, "|")) def add_param_lang (p, appx=None): """ Add C{lang} parameter to sieve parameters. @param appx: one or more trailing paragraphs for the parameter description @type appx: string """ desc = _("@info sieve parameter discription", "The language of translation. " "If the user configuration or a catalog header specifies the language, " "this parameter takes precedence." ) if appx: desc = "%s\n\n%s" % (desc, appx) - p.add_param("lang", unicode, + p.add_param("lang", str, metavar=_("@info sieve parameter value placeholder", "CODE"), desc=desc) def add_param_env (p, appx=None): """ Add C{env} parameter to sieve parameters. @param appx: one or more trailing paragraphs for the parameter description @type appx: string """ desc = _("@info sieve parameter discription", "The environment (language variation) of translation. " "If the user configuration or a catalog header specifies the environment, " "this parameter takes precedence. " "Several environments can be given as comma-separated list." ) if appx: desc = "%s\n\n%s" % (desc, appx) - p.add_param("env", unicode, seplist=True, + p.add_param("env", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "CODE"), desc=desc) def add_param_accel (p, appx=None): """ Add parameter C{accel} to sieve parameters. @param appx: one or more trailing paragraphs for the parameter description @type appx: string """ desc = _("@info sieve parameter discription", "Character which is used as UI accelerator marker in text fields." ) if appx: desc = "%s\n\n%s" % (desc, appx) - p.add_param("accel", unicode, multival=True, + p.add_param("accel", str, multival=True, metavar=_("@info sieve parameter value placeholder", "CHAR"), desc=desc) def add_param_markup (p, appx=None): """ Add parameter C{markup} to sieve parameters. @param appx: one or more trailing paragraphs for the parameter description @type appx: string """ desc = _("@info sieve parameter discription", "Markup that can be expected in text fields, as special keyword. " "Several markups can be given as comma-separated list." ) if appx: desc = "%s\n\n%s" % (desc, appx) - p.add_param("markup", unicode, seplist=True, + p.add_param("markup", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "KEYWORD"), desc=desc) def add_param_filter (p, intro=None): """ Add C{filter} parameter to sieve parameters. @param intro: first paragraph for the parameter description @type intro: string """ desc = _("@info sieve parameter description", "For a module pology.FOO which defines FOO() function, " "the hook specification is simply FOO. " "If the hook function is named BAR() instead of FOO(), then " "the hook specification is FOO/BAR. " "Language specific hooks (pology.lang.LANG.FOO) are aditionally " "preceded by the language code with colon, as LANG:FOO or LANG:FOO/BAR. " "\n\n" "If the function is actually a hook factory, the arguments for " "the factory are passed separated by tilde: LANG:FOO/BAR~ARGS " "(where LANG: and /BAR may be omitted under previous conditions). " "The ARGS string is a list of arguments as it would appear " "in the function call in Python code, omitting parenthesis. " "\n\n" "Several hooks can be given by repeating the parameter, " "when they are applied in the given order." ) if intro: desc = "%s\n\n%s" % (intro, desc) - p.add_param("filter", unicode, multival=True, + p.add_param("filter", str, multival=True, metavar=_("@info sieve parameter value placeholder", "HOOKSPEC"), desc=desc) def add_param_poeditors (p): """ Add parameters for opening messages in editors to sieve parameters. """ p.add_param("lokalize", bool, defval=False, desc=_("@info sieve parameter discription", "Open catalogs on reported messages in Lokalize. " "Lokalize must be already running with the project " "that contains the sieved catalogs opened." )) def add_param_entdef (p): """ Add C{entdef} parameter to sieve parameters. """ - p.add_param("entdef", unicode, multival=True, + p.add_param("entdef", str, multival=True, metavar="FILE", desc=_("@info sieve parameter discription; " "in the last line only 'entname' and 'entvalue' " "should be translated", "File defining the entities used in messages " "(parameter can be repeated to add more files). Entity file " "defines entities one per line, in the format:" "\n\n" "<!ENTITY entname 'entvalue'>" )) def add_param_spellcheck (p): """ Add parameters for spell checking to sieve parameters. """ add_param_lang(p, appx=_("@info sieve parameter discription", "The language determines which system dictionary, " "as well as internal word lists, to use for spell-checking. " "If the language is left undefined for a given catalog, " "it will be skipped and a warning may be output." )) add_param_env(p, appx=_("@info sieve parameter discription", "The environment determines which additional " "internal word lists to use for spell-checking. " "If the environment is left undefined for a given catalog, " "only environment-agnostic internal word lists will be used." )) add_param_accel(p) add_param_markup(p) - p.add_param("skip", unicode, + p.add_param("skip", str, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Regular expression to eliminate from spell-checking words that match it." )) p.add_param("case", bool, defval=False, desc=_("@info sieve parameter discription", "Make matching patterns given as parameter values case-sensitive." )) add_param_filter(p, intro=_("@info sieve parameter discription", "The F1A or F3A/C hook through which to filter the translation " "before passing it to spell-checking." )) p.add_param("suponly", bool, defval=False, desc=_("@info sieve parameter discription", "Use only internal supplement word lists, and not the system dictionary." )) p.add_param("list", bool, defval=False, desc=_("@info sieve parameter discription", "Output only a simple sorted list of unknown words." )) add_param_poeditors(p) diff --git a/pology/spell.py b/pology/spell.py index 7f24cc28..cccd8539 100644 --- a/pology/spell.py +++ b/pology/spell.py @@ -1,576 +1,576 @@ # -*- coding: UTF-8 -*- """ Check spelling in text using different spell checkers. @author: Chusslove Illich (Часлав Илић) author: Javier Vinal (Javier Viñal) @license: GPLv3 """ import os import codecs import re import tempfile from pology import PologyError, datadir, _, n_ from pology.comments import manc_parse_flag_list, manc_parse_list import pology.config from pology.msgreport import report_on_msg from pology.report import warning, format_item_list # Pipe flag to manually prevent spellcheck for a particular message. flag_no_check_spell = "no-check-spell" # Embedded list of words manually declared valid for a particular message. elist_well_spelled = "well-spelled:" def check_spell (lang=None, encoding="UTF-8", variety=None, extopts={}, envs=None, suponly=False, maxsugg=5): """ Check spelling using Aspell [hook factory]. Aspell language is selected by the C{lang} parameter, which should be a language code of one of the installed spelling dictionaries. Text encoding used by the dictionary is provided by the C{encoding} parameter. If the dictionary comes in several varieties, a non-default one is selected using the C{variety} parameter. Any additional options from the set of Aspell configuration fields can be passed in as (name, value) dictionary by the C{extopts} parameter. Pology may contain internal supplemental dictionaries for selected language in C{lang//spell/} directory, and these are automatically picked up. Any subdirectories in C{lang//spell/} are considered as to contain supplemental dictionaries in special "environments" (e.g. jargon, certain projects, etc.), and are not included by default. Such environments can be included by the C{envs} parameter, which is a list of relative paths added to C{lang//spell/} directory. All supplemental dictionaries from such paths are included, as well as from all their parent directories up to C{lang//spell/} (this makes supplemental dictionaries hierarchical, e.g. environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar} is requested, both its and supplements of C{foo} are used). If C{lang} is C{None}, then automatic detection of the language based on the catalog of the message is attempted (see catalog L{language()} method). Similar is attempted for environments if C{env} is C{None} (see catalog L{environment()} method). Aspell's system dictionary can be completely excluded from the check by the C{suponly} parameter, when the check will use only internal supplemental dictionaries. Misspelled words are reported to stdout, with suggestions if available. Maximum number of suggestions to display is selected by the C{maxsugg} parameter; if negative, all suggestions are shown. Spell checking is performed by internally splitting text into words, and querying Aspell word by word. Spliting is performed in a simple fashion; it is assumed that text has been appropriately filtered down to plain text, e.g. that any XML-like markup and other literals have been removed (see L{pology.remove} for filtering possibilities). Spell checking can be skipped entirely on a message by issuing the C{no-check-spell} L{sieve flag}. Alternatively, only certain words may be declared well spelled by adding a manual comment starting with C{well-spelled:} and followed by comma-separated list of words. Example:: # |, no-check-spell msgid "Aaaargh, gahhh, khh..." msgstr "" # well-spelled: Aaaargh, kh msgid "Aaaargh, kh, kh... I have been defeated...!" msgstr "" @param lang: language of spelling dictionary @type lang: string @param encoding: encoding used by the dictionary @type encoding: string @param variety: variety of dictionary @type variety: string @param extopts: additional options to send to Aspell @type extopts: dict @param envs: environments for supplemental dictionaries @type envs: list of strings @param suponly: whether to use only supplemental dictionaries @type suponly: bool @param maxsugg: maximum number of suggestions to show for misspelled word @type maxsugg: int @return: type S3A hook @rtype: C{(text, msg, cat) -> numerr} """ provider = "aspell-raw" return _check_spell_w(provider, lang, encoding, variety, extopts, envs, suponly, maxsugg, False) def check_spell_sp (lang=None, encoding="UTF-8", variety=None, extopts={}, envs=None, suponly=False, maxsugg=5): """ Like L{check_spell}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3A hook @rtype: C{(text, msg, cat) -> spans} """ provider = "aspell-raw" return _check_spell_w(provider, lang, encoding, variety, extopts, envs, suponly, maxsugg, True) def _check_spell_w (provider, lang, encoding, variety, extopts, envs, suponly, maxsugg, spanrep): """ Worker for C{check_spell*} hook factories. """ # FIXME: It is said that no fancy word-splitting is done on the text, # but still, best to split it assuming plain text? wsplit_rx = re.compile("[^\W\d_]+", re.U) def wsplit (text, msg, cat): word_spans = [] for m in wsplit_rx.finditer(text): word, span = m.group(0), m.span() word_spans.append((word, span)) # ...could have been a single comprehension, but may need expansion. return word_spans # Resolve provider. if provider != "aspell-raw": enchant_cfg = pology.config.section("enchant") if not provider: provider = enchant_cfg.string("provider") if not provider: raise PologyError(_("@info", "Enchant provider not set.")) # Cache for constructed checkers. checkers = {} # The checker itself. def spcheck (text, msg, cat): # Check if new spell checker should be constructed. if lang is not None: clang = lang elif cat.language() is not None: clang = cat.language() elif provider != "aspell-raw": clang = enchant_cfg.string("language") else: clang = None if not clang: raise PologyError( _("@info", "Cannot determine language for catalog '%(file)s'.", file=cat.filename)) if envs is not None: cenvs = envs elif cat.environment() is not None: cenvs = cat.environment() elif provider != "aspell-raw": envs_str = enchant_cfg.string("environment") cenvs = envs_str.split(",") if envs_str else [] else: cenvs = [] ckey = (clang, tuple(cenvs)) if ckey not in checkers: if provider != "aspell-raw": checkers[ckey] = _construct_enchant(provider, clang, cenvs, encoding, variety, suponly) else: checkers[ckey] = _construct_aspell(clang, cenvs, encoding, variety, extopts, suponly) checker = checkers[ckey] # Prepare shortcut reports. if spanrep: defret = [] else: defret = 0 # Skip message if explicitly requested. if flag_no_check_spell in manc_parse_flag_list(msg, "|"): return defret # Split text into words and spans: [(word, (start, end)), ...] word_spans = wsplit(text, msg, cat) # Ignore words explicitly listed as good. ignored_words = set(manc_parse_list(msg, elist_well_spelled, ",")) word_spans = [x for x in word_spans if x[0] not in ignored_words] spans = [] for word, span in word_spans: encword = word.encode(encoding) if not checker.check(encword): encsuggs = checker.suggest(encword) maxsugg = 5 # limit to some reasonable number incmp = False if maxsugg > 0 and len(encsuggs) > maxsugg: encsuggs = encsuggs[:maxsugg] incmp = True suggs = [x.decode(encoding) for x in encsuggs] if maxsugg != 0 and suggs: fmtsuggs = format_item_list(suggs, incmp=incmp) snote = _("@info", "Unknown word '%(word)s' " "(suggestions: %(wordlist)s).", word=word, wordlist=fmtsuggs) else: snote = _("@info", "Unknown word '%(word)s'.", word=word) spans.append(span + (snote,)) if spanrep: return spans else: for span in spans: if span[2:]: report_on_msg(span[2], msg, cat) return len(spans) return spcheck # Construct Aspell checker for given langenv. def _construct_aspell (lang, envs, encoding, variety, extopts, suponly): # Get Pology's internal personal dictonary for this language. dictpath, temporary = _compose_personal_dict(lang, envs) if not suponly: # Prepare Aspell options. aopts = {} aopts["lang"] = lang aopts["encoding"] = encoding if variety: aopts["variety"] = variety if dictpath: aopts["personal-path"] = dictpath if extopts: aopts.update(extopts) - aopts = dict([(x, y.encode(encoding)) for x, y in aopts.items()]) + aopts = dict([(x, y.encode(encoding)) for x, y in list(aopts.items())]) # Create Aspell object. import pology.external.pyaspell as A try: - checker = A.Aspell(aopts.items()) - except A.AspellConfigError, e: + checker = A.Aspell(list(aopts.items())) + except A.AspellConfigError as e: raise PologyError( _("@info", "Aspell configuration error:\n%(msg)s", msg=e)) - except A.AspellError, e: + except A.AspellError as e: raise PologyError( _("@info", "Cannot initialize Aspell:\n%(msg)s", msg=e)) else: # Create simple internal checker that only checks against # internal supplemental dictionaries. if not dictpath: raise PologyError( _("@info", "No supplemental dictionaries found.")) checker = _QuasiSpell(dictpath, encoding) # Composited dictionary read by now, remove if temporary file. if temporary: os.unlink(dictpath) return checker # Collect all personal dictionaries found for given language/environment # and composit them into one file to pass to Aspell. # Environment is given as a relative subpath into the language directory; # a dictionary belongs to that environment if it is in the directory # pointed by the subpath, or any of the parent directories. # Return the path to composited file or None if there were no dictionaries, # and whether the file is really a temporary composition or not. def _compose_personal_dict (lang, envs): # Collect all applicable dictionary files # (for a given environment, in its subdirectiory and all above). dictpaths = set() spell_root = os.path.join(datadir(), "lang", lang, "spell") for env in (envs or [""]): spell_sub = os.path.join(".", env) while spell_sub: spell_dir = os.path.join(spell_root, spell_sub) if os.path.isdir(spell_dir): for item in os.listdir(spell_dir): if item.endswith(".aspell"): dictpaths.add(os.path.join(spell_dir, item)) spell_sub = os.path.dirname(spell_sub) dictpaths = list(dictpaths) dictpaths.sort() if not dictpaths: return None, False # If only one dictionary found, Aspell can use it as-is. if len(dictpaths) == 1: return dictpaths[0], False # Composit all dictionary files into one temporary. words = [] for dictpath in dictpaths: words.extend(_read_dict_file(dictpath)) tmpf = tempfile.NamedTemporaryFile() tmpf.close() try: tmpf = codecs.open(tmpf.name, "w", "UTF-8") tmpf.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words))) tmpf.writelines([x + "\n" for x in words]) tmpf.close() - except Exception, e: + except Exception as e: raise PologyError( _("@info", "Cannot create composited spelling dictionary " "in current working directory:\n%(msg)s", msg=e)) return tmpf.name, True # Read words from Aspell personal dictionary. def _read_dict_file (filepath): # Parse the header for encoding. enc_def = "UTF-8" file = codecs.open(filepath, "r", enc_def) header = file.readline() m = re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) if not m: raise PologyError( _("@info", "Malformed header in dictionary file '%(file)s'.", file=filepath)) enc = m.group(4) # Reopen in correct encoding if not the default. if enc.lower() != enc_def.lower(): file.close() file = codecs.open(filepath, "r", enc) # Read words. words = [] for line in file: word = line.strip() if word: words.append(word) return words # Simple spell checker which reads Aspell's personal dictionary file. class _QuasiSpell (object): def __init__ (self, dictpath, enc="UTF-8"): self._words = _read_dict_file(dictpath) self._enc = enc # of the raw text sent in for checking def check (self, encword): word = str.decode(encword, self._enc) return ( word in self._words or word.lower() in self._words) def suggest (self, encword): return [] def check_spell_ec (provider=None, lang=None, encoding="UTF-8", variety=None, envs=None, suponly=False, maxsugg=5): """ Check spelling using Enchant [hook factory]. Enchant provider and language are selected by the C{lang} parameter, which should be a language code of one of the installed spelling dictionaries. Text encoding used by the dictionary is provided by the C{encoding} parameter. If the dictionary comes in several varieties, a non-default one is selected using the C{variety} parameter. If C{provider} is not given, it will be attempted to fetch it from C{[enchant]/provider} user configuration field. Pology may contain internal supplemental dictionaries for selected language in C{lang//spell/} directory, and these are automatically picked up. Any subdirectories in C{lang//spell/} are considered as to contain supplemental dictionaries in special "environments" (e.g. jargon, certain projects, etc.), and are not included by default. Such environments can be included by the C{envs} parameter, which is a list of relative paths added to C{lang//spell/} directory. All supplemental dictionaries from such paths are included, as well as from all their parent directories up to C{lang//spell/} (this makes supplemental dictionaries hierarchical, e.g. environment C{foo/bar} is a child of C{foo}, and thus when C{foo/bar} is requested, both its and supplements of C{foo} are used). If C{lang} is C{None}, then automatic detection of the language based on the catalog of the message is attempted (see catalog L{language()} method). Similar is attempted for environments if C{env} is C{None} (see catalog L{environment()} method). If automatic detection of language does not succeed, finally C{[enchant]/language} user configuration field is consulted; for environments, C{[enchant]/environment} field is consulted. Provider's system dictionary can be completely excluded from the check by the C{suponly} parameter, when the check will use only internal supplemental dictionaries. Misspelled words are reported to stdout, with suggestions if available. Maximum number of suggestions to display is selected by the C{maxsugg} parameter; if negative, all suggestions are shown. Spell checking is performed by internally splitting text into words, and querying provider word by word. Spliting is performed in a simple fashion; it is assumed that text has been appropriately filtered down to plain text, e.g. that any XML-like markup and other literals have been removed (see L{pology.remove} for filtering possibilities). Spell checking can be skipped entirely on a message by issuing the C{no-check-spell} L{sieve flag}. Alternatively, only certain words may be declared well spelled by adding a manual comment starting with C{well-spelled:} and followed by comma-separated list of words. Example:: # |, no-check-spell msgid "Aaaargh, gahhh, khh..." msgstr "" # well-spelled: Aaaargh, kh msgid "Aaaargh, kh, kh... I have been defeated...!" msgstr "" @param provider: the spell-checking provider to use @type provider: string @param lang: language of spelling dictionary @type lang: string @param encoding: encoding used by the dictionary @type encoding: string @param variety: variety of dictionary @type variety: string @param envs: environments for supplemental dictionaries @type envs: list of strings @param suponly: whether to use only supplemental dictionaries @type suponly: bool @param maxsugg: maximum number of suggestions to show for misspelled word @type maxsugg: int @return: type S3A hook @rtype: C{(text, msg, cat) -> numerr} """ extopts = {} return _check_spell_w(provider, lang, encoding, variety, extopts, envs, suponly, maxsugg, False) def check_spell_ec_sp (provider=None, lang=None, encoding="UTF-8", variety=None, envs=None, suponly=False, maxsugg=5): """ Like L{check_spell_ec}, except that erroneous spans are returned instead of reporting problems to stdout [hook factory]. @return: type V3A hook @rtype: C{(text, msg, cat) -> spans} """ extopts = {} return _check_spell_w(provider, lang, encoding, variety, extopts, envs, suponly, maxsugg, True) # Construct Enchant checker for given langenv. def _construct_enchant (provider, lang, envs, encoding, variety, suponly): # Get Pology's internal personal dictonary for this language. dictpath, temporary = _compose_personal_dict(lang, envs) if not suponly: try: import enchant except ImportError: pkgs = ["python-enchant"] raise PologyError(_("@info", "Python wrapper for Enchant not found, " "please install it (possible package names: " "%(pkglist)s).", pkglist=format_item_list(pkgs))) # Create Enchant broker. try: broker = enchant.Broker() - except Exception, e: + except Exception as e: raise PologyError( _("@info", "Cannot initialize Enchant:\n%(msg)s", msg=e)) # Find Enchant language. - e_langs = filter(broker.dict_exists, [variety, lang]) + e_langs = list(filter(broker.dict_exists, [variety, lang])) if e_langs: e_lang = e_langs[0] else: if variety is not None: raise PologyError( _("@info", "Language '%(lang)s' and variety '%(var)s' " "not known to Enchant.", lang=lang, var=variety)) else: raise PologyError( _("@info", "Language '%(lang)s' not known to Enchant.", lang=lang)) # Choose the provider for the selected language. try: broker.set_ordering((e_lang or "*"), provider) - except Exception, e: + except Exception as e: raise PologyError( _("@info", "Cannot configure Enchant for provider '%(pvd)s':\n%(msg)s", pvd=provider, msg=e)) # Create checker and test functionality. try: if dictpath is None: checker = enchant.Dict(e_lang, broker) else: checker = enchant.DictWithPWL(e_lang, dictpath, None, broker) checker.check(".") except: raise PologyError( _("@info", "Enchant test check for language '%(lang)s' failed.", lang=e_lang)) else: # Create simple internal checker that only checks against # internal supplemental dictionaries. if not dictpath: raise PologyError( _("@info", "No supplemental dictionaries found.")) checker = _QuasiSpell(dictpath, encoding) # Composited dictionary read by now, remove if temporary file. if temporary: os.unlink(dictpath) return checker diff --git a/pology/split.py b/pology/split.py index 7252d0c9..be52b492 100644 --- a/pology/split.py +++ b/pology/split.py @@ -1,206 +1,206 @@ # -*- coding: UTF-8 -*- """ Splitting message fields into syntactical elements. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology.resolve import remove_accelerator _word_rxp = r"(?:\w+[']\w+|\w+)" _split_rx = re.compile(r"[^\w]+|%s" % _word_rxp, re.U) _split_rx_markup = re.compile(r"[^\w]*(<.*?>|&[\w.:-]+;|&#x?\d+;)[^\w<&]*" r"|[^\w]+|%s" % _word_rxp, re.U) _word_rx = re.compile(r"^\w", re.U) def split_text (text, markup=False, format=None): """ Split text into words and intersections. The text is split into lists of words and intersections (inter-word segments), such that there is always an intersection before the first and after the last word, even if empty. That is, there is always one more of interesections than of words. The text may contain C{<...>} tags, and be of certain format supported by Gettext (e.g. C{c-format}). If specified, these elements may influence splitting. @param text: the text to split @type text: string @param markup: whether text contains markup tags @type markup: bool @param format: Gettext format flag @type format: None or string @returns: words and intersections @rtype: list of strings, list of strings """ if markup: split_rx = _split_rx_markup word_rx = _word_rx else: split_rx = _split_rx word_rx = _word_rx words = [] intrs = [] lastword = False for m in split_rx.finditer(text): seg = m.group(0) if word_rx.search(seg): if lastword and words: words[-1] += seg else: words.append(seg) lastword = True else: if not lastword and intrs: intrs[-1] += seg else: intrs.append(seg) lastword = False if lastword: - intrs.append(u"") + intrs.append("") if len(intrs) == len(words): - intrs.insert(0, u"") + intrs.insert(0, "") if format == "c-format": words, intrs = _mod_on_format_c(words, intrs) elif format == "qt-format": words, intrs = _mod_on_format_qt(words, intrs) return words, intrs _mf_c_rx = re.compile(r"(?:^|[^%])(% ?)$") def _mod_on_format_c (words, intrs): for i in range(len(words)): m = _mf_c_rx.search(intrs[i]) if m: dirst = m.group(1) intrs[i] = intrs[i][:-len(dirst)] words[i] = dirst + words[i] return words, intrs _mf_qt_rx = re.compile(r"^L?\d") def _mod_on_format_qt (words, intrs): for i in range(len(words)): if intrs[i].endswith("%") and _mf_qt_rx.search(words[i]): intrs[i] = intrs[i][:-1] words[i] = "%" + words[i] return words, intrs # Regexes for text removals to get proper words. # Second member of the tuple is the replacement string. _r_url_rx = (re.compile(r"[a-zA-Z0-9.+-]+://[^\s]*" r"|www\.[\w.-]{1,250}" r"|\b[\w.-]+\.[a-z]{2,3}\b" , re.I|re.U), "") _r_email_rx = (re.compile(r"\b[\w.-]+@[\w.-]+", re.U), "") _r_shvar_rx = (re.compile(r"\$(\w+|\{.*?\})", re.U), "") _r_shopt_rx = (re.compile(r"(^|[^\w])(--|-|/)[\w-]+", re.U), "") _r_tags_rx = (re.compile(r"<.*?>"), " ") _r_ents_rx = (re.compile(r"&[\w.:-]+;", re.U), " ") _r_numents_rx = (re.compile(r"&#x?\d+;"), " ") _r_digits_rx = (re.compile(r"\d+"), " ") _r_fmtd_c_rx = (re.compile(r"(?} tags, entities...), or keyboard accelerator markers. It may also be of certain format known to Gettext (e.g. C{c-format}). If specified, these elements may influence splitting. @param text: the text to split @type text: string @param markup: whether text contains markup tags @type markup: bool @param accels: accelerator characters to ignore @type accels: sequence @param format: Gettext format flag @type format: None or string @returns: proper words @rtype: list of strings """ # Remove markup. # (before format directives) if markup: for rem_rx, sub in _remove_xml_rxs: text = rem_rx.sub(sub, text) # Remove format directives. # (before general non-words) if format: for rem_rx, sub, clng in _remove_fmtd_rxs: if format.startswith(clng + "-"): text = rem_rx.sub(sub, text) # Remove general known non-words. for rem_rx, sub in _remove_rxs: text = rem_rx.sub(sub, text) # Remove accelerators (must come after other replacements). text = remove_accelerator(text, accels, greedy=True) rwords = split_text(text)[0] words = [x for x in rwords if _word_ok_rx.search(x)] return words diff --git a/pology/subcmd.py b/pology/subcmd.py index 1b717f5a..0103631e 100644 --- a/pology/subcmd.py +++ b/pology/subcmd.py @@ -1,783 +1,784 @@ # -*- coding: UTF-8 -*- """ Handle subcommands and their parameters. Subcommands are putting main command into different modes of operation. Main commands with subcommands are typical of package managers, version control systems, etc. This module provides a handler to conveniently load subcommands on demand, and parser to extract and route parameters to them from the command line. The command line interface consists of having subcommand a free parameter, and a special collector-option to collect parameters for the subcommand:: $ cmd -a -b -c \ # command and any usual options subcmd \ # subcommand -s foo \ # subcommand parameter 'foo', without value (flag) -s bar:xyz # subcommand parameter 'bar', with the value 'xyz' where C{-s} is the collector-option, repeated for as many subcommand parameters as needed. The collector-option can be freely positioned in the command line, before or after the subcommand name, and mixed with other options. The format of subcommand parameter is either C{param} for flag parameters, C{param:value} for parameters taking a value, or C{param:value1,value2,...} for parameters taking a list of values. Instead of, or in addition to using comma-separated string to represent the list, some parameters can be repeated on the command line, and all the values collected to make the list. Several subcommands may be given too, in which case a each subcommand parameter is routed to every subcommand which expects it. This means that all those subcommands should place the same semantics into the same-named parameter they are using. @note: For any of the methods in this module, the order of keyword parameters is not guaranteed. Always name them in calls. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ # NOTE: The original code for this module was taken from the Divergloss # glossary processor, and reduced and retouched for the needs in Pology. # Actually, the main reason for not using Divergloss' module directly # is to avoid dependency. import fnmatch import locale import os import re import sys from pology import PologyError, _, n_ from pology.colors import cjoin, cinterp from pology.fsops import term_width from pology.report import format_item_list from pology.wrap import wrap_text +from functools import reduce class ParamParser (object): """ Parser for subcommand parameters. """ def __init__ (self): """ Constructor. """ self._scviews = {} def add_subcmd (self, subcmd, desc=None): """ Add a subcommand for which the parameters may be added afterwards. Use double-newline in the description for splitting into paragraphs. The description can also be set later, using C{set_desc} method of subcommand view. @param subcmd: subcommand name @type subcmd: string @param desc: description of the subcommand @type desc: string or C{None} @return: subcommand view @rtype: L{SubcmdView} """ if subcmd in self._scviews: raise SubcmdError( _("@info", "Trying to add subcommand '%(cmd)s' more than once.", cmd=subcmd)) self._scviews[subcmd] = SubcmdView(self, subcmd, desc) return self._scviews[subcmd] def get_view (self, subcmd): """ The view into previously defined subcommand. @param subcmd: subcommand name @type subcmd: string @return: subcommand view @rtype: L{SubcmdView} """ scview = self._scviews.get(subcmd, None) if scview is None: raise SubcmdError( _("@info", "Trying to get a view for an unknown subcommand '%(cmd)s'.", cmd=subcmd)) return scview def help (self, subcmds=None, wcol=None, stream=sys.stdout): """ Formatted help for subcommands. @param subcmds: subcommand names (all subcommands if C{None}) @type subcmds: list of strings @param wcol: column to wrap text at (<= 0 for no wrapping, C{None} for automatic according to output stream) @type wcol: int @param stream: intended output stream for the text @type stream: file @return: formatted help @rtype: string """ if subcmds is None: - subcmds = self._scviews.keys() + subcmds = list(self._scviews.keys()) subcmds.sort() fmts = [] for subcmd in subcmds: scview = self._scviews.get(subcmd, None) if scview is None: raise SubcmdError( _("@info", "Trying to get help for an unknown subcommand '%(cmd)s'.", cmd=subcmd)) fmts.append(scview.help(wcol, stream)) fmts.append("") return cjoin(fmts, "\n") def listcmd (self, subcmds=None, wcol=None, stream=sys.stdout): """ Formatted listing of subcommands with short descriptions. @param subcmds: subcommand names (all subcommands if C{None}) @type subcmds: list of strings @param wcol: column to wrap text at (<= 0 for no wrapping, C{None} for automatic according to output stream) @type wcol: int @param stream: intended output stream for the text @type stream: file @return: formatted listing @rtype: string """ if subcmds is None: - subcmds = self._scviews.keys() + subcmds = list(self._scviews.keys()) subcmds.sort() maxsclen = max([len(x) for x in subcmds]) ndsep = _("@item:intext splitter between a subcommand name " "and its description", " - ") flead = " " * 2 lead = flead + " " * (maxsclen + 3) if wcol is None: wcol = (term_width(stream=stream) or 80) - 1 fmts = [] for subcmd in subcmds: scview = self._scviews.get(subcmd, None) if scview is None: raise SubcmdError( _("@info", "Trying to include an unknown subcommand '%(cmd)s' " "into listing.", cmd=subcmd)) desc = scview.shdesc() if desc: name = cinterp("%%-%ds" % maxsclen, subcmd) s = name + ndsep + desc else: s = name lines = wrap_text(s, wcol=wcol, flead=flead, lead=lead, endl="") fmts.extend(lines) return cjoin(fmts, "\n") def cmdnames (self): """ Get the list of all defined subcommands by name. @returns: list of subcommands @rtype: [string] """ return sorted(self._scviews.keys()) def cmdviews (self): """ Get the list of all defined subcommand views. @returns: list of subcommand views @rtype: [L{SubcmdView}] """ return [x[1] for x in sorted(self._scviews.items())] def parse (self, rawpars, subcmds): """ Parse the list of parameters collected from the command line. If the command line had parameters specified as:: -sfoo -sbar:xyz -sbaz:10 then the function call should get the list:: rawpars=['foo', 'bar:xyz', 'baz:10'] Result of parsing will be a dictionary of objects by subcommand name, where each object has attributes named like subcommand parameters. If attribute name has not been explicitly defined for a parameter, its parameter name will be used; if not a valid identifier by itself, it will be normalized by replacing all troublesome characters with an underscore, collapsing contiguous underscore sequences to a single underscore, and prepending an 'x' if it does not start with a letter. If a parameter is parsed which is not accepted by any of the given subcommands, its name is added to list of non-accepted parameters, which is the second element of the return tuple. @param rawpars: raw parameters @type rawpars: list of strings @param subcmds: names of issued subcommands @type subcmds: list of strings @return: objects with parameters as attributes, and list of parameter names not accepted by any of subcommands @rtype: dict of objects by subcommand name and list of strings """ # Assure only registered subcommands have been issued. for subcmd in subcmds: if subcmd not in self._scviews: raise SubcmdError( _("@info", "Unregistered subcommand '%(cmd)s' issued.", cmd=subcmd)) # Parse all given parameters and collect their values. param_vals = dict([(x, {}) for x in subcmds]) nacc_params = [] for opstr in rawpars: lst = opstr.split(":", 1) lst += [None] * (2 - len(lst)) param, strval = lst param_accepted = False for subcmd in subcmds: scview = self._scviews[subcmd] if param not in scview._ptypes: # Current subcommand does not have this parameter, skip. continue if param in param_vals[subcmd] and not scview._multivals[param]: raise SubcmdError( _("@info", "Parameter '%(par)s' repeated more than once.", par=param)) ptype = scview._ptypes[param] if ptype is bool and strval is not None: raise SubcmdError( _("@info", "Parameter '%(par)s' is a flag, no value expected.", par=param)) if ptype is not bool and strval is None: raise SubcmdError( _("@info", "Value expected for parameter '%(par)s'.", par=param)) val = scview._defvals[param] if ptype is bool: val = not val val_lst = [] if strval is not None: if not scview._seplists[param]: try: val = ptype(strval) except: raise SubcmdError( _("@info", "Cannot convert value '%(val)s' to " "parameter '%(par)s' into expected " "type '%(type)s'.", val=strval, par=param, type=ptype)) val_lst = [val] else: tmplst = strval.split(",") try: val = [ptype(x) for x in tmplst] except: raise SubcmdError( _("@info", "Cannot convert value '%(val)s' to " "parameter '%(par)s' into list of " "elements of expected type '%(type)s'.", val=strval, par=param, type=ptype)) val_lst = val # Assure admissibility of parameter values. admvals = scview._admvals[param] if admvals is not None: for val in val_lst: if val not in admvals: raise SubcmdError( _("@info", "Value '%(val)s' to parameter '%(par)s' " "not in the admissible set: %(vallist)s.", val=strval, par=param, vallist=format_item_list(admvals))) param_accepted = True if scview._multivals[param] or scview._seplists[param]: if param not in param_vals[subcmd]: param_vals[subcmd][param] = [] param_vals[subcmd][param].extend(val_lst) else: param_vals[subcmd][param] = val if not param_accepted and param not in nacc_params: nacc_params.append(param) # Assure that all mandatory parameters have been supplied to each # issued subcommand, and set defaults for all optional parameters. for subcmd in subcmds: scview = self._scviews[subcmd] for param in scview._ptypes: if param in param_vals[subcmd]: # Option explicitly given, skip. continue if scview._mandatorys[param]: raise SubcmdError( _("@info", "Mandatory parameter '%(par)s' to subcommand " "'%(cmd)s' not issued.", par=param, cmd=subcmd)) param_vals[subcmd][param] = scview._defvals[param] # Create dictionary of parameter objects. class ParamsTemp (object): pass params = {} for subcmd in subcmds: scview = self._scviews[subcmd] params[subcmd] = ParamsTemp() - for param, val in param_vals[subcmd].iteritems(): + for param, val in param_vals[subcmd].items(): # Construct valid attribute name out of parameter name. to_attr_rx = re.compile(r"[^a-z0-9]+", re.I|re.U) attr = scview._attrnames[param] if not attr: attr = to_attr_rx.sub("_", param) if not attr[:1].isalpha(): attr = "x" + attr params[subcmd].__dict__[attr] = val return params, nacc_params class SubcmdView (object): """ The view of a particular subcommand in a parameter parser. """ def __init__ (self, parent, subcmd, desc=None, shdesc=None): """ Constructor. @param parent: the parent parameter parser. @type parent: L{ParamParser} @param subcmd: subcommand name @type subcmd: string @param desc: subcommand description @type desc: string @param shdesc: short subcommand description @type shdesc: string """ self._parent = parent self._subcmd = subcmd self._desc = desc self._shdesc = shdesc # Maps by parameter name. self._ptypes = {} self._mandatorys = {} self._defvals = {} self._admvals = {} self._multivals = {} self._seplists = {} self._metavars = {} self._descs = {} self._attrnames = {} # Parameter names in the order in which they were added. self._ordered = [] def set_desc (self, desc): """ Set description of the subcommand. """ self._desc = desc def set_shdesc (self, shdesc): """ Set short description of the subcommand. """ self._shdesc = shdesc def add_param (self, name, ptype, mandatory=False, attrname=None, defval=None, admvals=None, multival=False, seplist=False, metavar=None, desc=None): """ Define a parameter. A parameter is at minimum defined by its name and value type, and may be optional or mandatory. Optional parameter will be set to the supplied default value if not encountered during parsing. Default value must be of the given parameter type (in the sense of C{isinstance()}) or C{None}. Default value of C{None} can be used to be able to check if the parameter has been parsed at all. If parameter type is boolean, then the default value has a special meaning: the parameter is always parsed without an argument (a flag), and its value will become negation of the default value. If parameter value is not arbitrary for the given type, the set of admissible values can be defined too. Parameter can be used to collect a list of values, in two ways, or both combined. One is by repeating the parameter several times with different values, and another by a single parameter value itself being a comma-separated list of values (in which case the values are parsed into elements of requested type). For such parameters the default value should be a list too (or C{None}). For help purposes, parameter may be given a description and metavariable to represent its value. If the parameter being added to current subcommand has the name same as a previously defined parameter to another subcommand, then the current parameter shares semantics with the old one. This means that the type and list nature of current parameter must match that of the previous one (i.e. C{ptype}, C{multival}, and C{seplist} must have same values). Double-newline in description string splits text into paragraphs. @param name: parameter name @type name: string @param ptype: type of the expected argument @type ptype: type @param mandatory: whether parameter is mandatory @type mandatory: bool @param attrname: explicit name for the object attribute under which the parsed parameter value is stored (auto-derived if C{None}) @type attrname: string @param defval: default value for the argument @type defval: instance of C{ptype} or C{None} @param admvals: admissible values for the argument @type admvals: list of C{ptype} elements or C{None} @param multival: whether parameter can be repeated for list of values @type multival: bool @param seplist: whether parameter is a comma-separated list of values @type seplist: bool @param metavar: name for parameter's value @type metavar: string or C{None} @param desc: description of the parameter @type desc: string or C{None} """ param = name islist = multival or seplist if defval is not None and not islist and not isinstance(defval, ptype): raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to " "subcommand '%(cmd)s' with default value '%(val)s' " "different from its stated type '%(type)s'.", par=param, cmd=self._subcmd, val=defval, type=ptype)) if defval is not None and islist and not _isinstance_els(defval, ptype): raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to " "subcommand '%(cmd)s' with default value '%(val)s' " "which contains some elements different from their " "stated type '%(type)s'.", par=param, cmd=self._subcmd, val=defval, type=ptype)) if defval is not None and admvals is not None and defval not in admvals: raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to " "subcommand '%(cmd)s' with default value '%(val)s' " "not from the admissible set: %(vallist)s.", par=param, cmd=self._subcmd, val=defval, vallist=format_item_list(admvals))) if param in self._ptypes: raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to subcommand " "'%(cmd)s' more than once.", par=param, cmd=self._subcmd)) if islist and not isinstance(defval, (type(None), tuple, list)): raise SubcmdError( _("@info", "Parameter '%(par)s' to subcommand '%(cmd)s' " "is stated to be list-valued, but the default value " "is not given as a list or tuple.", par=param, cmd=self._subcmd)) general_ptype = None general_multival = None general_seplist = None - for scview in self._parent._scviews.itervalues(): + for scview in self._parent._scviews.values(): general_ptype = scview._ptypes.get(param) general_multival = scview._multivals.get(param) general_seplist = scview._seplists.get(param) if general_ptype is not None and ptype is not general_ptype: raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to " "subcommand '%(cmd)s' with '%(field)s' field " "different from the same parameter in other subcommands.", par=param, cmd=self._subcmd, field="ptype")) if general_multival is not None and multival != general_multival: raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to " "subcommand '%(cmd)s' with '%(field)s' field " "different from the same parameter in other subcommands.", par=param, cmd=self._subcmd, field="multival")) if general_seplist is not None and seplist != general_seplist: raise SubcmdError( _("@info", "Trying to add parameter '%(par)s' to " "subcommand '%(cmd)s' with '%(field)s' field " "different from the same parameter in other subcommands.", par=param, cmd=self._subcmd, field="seplist")) self._ptypes[param] = ptype self._mandatorys[param] = mandatory self._defvals[param] = defval self._admvals[param] = admvals self._multivals[param] = multival self._seplists[param] = seplist self._metavars[param] = metavar self._descs[param] = desc self._attrnames[param] = attrname self._ordered.append(param) def help (self, wcol=None, stream=sys.stdout): """ Formatted help for the subcommand. @param wcol: column to wrap text at (<= 0 for no wrapping, C{None} for automatic according to output stream) @type wcol: int @param stream: intended output stream for the text @type stream: file @return: formatted help @rtype: string """ # Split parameters into mandatory and optional. m_params = [] o_params = [] for param in self._ordered: if self._mandatorys[param]: m_params.append(param) else: o_params.append(param) # Format output. if wcol is None: wcol = (term_width(stream=stream) or 80) - 1 def fmt_wrap (text, indent=""): paras = text.split("\n\n") fmtparas = [] for para in paras: lines = wrap_text(para, wcol=wcol, flead=indent, lead=indent, endl="") fmtparas.append(cjoin(lines, "\n")) return cjoin(fmtparas, "\n\n") def fmt_par (param, indent=""): s = "" s += indent + " " + param ptype = self._ptypes[param] if ptype is bool: s += " "*1 +_("@item:intext indicator that the parameter " "is a flag", "[flag]") else: metavar = self._metavars[param] if metavar is None: metavar = _("@item:intext default placehodler for " "the parameter argument", "ARG") s += cinterp(":%s", metavar) defval = self._defvals[param] admvals = self._admvals[param] if ptype is not bool and defval is not None and str(defval): cpos = len(s) - s.rfind("\n") - 1 s += " "*1 + _("@item:intext default value for the argument", "[default %(arg)s=%(val)s]", arg=metavar, val=defval) if admvals is not None: s += "\n" + (" " * cpos) if ptype is not bool and admvals is not None: s += " "*1 + _("@item:intext admissible argument values", "[%(arg)s is one of: %(vallist)s]", arg=metavar, vallist=format_item_list(admvals)) s += "\n" desc = self._descs[param] if desc: fmt_desc = fmt_wrap(desc, indent + " ") s += fmt_desc ## Wrap current parameter with empty lines if ## the description spanned several lines. #if "\n\n" in fmt_desc: #s = "\n" + s + "\n" s += "\n" # empty line after description return s ls = [] ls += [" " + self._subcmd] ls += [" " + "=" * len(ls[-1].strip())] ls += [""] desc = self._desc if not desc: desc = _("@info", "No description available.") ls += [fmt_wrap(desc, " ")] if m_params: ls += [""] ls += [" " + _("@info", "Mandatory parameters:")] ls += [""] for param in m_params: ls += [fmt_par(param, " ")] if o_params: ls += [""] ls += [" " + _("@info", "Optional parameters:")] ls += [""] for param in o_params: ls += [fmt_par(param, " ")] return cjoin(ls, "\n").strip("\n") def name (self): """ Get subcommand name. @returns: subcommand name @rtype: string """ return self._subcmd def shdesc (self): """ Get short description of the subcommand. Short description was either explicitly provided on construction, or it is taken as the first sentence of the first paragraph of the full description. @return: short description @rtype: string """ if self._shdesc is not None: return self._shdesc else: p1 = self._desc.find("\n\n") if p1 < 0: p1 = len(self._desc) p2 = self._desc.find(". ") if p2 < 0: p2 = len(self._desc) shdesc = self._desc[:min(p1, p2)].strip() if shdesc.endswith("."): shdesc = shdesc[:-1] return shdesc def params (self, addcol=False): """ Get the list of subcommand parameters. @param addcol: append colon (C{:}) to non-flag parameters @type addcol: bool @returns: list of subcommand parameters @rtype: [string] """ - pnames = self._ptypes.keys() - fmtnames = dict(zip(pnames, pnames)) + pnames = list(self._ptypes.keys()) + fmtnames = dict(list(zip(pnames, pnames))) if addcol: for pname in pnames: if self._ptypes[pname] is not bool: fmtnames[pname] += ":" return [x[1] for x in sorted(fmtnames.items())] # Check if all elements in a list are instances of given type def _isinstance_els (lst, typ): return reduce(lambda x, y: x and isinstance(y, typ), lst, True) class SubcmdError (PologyError): """ Exception for errors on defining subcommands and parsing their parameters. """ def __init__ (self, msg): """ Constructor. All the parameters are made available as instance variables. @param msg: a description of what went wrong @type msg: string """ self.msg = msg PologyError.__init__(self, msg) diff --git a/pology/synder.py b/pology/synder.py index 49e3f363..62f01e72 100644 --- a/pology/synder.py +++ b/pology/synder.py @@ -1,1993 +1,1993 @@ # -*- coding: UTF-8 -*- """ Derive forms and properties of syntagmas by macro expansion. This module provides facilities for macro derivations on syntagmas. It consists of two elements: the text format for defining macro derivations, and the derivator class which reads and processes these definitions. The derivator class is documented within this module, while the syntax and semantics of syntagma derivations are documented in the user manual, at C{doc/user/lingo.docbook#sec-lgsynder}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import copy -import cPickle as pickle +import pickle as pickle import hashlib import locale import os import re from pology import PologyError, _, n_ from pology.fsops import str_to_unicode from pology.normalize import simplify from pology.report import warning, format_item_list from pology.resolve import first_to_upper, first_to_lower # ---------------------------------------- # Error handling. class SynderError (PologyError): def __init__ (self, message, code, source=None, pos=None): """ Constructor. All the parameters are made available as instance variables. @param message: description of what went wrong @type message: string @param code: numerical ID of the problem @type code: int @param source: name of the source in which the problem occured @type source: string @param pos: line or line and column in the source in which the problem occured @type pos: int or (int, int) """ self.message = message self.code = code self.source = source if isinstance(pos, tuple): self.line, self.col = pos else: self.line = pos self.col = None - PologyError.__init__(self, unicode(self)) + PologyError.__init__(self, str(self)) def __unicode__ (self): if self.source is None: s = _("@info context of error", "[synder-%(code)d]: %(msg)s", code=self.code, msg=self.message) elif self.line is None: s = _("@info context of error", "[synder-%(code)d] in %(source)s: %(msg)s", code=self.code, msg=self.message, source=self.source) elif self.col is None: s = _("@info context of error", "[synder-%(code)d] at %(source)s:%(line)d: %(msg)s", code=self.code, msg=self.message, source=self.source, line=self.line) else: s = _("@info context of error", "[synder-%(code)d] at %(source)s:%(line)d:%(col)d: %(msg)s", code=self.code, msg=self.message, source=self.source, line=self.line, col=self.col) - return unicode(s) + return str(s) # ---------------------------------------- # Caching. # Cache for file sources, by absolute path. _parsed_sources = {} def empty_source_cache (): """ Clear all cached sources. When file with derivations is loaded, its parsed form is cached, such that future load instructions on that same path (e.g. when the path is included from another file) do not waste any extra time and memory. This function erases all sources from the cache, when loading files anew on future load instructions is desired. """ _parsed_sources.clear() # ---------------------------------------- # Parsing. _ch_escape = "\\" _ch_comment = "#" _ch_props = ":" _ch_env = "@" _ch_ksyn_hd = "|" _ch_prop_sep = "," _ch_pkey_sep = "&" _ch_pval = "=" _ch_exp = "|" _ch_cutprop = "!" _ch_termprop = "." _ch_remprop = "^" _ch_exp_mask = "~" _ch_exp_mask_pl = "." _ch_exp_kext = "%" _ch_exp_kext_pl = "*" _ch_exp_upc = "^" _ch_exp_lwc = "`" _ch_tag = "~" _ch_tag_sep = "&" _ch_grp_opn = "{" _ch_grp_cls = "}" _ch_inc = ">" _strict_ws = " \t\n" #set((" ", "\t", "\n")) _ch_nl = "\n" def _parse_string_w (instr, srcname): ctx = _ctx_void dobj = _SDSource(srcname) ctx_stack = [] pos = 0 bpos = (1, 1) while True: handler = _ctx_handlers[ctx] nctx, ndobj, descend, pos, bpos = handler(dobj, instr, pos, bpos) if nctx is not None: if descend: ctx_stack.append((ctx, dobj)) ctx, dobj = nctx, ndobj elif ctx_stack: ctx, dobj = ctx_stack.pop() else: return dobj _anonsrc_count = [0] def _parse_string (instr, srcname=None): # Try to return parsed source from cache. if srcname in _parsed_sources: return _parsed_sources[srcname] if srcname is None: srcname = _("@item automatic name for anonymous input stream", "<stream-%(num)s>", num=_anonsrc_count[0]).resolve("none") _anonsrc_count[0] += 1 source = _parse_string_w(instr, srcname) # Cache the source by name (before procesing includes). _parsed_sources[srcname] = source # Load included sources. source.incsources = _include_sources(source, source.incsources) return source def _parse_file (path): # Try to return parsed source from cache. apath = os.path.abspath(path) if apath in _parsed_sources: return _parsed_sources[apath] # Try to load parsed source from disk. source = _read_parsed_file(apath) if source: # Set attributes discarded on compiling. source.name = path # If still no hit, compile the file. if source is None: source = _compile_file_w(path) # Cache the source by absolute path (before procesing includes). _parsed_sources[apath] = source # Load included sources. source.incsources = _include_sources(source, source.incsources) return source def _compile_file_w (path, cpath=None): if cpath is None: cpath = path + _compfile_suff # Parse the file. ifs = open(path, "r") lines = ifs.readlines() ifs.close() m = re.search(r"^#\s+~~~\s+(\S+)\s+~~~\s*$", lines[0]) if lines else None enc = m and m.group(1) or "UTF-8" lines = [x.decode(enc) for x in lines] instr = "".join(lines) source = _parse_string_w(instr, path) # Write out parsed file. # Temporarily discard attributes relative to importing. iname = source.name source.name = None _write_parsed_file(source, path, cpath) source.name = iname return source def compile_file (path, cpath=None, doraise=False): """ Import file with derivations. If the compile file path C{cpath} is not given, it is constructed as C{path} plus standard extension suffix. If the file cannot be compiled, the behavior depends on C{doraise}. If C{doraise} is C{False}, a warning is reported to standard error; if C{doraise} is C{True}, an L{SynderError} exception is raised. @param path: the path to file to compile @type path: string @param cpath: the path to compiled file @type cpath: string @returns: C{True} if the file was successfully compiled @rtype: bool """ try: _compile_file_w(path, cpath) - except Exception, e: + except Exception as e: if doraise: raise else: warning(_("@info", "Derivation file '%(file)s' cannot be compiled " "due to the following error:\n" "%(msg)s", file=path, msg=str_to_unicode(str(e)))) return False else: return True def _include_sources (source, incpaths): incsources = [] incroot = os.path.dirname(os.path.abspath(source.name)) for incpath in incpaths: # If included path relative, make it relative to current source. if not incpath.startswith(os.path.sep): path = os.path.join(incroot, incpath) else: path = incpath if not os.path.isfile(path): # FIXME: Position of include directive in the file lost, # propagate it to this place to report error properly. raise SynderError( _("@info", "Included file '%(name)s' not found at '%(path)s'.", name=incpath, path=path), 1101, source.name) incsource = _parse_file(path) incsources.append(incsource) return incsources _compfile_suff = "c" _compfile_dver = "0003" _compfile_hlen = hashlib.md5().digest_size * 2 def _write_parsed_file (source, path, cpath=None): if cpath is None: cpath = path + _compfile_suff try: fhc = open(cpath, "wb") fh = open(path, "rb") except: return False # Write out data version and file hash. fhc.write(_compfile_dver) hasher = hashlib.md5 fhc.write(hashlib.md5(fh.read()).hexdigest() + "\n") pickle.dump(source, fhc, 2) # 0 for ASCII instead of binary fhc.close() return True def _read_parsed_file (path): cpath = path + _compfile_suff try: fhc = open(cpath, "rb") fh = open(path, "rb") except: return None # Check if data version and file hashes match. fdverc = fhc.read(len(_compfile_dver)) if fdverc != _compfile_dver: return None fhash = hashlib.md5(fh.read()).hexdigest() fhashc = fhc.read(_compfile_hlen + 1)[:-1] if fhash != fhashc: return None # Load the compiled source. source = pickle.load(fhc) return source # ---------------------------------------- # Parsing context handlers. def _ctx_handler_void (source, instr, pos, bpos): obpos = bpos testsep = lambda c: (c not in _strict_ws and [""] or [None])[0] substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, testsep, wesc=False) if sep is not None: indent = instr[pos - bpos[1] + 1:pos] if instr[pos] == _ch_inc: return _ctx_inc, source, True, pos, bpos elif instr[pos] == _ch_env: if not source.derivs: raise SynderError( _("@info", "No derivation yet for which to start an environment."), 1002, source.name, bpos) if source.indenv is None: source.indenv = indent if indent != source.indenv: raise SynderError( _("@info", "Inconsistent indenting of environment head."), 1003, source.name, bpos) deriv = source.derivs[-1] env = _SDEnv(deriv, bpos) deriv.envs.append(env) return _ctx_env, env, True, pos, bpos else: if source.indderiv is None: source.indderiv = indent if indent != source.indderiv: raise SynderError( _("@info", "Inconsistent indenting of derivation head."), 1001, source.name, bpos) deriv = _SDDeriv(source, bpos) source.derivs.append(deriv) ksyn = _SDSyn(deriv, bpos) deriv.syns.append(ksyn) return _ctx_ksyn, ksyn, True, pos, bpos else: return None, None, False, pos, bpos _seps_ksyn = set((_ch_prop_sep, _ch_props, _ch_tag, _ch_nl)) def _ctx_handler_ksyn (ksyn, instr, pos, bpos): opos, obpos = pos, bpos testsep = lambda c: c in _seps_ksyn and c or None substr, sep, pos, bpos, isesc = _move_to_sep(instr, pos, bpos, testsep, repesc=True) substrls = substr.lstrip(_strict_ws) if ( not ksyn.segs and substrls.startswith(_ch_ksyn_hd) and not isesc[len(substr) - len(substrls)] ): ksyn.hidden = True substr = substr.lstrip()[len(_ch_ksyn_hd):] if substr or not ksyn.segs: ksyn.segs.append(_SDText(ksyn, obpos, substr)) if sep == _ch_props: deriv = ksyn.parent env = _SDEnv(deriv, bpos) deriv.envs.append(env) prop = _SDProp(env, bpos) env.props.append(prop) return _ctx_pkey, prop, False, pos, bpos elif sep == _ch_prop_sep: deriv = ksyn.parent ksyn = _SDSyn(deriv, bpos) deriv.syns.append(ksyn) return _ctx_ksyn, ksyn, False, pos, bpos elif sep == _ch_tag: tag = _SDTag(ksyn, bpos) ksyn.segs.append(tag) return _ctx_tag, tag, True, pos, bpos else: raise SynderError( _("@info", "Unexpected end of derivation head started at %(line)d:%(col)d.", line=obpos[0], col=obpos[1]), 1010, ksyn.parent.parent.name, bpos) def _ctx_handler_env (env, instr, pos, bpos): obpos = bpos testsep = lambda c: c == _ch_props and c or None substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, testsep) if sep == _ch_props: env.name = substr[len(_ch_env):] if not env.name: raise SynderError( _("@info", "Empty environment name."), 1021, env.parent.parent.name, obpos) for oenv in env.parent.envs[:-1]: if env.name == oenv.name: raise SynderError( _("@info", "Repeated environment name '%(env)s'.", env=oenv.name), 1022, env.parent.parent.name, obpos) prop = _SDProp(env, bpos) env.props.append(prop) return _ctx_pkey, prop, False, pos, bpos else: raise SynderError( _("@info", "Unexpected end of environment head started at %(line)d:%(col)d.", line=obpos[0], col=obpos[1]), 1020, env.parent.parent.name, bpos) _seps_pkey = set((_ch_pval, _ch_prop_sep, _ch_exp, _ch_tag, _ch_nl)) def _ctx_handler_pkey (prop, instr, pos, bpos): opos, obpos = pos, bpos testsep = lambda c: c in _seps_pkey and c or None substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, testsep) if sep == _ch_pval: substr = substr.strip() for rawkey in substr.split(_ch_pkey_sep): cut, terminal, canceling = [False] * 3 while rawkey.endswith((_ch_cutprop, _ch_termprop, _ch_remprop)): if rawkey.endswith(_ch_cutprop): cut = True rawkey = rawkey[:-len(_ch_cutprop)] elif rawkey.endswith(_ch_termprop): terminal = True rawkey = rawkey[:-len(_ch_termprop)] elif rawkey.endswith(_ch_remprop): canceling = True rawkey = rawkey[:-len(_ch_remprop)] key = _SDKey(prop, obpos, rawkey, cut, terminal, canceling) prop.keys.append(key) return _ctx_pval, prop, False, pos, bpos else: # Backtrack and go into value context. return _ctx_pval, prop, False, opos, obpos _seps_pval = set((_ch_prop_sep, _ch_exp, _ch_tag, _ch_nl)) def _ctx_handler_pval (prop, instr, pos, bpos): opos, obpos = pos, bpos testsep = lambda c: c in _seps_pval and c or None substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, testsep) if substr: prop.segs.append(_SDText(prop, obpos, substr)) if sep == _ch_prop_sep: env = prop.parent prop = _SDProp(env, bpos) env.props.append(prop) return _ctx_pkey, prop, False, pos, bpos elif sep == _ch_exp: exp = _SDExp(prop, bpos) prop.segs.append(exp) return _ctx_exp, exp, True, pos, bpos elif sep == _ch_tag: tag = _SDTag(prop, bpos) prop.segs.append(tag) return _ctx_tag, tag, True, pos, bpos else: return None, None, False, pos, bpos _seps_exp = set([_ch_prop_sep, _ch_exp] + list(_strict_ws)) def _ctx_handler_exp (exp, instr, pos, bpos): if instr[pos:pos + len(_ch_grp_opn)] == _ch_grp_opn: enclosed = True testsep = lambda c: c in (_ch_grp_cls, _ch_nl) and c or None else: enclosed = False testsep = lambda c: (c in _seps_exp and [""] or [None])[0] obpos = bpos substr, sep, pos, bpos, isesc = _move_to_sep(instr, pos, bpos, testsep, repesc=True) if enclosed and sep is None or sep == _ch_nl: raise SynderError( _("@info", "Unexpected end of expander started at %(line)d:%(col)d.", line=obpos[0], col=obpos[1]), 1050, exp.parent.parent.parent.parent.name, bpos) if enclosed: substr = substr[len(_ch_grp_opn):] p = substr.find(_ch_exp_kext) if p >= 0: exp.kext = substr[p + len(_ch_exp_kext):] substr = substr[:p] p = substr.find(_ch_exp_mask) if p >= 0: exp.mask = substr[p + len(_ch_exp_mask):] substr = substr[:p] if substr.startswith(_ch_exp_upc) and not isesc[0]: exp.caps = True substr = substr[len(_ch_exp_upc):] elif substr.startswith(_ch_exp_lwc) and not isesc[0]: exp.caps = False substr = substr[len(_ch_exp_lwc):] exp.ref = substr return None, None, False, pos, bpos _seps_tag = set([_ch_prop_sep, _ch_exp, _ch_tag] + list(_strict_ws)) def _ctx_handler_tag (tag, instr, pos, bpos): if instr[pos:pos + len(_ch_grp_opn)] == _ch_grp_opn: enclosed = True testsep = lambda c: c in (_ch_grp_cls, _ch_nl) and c or None else: enclosed = False testsep = lambda c: (c in _seps_exp and [""] or [None])[0] obpos = bpos substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, testsep) if enclosed and sep is None or sep == _ch_nl: raise SynderError( _("@info", "Unexpected end of tag started at %(line)d:%(col)d.", line=obpos[0], col=obpos[1]), 1050, exp.parent.parent.parent.parent.name, bpos) if enclosed: substr = substr[len(_ch_grp_opn):] tag.names = substr.split(_ch_tag_sep) return None, None, False, pos, bpos def _ctx_handler_inc (source, instr, pos, bpos): # Skip include directive. substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, lambda c: c) # Parse include path. obpos = bpos testsep = lambda c: c == _ch_nl and c or None substr, sep, pos, bpos = _move_to_sep(instr, pos, bpos, testsep) incpath = substr.strip() if not incpath: raise SynderError( _("@info", "Empty target path in inclusion directive."), 1100, source.name, obpos) # Add to included sources of this source. # Temporarily store paths, to be resolved into full sources later. source.incsources.append(incpath) return None, None, False, pos, bpos # ---------------------------------------- # Parsing context IDs and handlers collected. # IDs and handlers must be in the same order, # as IDs are used to index handlers. ( _ctx_void, _ctx_ksyn, _ctx_env, _ctx_pkey, _ctx_pval, _ctx_exp, _ctx_tag, _ctx_inc, -) = range(8) +) = list(range(8)) _ctx_handlers = ( _ctx_handler_void, _ctx_handler_ksyn, _ctx_handler_env, _ctx_handler_pkey, _ctx_handler_pval, _ctx_handler_exp, _ctx_handler_tag, _ctx_handler_inc, ) # ---------------------------------------- # Parsing utilities. # Find the first separator admitted by the test function, # skipping over escaped characters, continued lines and comments. # Return substring to that point (without escapes, comments, line cont.), # separator, and new position and block position (line, column). # On request, also return list of escape indicators for each character # in the substring (True where character was escaped). # Separator test function takes single argument, the current character, # and returns None if it is not admitted as separator. # If end of input is reached without test function admitting a separator, # separator is reported as None; otherwise, separator is reported as # the return value from the test function. def _move_to_sep (instr, pos, bpos, testsep, wesc=True, repesc=False): opos = pos substr = [] isesc = [] sep = None while sep is None and pos < len(instr): c = instr[pos] if c == _ch_comment: p = instr.find(_ch_nl, pos) if p < 0: pos += len(instr) - pos else: pos = p elif wesc and c == _ch_escape: pos += 1 if pos < len(instr): if instr[pos] == _ch_nl: # line continuation pass # elif instr[pos] == _ch_ucode: # unicode hex else: substr.append(instr[pos]) isesc.append(True) pos += 1 else: sep = testsep(c) if sep is not None: pos += len(sep) else: substr.append(c) isesc.append(False) pos += 1 # Update block position (line, column). rawsubstr = instr[opos:pos] p = rawsubstr.rfind(_ch_nl) if p >= 0: bpos = (bpos[0] + rawsubstr.count(_ch_nl), len(rawsubstr) - p) else: bpos = (bpos[0], bpos[1] + len(rawsubstr)) ret = ("".join(substr), sep, pos, bpos) if repesc: ret = ret + (isesc,) return ret # ---------------------------------------- # Data structures. # Synder source. class _SDSource: def __init__ (self, name): # Name of the source (filename, etc). self.name = name # Derivations (SDDeriv). self.derivs = [] # Included sources (must be ordered). self.incsources = [] # Indentation for derivation and environments heads # (set on first parsed). self.indderiv = None self.indenv = None ## Global directives. #... def __unicode__ (self): return ( "============> %s\n" % self.name - + "\n".join(map(unicode, self.derivs))) + + "\n".join(map(str, self.derivs))) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Derivation. class _SDDeriv: def __init__ (self, parent, pos): # Parent source and position in it. self.parent = parent self.pos = pos # Key syntagmas (SDProp). self.syns = [] # Environments (SDEnv). self.envs = [] def __unicode__ (self): return ( " -----> %d:%d\n" % self.pos - + " " + "\n ".join(map(unicode, self.syns)) + "\n" - + "\n".join(map(unicode, self.envs))) + + " " + "\n ".join(map(str, self.syns)) + "\n" + + "\n".join(map(str, self.envs))) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Environment. class _SDEnv: def __init__ (self, parent, pos, name=""): # Parent derivation and position in source. self.parent = parent self.pos = pos # Environment name. self.name = name # Properties (SDProp). self.props = [] def __unicode__ (self): return ( " @%s:%d:%d\n" % ((self.name,) + self.pos) - + "\n".join(map(unicode, self.props))) + + "\n".join(map(str, self.props))) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Syntagma. class _SDSyn: def __init__ (self, parent, pos, hidden=False): # Parent derivation and position in source. self.parent = parent self.pos = pos # Visibility of the syntagma. self.hidden = hidden # Syntagma segments (SDText, SDTag). self.segs = [] def __unicode__ (self): return ( "{p:%d:%d|%s}=" % (self.pos + (self.hidden,)) - + u"".join(map(unicode, self.segs))) + + "".join(map(str, self.segs))) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Property. class _SDProp: def __init__ (self, parent, pos): # Parent environment and position in source. self.parent = parent self.pos = pos # Keys (SDKey). self.keys = [] # Value segments (SDText, SDExp, SDTag). self.segs = [] def __unicode__ (self): return ( " %d:%d " % self.pos - + "k=" + u"".join(map(unicode, self.keys)) + " " - + "v=" + u"".join(map(unicode, self.segs))) + + "k=" + "".join(map(str, self.keys)) + " " + + "v=" + "".join(map(str, self.segs))) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Property key. class _SDKey: def __init__ (self, parent, pos, name="", cut=False, terminal=False, canceling=False): # Parent property and position in source. self.parent = parent self.pos = pos # Key behaviors. self.name = name self.cut = cut self.terminal = terminal self.canceling = canceling def __unicode__ (self): return "{k:%d:%d:%s|%s&%s}" % (self.pos + (self.name, self.cut, self.terminal, self.canceling)) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Expander. class _SDExp: def __init__ (self, parent, pos, ref=None, mask=None, caps=None, kext=None): # Parent property and position in source. self.parent = parent self.pos = pos # Reference, selection mask, capitalization, key extender. self.ref = ref self.mask = mask self.caps = caps self.kext = kext def __unicode__ (self): - return u"{e:%d:%d:%s|%s|%s|%s}" % (self.pos + (self.ref, self.mask, + return "{e:%d:%d:%s|%s|%s|%s}" % (self.pos + (self.ref, self.mask, self.caps, self.kext)) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Tag. class _SDTag: def __init__ (self, parent, pos): # Parent property and position in source. self.parent = parent self.pos = pos # Names associated to this tag. self.names = [] def __unicode__ (self): - return u"{g:%d:%d:%s}" % (self.pos + ("+".join(self.names),)) + return "{g:%d:%d:%s}" % (self.pos + ("+".join(self.names),)) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # Text segment. class _SDText: def __init__ (self, parent, pos, text=""): # Parent property and position in source. self.parent = parent self.pos = pos # Text. self.text = text def __unicode__ (self): return "{t:%d:%d:%s}" % (self.pos + (self.text,)) def __str__ (self): return self.__unicode__().encode(locale.getpreferredencoding()) # ---------------------------------------- # High level access. class Synder (object): """ Derivator objects import sources of derivations and get queried for properties of syntagmas. Lookup can be done by derivation key and property key, but also by single compound key (serialization of the previous two), to have interface and behavior similar to built-in dictionaries. Basic usage is rather simple. If there are derivation files C{planets.sd} and {moons.sd}, they can be used like this:: >>> sd = Synder() >>> sd.import_file("planets.sd") >>> sd.import_file("moons.sd") >>> >>> # Lookup of properties by derivation and property key. >>> sd.get2("Venus", "nom") Venera >>> sd.get2("Callisto", "nom") Kalisto >>> sd.get2("Foobar", "nom") None >>> # Lookup of properties by compound key. >>> sd["Venus-nom"] Venera >>> >>> # Iteration through properties by derivation keys. >>> for dkey in sd.dkeys(): print sd.get2(dkey, "nom") ... Venera Kalisto Merkur Jupiter … >>> # Iteration through properties by compound keys. >>> for ckey in sd: print sd[ckey] ... Venera Veneri Venerom … Merkuru Merkur Merkura … >>> # Querying for key syntagmas. >>> sd.syns("Venus") ['Venus'] >>> sd.syns("Iapetus") ['Iapetus', 'Japetus'] >>> sd.syns("Japetus") ['Iapetus', 'Japetus'] >>> >>> # Querying for property keys. >>> sd.pkeys("Venus") ['gen', 'acc', 'nom', 'dat', 'gender'] Syntax errors in derivations sources will raise L{SynderError} exceptions on import. Unresolvable conflicts in derivation keys will be reported as warning on import, and conflicted derivations will not be imported. Errors in expansions are not reported on import, but when the problematic derivation is queried; warnings are output, and C{None} (or default value) is returned for all properties. """ def __init__ (self, env="", ckeysep="-", strictkey=False, dkeytf=None, dkeyitf=None, pkeytf=None, pkeyitf=None, pvaltf=None, ksyntf=None, envtf=None): """ Constructor of syntagma derivators. The default resolution of derivation key conflicts, as described in module documentation, can be changed to strict resolution through C{strictkey} parameter. If C{strictkey} is C{True}, all key syntagmas must be unique. Parameter C{env} is used to specify the environment from which the derivations are taken. In case no non-default environments have been used in derivations, C{env} is simply empty string. Otherwise, it can be: - a string specifying a non-default environment - a tuple specifying an environment fallback chain - a tuple of tuples, specifying more than one environment chain (Lists can also be used instead of tuples.) If several environment fallback chains are given, when a property is requrested they are tried in the order of specification, and the first yielded property is returned. It is also possible to combine properties from different environment chains in a custom way, by supplying a property value transformation function (C{pvaltf} parameter). Compound keys, for single-key lookups, are built by joining the derivation and property keys with a separator. This separator can be chosen through C{ckeysep} parameter. The separator string can be contained inside a derivation key, but it must not be found inside any property key (the compound key is split from the back). A myriad of I{transformation functions} can be applied by derivator object to imported derivations, through C{*tf} parameters. They are as follows (stating only default inputs, see below for more possibilities): - C{dkeytf}: applied to derivation key supplied on lookups (e.g. in L{get} or L{get2} methods). Takes the derivation key as parameter, returns either the derivation key or a tuple of the derivation key and another object. - C{dkeyitf}: applied to all derivation keys on import. Same default input-output as C{dkey}. - C{pkeytf}: like C{dkeytf}, only working analogously on property key instead of derivation key. - C{pkeyitf}: like C{dkeyitf}, only working analogously on property key instead of derivation key. - C{pvaltf}: applied to tagged segments of property values. The input to this function is a list of lists by each environment fallback chain; list for one environemnt chain consists of 2-tuples, each tuple having a list of tags as the first element, and a text segment as the second element. For example, if there is only one environment chain (e.g. C{evn=""} or C{env=("someenv", "")}, and the property value is derived to be C{foo ~tag bar} in this environment, then the argument to the function will be C{[[([''], "foo "), (['tag'], " bar")]]}. If an environemnt chain yielded no property value, its element will be C{None} instead of list of 2-tuples. The return value is the final property value string. Note that simplification will not be applied to this value afterwards, so if desired, L{simplify()} should be manually called inside the function. - C{ksyntf}: quite similar to C{pvaltf}, only applied to tagged segments of key syntagmas. The difference is that there are no multiple environments for key syntagmas, so the input value is just one list of tagged text segments (what would be the first element of input list to C{pvaltf}). - C{envtf}: applied to environment fallback chain on lookups. Takes original environment chain as argument, returns new environment chain (in one of the forms acceptable as C{env} parameter). Transformation functions can take more input arguments than the default described above, on demand. If transformation function is supplied directly, e.g. C{pvaltf=somefunc}, it is sent default inputs. Extra inputs are requested by supplying instead a tuple, where the first element is the transformation function, and the following elements are predefined keywords of available extra inputs, e.g. C{pvalf=(somefunc, "dkey", "pkrest")}. Available extra inputs by transformation function are: - C{dkeytf}: C{"self"} the derivation object. - C{pkeytf}: C{"self"}, C{"dkey"} the derivation key (original or that returned by C{dkeytf}), C{"dkrest"} the second object returned by C{dkeytf}. - C{pvaltf}: C{"self"}, C{"dkey"}, C{"pkey"} the property key (original or that returned by C{pkeytf}), C{"env"} the tuple of environment chains, C{"dkrest"}, C{"pkrest"} the second object returned by C{pkeytf}. - C{ksyntf}: C{"self"}, C{"dkey"}, C{"dkrest"}. - C{envtf}: C{"self"}, C{"dkey"}, C{"dkrest"}. @param env: environment for derivations @type env: string, (string*), ((string*)*) @param ckeysep: derivation-property key separator in compound keys @type ckeysep: string @param strictkey: whether all key syntagmas must be unique to avoid conflicts @param dkeytf: transformation function for lookup derivation keys @param dkeyitf: transformation function for imported derivation keys @param pkeytf: transformation function for lookup property keys @param pkeyitf: transformation function for imported property keys @param pvaltf: transformation fucntion for property values @param ksyntf: transformation fucntion for key syntagamas """ self._env = self._normenv(env) self._ckeysep = ckeysep self._dkeytf = self._resolve_tf(dkeytf, ["self"]) self._dkeyitf = self._resolve_tf(dkeyitf, []) self._pkeytf = self._resolve_tf(pkeytf, ["dkey", "dkrest", "self"]) self._pkeyitf = self._resolve_tf(pkeyitf, []) self._pvaltf = self._resolve_tf(pvaltf, ["pkey", "dkey", "env", "dkrest", "pkrest", "self"]) self._ksyntf = self._resolve_tf(ksyntf, ["dkey", "dkrest", "self"]) self._envtf = self._resolve_tf(envtf, ["dkey", "dkrest", "self"]) self._strictkey = strictkey self._imported_srcnames = set() self._visible_srcnames = set() self._derivs_by_srcname = {} self._deriv_by_srcname_idkey = {} self._visible_deriv_by_dkey = {} self._props_by_deriv_env1 = {} self._raw_props_by_deriv_env1 = {} self._single_dkeys = set() def _normenv (self, env): if isinstance(env, (tuple, list)): - if not env or isinstance(env[0], basestring): + if not env or isinstance(env[0], str): env = (env,) else: env = ((env,),) return env def _resolve_tf (self, tfspec, kneargs): eaords = [0] if isinstance(tfspec, (tuple, list)): tf0, eargs = tfspec[0], list(tfspec[1:]) unkeargs = set(eargs).difference(kneargs) if unkeargs: raise SynderError( _("@info", "Unknown extra arguments for transformation function " "requested in derivator constructor: %(arglist)s", arglist=format_item_list(sorted(unkeargs)))) eaords.extend([kneargs.index(x) + 1 for x in eargs]) else: tf0 = tfspec if tf0 is None: return None def tf (*args): args0 = [args[x] for x in eaords] return tf0(*args0) return tf def import_string (self, string, ignhid=False): """ Import string with derivations. @param string: the string to parse @type string: string @param ignhid: also make hidden derivations visible if C{True} @type ignhid: bool @returns: number of newly imported visible derivations @rtype: int """ source = _parse_string(string) return self._process_import_visible(source, ignhid) def import_file (self, filename, ignhid=False): """ Import file with derivations. @param filename: the path to file to parse @type filename: string @param ignhid: also make hidden derivations visible if C{True} @type ignhid: bool @returns: number of newly imported visible derivations @rtype: int """ source = _parse_file(filename) return self._process_import_visible(source, ignhid) def _process_import_visible (self, source, ignhid): nnew = self._process_import(source) nvis = self._make_visible(source, ignhid) return (nvis, nnew) def _process_import (self, source): if source.name in self._imported_srcnames: return 0 self._imported_srcnames.add(source.name) iderivs = [] self._derivs_by_srcname[source.name] = iderivs idmap = {} self._deriv_by_srcname_idkey[source.name] = idmap # Construct wrapping derivations and file them by derivation keys. nadded = 0 for rawderiv in source.derivs: # Create wrapper derivation for the raw derivation. deriv = self._Deriv(rawderiv, self._dkeyitf) # Eliminate internal key conflicts of this derivation. self._eliminate_conflicts(deriv, idmap, None, lambda x: x.idkeys) # Register internal derivation in this source. if deriv.idkeys: iderivs.append(deriv) for idkey in deriv.idkeys: idmap[idkey] = deriv nadded += 1 # Import included sources. for incsource in source.incsources: nadded += self._process_import(incsource) return nadded def _make_visible (self, source, ignhid): if source.name in self._visible_srcnames: return 0 self._visible_srcnames.add(source.name) nvis = 0 for deriv in self._derivs_by_srcname[source.name]: if not ignhid and all([x.hidden for x in deriv.base.syns]): continue # Eliminate external key conflicts of this derivation. self._eliminate_conflicts(deriv, self._visible_deriv_by_dkey, self._single_dkeys, lambda x: x.dkeys) # Register visible derivation in this source. if deriv.dkeys: self._single_dkeys.add(tuple(deriv.dkeys)[0]) for dkey in deriv.dkeys: self._visible_deriv_by_dkey[dkey] = deriv nvis += 1 return nvis class _Deriv: def __init__ (self, deriv, dkeyitf): self.base = deriv # Compute internal and external derivation keys from key syntagmas. self.idkeys = set() self.dkeys = set() for syn in deriv.syns: synt = "".join([x.text for x in syn.segs if isinstance(x, _SDText)]) idkey = simplify(synt) self.idkeys.add(idkey) dkeys = dkeyitf(idkey) if dkeyitf else idkey if dkeys is not None: if not isinstance(dkeys, (tuple, list)): dkeys = [dkeys] self.dkeys.update(dkeys) def _eliminate_conflicts (self, deriv, kmap, kskeys, keyf): to_remove_keys = set() to_remove_keys_other = {} for key in keyf(deriv): oderiv = kmap.get(key) if oderiv is not None: to_remove_keys.add(key) if oderiv not in to_remove_keys_other: to_remove_keys_other[oderiv] = set() to_remove_keys_other[oderiv].add(key) noconfres_oderivs = [] if self._strictkey or to_remove_keys == keyf(deriv): - noconfres_oderivs.extend(to_remove_keys_other.keys()) + noconfres_oderivs.extend(list(to_remove_keys_other.keys())) else: - for oderiv, keys in to_remove_keys_other.items(): + for oderiv, keys in list(to_remove_keys_other.items()): if keyf(oderiv) == keys: noconfres_oderivs.append(oderiv) if noconfres_oderivs: # Clear both internal and external keys. deriv.dkeys.clear() deriv.idkeys.clear() eposf = lambda x: (x.base.parent.name, x.base.syns[0].pos[0]) noconfres_oderivs.sort(key=eposf) pos1 = "%s:%d" % eposf(deriv) pos2s = ["%s:%d" % eposf(x) for x in noconfres_oderivs] pos2s = "\n".join(pos2s) warning(_("@info", "Derivation at %(pos1)s eliminated due to " "key conflict with the following derivations:\n" "%(pos2list)s", pos1=pos1, pos2list=pos2s)) else: for key in to_remove_keys: keyf(deriv).remove(key) - for oderiv, keys in to_remove_keys_other.items(): + for oderiv, keys in list(to_remove_keys_other.items()): for key in keys: keyf(oderiv).remove(key) kmap.pop(key) if kskeys is not None and key in kskeys: kskeys.remove(key) kskeys.add(tuple(keyf(oderiv))[0]) def _resolve_dkey (self, dkey): dkrest = () if self._dkeytf: dkey = self._dkeytf(dkey, self) if isinstance(dkey, tuple): dkey, dkrest = dkey[0], dkey[1:] deriv = None if dkey is not None: deriv = self._visible_deriv_by_dkey.get(dkey) if deriv is None: dkey = None return dkey, dkrest, deriv def _resolve_pkey (self, pkey, dkey, dkrest): pkrest = () if self._pkeytf: pkey = self._pkeytf(pkey, dkey, dkrest, self) if isinstance(pkey, tuple): pkey, pkrest = pkey[0], pkey[1:] return pkey, pkrest def _resolve_env (self, env, dkey, dkrest): if self._envtf: env = self._envtf(env, dkey, dkrest, self) if env is not None: env = self._normenv(env) return env def get2 (self, dkey, pkey, defval=None): """ Get property value by derivation key and property key. @param dkey: derivation key @type dkey: string @param pkey: property key @type pkey: string @param defval: the value to return if the property does not exist @type defval: string @returns: the property value @rtype: string """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return defval pkey, pkrest = self._resolve_pkey(pkey, dkey, dkrest) if pkey is None: return defval env = self._resolve_env(self._env, dkey, dkrest) if env is None: return defval mtsegs = [] for env1 in env: tsegs = self._getprops(deriv, env1).get(pkey) mtsegs.append(tsegs) if self._pvaltf: pval = self._pvaltf(mtsegs, pkey, dkey, env, dkrest, pkrest, self) else: pval = None for tsegs in mtsegs: if tsegs is not None: pval = simplify("".join([x[0] for x in tsegs])) break return pval if pval is not None else defval def _getprops (self, deriv, env1): # Try to fetch derivation from cache. props = self._props_by_deriv_env1.get((deriv, env1)) if props is not None: return props # Construct raw derivation and extract key-value pairs. rprops = self._derive(deriv, env1) - props = dict([(x, self._simple_segs(y[0])) for x, y in rprops.items() + props = dict([(x, self._simple_segs(y[0])) for x, y in list(rprops.items()) if not y[1].canceling]) # Internally transform keys if requested. if self._pkeyitf: nprops = [] - for pkey, segs in props.items(): + for pkey, segs in list(props.items()): pkey = self._pkeyitf(pkey) if pkey is not None: nprops.append((pkey, segs)) props = dict(nprops) self._props_by_deriv_env1[(deriv, env1)] = props return props def _derive (self, deriv, env1): # Try to fetch raw derivation from cache. dprops = self._raw_props_by_deriv_env1.get((deriv, env1)) if dprops is not None: return dprops # Derivator core. dprops = {} env = None envs_by_name = dict([(x.name, x) for x in deriv.base.envs]) for env0 in reversed(env1): env = envs_by_name.get(env0) if env is None: continue for prop in env.props: fsegs = [] cprops = dict([(simplify(x.name), ([], x)) for x in prop.keys]) ownpkeys = set(cprops.keys()) for seg in prop.segs: if isinstance(seg, _SDExp): eprops = self._expand(seg, deriv, env1) - if len(eprops) != 1 or eprops.keys()[0]: + if len(eprops) != 1 or list(eprops.keys())[0]: if cprops: for cpkey, csegskey in list(cprops.items()): if not csegskey[1].cut: esegskey = eprops.get(cpkey) if esegskey is not None: if not esegskey[1].cut: csegskey[0].extend(esegskey[0]) else: cprops.pop(cpkey) if not cprops: break - for epkey, esegskey in eprops.items(): + for epkey, esegskey in list(eprops.items()): if esegskey[1].cut: cprops[epkey] = esegskey if not cprops: break else: - for pkey, (esegs, key) in eprops.items(): + for pkey, (esegs, key) in list(eprops.items()): csegs = esegs[:] if not key.cut: csegs[:0] = fsegs cprops[pkey] = (csegs, key) else: - esegs = eprops.values()[0][0] + esegs = list(eprops.values())[0][0] if cprops: - for pkey, (csegs, key) in cprops.items(): + for pkey, (csegs, key) in list(cprops.items()): if not key.cut or pkey in ownpkeys: csegs.extend(esegs) else: fsegs.extend(esegs) elif cprops: - for pkey, (csegs, key) in cprops.items(): + for pkey, (csegs, key) in list(cprops.items()): if not key.cut or pkey in ownpkeys: csegs.append(seg) else: fsegs.append(seg) for pkey, (segs, key) in list(cprops.items()): if key.canceling and pkey in dprops: osegskey = dprops.get(pkey) if osegskey is not None and not osegskey[1].canceling: dprops.pop(pkey) cprops.pop(pkey) dprops.update(cprops) # Eliminate leading and trailing empty text segments. - map(self._trim_segs, [x[0] for x in dprops.values()]) + list(map(self._trim_segs, [x[0] for x in list(dprops.values())])) self._raw_props_by_deriv_env1[(deriv, env1)] = dprops return dprops def _expand (self, exp, pderiv, env1): # TODO: Discover circular expansion paths. # Fetch the derivation pointed to by the expansion. idkey = simplify(exp.ref) source = pderiv.base.parent deriv = self._deriv_by_srcname_idkey[source.name].get(idkey) if deriv is None: for isource in reversed(source.incsources): deriv = self._deriv_by_srcname_idkey[isource.name].get(idkey) if deriv is not None: break if deriv is None: raise SynderError( _("@info", "Expansion '%(ref)s' does not reference a known derivation.", ref=exp.ref, file=source.name, line=exp.pos[0]), 5010, source.name, exp.pos) # Derive the referenced derivation. props = self._derive(deriv, env1) # Drop terminal properties. nprops = [] - for pkey, (segs, key) in props.items(): + for pkey, (segs, key) in list(props.items()): if not key.terminal: nprops.append((pkey, (segs, key))) props = dict(nprops) # Apply expansion mask. if exp.mask is not None: # Eliminate all obtained keys not matching the mask. # Reduce by mask those that match. nprops = [] - for pkey, segskey in props.items(): + for pkey, segskey in list(props.items()): if len(pkey) != len(exp.mask): continue mpkey = "" for c, cm in zip(pkey, exp.mask): if cm != _ch_exp_mask_pl: if cm != c: mpkey = None break else: mpkey += c if mpkey is not None: nprops.append((mpkey, segskey)) props = dict(nprops) # Apply key extension. if exp.kext is not None: nprops = [] - for pkey, (segs, key) in props.items(): + for pkey, (segs, key) in list(props.items()): npkey = exp.kext.replace(_ch_exp_kext_pl, pkey) nprops.append((npkey, (segs, key))) props = dict(nprops) # Apply capitalization. if exp.caps is not None: chcaps = first_to_upper if exp.caps else first_to_lower nprops = [] - for pkey, (segs, key) in props.items(): + for pkey, (segs, key) in list(props.items()): chcapsed = False nsegs = [] for seg in segs: if ( not chcapsed and isinstance(seg, _SDText) and seg.text.strip() ): nseg = copy.copy(seg) nseg.text = chcaps(seg.text) chcapsed = True nsegs.append(nseg) else: nsegs.append(seg) nprops.append((pkey, (nsegs, key))) props = dict(nprops) if not props: raise SynderError( _("@info", "Expansion '%(ref)s' expands into nothing.", ref=exp.ref, file=source.name, line=exp.pos[0]), 5020, source.name, exp.pos) return props def _trim_segs (self, segs): for i0, di, stripf in ( - (0, 1, unicode.lstrip), - (len(segs) - 1, -1, unicode.rstrip), + (0, 1, str.lstrip), + (len(segs) - 1, -1, str.rstrip), ): i = i0 while i >= 0 and i < len(segs): if isinstance(segs[i], _SDText): segs[i].text = stripf(segs[i].text) if segs[i].text: break i += di def _simple_segs (self, segs): # Add sentries. if not segs: segs = [_SDText(None, None, "")] if not isinstance(segs[0], _SDTag): segs = [_SDTag(None, None)] + segs if not isinstance(segs[-1], _SDText): segs = segs + [_SDText(None, None, "")] # Construct simplified segments: [(text, [tagname...])...] tsegs = [] i = 0 while i < len(segs): # Tag names for the next piece of text. tags = segs[i].names # Join contiguous text segments into single plain text. i += 1 i0 = i while i < len(segs) and isinstance(segs[i], _SDText): i += 1 text = "".join([x.text for x in segs[i0:i]]) # Collect simplified segment. tsegs.append((text, tags)) return tsegs def get (self, ckey, defval=None): """ Get property value by compound key. @param ckey: compound key @type ckey: string @param defval: the value to return if the property does not exist @type defval: string @returns: the property value @rtype: string """ # Split the compound key into derivation and property keys. lst = ckey.rsplit(self._ckeysep, 1) if len(lst) < 2: return defval dkey, pkey = lst return self.get2(dkey, pkey, defval) def dkeys (self, single=False): """ Get list of all derivation keys. For derivations accessible through more than one derivation key, by default all of them are included in the result. If instead only a single random of those keys is wanted (i.e. strictly one key per derivation), C{single} can be set to C{True}. @param single: whether to return a single key for each derivation @type single: param @returns: list of derivation keys @rtype: [string*] """ if not single: - return self._visible_deriv_by_dkey.keys() + return list(self._visible_deriv_by_dkey.keys()) else: return self._single_dkeys def syns (self, dkey): """ Get list of key syntagmas by derivation key. Key syntagmas are always returned in the order in which they appear in the derivation. If no derivation is found for the given key, an empty list is returned. @param dkey: derivation key @type dkey: string @returns: key syntagmas @rtype: [string*] """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return [] rsyns = [] for syn in deriv.base.syns: if not syn.hidden: tsegs = self._simple_segs(syn.segs) if self._ksyntf: rsyn = self._ksyntf(tsegs, dkey, dkrest, self) else: rsyn = simplify("".join([x[0] for x in tsegs])) if rsyn is not None: rsyns.append(rsyn) return rsyns def altdkeys (self, dkey): """ Get list of all derivation keys pointing to same entry as given key. @param dkey: derivation key @type dkey: string @returns: alternative derivation keys @rtype: [string*] """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return [] return deriv.dkeys def pkeys (self, dkey): """ Get set of property keys available for given derivation key. If no derivation is found for the given key, an empty set is returned. @param dkey: derivation key @type dkey: string @returns: property keys @rtype: set(string*) """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return set() env = self._resolve_env(self._env, dkey, dkrest) if env is None: return set() pkeys = set() for env1 in env: props = self._getprops(deriv, env1) - pkeys.update(props.keys()) + pkeys.update(list(props.keys())) return pkeys def props (self, dkey): """ Get dictionary of property values by property keys for given derivation key. If no derivation is found for the given key, an empty dictionary is returned. @param dkey: derivation key @type dkey: string @returns: property dictionary @rtype: {(string, string)*} """ # TODO: Implement more efficiently. props = dict([(x, self.get2(dkey, x)) for x in self.pkeys(dkey)]) return props def envs (self, dkey): """ Get list of all explicitly defined environments in given derivation. "Explicitly" means environments mentioned in the derivation itself, and not those inherited through expansions. @param dkey: derivation key @type dkey: string @returns: explicit environment names @rtype: [string*] """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return [] return [x.name for x in deriv.base.envs] def source_name (self, dkey): """ Get the name of the source in which the derivation is found. If no derivation is found for the given key, C{None} is returned. @param dkey: derivation key @type dkey: string @returns: name of the source @rtype: string """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return None srcname = deriv.base.parent.name.split(os.path.sep)[-1] srcname = srcname[:srcname.rfind(".")] return srcname def source_pos (self, dkey): """ Get the position in the source where the derivation is found. Position is a 3-tuple of file path, line and column numbers. If no derivation is found for the given key, C{None} is returned. @param dkey: derivation key @type dkey: string @returns: source position @rtype: (string, int, int) """ dkey, dkrest, deriv = self._resolve_dkey(dkey) if dkey is None: return None path = deriv.base.parent.name lno, cno = deriv.base.pos return path, lno, cno def keys (self): """ Get the list of all compound keys. @returns: compound keys @rtype: [string*] """ - return list(self.iterkeys()) + return list(self.keys()) def values (self): """ Get the list of all property values. @returns: property values @rtype: [string*] """ - return list(self.itervalues()) + return list(self.values()) def items (self): """ Get the list of all pairs of compound keys and property values. @returns: compound keys and property values @rtype: [(string, string)*] """ - return list(self.iteritems()) + return list(self.items()) def __contains__ (self, ckey): """ Check if the compound key is present in the derivator. @returns: C{True} if present, C{False} otherwie @rtype: bool """ return self.get(ckey) is not None def __getitem__ (self, ckey): """ Get property value by compound key, in dictionary notation. Like L{get}, but raises C{KeyError} if key is not found. @returns: property value @rtype: string """ res = self.get(ckey) if res is None: raise KeyError(ckey) return res def __iter__ (self): """ Iterate through all compound keys, in random order. @returns: iterator through compound keys @rtype: iterator(string) """ - return self.iterkeys() + return iter(self.keys()) def iterkeys (self): """ Iterate through all compound keys, in random order. @returns: iterator through compound keys @rtype: iterator(string) """ return self._Iterator(self._make_iter(lambda x: x)) def itervalues (self): """ Iterate through all property values, in random order. @returns: iterator through property values @rtype: iteratorstring) """ return self._Iterator(self._make_iter(lambda x: self.get(x))) def iteritems (self): """ Iterate through all pairs of compound key and property value, in random order. @returns: iterator through compound key property value pairs @rtype: iterator((string, string)) """ return self._Iterator(self._make_iter(lambda x: (x, self.get(x)))) class _Iterator (object): def __init__ (self, it): self._it = it def __iter__ (self): return self - def next (self): + def __next__ (self): return self._it() # expected to raise StopIteration on its own def _make_iter (self, keyf): it = iter(self._visible_deriv_by_dkey) gdat = [None, []] # dkey, pkeys def next (): while not gdat[1]: - gdat[0] = it.next() # will raise StopIteration + gdat[0] = next(it) # will raise StopIteration gdat[1] = self.pkeys(gdat[0]) dkey = gdat[0] pkey = gdat[1].pop() return keyf(dkey + self._ckeysep + pkey) return next def empty_pcache (self): self._props_by_deriv_env1 = {} self._raw_props_by_deriv_env1 = {} diff --git a/pology/tabulate.py b/pology/tabulate.py index fd9c22cf..13606bd0 100644 --- a/pology/tabulate.py +++ b/pology/tabulate.py @@ -1,198 +1,198 @@ # -*- coding: UTF-8 -*- """ Pretty-printing of tabular data. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import copy from pology.colors import ColorString, cjoin def tabulate (data, coln=None, rown=None, dfmt=None, space=" ", none="", rotated=False, colorize=False, indent="", colnra=False, rownra=False, colw=0): """ Tabulate data in plain text. All data fields can have missing trailing entries. They will be set to C{None} according to table extents. Examples: >>> print T.tabulate(data=((1, 4), (2, ), (3, 6)), ... coln=("c1", "c2", "c3"), rown=("r1", "r2"), ... space=" ", none="-") - c1 c2 c3 r1 1 2 3 r2 4 - 6 @param data: column entries (cells) by column @type data: [[string*]*] @param coln: column names @type coln: [string*] @param rown: row names @type rown: [string*] @param dfmt: format strings per column (e.g. C{"%+.2f"} for floats) @type dfmt: [string*] @param space: fill-in for spacing between cells @type space: string @param none: fill-in for displaying empty cells (i.e. C{None}-valued) @type none: string @param rotated: whether the table should be transposed @type rotated: bool @param colorize: whether the table should have color highlighting @type colorize: bool @param indent: indent string for the whole table @type indent: string @param colnra: right align column names @type colnra: bool @param rownra: right align row names @type rownra: bool @param colw: minimal column width @type colw: integer @returns: plain text representation of the table (no trailing newline) @rtype: string/L{ColorString} """ # Make local copies, to be able to extend to table extents. _data = [] for col in data: _data.append(list(col)) _coln = None if coln: _coln = list(coln) _rown = None if rown: _rown = list(rown) _dfmt = None if dfmt: _dfmt = list(dfmt) # Calculate maximum row and column number. # ...look at data: nrows = 0 ncols = 0 for col in _data: if nrows < len(col): nrows = len(col) ncols += 1 # ...look at column and row names: if _coln is not None: if ncols < len(_coln): ncols = len(_coln) if _rown is not None: if nrows < len(_rown): nrows = len(_rown) # Index offsets due to column/row names. ro = 0 if _coln is not None: ro = 1 co = 0 if _rown is not None: co = 1 # Extend all missing table fields. # ...add columns: for c in range(len(_data), ncols): _data.append([]) # ...add rows: for col in _data: for r in range(len(col), nrows): col.append(None) # ...add column names: if _coln is not None: if _rown is not None: _coln.insert(0, none) # header corner for c in range(len(_coln), ncols + co): _coln.append(None) # ...add row names: if _rown is not None: if _coln is not None: _rown.insert(0, none) # header corner for r in range(len(_rown), nrows + ro): _rown.append(None) # ...add formats: if _dfmt is None: _dfmt = [] if _rown is not None: - _dfmt.insert(0, u"%s") # header corner + _dfmt.insert(0, "%s") # header corner for c in range(len(_dfmt), ncols + co): - _dfmt.append(u"%s") + _dfmt.append("%s") # Stringize data. # ...nice fat deep assembly of empty stringized table: - sdata = [[u"" for i in range(nrows + ro)] for j in range(ncols + co)] + sdata = [["" for i in range(nrows + ro)] for j in range(ncols + co)] # ...table body: for c in range(ncols): for r in range(nrows): if _data[c][r] is not None: sdata[c + co][r + ro] = _dfmt[c + co] % (_data[c][r],) else: sdata[c + co][r + ro] = none # ...column names: if _coln is not None: for c in range(ncols + co): if _coln[c] is not None: - sdata[c][0] = u"%s" % (_coln[c],) + sdata[c][0] = "%s" % (_coln[c],) # ...row names: if _rown is not None: for r in range(nrows + ro): if _rown[r] is not None: - sdata[0][r] = u"%s" % (_rown[r],) + sdata[0][r] = "%s" % (_rown[r],) # Rotate needed data for output. if rotated: _coln, _rown = _rown, _coln ncols, nrows = nrows, ncols co, ro = ro, co - sdata_r = [[u"" for i in range(nrows + ro)] for j in range(ncols + co)] + sdata_r = [["" for i in range(nrows + ro)] for j in range(ncols + co)] for c in range(ncols + co): for r in range(nrows + ro): sdata_r[c][r] = sdata[r][c] sdata = sdata_r # Calculate maximum lengths per screen column. maxlen = [colw] * (ncols + co) for c in range(ncols + co): for r in range(nrows + ro): l = len(sdata[c][r]) if maxlen[c] < l: maxlen[c] = l # Reformat strings to maximum length per column. for c in range(co, ncols + co): - lfmt = u"%" + str(maxlen[c]) + "s" + lfmt = "%" + str(maxlen[c]) + "s" for r in range(ro, nrows + ro): sdata[c][r] = lfmt % (sdata[c][r],) # ...but column names aligned as requested: if _coln is not None: if colnra: - lfmt = u"%" + str(maxlen[c]) + "s" + lfmt = "%" + str(maxlen[c]) + "s" else: - lfmt = u"%-" + str(maxlen[c]) + "s" + lfmt = "%-" + str(maxlen[c]) + "s" sdata[c][0] = lfmt % (sdata[c][0],) if colorize: sdata[c][0] = ColorString("%s") % sdata[c][0] # ...but row names aligned as requested: if _rown is not None: if rownra: - lfmt = u"%" + str(maxlen[0]) + "s" + lfmt = "%" + str(maxlen[0]) + "s" else: - lfmt = u"%-" + str(maxlen[0]) + "s" + lfmt = "%-" + str(maxlen[0]) + "s" for r in range(nrows + ro): sdata[0][r] = lfmt % (sdata[0][r],) if colorize: sdata[0][r] = ColorString("%s") % sdata[0][r] # Assemble the table. lines = [] for r in range(nrows + ro): cells = [] for c in range(ncols + co): cells.append(sdata[c][r]) lines.append(indent + cjoin(cells, space)) return cjoin(lines, "\n") diff --git a/pology/timeout.py b/pology/timeout.py index 7ee6f0e5..c78765df 100644 --- a/pology/timeout.py +++ b/pology/timeout.py @@ -1,55 +1,55 @@ # -*- coding: UTF-8 -*- """ A timeout decorator. Based on SIGALRM from an activeState Python recipe by Chris Wright, U{http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/307871}. @author: Sébastien Renard @license: GPLv3 """ import signal from pology import PologyError, _, n_ from pology.report import report class TimedOutException (PologyError): def __init__ (self, value="timed-out"): self.value = value PologyError.__init__(str(self)) def __str__ (self): return repr(self.value) def timed_out (timeout): def decorate (f): def handler (signum, frame): report(_("@info:progress", ">>>>> Operation timed out.")) raise TimedOutException() def new_f (*args, **kwargs): old = signal.signal(signal.SIGALRM, handler) signal.alarm(timeout) try: result = f(*args, **kwargs) finally: signal.alarm(0) signal.signal(signal.SIGALRM, old) return result - new_f.func_name = f.func_name + new_f.__name__ = f.__name__ return new_f return decorate diff --git a/pology/uiref.py b/pology/uiref.py index ad07507e..d8ea2940 100644 --- a/pology/uiref.py +++ b/pology/uiref.py @@ -1,812 +1,812 @@ # -*- coding: UTF-8 -*- """ Resolve UI references in translation by following through original texts. If PO files which are delivered are not the same PO files which are actually being translated, but there is processing step involved to get former from the latter, there is a possibility to automatically resolve references to user interface strings mentioned through messages (typical e.g. of documentation POs). Compared to hard-coding it, this enables referenced UI text to always be in sync with actual UI, without necessity for manual tracking of changes in the UI. See C{doc/user/lingo.docbook#sec-lguirefs} for details. @var default_headrefs: Default heads for explicit UI references. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ # NOTE: The implementation is tuned to look for and open as few as possible # UI catalogs, and as lazily as possible. import hashlib import os import re from pology import _, n_ from pology.catalog import Catalog from pology.remove import remove_accel_msg, remove_markup_msg from pology.colors import cjoin from pology.fsops import collect_catalogs, collect_catalogs_by_env from pology.getfunc import get_hook_ireq from pology.msgreport import warning_on_msg from pology.report import warning default_headrefs = ["~%"] def resolve_ui (headrefs=default_headrefs, tagrefs=[], uipathseps=[], uicpaths=None, uicpathenv=None, xmlescape=False, pfhook=None, mkeyw=None, invmkeyw=False, quiet=False, fdiralt={}): """ Resolve general UI references in translations [hook factory]. If UI catalogs are collected through the environment variable, a warning is issued if the given variable has not been set. Resolved UI text can be postprocessed by an F1A hook (C{(text)->text}). It can be given either as the hook function itself, or as a L{language request} string. If one or several markup keywords are given as C{mkeyw} parameter, UI reference resolution is skipped for catalogs which do not report one of the given keywords by their L{markup()} method. This match may be inverted by C{invmkeyw} parameter, i.e. to skip resolution for catalogs reporting one of given keywords. The list of UI path separators given by the C{uipathseps} parameter is ordered by priority, such that the first one found in the composite reference text is used to split it into componental UI references. If the UI reference contains a formatting directive/argument placeholder, and the UI reference is found in a message of the same format (e.g. a tooltip referencing another part of UI), then using the argument substitution syntax may make the message invalid for the C{msgfmt -c} check. In that case, an alternative directive start string can be given, which will mask it from C{msgfmt -c}. This is specified by C{fdiralt} parameter, as a dictionary of alternative (key) and normal (value) start strings. @param headrefs: heads for explicit UI references @type headrefs: list of strings @param tagrefs: XML-like tags which define implicit UI references @type tagrefs: list of strings @param uipathseps: separators in composited UI references @type uipathseps: list of strings @param uicpaths: paths to UI catalogs in the project (both files and directories can be given) @type uicpaths: list of strings @param uicpathenv: environment variable defining directories where UI catalogs may be found (colon-separated directory paths) @type uicpathenv: string @param xmlescape: whether to normalize UI text for XML @type xmlescape: bool @param pfhook: F1A hook to postprocess resolved UI text @type pfhook: function or string @param mkeyw: markup keywords for taking catalogs into account @type mkeyw: string or list of strings @param invmkeyw: whether to invert the meaning of C{mkeyw} parameter @type invmkeyw: bool @param quiet: whether to output warnings of failed resolutions @type quiet: bool @param fdiralt: alternative and normal start strings for masking formatting directives @type fdiralt: {string: string} @return: type F3C hook @rtype: C{(msgstr, msg, cat) -> msgstr} """ return _resolve_ui_w(headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext=True, spanrep=False) def check_ui (headrefs=default_headrefs, tagrefs=[], uipathseps=[], uicpaths=None, uicpathenv=None, xmlescape=False, mkeyw=None, invmkeyw=False, fdiralt={}): """ Check general UI references in translations [hook factory]. See L{resolve_ui} for description of parameters. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ pfhook = None quiet = True return _resolve_ui_w(headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext=False, spanrep=True) _tagrefs_docbook4 = [ "guilabel", "guibutton", "guiicon", "guimenu", "guisubmenu", "guimenuitem", ] def resolve_ui_docbook4 (headrefs=default_headrefs, uicpaths=None, uicpathenv=None, pfhook=None, mkeyw=None, quiet=False): """ Resolve UI references in Docbook 4.x translations [hook factory]. A convenience hook which fixes some of the parameters to L{resolve_ui} to match implicit UI references and formatting needs for Docbook POs. @return: type F3C hook @rtype: C{(msgstr, msg, cat) -> msgstr} """ tagrefs = _tagrefs_docbook4 uipathseps = [] xmlescape = True invmkeyw = False fdiralt = {} return _resolve_ui_w(headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext=True, spanrep=False) def check_ui_docbook4 (headrefs=default_headrefs, uicpaths=None, uicpathenv=None, mkeyw=None): """ Check UI references in Docbook 4.x translations [hook factory]. A convenience resolver which fixes some of the parameters to L{check_ui} to match implicit UI references and formatting needs for Docbook POs. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ tagrefs = _tagrefs_docbook4 uipathseps = [] xmlescape = True invmkeyw = False pfhook = None quiet = True fdiralt = {} return _resolve_ui_w(headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext=False, spanrep=True) _tagrefs_kde4 = [ "interface", ] def resolve_ui_kde4 (headrefs=default_headrefs, uipathseps=None, uicpaths=None, uicpathenv=None, pfhook=None, mkeyw=None, quiet=False): """ Resolve UI references in KDE4 UI translations [hook factory]. A convenience resolver which fixes some of the parameters to L{resolve_ui} to match implicit UI references and formatting needs for KDE4 UI POs. If C{uipathseps} is C{None}, separators known to KUIT C{} tag will be used automatically. C{fdiralt} is set to C{{"%~": "%"}}. @return: type F3C hook @rtype: C{(msgstr, msg, cat) -> msgstr} """ tagrefs = _tagrefs_kde4 if uipathseps is None: uipathseps = ["->"] xmlescape = True invmkeyw = False fdiralt = {"%~": "%"} return _resolve_ui_w(headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext=True, spanrep=False) def check_ui_kde4 (headrefs=default_headrefs, uipathseps=None, uicpaths=None, uicpathenv=None, mkeyw=None): """ Check UI references in KDE4 UI translations [hook factory]. A convenience resolver which fixes some of the parameters to L{check_ui} to match implicit UI references and formatting needs for KDE4 UI POs. If C{uipathseps} is C{None}, separators known to KUIT C{} tag will be used automatically. C{fdiralt} is set to C{{"%~": "%"}}. @return: type V3C hook @rtype: C{(msgstr, msg, cat) -> spans} """ tagrefs = _tagrefs_kde4 if uipathseps is None: uipathseps = ["->"] xmlescape = True invmkeyw = False pfhook = None quiet = True fdiralt = {"%~": "%"} return _resolve_ui_w(headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext=False, spanrep=True) def _resolve_ui_w (headrefs, tagrefs, uipathseps, uicpaths, uicpathenv, xmlescape, pfhook, mkeyw, invmkeyw, quiet, fdiralt, modtext, spanrep): """ Worker for resolver factories. """ # Convert sequences into sets, for fast membership checks. if not isinstance(tagrefs, set): tagrefs = set(tagrefs) if not isinstance(headrefs, set): headrefs = set(headrefs) if not isinstance(uipathseps, set): uipathseps = set(uipathseps) # Markup keywords should remain None if not a sequence or string. if mkeyw is not None: - if isinstance(mkeyw, basestring): + if isinstance(mkeyw, str): mkeyw = [mkeyw] mkeyw = set(mkeyw) # Construct post-filtering hook. if pfhook is None: pfhook = lambda x: x - elif isinstance(pfhook, basestring): + elif isinstance(pfhook, str): pfhook = get_hook_ireq(pfhook) # ...else assume it is already a hook function. # Regular expressions for finding and extracting UI references. # Add a never-match expression to start regexes for all reference types, # so that it can be applied even if the category has no entries. rxflags = re.U|re.I # - by tags rxstr = r"<\s*(%s)\b.*?>" % "|".join(list(tagrefs) + ["\x04"]) uiref_start_tag_rx = re.compile(rxstr, rxflags) uiref_extract_tag_rx = {} for tag in tagrefs: rxstr = r"<\s*(%s)\b.*?>(.*?)(<\s*/\s*\1\s*>)" % tag uiref_extract_tag_rx[tag] = re.compile(rxstr, rxflags) # - by heads rxstr = r"(%s)" % "|".join(list(headrefs) + ["\x04"]) uiref_start_head_rx = re.compile(rxstr, rxflags) uiref_extract_head_rx = {} for head in headrefs: rxstr = r"%s(.)(.*?)\1" % head uiref_extract_head_rx[head] = re.compile(rxstr, rxflags) # Lazy-evaluated data. ldata = {} # Function to split text by UI references, into list of tuples with # the text segment preceeding the reference as first element, # the reference as second element, and span indices of the reference # against complete text as the third and fourth elements; # trailing text segment has None as reference, and invalid span. # "Blah foo blah ~%/bar/ blah." -> # [("Blah ", "foo", 9, 12), (" blah ", "bar", 26, 29), # (" blah.", None, -1, -1)] def split_by_uiref (text, msg, cat, errspans): rsplit = [] ltext = len(text) p = 0 while True: mt = uiref_start_tag_rx.search(text, p) if mt: pt = mt.start() else: pt = ltext mh = uiref_start_head_rx.search(text, p) if mh: ph = mh.start() else: ph = ltext if pt < ph: # Tagged UI reference. tag = mt.group(1) m = uiref_extract_tag_rx[tag].search(text, pt) if not m: errmsg = _("@info \"tag\" is a tag in HTML/XML context", "Non-terminated UI reference by tag '%(tag)s'.", tag=tag) errspans.append(mt.span() + (errmsg,)) if not spanrep and not quiet: warning_on_msg(errmsg, msg, cat) break uirefpath = m.group(2) pe = m.end() - len(m.group(3)) ps = pe - len(uirefpath) elif ph < pt: # Headed UI reference. head = mh.group(1) m = uiref_extract_head_rx[head].search(text, ph) if not m: errmsg = _("@info \"head\" is the leading part of " "UI reference, e.g. '~%' in '~%/Save All/'", "Non-terminated UI reference by " "head '%(head)s'.", head=head) errspans.append(mh.span() + (errmsg,)) if not spanrep and not quiet: warning_on_msg(errmsg, msg, cat) break uirefpath = m.group(2) ps, pe = m.span() else: # Both positions equal, meaning end of text. break ptext_uiref = _split_uirefpath(text[p:ps], uirefpath, uipathseps) for ptext, uiref in ptext_uiref: rsplit.append((ptext, uiref, ps, pe)) p = pe # Trailing segment (or everything after an error). rsplit.append((text[p:], None, -1, -1)) return rsplit # Function to resolve given UI reference # (part that needs to be under closure). def resolve_single_uiref (uiref, msg, cat, resolver_helper): if ldata.get("uicpaths") is None: ldata["uicpaths"] = _collect_ui_catpaths(uicpaths, uicpathenv) if ldata.get("actcatfile") != cat.filename: ldata["actcatfile"] = cat.filename ldata["normcats"] = _load_norm_ui_cats(cat, ldata["uicpaths"], xmlescape) normcats = ldata["normcats"] hookcl_f3c = lambda uiref: resolver_helper(uiref, msg, cat, True, False) hookcl_v3c = lambda uiref: resolver_helper(uiref, msg, cat, False, True) uiref_res, errmsgs = _resolve_single_uiref(uiref, normcats, hookcl_f3c, hookcl_v3c, fdiralt) uiref_res = pfhook(uiref_res) return uiref_res, errmsgs # The resolver itself, in two parts. def resolver_helper (msgstr, msg, cat, modtext, spanrep): errspans = [] tsegs = [] if ( mkeyw is None or (not invmkeyw and mkeyw.intersection(cat.markup() or set())) or (invmkeyw and not mkeyw.intersection(cat.markup() or set())) ): rsplit = split_by_uiref(msgstr, msg, cat, errspans) for ptext, uiref, start, end in rsplit: tsegs.append(ptext) if uiref is not None: uiref_res, errmsgs = resolve_single_uiref(uiref, msg, cat, resolver_helper) tsegs.append(uiref_res) errspans.extend([(start, end, x) for x in errmsgs]) if not spanrep and not quiet: for errmsg in errmsgs: warning_on_msg(errmsg, msg, cat) else: tsegs.append(msgstr) if modtext: # F3C hook return "".join(tsegs) elif spanrep: # V3C hook return errspans else: # S3C hook return len(errspans) def resolver (msgstr, msg, cat): return resolver_helper(msgstr, msg, cat, modtext, spanrep) return resolver def _collect_ui_catpaths (uicpaths, uicpathenv): all_uicpaths = [] if uicpathenv is not None: all_uicpaths.extend(collect_catalogs_by_env(uicpathenv)) if uicpaths is not None: all_uicpaths.extend(collect_catalogs(uicpaths)) # Convert into dictionary by catalog name. # If there are several catalogs with the same name among paths, # store them under that name in undefined order. uicpath_dict = {} for uicpath in all_uicpaths: catname = os.path.basename(uicpath) p = catname.rfind(".") if p >= 0: catname = catname[:p] if catname not in uicpath_dict: uicpath_dict[catname] = [] uicpath_dict[catname].append(uicpath) return uicpath_dict # Cache for normalized UI catalogs. # Mapping by normalization options and catalog name. _norm_cats_cache = {} def _load_norm_ui_cats (cat, uicpaths, xmlescape): # Construct list of catalogs, by catalog name, from which this # catalog may draw UI strings. # The list should be ordered by decreasing priority, # used to resolve references in face of duplicates over catalogs. catnames = [] # - catalogs listed in some header fields # NOTE: Mention in module docustring when adding/removing fields. afnames = ( "X-Associated-UI-Catalogs-H", "X-Associated-UI-Catalogs", "X-Associated-UI-Catalogs-L", ) for afname in afnames: for field in cat.header.select_fields(afname): # Field value is a list of catalog names. lststr = field[1] # Remove any summit-merging comments. p = lststr.find("~~") if p >= 0: lststr = lststr[:p] catnames.extend(lststr.split()) # - the catalog itself, if among UI catalogs paths and not explicitly given if cat.name in uicpaths and not cat.name in catnames: catnames.insert(0, cat.name) # highest priority # Make catalog names unique, preserving order. uniq_catnames = [] for catname in catnames: if catname not in uniq_catnames: uniq_catnames.append(catname) # Open and normalize UI catalogs. # Cache catalogs for performance. uicats = [] chkeys = set() for catname in uniq_catnames: catpaths = uicpaths.get(catname) if not catpaths: warning(_("@info", "UI catalog '%(catname1)s' associated to '%(catname2)s' " "is not among known catalog paths.", catname1=catname, catname2=cat.name)) continue for catpath in catpaths: chkey = (xmlescape, catpath) chkeys.add(chkey) uicat = _norm_cats_cache.get(chkey) if uicat is None: uicat_raw = Catalog(catpath, monitored=False) uicat = _norm_ui_cat(uicat_raw, xmlescape) _norm_cats_cache[chkey] = uicat uicats.append(uicat) # Remove previous catalogs not reused by this call. # TODO: Better strategy for removing from cache. for chkey in set(_norm_cats_cache.keys()).difference(chkeys): - #print "Removing normalized UI catalog '%s'..." % list(chkey) + #print("Removing normalized UI catalog '%s'..." % list(chkey)) del _norm_cats_cache[chkey] return uicats def _norm_ui_cat (cat, xmlescape): norm_cat = Catalog("", create=True, monitored=False) norm_cat.filename = cat.filename + "~norm" # Normalize messages and collect them by normalized keys. msgs_by_normkey = {} for msg in cat: if msg.obsolete: continue orig_msgkey = (msg.msgctxt, msg.msgid) remove_markup_msg(msg, cat) # before accelerator removal remove_accel_msg(msg, cat) # after markup removal normkey = (msg.msgctxt, msg.msgid) if normkey not in msgs_by_normkey: msgs_by_normkey[normkey] = [] msgs_by_normkey[normkey].append((msg, orig_msgkey)) - for msgs in msgs_by_normkey.values(): + for msgs in list(msgs_by_normkey.values()): # If there are several messages with same normalized key and # different translations, add extra disambiguations to context. # These disambiguations must not depend on message ordering. if len(msgs) > 1: # Check equality of translations. - msgstr0 = u"" + msgstr0 = "" for msg, d1 in msgs: if msg.translated: if not msgstr0: msgstr0 = msg.msgstr[0] elif msgstr0 != msg.msgstr[0]: msgstr0 = None break if msgstr0 is None: # disambiguation necessary tails = set() for msg, (octxt, omsgid) in msgs: if msg.msgctxt is None: - msg.msgctxt = u"" + msg.msgctxt = "" tail = hashlib.md5(omsgid).hexdigest() n = 4 # minimum size of the disambiguation tail while tail[:n] in tails: n += 1 if n > len(tail): raise PologyError( _("@info", "Hash function has returned same result " "for two different strings.")) tails.add(tail[:n]) msg.msgctxt += "~" + tail[:n] else: # all messages have same translation, use first msgs = msgs[:1] # Escape text fields. if xmlescape: for msg, d1 in msgs: if msg.msgctxt: msg.msgctxt = _escape_to_xml(msg.msgctxt) msg.msgid = _escape_to_xml(msg.msgid) if msg.msgid_plural: msg.msgid_plural = _escape_to_xml(msg.msgid_plural) for i in range(len(msg.msgstr)): msg.msgstr[i] = _escape_to_xml(msg.msgstr[i]) # Add normalized messages to normalized catalog. for msg, d1 in msgs: if msg.msgctxt or msg.msgid: norm_cat.add_last(msg) return norm_cat def _escape_to_xml (text): text = text.replace("&", "&") # must be first text = text.replace("<", "<") text = text.replace(">", ">") return text _ts_fence = "|/|" def _resolve_single_uiref (uitext, uicats, hookcl_f3c, hookcl_v3c, fdiralt): errmsgs = [] # Determine context separator in the reference. # If the arcane one is not present, use normal. ctxsep = _uiref_ctxsep2 if ctxsep not in uitext: ctxsep = _uiref_ctxsep # Return verbatim if requested (starts with two context separators). if uitext.startswith(ctxsep * 2): return uitext[len(ctxsep) * 2:], errmsgs # Split into msgctxt and msgid. has_msgctxt = False msgctxt = None msgid = uitext if ctxsep in uitext: lst = uitext.split(ctxsep) if len(lst) > 2: rep = "..." + ctxsep + ctxsep.join(lst[2:]) errmsgs.append(_("@info \"tail\" is the trailing remainder of " "a UI reference string after parsing", "Superfluous tail '%(str)s' in " "UI reference '%(ref)s'.", str=rep, ref=uitext)) msgctxt, msgid = lst[:2] if not msgctxt: # FIXME: What about context with existing, but empty context? msgctxt = None has_msgctxt = True # msgctxt may be None while has_msgctxt is True. # This distinction is important when deciding between two msgids, # one having no context and one having a context. # Split any arguments from msgid. args = [] argsep = _uiref_argsep2 if _uiref_argsep2 not in msgid: argsep = _uiref_argsep if argsep in msgid: lst = msgid.split(argsep) msgid = lst[0] args_raw = lst[1:] for arg_raw in args_raw: alst = arg_raw.split(_uiref_argplsep) if len(alst) == 2: single = False if alst[0].startswith(_uiref_argsrepl): alst[0] = alst[0][1:] single = True - for fdalt, fdnorm in fdiralt.items(): + for fdalt, fdnorm in list(fdiralt.items()): if alst[0].startswith(fdalt): plhold = alst[0].replace(fdalt, fdnorm, 1) if single: msgid = msgid.replace(alst[0], plhold, 1) else: msgid = msgid.replace(alst[0], plhold) alst[0] = plhold # Argument itself may contain UI references. local_errspans = hookcl_v3c(alst[1]) if local_errspans: errmsgs.extend([x[-1] for x in local_errspans]) else: alst[1] = hookcl_f3c(alst[1]) alst.append(single) args.append(alst) else: errmsgs.append(_("@info", "Invalid argument specification '%(arg)s' " "in UI reference '%(ref)s'.", arg=arg_raw, ref=uitext)) # Try to find unambiguous match to msgctxt/msgid. rmsg = None rcat = None for uicat in uicats: if has_msgctxt: msgs = uicat.select_by_key(msgctxt, msgid) if not msgs: # Also try as if the context were regular expression. msgs = uicat.select_by_key_match(msgctxt, msgid, exctxt=False, exid=True, case=False) else: msgs = uicat.select_by_msgid(msgid) if len(msgs) == 1: rmsg = msgs[0] rcat = uicat break # If unambiguous match found. if rmsg is not None: # If the message is translated, use its translation, # otherwise use original and report. if rmsg.translated: ruitext = rmsg.msgstr[0] else: ruitext = msgid errmsgs.append(_("@info", "UI reference '%(ref)s' not translated " "at %(file)s:%(line)d(#%(entry)d).", ref=uitext, file=rcat.filename, line=rmsg.refline, entry=rmsg.refentry)) # If no unambiguous match found, collect all the approximate ones, # report and use the original UI text. else: ruitext = msgid approx = [] for uicat in uicats: nmsgs = uicat.select_by_msgid_fuzzy(msgid) for nmsg in nmsgs: if nmsg.translated: approx1 = _("@item condensed display of text and " "its translation; they should stand out " "well, hence the {{...}} wrapping", "{{%(text)s}}={{%(translation)s}} " "at %(file)s:%(line)d(#%(entry)d)", text=_to_uiref(nmsg), translation=nmsg.msgstr[0], file=uicat.filename, line=nmsg.refline, entry=nmsg.refentry) else: approx1 = _("@item condensed display of text without " "translation; it should stand out " "well, hence the {{...}} wrapping", "{{%(text)s}}=(untranslated) " "at %(file)s:%(line)d(#%(entry)d)", text=_to_uiref(nmsg), file=uicat.filename, line=nmsg.refline, entry=nmsg.refentry) approx.append(approx1) if approx: errmsgs.append(_("@info", "UI reference '%(ref)s' cannot be resolved; " "close matches:\n" "%(matches)s", ref=uitext, matches=cjoin(approx, "\n"))) else: errmsgs.append(_("@info", "UI reference '%(ref)s' cannot be resolved.", ref=uitext)) # Strip scripted part if any. p = ruitext.find(_ts_fence) if p >= 0: ruitext = ruitext[:p] # Replace any provided arguments. for plhold, value, single in args: if plhold in ruitext: if single: ruitext = ruitext.replace(plhold, value, 1) else: ruitext = ruitext.replace(plhold, value) else: errmsgs.append(_("@info", "Placeholder '%(plhold)s' not found in resolved " "UI reference text '%(text)s' " "to reference '%(ref)s'.", plhold=plhold, text=ruitext, ref=uitext)) return ruitext, errmsgs # Special tokens used in UI references. -_uiref_ctxsep = u"|" # normal context separator -_uiref_ctxsep2 = u"¦" # arcane context separator (fallback) -_uiref_argsep = u"^" # normal argument separator -_uiref_argsep2 = u"ª" # arcane argument separator (fallback) -_uiref_argplsep = u":" # placeholder separator in arguments -_uiref_argsrepl = u"!" # placeholder start to indicate single replacement +_uiref_ctxsep = "|" # normal context separator +_uiref_ctxsep2 = "¦" # arcane context separator (fallback) +_uiref_argsep = "^" # normal argument separator +_uiref_argsep2 = "ª" # arcane argument separator (fallback) +_uiref_argplsep = ":" # placeholder separator in arguments +_uiref_argsrepl = "!" # placeholder start to indicate single replacement # Present message from a normalized catalog in reference format, # suitable for inserting as a reference. def _to_uiref (nmsg): uiref = nmsg.msgid if nmsg.msgctxt: # Use arcane separator if the msgid or msgctxt contain normal one. ctxsep = _uiref_ctxsep if ctxsep in uiref or ctxsep in nmsg.msgctxt: ctxsep = _uiref_ctxsep2 uiref = nmsg.msgctxt + ctxsep + uiref elif _uiref_ctxsep in nmsg.msgid: # If the msgid contains normal separator, add one arcane separator # in front of it to indicate empty context. uiref = _uiref_ctxsep * 2 + uiref # TODO: Analyze format directives to add dummy arguments? return uiref # Split UI reference path as [(ptext, ref1), (sep, ref2), (sep, ref3), ...] def _split_uirefpath (ptext, uirefpath, uipathseps): p = -1 for sep in uipathseps: p = uirefpath.find(sep) if p >= 0: break if p < 0: return [(ptext, uirefpath)] else: rsplit = uirefpath.split(sep) - return zip([ptext] + [sep] * (len(rsplit) - 1), rsplit) + return list(zip([ptext] + [sep] * (len(rsplit) - 1), rsplit)) diff --git a/pology/vcs.py b/pology/vcs.py index bfdc116a..d8428e5f 100644 --- a/pology/vcs.py +++ b/pology/vcs.py @@ -1,1201 +1,1201 @@ # -*- coding: UTF-8 -*- """ Version control operations. Collections of PO files are frequently kept under some sort of version control. This module provides typical version control operations, abstracted across various version control systems. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import shutil import tempfile from pology import PologyError, _, n_ from pology.escape import escape_sh from pology.fsops import collect_system, system_wd, unicode_to_str, join_ncwd from pology.report import report, warning _vcskeys_by_pkey = {} _vcstypes_by_akey = {} def _register_vcs (): register = ( # First keyword is primary. (("none", "noop", "dummy"), VcsNoop), (("svn", "subversion"), VcsSubversion), (("git",), VcsGit), ) for vcskeys, vcstype in register: _vcskeys_by_pkey[vcskeys[0]] = vcskeys _vcstypes_by_akey.update([(x, vcstype) for x in vcskeys]) def available_vcs (flat=False): """ Get keywords of all available version control systems. Some VCS have more than one keyword identifying them. If C{flat} is C{False}, a dictionary with primary keyword per VCS as keys, and tuple of all alternatives (including the main keyword) as values, is returned. If C{flat} is C{True}, all keywords are returned in a flat tuple. @return: VCS keywords, as dictionary by primary or as a flat list of all @rtype: {(string, string)*} or [string*] """ if flat: - return _vcstypes_by_akey.keys() + return list(_vcstypes_by_akey.keys()) else: return _vcskeys_by_pkey.copy() def make_vcs (vcskey): """ Factory for version control systems. Desired VCS is identified by a keyword. Currently available: - dummy noop (C{none}, C{noop}, C{dummy}) - Subversion (C{svn}, C{subversion}) - Git (C{git}) @param vcskey: keyword identifier of the VCS @type vcskey: string @return: version control object @rtype: instance of L{VcsBase} """ nkey = vcskey.strip().lower() vcstype = _vcstypes_by_akey.get(nkey) if not vcstype: raise PologyError( _("@info", "Unknown version control system requested by key '%(key)s'.", key=vcskey)) return vcstype() class VcsBase (object): """ Abstract base for VCS objects. """ def add (self, paths, repadd=False): """ Add paths to version control. It depends on the particular VCS what adding means, but in general it should be the point where the subsequent L{commit()} on the same path will record addition in the repository history. Also a single path can be given instead of sequence of paths. Actually added paths may be different from input paths, e.g. if an input path is already version controlled, or input path's parent directory was added as well. List of added paths can be requested with C{repadd} parameter, and it will become the second element of return value. @param paths: paths to add @type paths: or string @param repadd: whether to report which paths were actually added @type repadd: bool @return: C{True} if addition successful, possibly list of added paths @rtype: bool or (bool, [string*]) """ raise PologyError( _("@info", "Selected version control system does not define adding.")) def remove (self, path): """ Remove path from version control and from disk. It depends on the particular VCS what removing means, but in general it should be the point where the subsequent L{commit()} on the same path will record removal in the repository history. @param path: path to remove @type path: string @return: C{True} if removal successful @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define removing.")) def move (self, spath, dpath): """ Move versioned file or directory within the repository. It depends on the particular VCS what moving means, but in general it should be the point where the subsequent L{commit()} on source and destination path (or their common parent directory) will record the move in the repository history. @param spath: source path @type spath: string @param dpath: destination path @type dpath: string @return: C{True} if moving successful @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define moving.")) def revision (self, path): """ Get current revision ID of the path. @param path: path to query for revision @type path: string @return: revision ID @rtype: string """ raise PologyError( _("@info", "Selected version control system does not define " "revision query.")) def is_clear (self, path): """ Check if the path is in clear state. Clear state means none of: not version-controlled, modified, added... @param path: path to check the state of @type path: string @return: C{True} if clear @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define state query.")) def is_versioned (self, path): """ Check if path is under version control. @param path: path to check @type path: string @return: C{True} if versioned @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define " "checking whether a path is version controlled.")) def export (self, path, rev, dstpath, rewrite=None): """ Export a versioned file or directory. Makes a copy of versioned file or directory pointed to by local path C{path}, in the revision C{rev}, to destination C{dstpath}. If C{rev} is C{None}, the clean version of C{path} according to current local repository state is copied to C{dstpath}. Final repository path, as determined from C{path}, can be filtered through an external function C{rewrite} before being used. The function takes as arguments the path and revision strings. This can be useful, for example, to reroute remote repository URL. @param path: path of the versioned file or directory in local repository @type path: string @param rev: revision to export @type rev: string or C{None} @param dstpath: file path to export to @type dstpath: string @param rewrite: function to filter resolved repository path @type rewrite: (string, string)->string or None @return: C{True} if fetching succeeded, C{False} otherwise @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define " "fetching of a versioned path.")) def commit (self, paths, message=None, msgfile=None, incparents=True): """ Commit paths to the repository. Paths can include any number of files and directories. Also a single path string can be given instead of a sequence. It depends on the particular VCS what committing means, but in general it should be the earliest level at which modifications are recorded in the repository history. Commit message can be given either directly, through C{message} parameter, or read from a file with path given by C{msgfile}. If both C{message} and C{msgfile} are given, C{message} takes precedence and C{msgfile} is ignored. If the commit message is not given, VCS should ask for one as usual (pop an editor window, or whatever the user has configured). Some VCS require that the parent directory of a path to be committed has been committed itself or included in the commit list if not. If that is the case, C{incparents} parameter determines if this function should assure that non-committed parents are included into the commit list too. This may be expensive to check, so it is good to disable it if all parents are known to be committed or included in the input paths. @param paths: paths to commit @type paths: or string @param message: commit message @type message: string @param msgfile: path to file with the commit message @type msgfile: string @param incparents: whether to automatically include non-committed parents in the commit list @type incparents: bool @return: C{True} if committing succeeded, C{False} otherwise @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define " "committing of paths.")) def log (self, path, rev1=None, rev2=None): """ Get revision log of the path. Revision log entry consists of revision ID, commiter name, date string, and commit message. Except the revision ID, any of these may be empty strings, depending on the particular VCS. The log is ordered from earliest to newest revision. A section of entries between revisions C{rev1} (inclusive) and C{rev2} (exclusive) can be returned instead of the whole log. If C{rev1} is C{None}, selected IDs start from the first in the log. If C{rev2} is C{None}, selected IDs end with the last in the log. If either C{rev1} or C{rev2} is not C{None} and does not exist in the path's log, or the path is not versioned, empty log is returned. @param path: path to query for revisions @type path: string @param rev1: entries starting from this revision (inclusive) @type rev1: string @param rev2: entries up to this revision (exclusive) @type rev2: string @return: revision ID, committer name, date string, commit message @rtype: [(string*4)*] """ raise PologyError( _("@info", "Selected version control system does not define " "revision history query.")) def to_commit (self, path): """ Get paths which need to be committed within the given path. Input path can be either a file or directory. If it is a directory, it depends on VCS whether it will only report files within it that need to be committed, or subdirectories too (including the given directory). @param path: path to query for non-committed paths @type path: string @return: non-committed paths @rtype: [string*] """ raise PologyError( _("@info", "Selected version control system does not define " "listing of non-committed paths.")) def diff (self, path, rev1=None, rev2=None): """ Get diff between revisions of the given path. Unified diff is computed and reported as list of 2-tuples, where the first element is a tag, and the second the payload. For tags C{" "}, C{"+"}, and C{"-"}, the payload is the line (without newline) which was equal, added or removed, respectively. Payload for tag C{":"} is the path of the diffed file, and for C{"@"} the 4-tuple of old start line, old number of lines, new start line, and new number of lines, which are represented by the following difference segment. Diffs can be requested between specific revisions. If both C{rev1} and C{rev2} are C{None}, diff is taken from last known commit to working copy. If only C{rev2} is C{None} diff is taken from C{rev1} to working copy. @param path: path to query for modified lines @type path: string @param rev1: diff from this revision @type rev1: string @param rev2: diff to this revision @type rev2: string @return: tagged unified diff @rtype: [(string, string or (int, int, int, int))*] """ raise PologyError( _("@info", "Selected version control system does not define diffing.")) def revert (self, path): """ Revert a versioned file or directory. The path is reverted to the clean version of itself according to current local repository state. @param path: path of the versioned file or directory in local repository @type path: string @return: C{True} if reverting succeeded, C{False} otherwise @rtype: bool """ raise PologyError( _("@info", "Selected version control system does not define " "reverting a versioned path.")) class VcsNoop (VcsBase): """ VCS: Dummy VCS which perform only file system operations. """ def add (self, paths, repadd=False): # Base override. return True if not repadd else [True, paths] def remove (self, path): # Base override. if os.path.isdir(path): shutil.rmtree(path) else: os.remove(path) return True def move (self, spath, dpath): # Base override. shutil.move(spath, dpath) return True def revision (self, path): # Base override. return "" def is_clear (self, path): # Base override. return True def is_versioned (self, path): # Base override. return True def export (self, path, rev, dstpath, rewrite=None): # Base override. if rev is not None: return False try: os.shutil.copyfile(path, dstpath) except: return False return True def commit (self, paths, message=None, msgfile=None, incparents=True): # Base override. return True def log (self, path, rev1=None, rev2=None): # Base override. return [] def to_commit (self, path): # Base override. return [] def revert (self, path): # Base override. return True class VcsSubversion (VcsBase): """ VCS: Subversion. """ def __init__ (self): # Environment to cancel any localization in output of operations, # for methods which need to parse the output. self._env = os.environ.copy() self._env["LC_ALL"] = "C" def add (self, paths, repadd=False): # Base override. - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] if not paths: return True tmppath = _temp_paths_file(paths) res = collect_system(["svn", "add", "--force", "--parents", "--targets", tmppath], env=self._env) success = (res[2] == 0) os.remove(tmppath) if repadd: apaths = [] for line in res[0].split("\n"): if line.startswith("A"): apaths.append(line[1:].strip()) return success, apaths else: return success def remove (self, path): # Base override. if collect_system(["svn", "remove", path])[2] != 0: return False return True def move (self, spath, dpath): # Base override. if collect_system(["svn", "move", "--parents", self._ep(spath), dpath])[2] != 0: return False return True def revision (self, path): # Base override. res = collect_system(["svn", "info", self._ep(path)], env=self._env) rx = re.compile(r"^Last Changed Rev: *([0-9]+)", re.I) revid = "" for line in res[0].split("\n"): m = rx.search(line) if m: revid = m.group(1) break return revid def is_clear (self, path): # Base override. res = collect_system(["svn", "status", path], env=self._env) clear = not re.search(r"^\S", res[0]) return clear def is_versioned (self, path): # Base override. res = collect_system(["svn", "info", self._ep(path)], env=self._env) if res[-1] != 0: return False rx = re.compile(r"^Repository", re.I) for line in res[0].split("\n"): if rx.search(line): return True return False def export (self, path, rev, dstpath, rewrite=None): # Base override. if rev is None: res = collect_system(["svn", "export", "--force", self._ep(path), "-r", "BASE", dstpath]) if res[-1] != 0: return False return True res = collect_system(["svn", "info", self._ep(path)], env=self._env) if res[-1] != 0: return False rx = re.compile(r"^URL:\s*(\S+)", re.I) rempath = None for line in res[0].split("\n"): m = rx.search(line) if m: rempath = m.group(1) break if not rempath: return False if rewrite: rempath = rewrite(rempath, rev) if collect_system(["svn", "export", "--force", self._ep(rempath), "-r", rev, dstpath])[-1] != 0: return False return True def commit (self, paths, message=None, msgfile=None, incparents=True): # Base override. - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] if not paths: return True if incparents: # Move up any path that needs its parent committed too. paths_mod = [] for path in paths: path_mod = path while True: path_mod_up = os.path.dirname(path_mod) if self.revision(path_mod_up): break elif not path_mod_up or not self.is_versioned(path_mod_up): # Let simply Subversion complain. path_mod = path break else: path_mod = path_mod_up paths_mod.append(path_mod) paths = paths_mod cmdline = ["svn", "commit"] if message is not None: cmdline += ["-m", message] elif msgfile is not None: cmdline += ["-F", msgfile] tmppath = _temp_paths_file(paths) cmdline += ["--targets", tmppath] # Do not use collect_system(), user may need to input stuff. cmdstr = " ".join(map(escape_sh, cmdline)) success = (os.system(unicode_to_str(cmdstr)) == 0) os.remove(tmppath) return success def log (self, path, rev1=None, rev2=None): # Base override. res = collect_system(["svn", "log", self._ep(path)], env=self._env) if res[-1] != 0: return [] rev = "" - next_rev, next_cmsg = range(2) + next_rev, next_cmsg = list(range(2)) entries = [] next = -1 for line in res[0].strip().split("\n"): if line.startswith("----------"): if rev: cmsg = "\n".join(cmsg).strip("\n") entries.append((rev, user, dstr, cmsg)) cmsg = [] next = next_rev elif next == next_rev: lst = line.split("|") rev, user, dstr = [x.strip() for x in lst[:3]] rev = rev[1:] # strip initial "r" next = next_cmsg elif next == next_cmsg: cmsg += [line] entries.reverse() return _crop_log(entries, rev1, rev2) def to_commit (self, path): # Base override. res = collect_system(["svn", "status", path], env=self._env) if res[-1] != 0: return [] ncpaths = [] for line in res[0].split("\n"): if line[:1] in ("A", "M"): path = line[1:].strip() ncpaths.append(path) return ncpaths def diff (self, path, rev1=None, rev2=None): # Base override. if rev1 is not None and rev2 is not None: rspec = "-r %s:%s" % (rev1, rev2) elif rev1 is not None: rspec = "-r %s" % rev1 elif rev2 is not None: raise PologyError( _("@info \"Subversion\" is a version control system", "Subversion cannot diff from working copy " "to a named revision.")) else: rspec = "" res = collect_system(["svn", "diff", path, rspec], env=self._env) if res[-1] != 0: warning(_("@info", "Subversion reports it cannot diff path '%(path)s':\n" "%(msg)s", path=path, msg=res[1])) return [] udiff = [] nskip = 0 for line in res[0].split("\n"): if nskip > 0: nskip -= 1 continue if line.startswith("Index:"): udiff.append((":", line[line.find(":") + 1:].strip())) nskip = 3 elif line.startswith("@@"): m = re.search(r"-(\d+),(\d+) *\+(\d+),(\d+)", line) spans = tuple(map(int, m.groups())) if m else (0, 0, 0, 0) udiff.append(("@", spans)) elif line.startswith(" "): udiff.append((" ", line[1:])) elif line.startswith("-"): udiff.append(("-", line[1:])) elif line.startswith("+"): udiff.append(("+", line[1:])) return udiff def revert (self, path): # Base override. res = collect_system(["svn", "revert", "-R", path], env=self._env) if res[-1] != 0: warning(_("@info", "Subversion reports it cannot revert path '%(path)s':\n" "%(msg)s", path=path, msg=res[1])) return False return True def _ep (self, path): #if "@" in os.path.basename(os.path.normpath(path)): if "@" in path: path = "%s@" % path return path class VcsGit (VcsBase): """ VCS: Git. """ def __init__ (self): # Environment to cancel any localization in output of operations, # for methods which need to parse the output. self._env = os.environ.copy() self._env["LC_ALL"] = "C" def _gitroot (self, paths): single = False - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] single = True # Take first path as referent. path = os.path.abspath(paths[0]) root = None if os.path.isfile(path): pdir = os.path.dirname(path) else: pdir = path while True: gitpath = os.path.join(pdir, ".git") if os.path.isdir(gitpath): root = pdir break pdir_p = pdir pdir = os.path.dirname(pdir) if pdir == pdir_p: break if root is None: raise PologyError( _("@info \"Git\" is a version control system", "Cannot find Git repository for '%(path)s'.", path=path)) rpaths = [] for path in paths: path = os.path.abspath(path) path = path[len(root) + len(os.path.sep):] rpaths.append(path) if single: return root, rpaths[0] else: return root, rpaths def add (self, paths, repadd=False): # Base override. - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] if not paths: return True root, paths = self._gitroot(paths) success = True apaths = [] for path in paths: if collect_system(["git", "add", path], wdir=root)[2] != 0: success = False break apaths.append(path) return success if not repadd else [success, apaths] def remove (self, path): # Base override. if os.path.isdir(path): warning(_("@info", "Git cannot remove directories (tried on '%(path)s').", path=path)) return False root, path = self._gitroot(path) if collect_system(["git", "rm", path], wdir=root)[2] != 0: return False return True def move (self, spath, dpath): # Base override. root1, spath = self._gitroot(spath) root2, dpath = self._gitroot(dpath) if root1 != root2: warning(_("@info", "Trying to move paths between different repositories.")) return False if collect_system(["git", "mv", spath, dpath], wdir=root1)[2] != 0: return False return True def revision (self, path): # Base override. root, path = self._gitroot(path) res = collect_system(["git", "log", path], wdir=root, env=self._env) rx = re.compile(r"^commit\s*([0-9abcdef]+)", re.I) revid = "" for line in res[0].split("\n"): m = rx.search(line) if m: revid = m.group(1) break return revid def is_clear (self, path): # Base override. root, path = self._gitroot(path) res = collect_system(["git", "status", path], wdir=root, env=self._env) rx = re.compile(r"\bmodified:\s*(\S.*)", re.I) for line in res[0].split("\n"): m = rx.search(line) if m: mpath = m.group(1) if os.path.isfile(path): if mpath == path: return False else: if not path or mpath[len(path):].startswith(os.path.sep): return False return True def is_versioned (self, path): # Base override. root, path = self._gitroot(path) if not path: return True res = collect_system(["git", "status"], wdir=root, env=self._env) rx = re.compile(r"untracked.*?:", re.I) m = rx.search(res[0]) if m: for line in res[0][m.end():].split("\n"): line = line.lstrip("#").strip() if line == path: return False return True def export (self, path, rev, dstpath, rewrite=None): # Base override. root, path = self._gitroot(path) ret = True if rev is None: rev = "HEAD" if rewrite: path = rewrite(path, rev) # FIXME: Better temporary location." # FIXME: Use list command lines (so must replace piping too). tarpdir = "/tmp" tarbdir = "git-archive-tree%d" % os.getpid() res = collect_system(" git archive --prefix=%s/ %s %s " "| (cd %s && tar xf -)" % (tarbdir, rev, path, tarpdir), wdir=root) if res[2] == 0: tardir = os.path.join(tarpdir, tarbdir) tarpath = os.path.join(tardir, path) try: shutil.move(tarpath, dstpath) except: ret = False if os.path.isdir(tardir): shutil.rmtree(tardir) else: ret = False return ret def commit (self, paths, message=None, msgfile=None, incparents=True): # Base override. - if isinstance(paths, basestring): + if isinstance(paths, str): paths = [paths] if not paths: return True opaths = paths root, paths = self._gitroot(paths) # Check if all paths are versioned. # Add to index any modified paths that have not been added. for opath in opaths: if not self.is_versioned(opath): warning(_("@info" "Git cannot commit non-versioned path '%(path)s'.", path=opath)) return False if os.path.exists(opath) and not self.add(opath): warning(_("@info" "Git cannot add path '%(path)s' to index.", path=opath)) return False # Reset all paths in index which have not been given to commit. ipaths = self._paths_to_commit(root) rpaths = list(set(ipaths).difference(paths)) if rpaths: warning(_("@info", "Git is resetting paths in index which are " "not to be committed.")) cmdline = "git reset %s" % " ".join(rpaths) system_wd(unicode_to_str(cmdline), root) # ...seems to return != 0 even if it did what it was told to. # Commit the index. cmdline = ["git", "commit"] if message is not None: cmdline += ["-m", message] elif msgfile is not None: cmdline += ["-F", msgfile] # Do not use collect_system(), user may need to input stuff. cmdstr = " ".join(map(escape_sh, cmdline)) if system_wd(unicode_to_str(cmdstr), root) != 0: return False return True def log (self, path, rev1=None, rev2=None): # Base override. root, path = self._gitroot(path) res = collect_system(["git", "log", path], wdir=root, env=self._env) if res[-1] != 0: return [] rev = "" - next_auth, next_date, next_cmsg = range(3) + next_auth, next_date, next_cmsg = list(range(3)) next = -1 entries = [] lines = res[0].split("\n") for i in range(len(lines) + 1): if i < len(lines): line = lines[i] if i == len(lines) or line.startswith("commit"): if rev: cmsg = "\n".join(cmsg).strip("\n") entries.append((rev, user, dstr, cmsg)) rev = line[line.find(" ") + 1:].strip() cmsg = [] next = next_auth elif next == next_auth: user = line[line.find(":") + 1:].strip() next = next_date elif next == next_date: dstr = line[line.find(":") + 1:].strip() next = next_cmsg elif next == next_cmsg: cmsg += [line[4:]] entries.reverse() return _crop_log(entries, rev1, rev2) def to_commit (self, path): # Base override. root, path = self._gitroot(path) ncpaths = self._paths_to_commit(root, path or ".") ncpaths = [join_ncwd(root, p) for p in ncpaths] return ncpaths def _paths_to_commit (self, root, path=None): if path: cmdline = ["git", "status", path] else: cmdline = ["git", "status"] res = collect_system(cmdline, wdir=root, env=self._env) sect_rx = re.compile(r"^(?:# )?(\S.*):$", re.I) file_rx = re.compile(r"^#?\s+.*\w:\s*(.+?)\s*$", re.I) inlist = False ipaths = [] for line in res[0].split("\n"): m = sect_rx.search(line) if m: mstr = m.group(1) if ( mstr.endswith("to be committed") # git 1.6.x or mstr.endswith("but not updated") # git 1.7.x or mstr.endswith("not staged for commit") # git 1.7.x ): inlist = True else: break if not inlist: continue m = file_rx.search(line) if m: ipaths.append(m.group(1)) return ipaths def diff (self, path, rev1=None, rev2=None): # Base override. root, path = self._gitroot(path) if rev1 is not None and rev2 is not None: rspec = "%s..%s" % (rev1, rev2) elif rev1 is not None: rspec = "%s" % rev1 elif rev2 is not None: raise PologyError( _("@info" "Git cannot diff from non-staged paths to a commit.")) else: rspec = "" res = collect_system(["git", "diff", rspec, path], wdir=root, env=self._env) if res[-1] != 0: warning(_("@info" "Git reports it cannot diff path '%(path)s':\n" "%(msg)s", path=path, msg=res[1])) return [] udiff = [] nskip = 0 for line in res[0].split("\n"): if nskip > 0: nskip -= 1 continue if line.startswith("diff"): m = re.search(r"a/(.*?) *b/", line) udiff.append((":", m.group(1) if m else "")) nskip = 3 elif line.startswith("@@"): m = re.search(r"-(\d+),(\d+) *\+(\d+),(\d+)", line) spans = tuple(map(int, m.groups())) if m else (0, 0, 0, 0) udiff.append(("@", spans)) elif line.startswith(" "): udiff.append((" ", line[1:])) elif line.startswith("-"): udiff.append(("-", line[1:])) elif line.startswith("+"): udiff.append(("+", line[1:])) return udiff def revert (self, path): # Base override. res = collect_system(["git", "checkout", path], wdir=root, env=self._env) if res[-1] != 0: warning(_("@info" "Git reports it cannot revert path '%(path)s':\n" "%(msg)s", path=path, msg=res[1])) return [] return True def _crop_log (entries, rev1, rev2): start = 0 if rev1 is not None: while start < len(entries): if entries[start][0] == rev1: break start += 1 end = len(entries) if rev2 is not None: while end > 0: end -= 1 if entries[end][0] == rev2: break return entries[start:end] def _temp_paths_file (paths): content = unicode_to_str("\n".join(paths) + "\n") tmpf, tmppath = tempfile.mkstemp() os.write(tmpf, content) os.close(tmpf) return tmppath _register_vcs() diff --git a/pology/wrap.py b/pology/wrap.py index 371a1b41..40f648a4 100644 --- a/pology/wrap.py +++ b/pology/wrap.py @@ -1,567 +1,567 @@ # -*- coding: UTF-8 -*- """ Text wrapping, with special handling for typical texts in PO files. Wrapping turns out to be quite a non-trivial matter. Gettext itself implements an intricate wrapping algorithm from the Unicode consortium, with its own tweaks, which is hard to beat in any simpler way. Thus, do not be surprised if the wrapping quality offered by this module does not meet your exact needs. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re import unicodedata from pology import PologyError, _, n_ # Regex for splitting C{<...>} into tag name and few other elements. _tag_split_rx = re.compile(r"^\s*<\s*(/?)\s*(\w+)[^/>]*(/?)\s*>\s*$") # Characters for "natural" breaks where to wrap the text. -_natbr_after = u".,;/-)]}" -_natbr_before = u"%({[" +_natbr_after = ".,;/-)]}" +_natbr_before = "%({[" # Strings at which the text should be wrapped before or after. _prebr = ("|/|",) _postbr = (("\\n", "\\\\n"), "|/|") # |/| is the Transcript fence, should break both before and after. # Tags for normal breaking (after the closed tag) _tagbr_normal = ( # HTML "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", "table", "th", "td", "tr", "center", "blockquote", "pre", "dd", "dl", "dt", # KUIT "title", "subtitle", "para", "list", "item", # Docbook "calloutlist", "glosslist", "itemizedlist", "orderedlist", "segmentedlist", "simplelist", "variablelist", "listitem", "seglistitem", "varlistentry", ) # Tags usually closed in-place in strict XML, break before and after. _tagbr_inplace = ( # HTML "br", "hr", # KUIT "nl", ) def _tag_split (tag): """ Split tag statement into tag name and a state string. State is one of "open" (), "close" (), or "inplace" (). @param tag: the tag proper, C{<...>} @type tag: string @returns: tag name and state @rtype: string, string """ m = _tag_split_rx.match(tag) if m: if m.group(1): state = "close" elif m.group(3): state = "inplace" else: state = "open" return m.group(2), state else: return "", "" def wrap_text (text, wcol=79, lead="", trail="", flead=None, femp=False, natbr="", natbr2="", prebr=(), postbr=(), tagbr=(), tagbr2=(), wcolmin=0, midbr=True, remtrws=False, endl="\n"): """ Wrap text into lines. Wrapping behavior and positions can be controlled by several parameters. Trailing and leading strings can be added to each wrapped line, including a special lead for the first line. If wrapping column is given as less or equal to zero, the lines are split only at unconditional breaks. This is a very general wrapping function, see the more specialized ones in this module for practical use with PO message elements. @param text: the text to wrap @type text: string @param wcol: column to wrap after @type wcol: int @param lead: prefix for each line @type lead: string @param trail: suffix for each line @type trail: string @param flead: special suffix for the first line. Normal suffix is used if this is given as C{None} @type flead: C{None} or string @param femp: C{True} to leave the first line empty if the complete text would not fit into it, C{False} for normal use of the first line @type femp: bool @param natbr: characters other than space to naturally break at @type natbr: string @param natbr2: characters other than space to naturally break at, also taking the breaking character to the next line @type natbr2: string @param prebr: character sequences to unconditionally break before @type prebr: (string*) @param postbr: character sequences to unconditionally break after @type postbr: (string*) @param tagbr: tag names to break before opening and after closing @type tagbr: (string*) @param tagbr2: tag names to always break after (like
) @type tagbr2: (string*) @param wcolmin: minimal column to allow natural breaks after @type wcolmin: int @param midbr: C{True} to allow break in the middle of a word if no usual break found before C{wcol} has been exceeded @type midbr: bool @param remtrws: whether to strictly remove any trailing whitespace in wrapped lines (otherwise trailing whitespace may be left in under certain conditions) @type remtrws: bool @param endl: line end marker for each line @type endl: string @returns: wrapped lines @rtype: [string*] """ if flead is None: flead = lead rlentext = len(text) atoms = _atomize(text)[:-1] # strip sentry vlenlead = _atomize(lead)[-1][2] vlentrail = _atomize(trail)[-1][2] vlenflead = _atomize(flead)[-1][2] if wcol > 0 and vlenlead + vlentrail + 1 >= wcol: raise PologyError( _("@info", "Wrapping is too tight, cannot fit leading and trailing text.")) lines = [] # list of lines nlines = 0 lenatoms = len(atoms) p = 0 # position into original text by atoms vtext = "".join(x[0] for x in atoms) vposs = tuple(x[2] for x in atoms) rvposs = tuple(x[6] for x in atoms) while p < lenatoms: # Determine effective wrapping column for this line. ewcol = wcol - 1 - vlentrail # -1 for newline character if nlines == 0: clead = flead ewcol -= vlenflead else: clead = lead ewcol -= vlenlead # Find where to wrap. atbr = False # immediate break found pl = 0 # position into current line ple = 0 #b apparent position into current line pl_ok = 0 # last good position into current line (where wrap was fine) ple_ok = 0 # last good apparent position into current line pvseg, pvlen = "", 0 while ( p + pl < lenatoms and (ple <= ewcol or wcol <= 0 or (not midbr and pl_ok == 0)) and not atbr ): if pl > 0: pvseg, pvlen = atoms[p + pl - 1][:2] cvseg, cvlen = atoms[p + pl][:2] if postbr or tagbr or tagbr2: # condition for optimization backvtext = vtext[rvposs[p]:rvposs[p + pl]] if prebr or tagbr: # condition for optimization forevtext = vtext[rvposs[p + pl]:] # Immediate breaks allowed only after # at least one visually non-empty atom. if vposs[p + pl] > vposs[p]: # Check for an immediate break by sequence. for br in postbr: if not isinstance(br, tuple): if backvtext.endswith(br): atbr = True; break else: br1, br2 = br if ( backvtext.endswith(br1) and not backvtext.endswith(br2) ): atbr = True; break if atbr: break for br in prebr: if forevtext.startswith(br): atbr = True; break if atbr: break # Check for an immediate break by tag. if tagbr or tagbr2: if backvtext.endswith(">"): pt = backvtext.rfind("<", 0, -1) if pt >= 0: tag, state = _tag_split(backvtext[pt:]) if ( (tag in tagbr2) or ( tag in tagbr and state in ("close", "inplace")) ): atbr = True; break if tagbr: if forevtext.startswith("<"): pt = forevtext.find(">", 1) if pt >= 0: tag, state = _tag_split(forevtext[:pt+1]) if tag in tagbr and state == "open": atbr = True; break # Check for valid natural break. if ( pvseg in " " or (cvseg != " " and pvseg in natbr and cvseg not in natbr) or cvseg in natbr2 ): pl_ok = pl ple_ok = ple ple += pvlen pl += 1 # If not unconditional break, still enough text, and break possible. if not atbr and ple > ewcol and ewcol > 0: # Don't allow too short natural break. if ple_ok > wcolmin: pl = pl_ok ple = ple_ok # Backstep any segments still too much if mid-word break allowed. if midbr: while pl > 1 and ple > ewcol: pl -= 1 ple -= atoms[pl][1] # Never break after non-final backslash. if p + pl < lenatoms: while pl > 1 and atoms[p + pl - 1][0] == "\\": pl -= 1 ple -= atoms[p + pl][1] if ( nlines == 0 and ((femp and p + pl < lenatoms) or (ewcol <= 0 and wcol > 0)) ): # leaving first line empty lines.append(clead + trail) pl = 0 else: p1 = atoms[p][4] p2 = atoms[p + pl][4] if p + pl < lenatoms else rlentext lines.append(clead + text[p1:p2] + trail) nlines += 1 p += pl if lenatoms == 0: # in case no text given, main loop did not run lines.append(flead + trail) for i in range(len(lines)): # postprocess # Strip trailing whitespace if no trailing string or removal is forced. if not trail or remtrws: # Do not remove trailing whitespace which is part of leading string, # unless removal is forced. clead = "" if not remtrws: if i == 0: clead = flead else: clead = lead tmp = lines[i][len(clead):] lines[i] = clead + tmp.rstrip() if endl: lines[i] += endl return lines def _atomize (text): """ Split text into atomic segments and compute their visual and raw widths. Returns list of tuples (visual segment, visual length, visual position, raw length, raw position, raw visual length, raw visual position). The list always ends with zero-visual length segment, so that it is not empty even if the text is empty, and that last atom's positions are visual and raw lengths of the string. """ atoms = [] - isuc = isinstance(text, unicode) + isuc = isinstance(text, str) vsegf = getattr(text, "visual_segment", None) rpos = 0 vpos = 0 rvpos = 0 rlentext = len(text) while rpos < rlentext: rlen = 0 if vsegf: vseg, rlen = vsegf(rpos) if rlen == 0: vseg, rlen = text[rpos], 1 vlen = len(vseg) rvlen = vlen if isuc and vlen: for c in vseg: if unicodedata.east_asian_width(c) in ("W", "F"): vlen += 1 # 1 = 2 minus (1 already counted) atoms.append((vseg, vlen, vpos, rlen, rpos, rvlen, rvpos)) vpos += vlen rpos += rlen rvpos += rvlen atoms.append((type(text)(""), 0, vpos, 0, rpos, 0, rvpos)) return atoms def wrap_field (field, text, preseq=""): """ Wrap fields in PO messages. This function can be sent as parameter to L{Message} and L{Catalog} methods and constructors. @param field: the field keyword (C{"msgctxt"}, C{"msgid"}, ...) @type field: string @param text: the text of the field @type text: string @param preseq: the prefix to field keyword, usually for previous-value (C{"#|"}) and obsolete (C{"#~"}) fields @type preseq: string @returns: wrapped field lines (each ends with a newline) @rtype: list of strings """ return wrap_text(text, 79, flead=preseq+field+" \"", lead=preseq+"\"", trail="\"", natbr=_natbr_after, natbr2=_natbr_before, prebr=_prebr, postbr=_postbr, femp=True, wcolmin=39) def wrap_field_unwrap (field, text, preseq=""): """ Wrap fields in PO messages at unconditional breaks (no column-wrapping). This function can be sent as parameter to L{Message} and L{Catalog} methods and constructors. The parameters and return values are as for L{wrap_field}. @see: L{wrap_field} """ return wrap_text(text, 0, flead=preseq+field+" \"", lead=preseq+"\"", trail="\"", prebr=_prebr, postbr=_postbr, femp=True) def wrap_comment (ctype, text): """ Wrap comments in PO messages. @param ctype: the comment type (C{"# "}, C{"#:"}, C{"#."}, ...) @type ctype: string @param text: the text of the comment @type text: string @returns: wrapped comment lines (each ends with a newline) @rtype: list of strings """ return wrap_text(text, 79, lead="#"+ctype+" ", femp=False, midbr=False, remtrws=True) # midbr is False in order to prevent e.g. very long source references # being forced split in the middle. # remtrws is True in order to remove the trailing space in empty comments. def wrap_comment_unwrap (ctype, text): """ Wrap comments in PO messages at unconditional breaks (no column-wrapping). The parameters and return values are as for L{wrap_comment}. @see: L{wrap_comment} """ return wrap_text(text, 0, lead="#"+ctype+" ", femp=False, remtrws=True) def wrap_field_fine (field, text, preseq=""): """ Wrap fields in PO messages, including breaks at selected markup elements. This function can be sent as parameter to L{Message} and L{Catalog} methods and constructors. The parameters and return values are as for L{wrap_field}. @see: L{wrap_field} """ return wrap_text(text, 79, flead=preseq+field+" \"", lead=preseq+"\"", trail="\"", natbr=_natbr_after, natbr2=_natbr_before, prebr=_prebr, postbr=_postbr, tagbr=_tagbr_normal, tagbr2=_tagbr_inplace, femp=True) def wrap_field_fine_unwrap (field, text, preseq=""): """ Wrap fields in PO messages, including breaks at selected markup elements, but only at unconditional breaks (no column-wrapping). This function can be sent as parameter to L{Message} and L{Catalog} methods and constructors. The parameters and return values are as for L{wrap_field}. @see: L{wrap_field} """ return wrap_text(text, 0, flead=preseq+field+" \"", lead=preseq+"\"", trail="\"", prebr=_prebr, postbr=_postbr, tagbr=_tagbr_normal, tagbr2=_tagbr_inplace, femp=True) def select_field_wrapper (wrapkw): """ Select wrap function for PO message fields based on keywords. Wrap function is selected by specifying a sequence of keywords, from the following set: - C{"basic"}: wrapping on column count - C{"fine"}: wrapping on logical breaks (such as C{

} or C{} tags) Wrapping on newline characters is always engaged. If C{wrapkw} is given as C{None}, C{"basic"} only is assumed. @param wrapkw: wrapping keywords @type wrapkw: sequence of strings or C{None} @returns: wrapping function @rtype: (string, string, string?)->[string] @see: L{wrap_field} """ if wrapkw is None: wrapkw = ["basic"] if "basic" in wrapkw: if "fine" in wrapkw: wrapf = wrap_field_fine else: wrapf = wrap_field else: if "fine" in wrapkw: wrapf = wrap_field_fine_unwrap else: wrapf = wrap_field_unwrap return wrapf def select_field_wrapping (cfgsec=None, cat=None, cmlopt=None): """ Select wrapping keywords for PO message fields based on various inputs. There are three possible sources of wrapping information: - a user configuration section, possibly containing wrapping fields - the catalog to which the wrapping should be applied, possibly defining wrapping in its header - command line options for wrapping This function will examine these three sources with increasing priority, and return a tuple of applicable L{wrapping keywords}. Any of these sources can also be omitted; if all are omitted, C{("basic",)} is returned. @param cfgsec: a section of user configuration @type cfgsec: L{section} @param cat: the catalog to be wrapped @type cat: L{Catalog} @param cmlopt: command line options @type cmlopt: optparse.ConfigParser @returns: wrapping keywords @rtype: (string*) @see: L{select_field_wrapper} """ # Default wrapping. wrapping = ["basic"] # Helper to remove and add wrapping types. def waddrem (add, wtype): if add is False and wtype in wrapping: wrapping.remove(wtype) elif add is True and wtype not in wrapping: wrapping.append(wtype) # Restrict wrapping in following priority of overrides. # - configuration if cfgsec is not None: waddrem(cfgsec.boolean("wrap", None), "basic") waddrem(cfgsec.boolean("fine-wrap", None), "fine") # - catalog wrapping_cat = cat.wrapping() if cat is not None else None if wrapping_cat is not None: waddrem("basic" in wrapping_cat, "basic") waddrem("fine" in wrapping_cat, "fine") # - command line if cmlopt is not None: waddrem(cmlopt.do_wrap, "basic") waddrem(cmlopt.do_fine_wrap, "fine") return tuple(sorted(wrapping)) diff --git a/scripts/create_rules_from_koffice_autocorrect.py b/scripts/create_rules_from_koffice_autocorrect.py index aa443592..1ad2cc4d 100755 --- a/scripts/create_rules_from_koffice_autocorrect.py +++ b/scripts/create_rules_from_koffice_autocorrect.py @@ -1,57 +1,57 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Create Pology rules from the KOffice KWord autocorrect xml file. This script is intended to be run standalone. Usage:: python create_rules_from_koffice_autocorrect.py @author: Sébastien Renard @license: GPLv3 """ import re import sys from codecs import open import locale def main(): locale.setlocale(locale.LC_ALL, "") if len(sys.argv)!=3: usage() #TODO: check file is readable kofficeFile=open(sys.argv[1], "r", "utf-8") #TODO: check path is writable ruleFile=open(sys.argv[2], "w", "utf-8") # Regexp to find autocorrect items regexp=re.compile('') #Header ruleFile.write("# Generated rules from KOffice autocorrect file\n") ruleFile.write("# by the KOffice project (http://www.koffice.org)\n") ruleFile.write("# License: GPLv3\n\n") #TODO: exceptions should be in a separated file, not hardcoded. exceptions=["http:/", "http://", "etc...", "language"] for line in kofficeFile: match=regexp.match(line.strip()) if match: find=match.group(1) replace=match.group(2) if find not in exceptions: ruleFile.write(u'[&lwb;%s&rwb;]\nhint="%s => %s (d\'après le fichier de correction de KOffice)"\n\n' % (find, find, replace)) #Footer ruleFile.write("\n#End of rule file\n") ruleFile.close() def usage(): - print "\t%s " % sys.argv[0] + print("\t%s " % sys.argv[0]) sys.exit(1) if __name__ == '__main__': main() diff --git a/scripts/normalize-aspell-word-list.py b/scripts/normalize-aspell-word-list.py index 1f94da35..73026075 100755 --- a/scripts/normalize-aspell-word-list.py +++ b/scripts/normalize-aspell-word-list.py @@ -1,136 +1,136 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Organize dictionary file: - sort entries - remove duplicate - update header This script is intended to be run standalone. Usage:: python @author: Sébastien Renard @license: GPLv3 """ import locale from codecs import open from os.path import abspath, basename import re import sys try: import fallback_import_paths except: pass from pology import _, n_ from pology.report import report, error def main(): locale.setlocale(locale.LC_ALL, "") # FIXME: Use pology.colors.ColorOptionParser. reminv=False paths=[] for arg in sys.argv[1:]: if arg.startswith("-"): if arg in ("-r", "--remove-invalid"): reminv = True else: error(_("@info", "Unknown option '%(opt)s' in command line.", opt=arg)) else: paths.append(arg) if len(paths)<1: usage() for path in paths: organize(path, reminv) def organize (dictPath, reminv=False): report(dictPath) dictEncDefault = "UTF-8" dictFile=open(dictPath, "r", dictEncDefault) # Parse the header for language and encoding. header=dictFile.readline() m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) if not m: error(_("@info", "Malformed header of the dictionary file '%(file)s'.", file=dictPath)) dictType, dictLang, numWords, dictEnc=m.groups() expDictType = "personal_ws-1.1" if dictType != expDictType: dictType = expDictType report(" " + _("@item:inlist", "dictionary type changed to '%(dtype)s'", dtype=expDictType)) # Reopen in correct encoding if not the default. if dictEnc.lower() != dictEncDefault.lower(): dictFile.close() dictFile=open(dictPath, "r", dictEnc) # Read all words and eliminate duplicates. words=set() - validCharacters=re.compile(ur"^[\w\d\'・-]+$", re.UNICODE) + validCharacters=re.compile(r"^[\w\d\'・-]+$", re.UNICODE) lno = 0 for word in dictFile: lno += 1 word=word.strip() if not word or word.startswith("personal_ws"): continue if word in words: report(" " + _("@item:inlist", "duplicate removed: %(word)s", word=word)) elif not validCharacters.match(word): if not reminv: report(" " + _("@item:inlist", "*** invalid word at %(line)s: %(word)s", line=lno, word=word)) words.add(word) else: report(" " + _("@item:inlist", "invalid word removed: %(word)s", word=word)) else: words.add(word) dictFile.close() words=list(words) numWords=len(words) # Sort the list according to current locale, ignoring case. words.sort(lambda x, y: locale.strcoll(x.lower(), y.lower())) # Write back the updated dictionary. dictFile=open(dictPath, "w", dictEnc) dictFile.write("%s %s %d %s\n" % (dictType, dictLang, numWords, dictEnc)) dictFile.write("\n".join(words)) dictFile.write("\n") dictFile.close() report(" " + n_("@item:inlist", "written %(num)d word", "written %(num)d words", num=len(words))) def usage(): report(_("@info", "Usage: %(cmd)s [-r|--remove-invalid] DICTFILE...", cmd=basename(sys.argv[0]))) sys.exit(1) if __name__ == '__main__': main() diff --git a/scripts/poascribe.py b/scripts/poascribe.py index e84579b0..e01c33ba 100755 --- a/scripts/poascribe.py +++ b/scripts/poascribe.py @@ -1,1526 +1,1526 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- try: import fallback_import_paths except: pass import datetime import locale import os import re import sys from tempfile import NamedTemporaryFile import time from pology import PologyError, version, _, n_, t_ from pology.ascript import collect_ascription_associations from pology.ascript import collect_ascription_history from pology.ascript import collect_ascription_history_segment from pology.ascript import ascription_equal, merge_modified from pology.ascript import ascribe_modification, ascribe_review from pology.ascript import first_non_fuzzy, has_tracked_parts from pology.ascript import make_ascription_selector from pology.ascript import AscPoint from pology.catalog import Catalog from pology.header import Header, TZInfo, format_datetime from pology.message import Message, MessageUnsafe from pology.gtxtools import msgfmt from pology.colors import ColorOptionParser, cjoin import pology.config as pology_config from pology.diff import msg_ediff, msg_ediff_to_new from pology.diff import editprob from pology.fsops import str_to_unicode, unicode_to_str from pology.fsops import collect_paths_cmdline, collect_catalogs from pology.fsops import mkdirpath, join_ncwd from pology.fsops import exit_on_exception from pology.getfunc import get_hook_ireq from pology.merge import merge_pofile from pology.monitored import Monlist from pology.msgreport import warning_on_msg, report_msg_content from pology.msgreport import report_msg_to_lokalize from pology.report import report, error, format_item_list from pology.report import init_file_progress from pology.stdcmdopt import add_cmdopt_incexc, add_cmdopt_filesfrom from pology.tabulate import tabulate # Wrapping in ascription catalogs. _ascwrapping = ["fine"] # Flag used to mark diffed messages. # NOTE: All diff flags should start with 'ediff', as some other scripts # only need to check if any of them is present. -_diffflag = u"ediff" -_diffflag_tot = u"ediff-total" -_diffflag_ign = u"ediff-ignored" +_diffflag = "ediff" +_diffflag_tot = "ediff-total" +_diffflag_ign = "ediff-ignored" # Flags used to explicitly mark messages as reviewed or unreviewed. -_revdflags = (u"reviewed", u"revd", u"rev") # synonyms -_urevdflags = (u"unreviewed", u"nrevd", u"nrev") # synonyms +_revdflags = ("reviewed", "revd", "rev") # synonyms +_urevdflags = ("unreviewed", "nrevd", "nrev") # synonyms # Comment used to show ascription chain in messages marked for review. _achncmnt = "~ascto:" # String used to separate tags to review flags. _flagtagsep = "/" _diffflags = (_diffflag, _diffflag_tot, _diffflag_ign) _all_flags = _diffflags + _revdflags + _urevdflags _all_flags = sorted(_all_flags, key=lambda x: (-len(x), x)) # ...this order is necessary for proper |-linking in regexes. _all_cmnts = (_achncmnt,) # Datetime at the moment the script is started. _dt_start = datetime.datetime(*(time.localtime()[:6] + (0, TZInfo()))) def main (): locale.setlocale(locale.LC_ALL, "") mode_spec = ( ("status", ("st",)), ("commit", ("co", "ci", "mo")), ("diff", ("di",)), ("purge", ("pu",)), ("history", ("hi",)), ) mode_allnames = set() mode_tolong = {} for name, syns in mode_spec: mode_allnames.add(name) mode_allnames.update(syns) mode_tolong[name] = name mode_tolong.update((s, name) for s in syns) known_editors = { "lokalize": report_msg_to_lokalize, } # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s MODE [OPTIONS] [PATHS...]", cmd="%prog") desc = _("@info command description", "Keep track of who, when, and how, has translated, modified, " "or reviewed messages in a collection of PO files.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2008, 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2008, 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-a", "--select-ascription", metavar=_("@info command line value placeholder", "SELECTOR[:ARGS]"), action="append", dest="aselectors", default=None, help=_("@info command line option description", "Select a message from ascription history by this selector. " "Can be repeated, in which case the message is selected " "if all selectors match it.")) opars.add_option( "-A", "--min-adjsim-diff", metavar=_("@info command line value placeholder", "RATIO"), action="store", dest="min_adjsim_diff", default=None, help=_("@info command line option description", "Minimum adjusted similarity between two versions of a message " "needed to show the embedded difference. " "Range 0.0-1.0, where 0 means always to show the difference, " "and 1 never to show it; a convenient range is 0.6-0.8. " "When the difference is not shown, the '%(flag)s' flag is " "added to the message.", flag=_diffflag_ign)) opars.add_option( "-b", "--show-by-file", action="store_true", dest="show_by_file", default=False, help=_("@info command line option description", "Next to global summary, also present results by file.")) opars.add_option( "-C", "--no-vcs-commit", action="store_false", dest="vcs_commit", default=None, help=_("@info command line option description", "Do not commit catalogs to version control " "(when version control is used).")) opars.add_option( "-d", "--depth", metavar=_("@info command line value placeholder", "LEVEL"), action="store", dest="depth", default=None, help=_("@info command line option description", "Consider ascription history up to this level into the past.")) opars.add_option( "-D", "--diff-reduce-history", metavar=_("@info command line value placeholder", "SPEC"), action="store", dest="diff_reduce_history", default=None, help=_("@info command line option description", "Reduce each message in history to a part of the difference " "from the first earlier modification: to added, removed, or " "equal segments. " "The value begins with one of the characters 'a', 'r', or 'e', " "followed by substring that will be used to separate " "selected difference segments in resulting messages " "(if this substring is empty, space is used).")) opars.add_option( "-F", "--filter", metavar=_("@info command line value placeholder", "NAME"), action="append", dest="filters", default=None, help=_("@info command line option description", "Pass relevant message text fields through a filter before " "matching or comparing them (relevant in some modes). " "Can be repeated to add several filters.")) opars.add_option( "-G", "--show-filtered", action="store_true", dest="show_filtered", default=False, help=_("@info command line option description", "When operating under a filter, also show filtered versions " "of whatever is shown in original (e.g. in diffs).")) opars.add_option( "-k", "--keep-flags", action="store_true", dest="keep_flags", default=False, help=_("@info command line option description", "Do not remove review-significant flags from messages " "(possibly convert them as appropriate).")) opars.add_option( "-m", "--message", metavar=_("@info command line value placeholder", "TEXT"), action="store", dest="message", default=None, help=_("@info command line option description", "Version control commit message for original catalogs, " "when %(opt)s is in effect.", opt="-c")) opars.add_option( "-o", "--open-in-editor", metavar=("|".join(sorted(known_editors))), action="store", dest="po_editor", default=None, help=_("@info command line option description", "Open selected messages in one of the supported PO editors.")) opars.add_option( "-L", "--max-fraction-select", metavar=_("@info command line value placeholder", "FRACTION"), action="store", dest="max_fraction_select", default=None, help=_("@info command line option description", "Select messages in a catalog only if the total number " "of selected messages in that catalog would be at most " "the given fraction (0.0-1.0) of total number of messages.")) opars.add_option( "-s", "--selector", metavar=_("@info command line value placeholder", "SELECTOR[:ARGS]"), action="append", dest="selectors", default=None, help=_("@info command line option description", "Consider only messages matched by this selector. " "Can be repeated, in which case the message is selected " "if all selectors match it.")) opars.add_option( "-t", "--tag", metavar=_("@info command line value placeholder", "TAG"), action="store", dest="tags", default=None, help=_("@info command line option description", "Tag to add or consider in ascription records. " "Several tags may be given separated by commas.")) opars.add_option( "-u", "--user", metavar=_("@info command line value placeholder", "USER"), action="store", dest="user", default=None, help=_("@info command line option description", "User in whose name the operation is performed.")) opars.add_option( "-U", "--update-headers", action="store_true", dest="update_headers", default=None, help=_("@info command line option description", "Update headers in catalogs which contain modifications " "before committing them, with user's translator information.")) opars.add_option( "-v", "--verbose", action="store_true", dest="verbose", default=False, help=_("@info command line option description", "Output more detailed progress info.")) opars.add_option( "-w", "--write-modified", metavar=_("@info command line value placeholder", "FILE"), action="store", dest="write_modified", default=None, help=_("@info command line option description", "Write paths of all original catalogs modified by " "ascription operations into the given file.")) opars.add_option( "-x", "--externals", metavar=_("@info command line value placeholder", "PYFILE"), action="append", dest="externals", default=[], help=_("@info command line option description", "Collect optional functionality from an external Python file " "(selectors, etc).")) opars.add_option( "--all-reviewed", action="store_true", dest="all_reviewed", default=False, help=_("@info command line option description", "Ascribe all messages as reviewed on commit, " "overriding any existing review elements. " "Tags given by %(opt)s apply. " "This should not be done in day-to-day practice; " "the primary use is initial review ascription.", opt="--tag")) add_cmdopt_filesfrom(opars) add_cmdopt_incexc(opars) (options, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) # Parse operation mode and its arguments. if len(free_args) < 1: error(_("@info", "Operation mode not given.")) rawmodename = free_args.pop(0) modename = mode_tolong.get(rawmodename) if modename is None: flatmodes = ["/".join((x[0],) + x[1]) for x in mode_spec] error(_("@info", "Unknown operation mode '%(mode)s' " "(known modes: %(modelist)s).", mode=rawmodename, modelist=format_item_list(flatmodes))) # For options not issued, read values from user configuration. # Configuration values can also be issued by mode using # C{afield/amode = value} syntax, which takes precedence over # general fields (e.g. C{filters/review} vs. C{filters}). cfgsec = pology_config.section("poascribe") for optname, getvalf, defval in ( ("aselectors", cfgsec.strdlist, []), ("vcs-commit", cfgsec.boolean, True), ("po-editor", cfgsec.string, None), ("filters", cfgsec.strslist, []), ("min-adjsim-diff", cfgsec.real, 0.0), ("selectors", cfgsec.strdlist, []), ("tags", cfgsec.string, ""), ("user", cfgsec.string, None), ("update-headers", cfgsec.boolean, False), ("diff-reduce-history", cfgsec.string, None), ("max-fraction-select", cfgsec.real, 1.01), ): uoptname = optname.replace("-", "_") if getattr(options, uoptname) is None: for fldname in ("%s/%s" % (optname, modename), optname): fldval = getvalf(fldname, None) if fldval is not None: break if fldval is None: fldval = defval setattr(options, uoptname, fldval) # Convert options to non-string types. def valconv_editor (edkey): msgrepf = known_editors.get(edkey) if msgrepf is None: error(_("@info", "PO editor '%(ed)s' is not among " "the supported editors: %(edlist)s.", ed=edkey, edlist=format_item_list(sorted(known_editors)))) return msgrepf def valconv_tags (cstr): return set(x.strip() for x in cstr.split(",")) for optname, valconv in ( ("max-fraction-select", float), ("min-adjsim-diff", float), ("po-editor", valconv_editor), ("tags", valconv_tags), ): uoptname = optname.replace("-", "_") valraw = getattr(options, uoptname, None) if valraw is not None: try: value = valconv(valraw) except TypeError: error(_("@info", "Value '%(val)s' to option '%(opt)s' is of wrong type.", val=valraw, opt=("--" + optname))) setattr(options, uoptname, value) # Collect any external functionality. for xmod_path in options.externals: collect_externals(xmod_path) # Create history filter if requested, store it in options. options.hfilter = None options.sfilter = None if options.filters: hfilters = [] for hspec in options.filters: hfilters.append(get_hook_ireq(hspec, abort=True)) def hfilter_composition (text): for hfilter in hfilters: text = hfilter(text) return text options.hfilter = hfilter_composition if options.show_filtered: options.sfilter = options.hfilter # Create specification for reducing historical messages to diffs. options.addrem = None if options.diff_reduce_history: options.addrem = options.diff_reduce_history if options.addrem[:1] not in ("a", "e", "r"): error(_("@info", "Value '%(val)s' to option '%(opt)s' must start " "with '%(char1)s', '%(char2)s', or '%(char3)s'.", val=options.addrem, opt="--diff-reduce-history", char1="a", char2="e", char3="r")) # Create selectors if any explicitly given. selector = None if options.selectors: selector = make_ascription_selector(options.selectors) aselector = None if options.aselectors: aselector = make_ascription_selector(options.aselectors, hist=True) # Assemble operation mode. needuser = False canselect = False canaselect = False class _Mode: pass mode = _Mode() mode.name = modename if 0: pass elif mode.name == "status": mode.execute = status mode.selector = selector or make_ascription_selector(["any"]) canselect = True elif mode.name == "commit": mode.execute = commit mode.selector = selector or make_ascription_selector(["any"]) needuser = True canselect = True elif mode.name == "diff": mode.execute = diff mode.selector = selector or make_ascription_selector(["modar"]) mode.aselector = aselector canselect = True canaselect = True elif mode.name == "purge": mode.execute = purge mode.selector = selector or make_ascription_selector(["any"]) canselect = True elif mode.name == "history": mode.execute = history mode.selector = selector or make_ascription_selector(["any"]) canselect = True else: error(_("@info", "Unhandled operation mode '%(mode)s'.", mode=mode.name)) mode.user = None if needuser: if not options.user: error(_("@info", "Operation mode '%(mode)s' requires a user " "to be specified.", mode=mode.name)) mode.user = options.user if not canselect and selector: error(_("@info", "Operation mode '%(mode)s' does not accept selectors.", mode=mode.name)) if not canaselect and aselector: error(_("@info", "Operation mode '%(mode)s' does not accept history selectors.", mode=mode.name)) # Collect list of catalogs supplied through command line. # If none supplied, assume current working directory. catpaths = collect_paths_cmdline(rawpaths=free_args, incnames=options.include_names, incpaths=options.include_paths, excnames=options.exclude_names, excpaths=options.exclude_paths, filesfrom=options.files_from, elsecwd=True, respathf=collect_catalogs, abort=True) # Split catalogs into lists by ascription config, # and link them to their ascription catalogs. aconfs_catpaths = collect_ascription_associations(catpaths) assert_review_tags(aconfs_catpaths, options.tags) # Execute operation. mode.execute(options, aconfs_catpaths, mode) # Write out list of modified original catalogs if requested. if options.write_modified and _modified_cats: lfpath = options.write_modified f = open(lfpath, "w") f.write(("\n".join(sorted(_modified_cats)) + "\n").encode("utf-8")) f.close() report(_("@info", "Paths of modified catalogs written to '%(file)s'.", file=lfpath)) def vcs_commit_catalogs (aconfs_catpaths, user, message=None, onabortf=None): report(_("@info:progress VCS is acronym for \"version control system\"", ">>>>> VCS is committing catalogs:")) # Attach paths to each distinct config, to commit them all at once. aconfs = [] catpaths_byconf = {} for aconf, catpaths in aconfs_catpaths: if aconf not in catpaths_byconf: catpaths_byconf[aconf] = [] aconfs.append(aconf) for catpath, acatpath in catpaths: catpaths_byconf[aconf].append(catpath) if os.path.isfile(acatpath): catpaths_byconf[aconf].append(acatpath) # Commit by config. for aconf in aconfs: cmsg = message cmsgfile = None if not cmsg: cmsg = aconf.commitmsg if not cmsg: cmsgfile, cmsgfile_orig = get_commit_message_file_path(user) else: cmsg += " " + fmt_commit_user(user) added, apaths = aconf.vcs.add(catpaths_byconf[aconf], repadd=True) if not added: if onabortf: onabortf() error(_("@info", "VCS reports that some catalogs cannot be added.")) cpaths = sorted(set(map(join_ncwd, catpaths_byconf[aconf] + apaths))) if not aconf.vcs.commit(cpaths, message=cmsg, msgfile=cmsgfile, incparents=False): if onabortf: onabortf() if not cmsgfile: error(_("@info", "VCS reports that some catalogs cannot be committed.")) else: os.remove(cmsgfile) error(_("@info", "VCS reports that some catalogs cannot be committed " "(commit message preserved in '%(file)s').", file=cmsgfile_orig)) if cmsgfile: os.remove(cmsgfile) os.remove(cmsgfile_orig) def fmt_commit_user (user): return "[>%s]" % user def get_commit_message_file_path (user): while True: tfmt = time.strftime("%Y-%m-%d-%H-%M-%S") prefix = "poascribe-commit-message" ext = "txt" fpath = "%s-%s.%s" % (prefix, tfmt, ext) fpath_asc = "%s-%s-asc.%s" % (prefix, tfmt, ext) if not os.path.isfile(fpath) and not os.path.isfile(fpath_asc): break edcmd = None if not edcmd: edcmd = os.getenv("ASC_EDITOR") if not edcmd: edcmd = pology_config.section("poascribe").string("editor") if not edcmd: edcmd = os.getenv("EDITOR") if not edcmd: edcmd = "/usr/bin/vi" cmd = "%s %s" % (edcmd, fpath) if os.system(cmd): error(_("@info", "Еrror from editor command '%(cmd)s' for commit message.", cmd=cmd)) if not os.path.isfile(fpath): error(_("@info", "Editor command '%(cmd)s' did not produce a file.", cmd=cmd)) cmsg = open(fpath, "r").read() if not cmsg.endswith("\n"): cmsg += "\n" fmt_user = unicode_to_str(fmt_commit_user(user)) if cmsg.count("\n") == 1: cmsg = cmsg[:-1] + " " + fmt_user + "\n" else: cmsg += fmt_user + "\n" fh = open(fpath_asc, "w") fh.write(cmsg) fh.close() return fpath_asc, fpath def assert_mode_user (aconfs_catpaths, mode): for aconf, catpaths in aconfs_catpaths: if mode.user not in aconf.users: error(_("@info", "User '%(user)s' not defined in '%(file)s'.", user=mode.user, file=aconf.path)) def assert_review_tags (aconfs_catpaths, tags): for aconf, catpaths in aconfs_catpaths: for tag in tags: if tag not in aconf.revtags: error(_("@info", "Review tag '%(tag)s' not defined in '%(file)s'.", tag=tag, file=aconf.path)) def assert_syntax (aconfs_catpaths, onabortf=None): checkf = msgfmt(options=["--check"]) numerr = 0 for aconf, catpaths in aconfs_catpaths: for catpath, acatpath in catpaths: numerr += checkf(catpath) if numerr: if onabortf: onabortf() error(_("@info", "Invalid syntax in some files, see the reports above. " "Ascription aborted.")) return numerr def setup_progress (aconfs_catpaths, addfmt): acps = [y[0] for x in aconfs_catpaths for y in x[1]] return init_file_progress(acps, addfmt=addfmt) # Exclusive states of a message, as reported by Message.state(). _st_tran = "T" _st_fuzzy = "F" _st_untran = "U" _st_otran = "OT" _st_ofuzzy = "OF" _st_ountran = "OU" _all_states = ( _st_tran, _st_fuzzy, _st_untran, _st_otran, _st_ofuzzy, _st_ountran, ) def status (options, aconfs_catpaths, mode): # Count ascribed and unascribed messages through catalogs. counts_a = dict([(x, {}) for x in _all_states]) counts_na = dict([(x, {}) for x in _all_states]) upprog = setup_progress(aconfs_catpaths, t_("@info:progress", "Examining state: %(file)s")) for aconf, catpaths in aconfs_catpaths: for catpath, acatpath in catpaths: upprog(catpath) # Open current and ascription catalog. cat = Catalog(catpath, monitored=False) acat = Catalog(acatpath, create=True, monitored=False) # Count ascribed and non-ascribed by original catalog. nselected = 0 for msg in cat: purge_msg(msg) ahist = collect_ascription_history( msg, acat, aconf, hfilter=options.hfilter, addrem=options.addrem, nomrg=True) if ahist[0].user is None and not has_tracked_parts(msg): continue # pristine if not mode.selector(msg, cat, ahist, aconf): continue # not selected counts = ahist[0].user is None and counts_na or counts_a st = msg.state() if catpath not in counts[st]: counts[st][catpath] = 0 counts[st][catpath] += 1 nselected += 1 # Count non-ascribed by ascription catalog. for amsg in acat: if amsg not in cat: ast = amsg.state() st = None if ast == _st_tran: st = _st_otran elif ast == _st_fuzzy: st = _st_ofuzzy elif ast == _st_untran: st = _st_ountran if st: if catpath not in counts_na[st]: counts_na[st][catpath] = 0 counts_na[st][catpath] += 1 # Cancel counts if maximum selection fraction exceeded. if float(nselected) / len(cat) > options.max_fraction_select: for counts in (counts_a, counts_na): for st in _all_states: if catpath in counts[st]: counts[st].pop(catpath) upprog() # Some general data for tabulation of output. coln = [_("@title:column translated messages", "msg/t"), _("@title:column fuzzy messages", "msg/f"), _("@title:column untranslated messages", "msg/u"), _("@title:column obsolete translated messages", "msg/ot"), _("@title:column obsolete fuzzy messages", "msg/of"), _("@title:column obsolete untranslated messages", "msg/ou")] none="-" # NOTE: When reporting, do not show anything if there are # neither ascribed nor non-ascribed messages selected. # If there are some ascribed and none non-ascribed, # show only the row for ascribed. # However, if there are some non-ascribed but none ascribed, # still show the row for ascribed, to not accidentally confuse # non-ascribed for ascribed. # Report totals. totals_a, totals_na = {}, {} for totals, counts in ((totals_a, counts_a), (totals_na, counts_na)): - for st, cnt_per_cat in counts.items(): + for st, cnt_per_cat in list(counts.items()): totals[st] = sum(cnt_per_cat.values()) # See previous NOTE. if sum(totals_a.values()) > 0 or sum(totals_na.values()) > 0: rown = [_("@title:row number of ascribed messages", "ascribed")] data = [[totals_a[x] or None] for x in _all_states] if sum(totals_na.values()) > 0: rown.append(_("@title:row number of unascribed messages", "unascribed")) for i in range(len(_all_states)): data[i].append(totals_na[_all_states[i]] or None) report(tabulate(data=data, coln=coln, rown=rown, none=none, colorize=True)) # Report counts per catalog if requested. if options.show_by_file: catpaths = set() for counts in (counts_a, counts_na): - catpaths.update(sum([x.keys() for x in counts.values()], [])) + catpaths.update(sum([list(x.keys()) for x in list(counts.values())], [])) catpaths = sorted(catpaths) if catpaths: coln.insert(0, _("@title:column", "catalog")) coln.insert(1, _("@title:column state (asc/nasc)", "st")) data = [[] for x in _all_states] for catpath in catpaths: cc_a = [counts_a[x].get(catpath, 0) for x in _all_states] cc_na = [counts_na[x].get(catpath, 0) for x in _all_states] # See previous NOTE. if sum(cc_a) > 0 or sum(cc_na) > 0: data[0].append(catpath) data[1].append( _("@item:intable number of ascribed messages", "asc")) for datac, cc in zip(data[2:], cc_a): datac.append(cc or None) if sum(cc_na) > 0: data[0].append("^^^") data[1].append( _("@item:intable number of unascribed messages", "nasc")) for datac, cc in zip(data[2:], cc_na): datac.append(cc or None) if any(data): dfmt = ["%%-%ds" % max([len(x) for x in catpaths])] report("-") report(tabulate(data=data, coln=coln, dfmt=dfmt, none=none, colorize=True)) # FIXME: Factor out into message module. _fields_current = ( "msgctxt", "msgid", "msgid_plural", ) _fields_previous = ( "msgctxt_previous", "msgid_previous", "msgid_plural_previous", ) def msg_to_previous (msg, copy=True): if msg.fuzzy and msg.msgid_previous is not None: pmsg = MessageUnsafe(msg) if copy else msg for fcurr, fprev in zip(_fields_current, _fields_previous): setattr(pmsg, fcurr, pmsg.get(fprev)) pmsg.unfuzzy() return pmsg def restore_reviews (aconfs_catpaths, revspecs_by_catmsg): upprog = setup_progress(aconfs_catpaths, t_("@info:progress", "Restoring reviews: %(file)s")) nrestored = 0 for aconf, catpaths in aconfs_catpaths: for catpath, acatpath in catpaths: upprog(catpath) revels_by_msg = revspecs_by_catmsg.get(catpath) if revels_by_msg: cat = Catalog(catpath, monitored=True) for msgref, revels in sorted(revels_by_msg.items()): msg = cat[msgref - 1] revtags, unrevd, revok = revels restore_review_flags(msg, revtags, unrevd) nrestored += 1 sync_and_rep(cat, shownmod=False) if aconf.vcs.is_versioned(acatpath): aconf.vcs.revert(acatpath) # ...no else: because revert may cause the file # not to be versioned any more. if not aconf.vcs.is_versioned(acatpath): os.remove(acatpath) if nrestored > 0: report(n_("@info:progress", "===== Review elements restored to %(num)d message.", "===== Review elements restored to %(num)d messages.", num=nrestored)) def restore_review_flags (msg, revtags, unrevd): for tag in revtags: flag = _revdflags[0] if tag: flag += _flagtagsep + tag msg.flag.add(flag) if unrevd: msg.flag.add(_urevdflags[0]) return msg def commit (options, aconfs_catpaths, mode): assert_mode_user(aconfs_catpaths, mode) # Ascribe modifications and reviews. upprog = setup_progress(aconfs_catpaths, t_("@info:progress", "Ascribing: %(file)s")) revels = {} counts = dict([(x, [0, 0]) for x in _all_states]) aconfs_catpaths_ascmod = [] aconf_by_catpath = {} for aconf, catpaths in aconfs_catpaths: aconfs_catpaths_ascmod.append((aconf, [])) for catpath, acatpath in catpaths: upprog(catpath) res = commit_cat(options, aconf, mode.user, catpath, acatpath, mode.selector) ccounts, crevels, catmod = res - for st, (nmod, nrev) in ccounts.items(): + for st, (nmod, nrev) in list(ccounts.items()): counts[st][0] += nmod counts[st][1] += nrev revels[catpath] = crevels if catmod: aconfs_catpaths_ascmod[-1][1].append((catpath, acatpath)) aconf_by_catpath[catpath] = aconf upprog() onabortf = lambda: restore_reviews(aconfs_catpaths_ascmod, revels) # Assert that all reviews were good. unknown_revtags = [] for catpath, revels1 in sorted(revels.items()): aconf = aconf_by_catpath[catpath] for msgref, (revtags, unrevd, revok) in sorted(revels1.items()): if not revok: onabortf() error("Ascription aborted due to earlier warnings.") assert_syntax(aconfs_catpaths_ascmod, onabortf=onabortf) # ...must be done after committing, to have all review elements purged coln = [_("@title:column number of modified messages", "modified")] rown = [] data = [[]] for st, stlabel in ( (_st_tran, _("@title:row number of translated messages", "translated")), (_st_fuzzy, _("@title:row number of fuzzy messages", "fuzzy")), (_st_untran, _("@title:row number of untranslated messages", "untranslated")), (_st_otran, _("@title:row number of obsolete translated messages", "obsolete/t")), (_st_ofuzzy, _("@title:row number of obsolete fuzzy messages", "obsolete/f")), (_st_ountran, _("@title:row number of obsolete untranslated messages", "obsolete/u")), ): if counts[st][1] > 0 and len(coln) < 2: coln.append(_("@title:column number of reviewed messages", "reviewed")) data.append([]) if counts[st][0] > 0 or counts[st][1] > 0: rown.append(stlabel) data[0].append(counts[st][0] or None) if len(coln) >= 2: data[1].append(counts[st][1] or None) if rown: report(_("@info:progress", "===== Ascription summary:")) report(tabulate(data, coln=coln, rown=rown, none="-", colorize=True)) if options.vcs_commit: vcs_commit_catalogs(aconfs_catpaths, mode.user, message=options.message, onabortf=onabortf) # ...not configs_catpaths_ascmod, as non-ascription relevant # modifications may exist (e.g. new pristine catalog added). def diff (options, aconfs_catpaths, mode): upprog = setup_progress(aconfs_catpaths, t_("@info:progress", "Diffing for review: %(file)s")) ndiffed = 0 for aconf, catpaths in aconfs_catpaths: for catpath, acatpath in catpaths: upprog(catpath) ndiffed += diff_cat(options, aconf, catpath, acatpath, mode.selector, mode.aselector) upprog() if ndiffed > 0: report(n_("@info:progress", "===== %(num)d message diffed for review.", "===== %(num)d messages diffed for review.", num=ndiffed)) def purge (options, aconfs_catpaths, mode): upprog = setup_progress(aconfs_catpaths, t_("@info:progress", "Purging review elements: %(file)s")) npurged = 0 for aconf, catpaths in aconfs_catpaths: for catpath, acatpath in catpaths: upprog(catpath) npurged += purge_cat(options, aconf, catpath, acatpath, mode.selector) upprog() if npurged > 0: if not options.keep_flags: report(n_("@info:progress", "===== Review elements purged from %(num)d message.", "===== Review elements purged from %(num)d messages.", num=npurged)) else: report(n_("@info:progress", "===== Review elements purged from %(num)d message " "(flags kept).", "===== Review elements purged from %(num)d messages " "(flags kept).", num=npurged)) return npurged def history (options, aconfs_catpaths, mode): upprog = setup_progress(aconfs_catpaths, t_("@info:progress", "Computing histories: %(file)s")) nshown = 0 for aconf, catpaths in aconfs_catpaths: for catpath, acatpath in catpaths: upprog(catpath) nshown += history_cat(options, aconf, catpath, acatpath, mode.selector) upprog() if nshown > 0: report(n_("@info:progress", "===== Histories computed for %(num)d message.", "===== Histories computed for %(num)d messages.", num=nshown)) def commit_cat (options, aconf, user, catpath, acatpath, stest): # Open current catalog and ascription catalog. # Monitored, for removal of review elements. cat = Catalog(catpath, monitored=True) acat = prep_write_asc_cat(acatpath, aconf) revtags_ovr = None if options.all_reviewed: revtags_ovr = options.tags # Collect unascribed messages, but ignoring pristine ones # (those which are both untranslated and without history). # Collect and purge any review elements. # Check if any modification cannot be due to merging # (if header update is requested). mod_msgs = [] rev_msgs = [] revels_by_msg = {} counts = dict([(x, [0, 0]) for x in _all_states]) counts0 = counts.copy() any_nonmerges = False prev_msgs = [] check_mid_msgs = [] for msg in cat: mod, revtags, unrevd = purge_msg(msg) if mod: revels_by_msg[msg.refentry] = [revtags, unrevd, True] ahist = collect_ascription_history(msg, acat, aconf) # after purging # Do not ascribe anything if the message is new and untranslated. if ( ahist[0].user is None and len(ahist) == 1 and not has_tracked_parts(msg) ): continue # Possibly ascribe review only if the message passes the selector. if stest(msg, cat, ahist, aconf) and (mod or revtags_ovr): if revtags_ovr: revtags = revtags_ovr unrevd = False if revtags and not unrevd: # unreviewed flag overrides rev_msgs.append((msg, revtags)) counts[msg.state()][1] += 1 # Check and record if review tags are not valid. unknown_revtags = revtags.difference(aconf.revtags) if unknown_revtags: revels_by_msg[msg.refentry][-1] = False tagfmt = format_item_list(sorted(unknown_revtags)) warning_on_msg(_("@info", "Unknown review tags: %(taglist)s.", taglist=tagfmt), msg, cat) # Ascribe modification regardless of the selector. if ahist[0].user is None: mod_msgs.append(msg) counts[msg.state()][0] += 1 if options.update_headers and not any_nonmerges: if len(ahist) == 1 or not merge_modified(ahist[1].msg, msg): any_nonmerges = True # Record that reconstruction of the post-merge message # should be tried if this message has no prior history # but it is not pristine (it may be that the translator # has merged the catalog and updated fuzzy messages in one step, # without committing the catalog right after merging). if len(ahist) == 1: check_mid_msgs.append(msg) # Collect latest historical version of the message, # in case reconstruction of post-merge messages is needed. if ahist[0].user is not None or len(ahist) > 1: pmsg = ahist[1 if ahist[0].user is None else 0].msg prev_msgs.append(pmsg) # Collect non-obsolete ascribed messages that no longer have # original counterpart, to ascribe as obsolete. # If reconstruction of post-merge messages is needed, # also collect latest historical versions. cat.sync_map() # in case key fields were purged for amsg in acat: if amsg not in cat: ast = amsg.state() st = None if ast == _st_tran: st = _st_otran elif ast == _st_fuzzy: st = _st_ofuzzy elif ast == _st_untran: st = _st_ountran if st or check_mid_msgs: msg = collect_ascription_history_segment(amsg, acat, aconf)[0].msg if check_mid_msgs: prev_msgs.append(msg) if st: msg.obsolete = True mod_msgs.append(msg) counts[st][0] += 1 # Shortcut if nothing to do, because sync_and_rep later are expensive. if not mod_msgs and not revels_by_msg: # No messages to commit. return counts0, revels_by_msg, False # Construct post-merge messages. mod_mid_msgs = [] if check_mid_msgs and not acat.created(): mid_cat = create_post_merge_cat(cat, prev_msgs) for msg in check_mid_msgs: mid_msg = mid_cat.get(msg) if ( mid_msg is not None and mid_msg.fuzzy and not ascription_equal(mid_msg, msg) ): mod_mid_msgs.append(mid_msg) # Ascribe modifications. for mid_msg in mod_mid_msgs: # ascribe post-merge before actual ascribe_modification(mid_msg, user, _dt_start, acat, aconf) for msg in mod_msgs: ascribe_modification(msg, user, _dt_start, acat, aconf) # Ascribe reviews. for msg, revtags in rev_msgs: ascribe_review(msg, user, _dt_start, revtags, acat, aconf) # Update header if requested and translator's modifications detected. if options.update_headers and any_nonmerges: cat.update_header(project=cat.name, title=aconf.title, name=aconf.users[user].name, email=aconf.users[user].email, teamemail=aconf.teamemail, langname=aconf.langteam, langcode=aconf.langcode, plforms=aconf.plforms) nmod = [len(mod_msgs)] if len(rev_msgs) > 0: nmod.append(len(rev_msgs)) catmod = False if sync_and_rep(cat, nmod=nmod): catmod = True if asc_sync_and_rep(acat, shownmod=False, nmod=[0]): catmod = True return counts, revels_by_msg, catmod def diff_cat (options, aconf, catpath, acatpath, stest, aselect): cat = Catalog(catpath, monitored=True) acat = Catalog(acatpath, create=True, monitored=False) # Select messages for diffing. msgs_to_diff = [] for msg in cat: purge_msg(msg) ahist = collect_ascription_history( msg, acat, aconf, hfilter=options.hfilter, addrem=options.addrem, nomrg=True) # Makes no sense to review pristine messages. if ahist[0].user is None and not has_tracked_parts(msg): continue sres = stest(msg, cat, ahist, aconf) if not sres: continue msgs_to_diff.append((msg, ahist, sres)) # Cancel selection if maximum fraction exceeded. if float(len(msgs_to_diff)) / len(cat) > options.max_fraction_select: msgs_to_diff = [] if not msgs_to_diff: return 0 # Diff selected messages. diffed_msgs = [] tagfmt = _flagtagsep.join(options.tags) for msg, ahist, sres in msgs_to_diff: # Try to select ascription to differentiate from. # (Note that ascription indices returned by selectors are 1-based.) i_asc = None if aselect: asres = aselect(msg, cat, ahist, aconf) i_asc = (asres - 1) if asres else None elif not isinstance(sres, bool): # If there is no ascription selector, but basic selector returned # an ascription index, use first earlier non-fuzzy for diffing. i_asc = sres - 1 i_asc = first_non_fuzzy(ahist, i_asc + 1) # Differentiate and flag. amsg = i_asc is not None and ahist[i_asc].msg or None if amsg is not None: if editprob(amsg.msgid, msg.msgid) > options.min_adjsim_diff: msg_ediff(amsg, msg, emsg=msg, pfilter=options.sfilter) flag = _diffflag else: # If to great difference, add special flag and do not diff. flag = _diffflag_ign else: # If no previous ascription selected, add special flag. flag = _diffflag_tot if tagfmt: flag += _flagtagsep + tagfmt msg.flag.add(flag) # Add ascription chain comment. ascfmts = [] i_from = (i_asc - 1) if i_asc is not None else len(ahist) - 1 for i in range(i_from, -1, -1): a = ahist[i] shtype = {AscPoint.ATYPE_MOD: "m", AscPoint.ATYPE_REV: "r"}[a.type] if a.tag: ascfmt = "%s:%s(%s)" % (a.user, shtype, a.tag) else: ascfmt = "%s:%s" % (a.user, shtype) ascfmts.append(ascfmt) - achnfmt = u"%s %s" % (_achncmnt, " ".join(ascfmts)) + achnfmt = "%s %s" % (_achncmnt, " ".join(ascfmts)) msg.auto_comment.append(achnfmt) diffed_msgs.append(msg) sync_and_rep(cat) # Open in the PO editor if requested. if options.po_editor: for msg in diffed_msgs: options.po_editor(msg, cat, report=_("@info note on selected message", "Selected for review.")) return len(diffed_msgs) _subreflags = "|".join(_all_flags) _subrecmnts = "|".join(_all_cmnts) _any_to_purge_rx = re.compile(r"^\s*(#,.*\b(%s)|#\.\s*(%s))" % (_subreflags, _subrecmnts), re.M|re.U) # Quickly check if it may be that some messages in the PO file # have review elements (diffs, flags). def may_have_revels (catpath): return bool(_any_to_purge_rx.search(open(catpath).read())) def purge_cat (options, aconf, catpath, acatpath, stest): if not may_have_revels(catpath): return 0 cat = Catalog(catpath, monitored=True) acat = Catalog(acatpath, create=True, monitored=False) # Select messages to purge. msgs_to_purge = [] for msg in cat: cmsg = MessageUnsafe(msg) purge_msg(cmsg) ahist = collect_ascription_history( cmsg, acat, aconf, hfilter=options.hfilter, addrem=options.addrem, nomrg=True) if not stest(cmsg, cat, ahist, aconf): continue msgs_to_purge.append(msg) # Does observing options.max_fraction_select makes sense for purging? ## Cancel selection if maximum fraction exceeded. #if float(len(msgs_to_purge)) / len(cat) > options.max_fraction_select: #msgs_to_purge = [] # Purge selected messages. npurged = 0 for msg in msgs_to_purge: res = purge_msg(msg, keepflags=options.keep_flags) mod, revtags, unrevd = res if mod: npurged += 1 sync_and_rep(cat) return npurged def history_cat (options, aconf, catpath, acatpath, stest): cat = Catalog(catpath, monitored=False) acat = Catalog(acatpath, create=True, monitored=False) # Select messages for which to compute histories. msgs_to_hist = [] for msg in cat: purge_msg(msg) ahist = collect_ascription_history( msg, acat, aconf, hfilter=options.hfilter, addrem=options.addrem, nomrg=True) if not stest(msg, cat, ahist, aconf): continue msgs_to_hist.append((msg, ahist)) # Cancel selection if maximum fraction exceeded. if float(len(msgs_to_hist)) / len(cat) > options.max_fraction_select: msgs_to_hist = [] # Compute histories for selected messages. for msg, ahist in msgs_to_hist: unasc = ahist[0].user is None if unasc: ahist.pop(0) hlevels = len(ahist) if options.depth is not None: hlevels = int(options.depth) if ahist[0].user is None: hlevels += 1 if hlevels > len(ahist): hlevels = len(ahist) hinfo = [] if hlevels > 0: hinfo += [_("@info:progress", ">>> History follows:")] hfmt = "%%%dd" % len(str(hlevels)) for i in range(hlevels): a = ahist[i] if a.type == AscPoint.ATYPE_MOD: anote = _("@item:intable", "#%(pos)d " "modified by %(user)s on %(date)s", pos=a.pos, user=a.user, date=a.date) elif a.type == AscPoint.ATYPE_REV: if not a.tag: anote = _("@item:intable", "#%(pos)d " "reviewed by %(user)s on %(date)s", pos=a.pos, user=a.user, date=a.date) else: anote = _("@item:intable", "#%(pos)d " "reviewed (%(tag)s) by %(user)s on %(date)s", pos=a.pos, user=a.user, tag=a.tag, date=a.date) else: warning_on_msg( _("@info", "Unknown ascription type '%(type)s' found in history.", type=a.type), msg, cat) continue hinfo += [anote] if not a.type == AscPoint.ATYPE_MOD: # Nothing more to show if this ascription is not modification. continue i_next = i + 1 if i_next == len(ahist): # Nothing more to show at end of history. continue dmsg = MessageUnsafe(a.msg) nmsg = ahist[i_next].msg if dmsg != nmsg: msg_ediff(nmsg, dmsg, emsg=dmsg, pfilter=options.sfilter, colorize=True) dmsgfmt = dmsg.to_string(force=True, wrapf=cat.wrapf()).rstrip("\n") hindent = " " * (len(hfmt % 0) + 2) hinfo += [hindent + x for x in dmsgfmt.split("\n")] hinfo = cjoin(hinfo, "\n") if unasc or msg.fuzzy: pmsg = None i_nfasc = first_non_fuzzy(ahist) if i_nfasc is not None: pmsg = ahist[i_nfasc].msg elif msg.fuzzy and msg.msgid_previous is not None: pmsg = msg_to_previous(msg) if pmsg is not None: for fprev in _fields_previous: setattr(msg, fprev, None) msg_ediff(pmsg, msg, emsg=msg, pfilter=options.sfilter, colorize=True) report_msg_content(msg, cat, note=(hinfo or None), delim=("-" * 20)) return len(msgs_to_hist) _revflags_rx = re.compile(r"^(%s)(?: */(.*))?" % "|".join(_all_flags), re.I) def purge_msg (msg, keepflags=False): modified = False # Remove review flags. diffed = False revtags = set() unrevd = False for flag in list(msg.flag): # modified inside m = _revflags_rx.search(flag) if m: sflag = m.group(1) tagstr = m.group(2) or "" tags = [x.strip() for x in tagstr.split(_flagtagsep)] if sflag not in _urevdflags: revtags.update(tags) if sflag == _diffflag: # ...must not check ...in _diffflags because with # those other flags there is actually no diff. diffed = True else: unrevd = True msg.flag.remove(flag) modified = True # Remove review comments. i = 0 while i < len(msg.auto_comment): cmnt = msg.auto_comment[i].strip() if cmnt.startswith(_all_cmnts): msg.auto_comment.pop(i) modified = True else: i += 1 # Remove any leftover previous fields. if msg.translated: for fprev in _fields_previous: if msg.get(fprev) is not None: setattr(msg, fprev, None) modified = True if diffed: msg_ediff_to_new(msg, rmsg=msg) if keepflags: restore_review_flags(msg, revtags, unrevd) return modified, revtags, unrevd def prep_write_asc_cat (acatpath, aconf): if not os.path.isfile(acatpath): return init_asc_cat(acatpath, aconf) else: return Catalog(acatpath, monitored=True, wrapping=_ascwrapping) def init_asc_cat (acatpath, aconf): acat = Catalog(acatpath, create=True, monitored=True, wrapping=_ascwrapping) ahdr = acat.header - ahdr.title = Monlist([u"Ascription shadow for %s.po" % acat.name]) + ahdr.title = Monlist(["Ascription shadow for %s.po" % acat.name]) - translator = u"Ascriber" + translator = "Ascriber" if aconf.teamemail: - author = u"%s <%s>" % (translator, aconf.teamemail) + author = "%s <%s>" % (translator, aconf.teamemail) else: - author = u"%s" % translator + author = "%s" % translator ahdr.author = Monlist([author]) - ahdr.copyright = u"Copyright same as for the original catalog." - ahdr.license = u"License same as for the original catalog." - ahdr.comment = Monlist([u"===== DO NOT EDIT MANUALLY ====="]) + ahdr.copyright = "Copyright same as for the original catalog." + ahdr.license = "License same as for the original catalog." + ahdr.comment = Monlist(["===== DO NOT EDIT MANUALLY ====="]) - ahdr.set_field(u"Project-Id-Version", unicode(acat.name)) - ahdr.set_field(u"Report-Msgid-Bugs-To", unicode(aconf.teamemail or "")) - ahdr.set_field(u"PO-Revision-Date", format_datetime(_dt_start)) - ahdr.set_field(u"Content-Type", u"text/plain; charset=UTF-8") - ahdr.set_field(u"Content-Transfer-Encoding", u"8bit") + ahdr.set_field("Project-Id-Version", str(acat.name)) + ahdr.set_field("Report-Msgid-Bugs-To", str(aconf.teamemail or "")) + ahdr.set_field("PO-Revision-Date", format_datetime(_dt_start)) + ahdr.set_field("Content-Type", "text/plain; charset=UTF-8") + ahdr.set_field("Content-Transfer-Encoding", "8bit") if aconf.teamemail: ltr = "%s <%s>" % (translator, aconf.teamemail) else: ltr = translator - ahdr.set_field(u"Last-Translator", unicode(ltr)) + ahdr.set_field("Last-Translator", str(ltr)) if aconf.langteam: if aconf.teamemail: - tline = u"%s <%s>" % (aconf.langteam, aconf.teamemail) + tline = "%s <%s>" % (aconf.langteam, aconf.teamemail) else: tline = aconf.langteam - ahdr.set_field(u"Language-Team", unicode(tline)) + ahdr.set_field("Language-Team", str(tline)) else: ahdr.remove_field("Language-Team") if aconf.langcode: - ahdr.set_field(u"Language", unicode(aconf.langcode)) + ahdr.set_field("Language", str(aconf.langcode)) else: ahdr.remove_field("Language") if aconf.plforms: - ahdr.set_field(u"Plural-Forms", unicode(aconf.plforms)) + ahdr.set_field("Plural-Forms", str(aconf.plforms)) else: - ahdr.remove_field(u"Plural-Forms") + ahdr.remove_field("Plural-Forms") return acat def update_asc_hdr (acat): - acat.header.set_field(u"PO-Revision-Date", format_datetime(_dt_start)) + acat.header.set_field("PO-Revision-Date", format_datetime(_dt_start)) def create_post_merge_cat (cat, prev_msgs): # Prepare previous catalog based on ascription catalog. prev_cat = Catalog("", create=True, monitored=False) prev_cat.header = Header(cat.header) for prev_msg in prev_msgs: prev_cat.add_last(prev_msg) tmpf1 = NamedTemporaryFile(prefix="pology-merged-", suffix=".po") prev_cat.filename = tmpf1.name prev_cat.sync() # Prepare template based on current catalog. tmpl_cat = Catalog("", create=True, monitored=False) tmpl_cat.header = Header(cat.header) for msg in cat: if not msg.obsolete: tmpl_msg = MessageUnsafe(msg) tmpl_msg.clear() tmpl_cat.add_last(tmpl_msg) tmpf2 = NamedTemporaryFile(prefix="pology-template-", suffix=".pot") tmpl_cat.filename = tmpf2.name tmpl_cat.sync() # Merge previous catalog using current catalog as template. mid_cat = merge_pofile(prev_cat.filename, tmpl_cat.filename, getcat=True, monitored=False, quiet=True) return mid_cat _modified_cats = [] def sync_and_rep (cat, shownmod=True, nmod=None): if shownmod and nmod is None: nmod = [0] for msg in cat: if msg.modcount: nmod[0] += 1 modified = cat.sync() if nmod and sum(nmod) > 0: # DO NOT check instead modified == True if shownmod: nmodfmt = "/".join("%d" % x for x in nmod) report("%s (%s)" % (cat.filename, nmodfmt)) else: report("%s" % cat.filename) _modified_cats.append(cat.filename) return modified def asc_sync_and_rep (acat, shownmod=True, nmod=None): if acat.modcount: update_asc_hdr(acat) mkdirpath(os.path.dirname(acat.filename)) return sync_and_rep(acat, shownmod=shownmod, nmod=nmod) # ----------------------------------------------------------------------------- if __name__ == "__main__": exit_on_exception(main) diff --git a/scripts/poediff.py b/scripts/poediff.py index 4d395391..1c4dd13c 100755 --- a/scripts/poediff.py +++ b/scripts/poediff.py @@ -1,511 +1,511 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Create embedded diffs of PO files. Documented in C{doc/user/diffpatch.docbook#sec-dppatch}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import filecmp import locale import os import shutil import sys try: import fallback_import_paths except: pass from pology import version, _, n_, t_ from pology.catalog import Catalog from pology.message import MessageUnsafe from pology.colors import ColorOptionParser, set_coloring_globals, cjoin import pology.config as pology_config from pology.fsops import str_to_unicode, collect_catalogs from pology.fsops import exit_on_exception from pology.diff import msg_ediff from pology.report import error, warning, report, format_item_list from pology.report import list_options from pology.report import init_file_progress from pology.stdcmdopt import add_cmdopt_colors from pology.vcs import available_vcs, make_vcs from pology.internal.poediffpatch import MPC, EDST from pology.internal.poediffpatch import msg_eq_fields, msg_copy_fields from pology.internal.poediffpatch import msg_clear_prev_fields from pology.internal.poediffpatch import diff_cats, diff_hdrs from pology.internal.poediffpatch import init_ediff_header from pology.internal.poediffpatch import get_msgctxt_for_headers from pology.internal.poediffpatch import cats_update_effort def main (): locale.setlocale(locale.LC_ALL, "") # Get defaults for command line options from global config. cfgsec = pology_config.section("poediff") def_do_merge = cfgsec.boolean("merge", True) # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s [OPTIONS] FILE1 FILE2\n" "%(cmd)s [OPTIONS] DIR1 DIR2\n" "%(cmd)s -c VCS [OPTIONS] [PATHS...]", cmd="%prog") desc = _("@info command description", "Create embedded diffs of PO files.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") showvcs = list(set(available_vcs()).difference(["none"])) showvcs.sort() opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-b", "--skip-obsolete", action="store_true", dest="skip_obsolete", default=False, help=_("@info command line option description", "Do not diff obsolete messages.")) opars.add_option( "-c", "--vcs", metavar=_("@info command line value placeholder", "VCS"), dest="version_control", help=_("@info command line option description", "Paths are under version control by given VCS; " "can be one of: %(vcslist)s.", vcslist=format_item_list(showvcs))) opars.add_option( "--list-options", action="store_true", dest="list_options", default=False, help=_("@info command line option description", "List the names of available options.")) opars.add_option( "--list-vcs", action="store_true", dest="list_vcs", default=False, help=_("@info command line option description", "List the keywords of known version control systems.")) opars.add_option( "-n", "--no-merge", action="store_false", dest="do_merge", default=def_do_merge, help=_("@info command line option description", "Do not try to indirectly pair messages by merging catalogs.")) opars.add_option( "-o", "--output", metavar=_("@info command line value placeholder", "POFILE"), dest="output", help=_("@info command line option description", "Output diff catalog to a file instead of stdout.")) opars.add_option( "-p", "--paired-only", action="store_true", dest="paired_only", default=False, help=_("@info command line option description", "When two directories are diffed, ignore catalogs which " "are not present in both directories.")) opars.add_option( "-q", "--quiet", action="store_true", dest="quiet", default=False, help=_("@info command line option description", "Do not display any progress info.")) opars.add_option( "-Q", "--quick", action="store_true", dest="quick", default=False, help=_("@info command line option description", "Equivalent to %(opt)s.", opt="-bns")) opars.add_option( "-r", "--revision", metavar=_("@info command line value placeholder", "REV1[:REV2]"), dest="revision", help=_("@info command line option description", "Revision from which to diff to current working copy, " "or from first to second revision (if VCS is given).")) opars.add_option( "-s", "--strip-headers", action="store_true", dest="strip_headers", default=False, help=_("@info command line option description", "Do not diff headers and do not write out the top header " "(resulting output cannot be used as patch).")) opars.add_option( "-U", "--update-effort", action="store_true", dest="update_effort", default=False, help=_("@info command line option description", "Instead of outputting the diff, calculate and output " "an estimate of the effort that was needed to update " "the translation from old to new paths. " "Ignores %(opt1)s and %(opt1)s options.", opt1="-b", opt2="-n")) add_cmdopt_colors(opars) (op, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) if op.list_options: report(list_options(opars)) sys.exit(0) if op.list_vcs: report("\n".join(showvcs)) sys.exit(0) # Could use some speedup. try: import psyco psyco.full() except ImportError: pass set_coloring_globals(ctype=op.coloring_type, outdep=(not op.raw_colors)) if op.quick: op.do_merge = False op.skip_obsolete = True op.strip_headers = True # Create VCS. vcs = None if op.version_control: if op.version_control not in available_vcs(flat=True): error_wcl(_("@info", "Unknown VCS '%(vcs)s' selected.", vcs=op.version_control)) vcs = make_vcs(op.version_control) # Sanity checks on paths. paths = free_args if not vcs: if len(paths) != 2: error_wcl(_("@info", "Exactly two paths are needed for diffing.")) for path in paths: if not os.path.exists(path): error_wcl("path does not exist: %s" % path) p1, p2 = paths if (not ( (os.path.isfile(p1) and (os.path.isfile(p2))) or (os.path.isdir(p1) and (os.path.isdir(p2)))) ): error_wcl(_("@info", "Both paths must be either files or directories.")) else: # Default to current working dir if no paths given. paths = paths or ["."] for path in paths: if not os.path.exists(path): error_wcl(_("@info", "Path '%(path)s' does not exist.", path=path)) if not vcs.is_versioned(path): error_wcl(_("@info", "Path '%(path)s' is not under version control.", path=path)) # Collect and pair PO files in given paths. # Each pair specification is in the form of # ((path1, path2), (vpath1, vpath2)) # where path* are the real paths, and vpath* the visual paths to be # presented in diff output. if not vcs: fpairs = collect_file_pairs(paths[0], paths[1], op.paired_only) pspecs = [(x, x) for x in fpairs] else: lst = op.revision and op.revision.split(":", 1) or [] if len(lst) > 2: error_wcl(_("@info", "Too many revisions given: %(revlist)s.", revspec=format_item_list(lst))) elif len(lst) == 2: revs = lst # diff between revisions elif len(lst) == 1: revs = [lst[0], None] # diff from revision to working copy else: revs = ["", None] # diff from head to working copy # Replace original paths with modified/added catalogs. paths_nc = [] for path in paths: for path in vcs.to_commit(path): if path.endswith(".po") or path.endswith(".pot"): paths_nc.append(path) paths = paths_nc paths.sort() pspecs = collect_pspecs_from_vcs(vcs, paths, revs, op.paired_only) if not op.update_effort: ecat, ndiffed = diff_pairs(pspecs, op.do_merge, colorize=(not op.output), shdr=op.strip_headers, noobs=op.skip_obsolete, quiet=op.quiet) if ndiffed > 0: hmsgctxt = ecat.header.get_field_value(EDST.hmsgctxt_field) lines = [] msgs = list(ecat) if not op.strip_headers: msgs.insert(0, ecat.header.to_msg()) for msg in msgs: if op.strip_headers and msg.msgctxt == hmsgctxt: sepl = [] sepl += [msg.manual_comment[0]] sepl += msg.msgid.split("\n")[:2] lines.extend(["# %s\n" % x for x in sepl]) lines.append("\n") else: lines.extend(msg.to_lines(force=True, wrapf=ecat.wrapf())) diffstr = cjoin(lines)[:-1] # remove last newline if op.output: file = open(op.output, "w") file.write(diffstr.encode(ecat.encoding())) file.close() else: report(diffstr) else: updeff = pairs_update_effort(pspecs, quiet=op.quiet) ls = [] for kw, desc, val, fmtval in updeff: ls.append(_("@info", "%(quantity)s: %(value)s", quantity=desc, value=fmtval)) report("\n".join(ls)) # Clean up. cleanup_tmppaths() def diff_pairs (pspecs, merge, colorize=False, wrem=True, wadd=True, shdr=False, noobs=False, quiet=False): # Create diffs of messages. # Note: Headers will be collected and diffed after all messages, # to be able to check if any decoration to their message keys is needed. wrappings = {} ecat = Catalog("", create=True, monitored=False) hspecs = [] ndiffed = 0 update_progress = None if len(pspecs) > 1 and not quiet: update_progress = init_file_progress([vp[1] for fp, vp in pspecs], addfmt=t_("@info:progress", "Diffing: %(file)s")) for fpaths, vpaths in pspecs: upprogf = None if update_progress: upprogf = lambda: update_progress(vpaths[1]) upprogf() # Quick check if files are binary equal. if fpaths[0] and fpaths[1] and filecmp.cmp(*fpaths): continue cats = [] for fpath in fpaths: try: cats.append(Catalog(fpath, create=True, monitored=False)) except: error_wcl(_("@info", "Cannot parse catalog '%(file)s'.", file=fpath), norem=[fpath]) tpos = len(ecat) cndiffed = diff_cats(cats[0], cats[1], ecat, merge, colorize, wrem, wadd, noobs, upprogf) hspecs.append(([not x.created() and x.header or None for x in cats], vpaths, tpos, cndiffed)) ndiffed += cndiffed # Collect and count wrapping policy used for to-catalog. wrapping = cats[1].wrapping() if wrapping not in wrappings: wrappings[wrapping] = 0 wrappings[wrapping] += 1 if update_progress: update_progress() # Find appropriate length of context for header messages. hmsgctxt = get_msgctxt_for_headers(ecat) init_ediff_header(ecat.header, hmsgctxt=hmsgctxt) # Create diffs of headers. # If some of the messages were diffed, # header must be added even if there is no difference. incpos = 0 for hdrs, vpaths, pos, cndiffed in hspecs: ehmsg, anydiff = diff_hdrs(hdrs[0], hdrs[1], vpaths[0], vpaths[1], hmsgctxt, ecat, colorize) if anydiff or cndiffed: ecat.add(ehmsg, pos + incpos) incpos += 1 # Add diffed headers to total count only if header stripping not in effect. if not shdr: ndiffed += incpos # Set the most used wrapping policy for the ediff catalog. if wrappings: - wrapping = sorted(wrappings.items(), key=lambda x: x[1])[-1][0] + wrapping = sorted(list(wrappings.items()), key=lambda x: x[1])[-1][0] ecat.set_wrapping(wrapping) if wrapping is not None: - ecat.header.set_field(u"X-Wrapping", u", ".join(wrapping)) + ecat.header.set_field("X-Wrapping", ", ".join(wrapping)) return ecat, ndiffed # Collect and pair catalogs as list [(fpath1, fpath2)]. # Where a pair cannot be found, empty string is given for path # (unless paired_only is True, when non-paired catalogs are ignored). def collect_file_pairs (dpath1, dpath2, paired_only): if os.path.isfile(dpath1): return [(dpath1, dpath2)] - bysub1, bysub2 = map(collect_and_split_fpaths, (dpath1, dpath2)) + bysub1, bysub2 = list(map(collect_and_split_fpaths, (dpath1, dpath2))) # Try to pair files by subdirectories. # FIXME: Can and should anything smarter be done? fpairs = [] - subdirs = list(set(bysub1.keys() + bysub2.keys())) + subdirs = list(set(list(bysub1.keys()) + list(bysub2.keys()))) subdirs.sort() for subdir in subdirs: flinks1 = bysub1.get(subdir, {}) flinks2 = bysub2.get(subdir, {}) - filenames = list(set(flinks1.keys() + flinks2.keys())) + filenames = list(set(list(flinks1.keys()) + list(flinks2.keys()))) filenames.sort() for filename in filenames: fpath1 = flinks1.get(filename, "") fpath2 = flinks2.get(filename, "") if not paired_only or (fpath1 and fpath2): fpairs.append((fpath1, fpath2)) return fpairs # Collect all catalog paths in given root, and construct mapping # {subdir: {filename: path}}, where subdir is relative to root. def collect_and_split_fpaths (dpath): dpath = dpath.rstrip(os.path.sep) + os.path.sep fpaths = collect_catalogs(dpath) bysub = {} for fpath in fpaths: if not fpath.startswith(dpath): error_wcl(_("@info", "Internal problem with path collection (200).")) subdir = os.path.dirname(fpath[len(dpath):]) if subdir not in bysub: bysub[subdir] = {} bysub[subdir][os.path.basename(fpath)] = fpath return bysub def collect_pspecs_from_vcs (vcs, paths, revs, paired_only): pspecs = [] # FIXME: Use tempfile module. expref = "/tmp/poediff-export-" exind = 0 for path in paths: expaths = {} for rev in revs: if rev is None: expaths[rev] = path else: expath = expref + "%d-%d-%s" % (os.getpid(), exind, rev) exind += 1 if os.path.isfile(path): expath += ".po" if not vcs.export(path, rev or None, expath): error_wcl(_("@info", "Cannot export path '%(path)s' " "in revision '%(rev)s'.", path=path, rev=rev)) record_tmppath(expath) expaths[rev] = expath expaths = [os.path.normpath(expaths[x]) for x in revs] fpairs = collect_file_pairs(expaths[0], expaths[1], paired_only) for fpair in fpairs: fpaths = [] vpaths = [] for fpath, expath, rev in zip(fpair, expaths, revs): if rev is not None: if not fpath: fpath_m = "" elif os.path.isdir(path): fpath_m = fpath[len(expath) + len(os.path.sep):] fpath_m = os.path.join(path, fpath_m) else: fpath_m = path rev_m = rev or vcs.revision(path) vpath = fpath_m + EDST.filerev_sep + rev_m else: vpath = fpath fpaths.append(fpath) vpaths.append(vpath) pspecs.append((fpaths, vpaths)) return pspecs def pairs_update_effort (pspecs, quiet=False): update_progress = None if len(pspecs) > 1 and not quiet: update_progress = init_file_progress([vp[1] for fp, vp in pspecs], addfmt=t_("@info:progress", "Diffing: %(file)s")) nntw_total = 0.0 for fpaths, vpaths in pspecs: upprogf = None if update_progress: upprogf = lambda: update_progress(vpaths[1]) upprogf() # Quick check if files are binary equal. if fpaths[0] and fpaths[1] and filecmp.cmp(*fpaths): continue cats = [] for fpath in fpaths: try: cats.append(Catalog(fpath, create=True, monitored=False)) except: error_wcl(_("@info", "Cannot parse catalog '%(file)s'.", file=fpath), norem=[fpath]) nntw = cats_update_effort(cats[0], cats[1], upprogf) nntw_total += nntw if update_progress: update_progress() updeff = [ ("nntw", _("@item", "nominal newly translated words"), nntw_total, "%.0f" % nntw_total), ] return updeff # Cleanup of temporary paths. _tmppaths = set() def record_tmppath (path): _tmppaths.add(path) def cleanup_tmppaths (norem=set()): for path in _tmppaths: if path in norem: continue if os.path.isfile(path): os.unlink(path) elif os.path.isdir(path): shutil.rmtree(path) def error_wcl (msg, norem=set()): if not isinstance(norem, set): norem = set(norem) cleanup_tmppaths(norem) error(msg) if __name__ == '__main__': exit_on_exception(main, cleanup_tmppaths) diff --git a/scripts/poepatch.py b/scripts/poepatch.py index 59da4207..f3bf879d 100755 --- a/scripts/poepatch.py +++ b/scripts/poepatch.py @@ -1,824 +1,825 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Patch PO files from an embedded diff. Documented in C{doc/user/diffpatch.docbook#sec-dpdiff}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ try: import fallback_import_paths except: pass import sys import os import locale import re from tempfile import NamedTemporaryFile from pology import version, _, n_ from pology.colors import ColorOptionParser from pology.report import error, warning, report from pology.msgreport import error_on_msg, warning_on_msg import pology.config as pology_config from pology.fsops import str_to_unicode, mkdirpath, collect_catalogs from pology.fsops import exit_on_exception from pology.catalog import Catalog from pology.message import Message, MessageUnsafe from pology.header import Header from pology.diff import msg_ediff, msg_ediff_to_new, msg_ediff_to_old from pology.internal.poediffpatch import MPC, EDST from pology.internal.poediffpatch import msg_eq_fields, msg_copy_fields from pology.internal.poediffpatch import msg_clear_prev_fields from pology.internal.poediffpatch import diff_cats from pology.internal.poediffpatch import init_ediff_header from pology.internal.poediffpatch import get_msgctxt_for_headers +from functools import reduce -_flag_ediff = u"ediff" -_flag_ediff_to_cur = u"%s-to-cur" % _flag_ediff -_flag_ediff_to_new = u"%s-to-new" % _flag_ediff -_flag_ediff_no_match = u"%s-no-match" % _flag_ediff +_flag_ediff = "ediff" +_flag_ediff_to_cur = "%s-to-cur" % _flag_ediff +_flag_ediff_to_new = "%s-to-new" % _flag_ediff +_flag_ediff_no_match = "%s-no-match" % _flag_ediff _flags_all = ( _flag_ediff, _flag_ediff_to_cur, _flag_ediff_to_new, _flag_ediff_no_match, ) def main (): locale.setlocale(locale.LC_ALL, "") # Get defaults for command line options from global config. cfgsec = pology_config.section("poepatch") def_do_merge = cfgsec.boolean("merge", True) # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s [OPTIONS] [OPTIONS] < EDIFF\n" "%(cmd)s -u [OPTIONS] PATHS...", cmd="%prog") desc = _("@info command description", "Apply embedded diff of PO files as patch.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-a", "--aggressive", action="store_true", dest="aggressive", default=False, help=_("@info command line option description", "Apply every message to its paired message in the target file, " "irrespective of whether its non-pairing parts match too.")) opars.add_option( "-d", "--directory", metavar=_("@info command line value placeholder", "DIR"), dest="directory", help=_("@info command line option description", "Prepend this directory path to any resolved target file path.")) opars.add_option( "-e", "--embed", action="store_true", dest="embed", default=False, help=_("@info command line option description", "Instead of applying resolved newer version of the message, " "add the full embedded diff into the target file.")) opars.add_option( "-i", "--input", metavar=_("@info command line value placeholder", "FILE"), dest="input", help=_("@info command line option description", "Read the patch from the given file instead of standard input.")) opars.add_option( "-n", "--no-merge", action="store_false", dest="do_merge", default=def_do_merge, help=_("@info command line option description", "Do not try to indirectly pair messages by merging catalogs.")) opars.add_option( "-p", "--strip", metavar=_("@info command line value placeholder", "NUM"), dest="strip", help=_("@info command line option description", "Strip the smallest prefix containing NUM leading slashes from " "each file name found in the ediff file (like in patch(1)). " "If not given, only the base name of each file is taken.")) opars.add_option( "-u", "--unembed", action="store_true", dest="unembed", default=False, help=_("@info command line option description", "Instead of applying a patch, resolve all embedded differences " "in given paths to newer versions of messages.")) (op, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) # Could use some speedup. try: import psyco psyco.full() except ImportError: pass if not op.unembed: if free_args: error(_("@info", "Too many arguments in command line: %(argspec)s", argspec=" ".join(free_args))) if op.strip and not op.strip.isdigit(): error(_("@info", "Option %(opt)s expects a positive integer value.", opt="--strip")) apply_ediff(op) else: paths = [] for path in free_args: if not os.path.exists(path): warning(_("@info", "Path '%(path)s' does not exist.", path=path)) if os.path.isdir(path): paths.extend(collect_catalogs(path)) else: paths.append(path) for path in paths: unembed_ediff(path) def apply_ediff (op): # Read the ediff PO. dummy_stream_path = "" if op.input: if not os.path.isfile(op.input): error(_("@info", "Path '%(path)s' is not a file or does not exist.", path=op.input)) edfpath = op.input readfh = None else: edfpath = dummy_stream_path readfh = sys.stdin try: ecat = Catalog(edfpath, monitored=False, readfh=readfh) except: error(_("@info ediff is shorthand for \"embedded difference\"", "Error reading ediff '%(file)s'.", file=edfpath)) # Split ediff by diffed catalog into original and new file paths, # header message, and ordinary messages. hmsgctxt = ecat.header.get_field_value(EDST.hmsgctxt_field) if hmsgctxt is None: error(_("@info", "Header field '%(field)s' is missing in the ediff.", field=EDST.hmsgctxt_field)) edsplits = [] cehmsg = None - smsgid = u"\x00" + smsgid = "\x00" ecat.add_last(MessageUnsafe(dict(msgctxt=hmsgctxt, msgid=smsgid))) # sentry for emsg in ecat: if emsg.msgctxt == hmsgctxt: if cehmsg: # Record previous section. edsplits.append((fpaths, cehmsg, cemsgs)) if emsg.msgid == smsgid: # end sentry, avoid parsing below break # Mine original and new file paths out of header. fpaths = [] for fpath in emsg.msgid.split("\n")[:2]: # Strip leading "+ "/"- " fpath = fpath[2:] # Convert to planform path separators. fpath = re.sub(r"/+", os.path.sep, fpath) # Remove revision indicator. p = fpath.find(EDST.filerev_sep) if p >= 0: fpath = fpath[:p] # Strip path and append directory as requested. if op.strip: preflen = int(op.strip) lst = fpath.split(os.path.sep, preflen) if preflen + 1 == len(lst): fpath = lst[preflen] else: fpath = os.path.basename(fpath) else: fpath = os.path.basename(fpath) if op.directory and fpath: fpath = os.path.join(op.directory, fpath) # All done. fpaths.append(fpath) cehmsg = emsg cemsgs = [] else: cemsgs.append(emsg) # Prepare catalog for rejects and merges. rcat = Catalog("", create=True, monitored=False, wrapping=ecat.wrapping()) init_ediff_header(rcat.header, hmsgctxt=hmsgctxt, extitle="rejects") # Apply diff to catalogs. for fpaths, ehmsg, emsgs in edsplits: # Open catalog for patching. fpath1, fpath2 = fpaths if fpath1: # Diff from an existing catalog, open it. if not os.path.isfile(fpath1): warning(_("@info", "Path '%(path)s' is not a file or does not exist, " "skipping it.", path=fpath1)) continue try: cat = Catalog(fpath1) except: warning(_("@info", "Error reading catalog '%(file)s', skipping it.", file=fpath1)) continue elif fpath2: # New catalog added in diff, create it (or open if it exists). try: mkdirpath(os.path.dirname(fpath2)) cat = Catalog(fpath2, create=True) if cat.created(): cat.set_wrapping(ecat.wrapping()) except: if os.path.isfile(fpath2): warning(_("@info", "Error reading catalog '%(file)s', skipping it.", file=fpath1)) else: warning(_("@info", "Cannot create catalog '%(file)s', skipping it.", file=fpath2)) continue else: error(_("@info", "Both catalogs in ediff indicated not to exist.")) # Do not try to patch catalog with embedded differences # (i.e. previously patched using -e). if cat.header.get_field_value(EDST.hmsgctxt_field) is not None: warning(_("@info", "Catalog '%(file)s' already contains " "embedded differences, skipping it.", file=cat.filename)) continue # Do not try to patch catalog if the patch contains # unresolved split differences. if reduce(lambda r, x: r or _flag_ediff_to_new in x.flag, emsgs, False): warning(_("@info", "Patch for catalog '%(file)s' contains unresolved " "split differences, skipping it.", file=cat.filename)) continue # Patch the catalog. rejected_ehmsg = patch_header(cat, ehmsg, ecat, op) rejected_emsgs_flags = patch_messages(cat, emsgs, ecat, op) any_rejected = rejected_ehmsg or rejected_emsgs_flags if fpath2 or any_rejected: created = cat.created() if cat.sync(): if not created: if any_rejected and op.embed: report(_("@info:progress E is for \"with embedding\"", "Partially patched (E): %(file)s", file=cat.filename)) elif any_rejected: report(_("@info:progress", "Partially patched: %(file)s", file=cat.filename)) elif op.embed: report(_("@info:progress E is for \"with embedding\"", "Patched (E): %(file)s", file=cat.filename)) else: report(_("@info:progress", "Patched: %(file)s", file=cat.filename)) else: if op.embed: report(_("@info:progress E is for \"with embedding\"", "Created (E): %(file)s", file=cat.filename)) else: report(_("@info:progress", "Created: %(file)s", file=cat.filename)) else: pass #report("unchanged: %s" % cat.filename) else: os.unlink(fpath1) report(_("@info:progress", "Removed: %(file)s", file=fpath1)) # If there were any rejects and reembedding is not in effect, # record the necessary to present them. if any_rejected and not op.embed: if not rejected_ehmsg: # Clean header diff. ehmsg.manual_comment = ehmsg.manual_comment[:1] - ehmsg.msgstr[0] = u"" + ehmsg.msgstr[0] = "" rcat.add_last(ehmsg) for emsg, flag in rejected_emsgs_flags: # Reembed to avoid any conflicts. msg1, msg2, msg1_s, msg2_s = resolve_diff_pair(emsg) emsg = msg_ediff(msg1_s, msg2_s, emsg=msg2_s, ecat=rcat, enoctxt=hmsgctxt) if flag: emsg.flag.add(flag) rcat.add_last(emsg) # If there were any rejects, write them out. if len(rcat) > 0: # Construct paths for embedded diffs of rejects. rsuff = "rej" if ecat.filename != dummy_stream_path: rpath = ecat.filename p = rpath.rfind(".") if p < 0: p = len(rpath) rpath = rpath[:p] + (".%s" % rsuff) + rpath[p:] else: rpath = "stdin.%s.po" % rsuff rcat.filename = rpath rcat.sync(force=True, noobsend=True) report(_("@info:progress file to which rejected parts of the patch " "have been written to", "*** Rejects: %(file)s", file=rcat.filename)) # Patch application types. -_pt_merge, _pt_insert, _pt_remove = range(3) +_pt_merge, _pt_insert, _pt_remove = list(range(3)) def patch_messages (cat, emsgs, ecat, options): # It may happen that a single message from original catalog # is paired with more than one from the diff # (e.g. single old translated message going into two new fuzzy). # Therefore paired messages must be tracked, to know if patched # message can be merged into the existing, or it must be inserted. pmsgkeys = set() # Triplets for splitting directly unapplicable patches into two. # Delay building of triplets until needed for the first time. striplets_pack = [None] def striplets (): if striplets_pack[0] is None: striplets_pack[0] = build_splitting_triplets(emsgs, cat, options) return striplets_pack[0] # Check whether diffs apply, and where and how if they do. rejected_emsgs_flags = [] patch_specs = [] for emsg in emsgs: pspecs = msg_apply_diff(cat, emsg, ecat, pmsgkeys, striplets) for pspec in pspecs: emsg_m, flag = pspec[:2] if flag == _flag_ediff or options.embed: patch_specs.append(pspec) if flag != _flag_ediff: rejected_emsgs_flags.append((emsg_m, flag)) # Sort accepted patches by position of application. patch_specs.sort(key=lambda x: x[3]) # Add accepted patches to catalog. incpos = 0 for emsg, flag, typ, pos, msg1, msg2, msg1_s, msg2_s in patch_specs: if pos is not None: pos += incpos if options.embed: # Embedded diff may conflict one of the messages in catalog. # Make a new diff of special messages, # and embed them either into existing message in catalog, # or into new message. if typ == _pt_merge: tmsg = cat[pos] tpos = pos else: tmsg = MessageUnsafe(msg2 or {}) tpos = None emsg = msg_ediff(msg1_s, msg2_s, emsg=tmsg, ecat=cat, eokpos=tpos) if 0:pass elif typ == _pt_merge: if not options.embed: cat[pos].set_inv(msg2) else: cat[pos].flag.add(flag) elif typ == _pt_insert: if not options.embed: cat.add(Message(msg2), pos) else: cat.add(Message(emsg), pos) cat[pos].flag.add(flag) incpos += 1 elif typ == _pt_remove: if pos is None: continue if not options.embed: cat.remove(pos) incpos -= 1 else: cat[pos].flag.add(flag) else: error_on_msg(_("@info", "Unknown patch type %(type)s.", type=typ), emsg, ecat) return rejected_emsgs_flags def msg_apply_diff (cat, emsg, ecat, pmsgkeys, striplets): msg1, msg2, msg1_s, msg2_s = resolve_diff_pair(emsg) # Try to select existing message from the original messages. # Order is important, should try first new, then old # (e.g. if an old fuzzy was resolved to new after diff was made). msg = None if msg2 and msg2 in cat: msg = cat[msg2] elif msg1 and msg1 in cat: msg = cat[msg1] patch_specs = [] # Try to apply the patch. if msg_patchable(msg, msg1, msg2): # Patch can be directly applied. if msg1 and msg2: if msg.key not in pmsgkeys: typ = _pt_merge pos = cat.find(msg) pmsgkeys.add(msg.key) else: typ = _pt_insert pos, weight = cat.insertion_inquiry(msg2) elif msg2: # patch adds a message if msg: typ = _pt_merge pos = cat.find(msg) pmsgkeys.add(msg.key) else: typ = _pt_insert pos, weight = cat.insertion_inquiry(msg2) elif msg1: # patch removes a message if msg: typ = _pt_remove pos = cat.find(msg) pmsgkeys.add(msg.key) else: typ = _pt_remove pos = None # no position to remove from else: # Cannot happen. error_on_msg(_("@info", "Neither the old nor the new message " "in the diff is indicated to exist."), emsg, ecat) patch_specs.append((emsg, _flag_ediff, typ, pos, msg1, msg2, msg1_s, msg2_s)) else: # Patch cannot be applied directly, # try to split into old-to-current and current-to-new diffs. split_found = False if callable(striplets): striplets = striplets() # delayed creation of splitting triplets for i in range(len(striplets)): m1_t, m1_ts, m2_t, m2_ts, m_t, m_ts1, m_ts2 = striplets[i] if msg1.inv == m1_t.inv and msg2.inv == m2_t.inv: striplets.pop(i) # remove to not slow further searches split_found = True break if split_found: # Construct new corresponding diffs. em_1c = msg_ediff(m1_ts, m_ts1, emsg=MessageUnsafe(m_t)) em_c2 = msg_ediff(m_ts2, m2_ts, emsg=MessageUnsafe(m2_t)) # Current-to-new can be merged or inserted, # and old-to-current is then inserted just before it. if m_t.key not in pmsgkeys: typ = _pt_merge pos = cat.find(m_t) pmsgkeys.add(m_t.key) else: typ = _pt_insert pos, weight = cat.insertion_inquiry(m2_t) # Order of adding patch specs here important for rejects file. patch_specs.append((em_1c, _flag_ediff_to_cur, _pt_insert, pos, m1_t, m_t, m1_ts, m_ts1)) patch_specs.append((em_c2, _flag_ediff_to_new, typ, pos, m_t, m2_t, m_ts2, m2_ts)) # The patch is totally rejected. # Will be inserted if reembedding requested, so compute insertion. if not patch_specs: typ = _pt_insert if msg2 is not None: pos, weight = cat.insertion_inquiry(msg2) else: pos = len(cat) patch_specs.append((emsg, _flag_ediff_no_match, typ, pos, msg1, msg2, msg1_s, msg2_s)) return patch_specs def msg_patchable (msg, msg1, msg2): # Check for cases where current message does not match old or new, # but there is a transformation that can also be cleanly merged. msg_m = msg if 0: pass # Old and new are translated, but current is fuzzy and has previous fields. # Transform current to its previous state, from which it may have became # fuzzy by merging with templates. elif ( msg and msg.fuzzy and msg.key_previous is not None and msg1 and not msg1.fuzzy and msg2 and not msg2.fuzzy ): msg_m = MessageUnsafe(msg) msg_copy_fields(msg, msg_m, MPC.prevcurr_fields) msg_clear_prev_fields(msg_m) msg_m.fuzzy = False # Old is None, new is translated, and current is untranslated. # Add translation of new to current, since it may have been added as # untranslated after merging with templates. elif msg and msg.untranslated and not msg1 and msg2 and msg2.translated: msg_m = MessageUnsafe(msg) msg_copy_fields(msg2, msg_m, ["msgstr"]) if msg1 and msg2: return msg and msg_m.inv in (msg1.inv, msg2.inv) elif msg2: return not msg or msg_m.inv == msg2.inv elif msg1: return not msg or msg_m.inv == msg1.inv else: return not msg def resolve_diff_pair (emsg): # Recover old and new message according to diff. # Resolve into copies of ediff message, to preserve non-inv parts. emsg1 = MessageUnsafe(emsg) msg1_s = msg_ediff_to_old(emsg1, rmsg=emsg1) emsg2 = MessageUnsafe(emsg) msg2_s = msg_ediff_to_new(emsg2, rmsg=emsg2) # Resolve any special pairings. msg1, msg2 = msg1_s, msg2_s if not msg1_s or not msg2_s: # No special cases if either message non-existant. pass # Cases f-nf-*. elif msg1_s.fuzzy and not msg2_s.fuzzy: # Case f-nf-ecc. if ( msg2_s.key_previous is None and not msg_eq_fields(msg1_s, msg2_s, MPC.curr_fields) ): msg1 = MessageUnsafe(msg1_s) msg_copy_fields(msg1_s, msg1, MPC.currprev_fields) msg_copy_fields(msg2_s, msg1, MPC.curr_fields) # Case f-nf-necc. elif msg2_s.key_previous is not None: msg1 = MessageUnsafe(msg1_s) msg2 = MessageUnsafe(msg2_s) msg_copy_fields(msg2_s, msg1, MPC.prevcurr_fields) msg_clear_prev_fields(msg2) # Cases nf-f-*. elif not msg1_s.fuzzy and msg2_s.fuzzy: # Case nf-f-ecp. if ( msg1_s.key_previous is None and not msg_eq_fields(msg1_s, msg2_s, MPC.curr_fields) ): msg2 = MessageUnsafe(msg2_s) msg_copy_fields(msg1_s, msg2, MPC.currprev_fields) # Case nf-f-necp. elif msg1_s.key_previous is not None: msg1 = MessageUnsafe(msg1_s) msg2 = MessageUnsafe(msg2_s) msg_copy_fields(msg1_s, msg2, MPC.prev_fields) msg_clear_prev_fields(msg1) return msg1, msg2, msg1_s, msg2_s def build_splitting_triplets (emsgs, cat, options): # Create catalogs of old and new messages. cat1 = Catalog("", create=True, monitored=False) cat2 = Catalog("", create=True, monitored=False) for emsg in emsgs: msg1, msg2, msg1_s, msg2_s = resolve_diff_pair(emsg) if msg1: cat1.add_last(msg1) if msg2: cat2.add_last(msg2) # Make headers same, to avoid any diffs there. cat1.header = cat.header cat2.header = cat.header # Write created catalogs to disk if # msgmerge may be used on files during diffing. if options.do_merge: tmpfs = [] # to avoid garbage collection until the function returns for tcat, tsuff in ((cat1, "1"), (cat2, "2")): tmpf = NamedTemporaryFile(prefix="poepatch-split-%s-" % tsuff, suffix=".po") tmpfs.append(tmpf) tcat.filename = tmpf.name tcat.sync(force=True) # Create the old-to-current and current-to-new diffs. ecat_1c = Catalog("", create=True, monitored=False) diff_cats(cat1, cat, ecat_1c, options.do_merge, wadd=False, wrem=False) ecat_c2 = Catalog("", create=True, monitored=False) diff_cats(cat, cat2, ecat_c2, options.do_merge, wadd=False, wrem=False) # Mine splitting triplets out of diffs. sdoublets_1c = {} for emsg in ecat_1c: m1_t, m_t, m1_ts, m_ts1 = resolve_diff_pair(emsg) sdoublets_1c[m_t.key] = [m1_t, m1_ts, m_t, m_ts1] sdoublets_c2 = {} for emsg in ecat_c2: m_t, m2_t, m_ts2, m2_ts = resolve_diff_pair(emsg) sdoublets_c2[m_t.key] = [m_t, m_ts2, m2_t, m2_ts] common_keys = set(sdoublets_1c).intersection(sdoublets_c2) striplets = [] for key in common_keys: m1_t, m1_ts, m_t, m_ts1 = sdoublets_1c[key] m_t, m_ts2, m2_t, m2_ts = sdoublets_c2[key] striplets.append((m1_t, m1_ts, m2_t, m2_ts, m_t, m_ts1, m_ts2)) return striplets def patch_header (cat, ehmsg, ecat, options): if not ehmsg.msgstr[0]: # no header diff, only metadata return None ehmsg_clean = clear_header_metadata(ehmsg) # Create reduced headers. hmsg1 = msg_ediff_to_old(ehmsg_clean) hmsg2 = msg_ediff_to_new(ehmsg_clean) hmsg = not cat.created() and cat.header.to_msg() or None hdrs = [] for m in (hmsg, hmsg1, hmsg2): h = m is not None and reduce_header_fields(Header(m)) or None hdrs.append(h) rhdr, rhdr1, rhdr2 = hdrs # Decide if the header can be cleanly patched. clean = False if not rhdr: clean = rhdr1 or rhdr2 else: clean = (rhdr1 and rhdr == rhdr1) or (rhdr2 and rhdr == rhdr2) if clean: if not options.embed: if hmsg2: cat.header = Header(hmsg2) else: # Catalog will be removed if no messages are rejected, # and otherwise the header should stay as-is. pass else: if cat.created(): cat.header = Header(hmsg2) ehmsg = MessageUnsafe(ehmsg) ehmsg.flag.add(_flag_ediff) hmsgctxt = get_msgctxt_for_headers(cat) ehmsg.msgctxt = hmsgctxt cat.header.set_field(EDST.hmsgctxt_field, hmsgctxt) cat.add(Message(ehmsg), 0) return None else: return ehmsg # Clear header diff message of metadata. # A copy of the message is returned. def clear_header_metadata (ehmsg): ehmsg = MessageUnsafe(ehmsg) ehmsg.manual_comment.pop(0) ehmsg.msgctxt = None - ehmsg.msgid = u"" + ehmsg.msgid = "" return ehmsg # Remove known unimportant fields from the header, # to ignore them on comparisons. def reduce_header_fields (hdr): rhdr = Header(hdr) for field in ( "POT-Creation-Date", "PO-Revision-Date", "Last-Translator", "X-Generator", ): rhdr.remove_field(field) return rhdr def unembed_ediff (path, all=False, old=False): try: cat = Catalog(path) except: warning(_("@info", "Error reading catalog '%(file)s', skipping it.", file=path)) return hmsgctxt = cat.header.get_field_value(EDST.hmsgctxt_field) if hmsgctxt is not None: cat.header.remove_field(EDST.hmsgctxt_field) uehmsg = None unembedded = {} for msg in cat: ediff_flag = None for flag in _flags_all: if flag in msg.flag: ediff_flag = flag msg.flag.remove(flag) if not ediff_flag and not all: continue if ediff_flag in (_flag_ediff_no_match, _flag_ediff_to_new): # Throw away fully rejected embeddings, i.e. reject the patch. # For split-difference embeddings, throw away the current-to-new; # this effectively rejects the patch, which is safest thing to do. cat.remove_on_sync(msg) elif hmsgctxt is not None and msg.msgctxt == hmsgctxt: if uehmsg: warning_on_msg(_("@info", "Unembedding results in duplicate header, " "previous header at %(line)d(#%(entry)d); " "skipping it.", line=uehmsg.refline, entry=uehmsg.refentry), msg, cat) return msg_ediff_to_x = not old and msg_ediff_to_new or msg_ediff_to_old hmsg = msg_ediff_to_x(clear_header_metadata(msg)) if hmsg.msgstr and hmsg.msgstr[0]: cat.header = Header(hmsg) cat.remove_on_sync(msg) uehmsg = msg else: msg1, msg2, msg1_s, msg2_s = resolve_diff_pair(msg) tmsg = (not old and (msg2,) or (msg1,))[0] if tmsg is not None: if tmsg.key in unembedded: msg_p = unembedded[tmsg.key] warning_on_msg(_("@info", "Unembedding results in " "duplicate message, previous message " "at %(line)d(#%(entry)d); skipping it.", line=msg_p.refline, entry=msg_p.refentry), msg, cat) return msg.set(Message(msg2)) unembedded[tmsg.key] = msg else: cat.remove_on_sync(msg) if cat.sync(): report(_("@info:progress", "Unembedded: %(file)s", file=cat.filename)) if __name__ == '__main__': exit_on_exception(main) diff --git a/scripts/pomtrans.py b/scripts/pomtrans.py index ced81e3e..538cc016 100755 --- a/scripts/pomtrans.py +++ b/scripts/pomtrans.py @@ -1,528 +1,528 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Perform machine translation of PO files. Documented in C{doc/user/lingo.docbook#sec-lgmtrans}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ try: import fallback_import_paths except: pass import locale import subprocess import sys import os from pology import datadir, version, _, n_ from pology.catalog import Catalog from pology.colors import ColorOptionParser import pology.config as pology_config from pology.entities import read_entities from pology.fsops import collect_catalogs, collect_system from pology.fsops import str_to_unicode from pology.fsops import exit_on_exception from pology.message import MessageUnsafe from pology.remove import remove_accel_msg from pology.report import report, error, warning from pology.resolve import resolve_entities_simple def main (): locale.setlocale(locale.LC_ALL, "") # Get defaults for command line options from global config. cfgsec = pology_config.section("pomtrans") showservs = list() showservs.sort() # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s [OPTIONS] TRANSERV PATHS...", cmd="%prog") desc = _("@info command description", "Perform machine translation of PO files.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-a", "--accelerator", dest="accel", metavar=_("@info command line value placeholder", "CHAR"), help=_("@info command line option description", "Accelerator marker character used in messages. " "Detected from catalogs if not given.")) opars.add_option( "-c", "--parallel-compendium", dest="parcomp", metavar=_("@info command line value placeholder", "FILE"), help=_("@info command line option description", "Translate from translation to another language, " "found in compendium file at the given path.")) opars.add_option( "-l", "--list-transervs", action="store_true", dest="list_transervs", default=False, help="List available translation services.") opars.add_option( "-m", "--flag-%s" % _flag_mtrans, action="store_true", dest="flag_mtrans", default=False, help=_("@info command line option description", "Add '%(flag)s' flag to translated messages.", flag=_flag_mtrans)) opars.add_option( "-M", "--translation-mode", dest="tmode", metavar=_("@info command line value placeholder", "MODE"), help=_("@info command line option description", "Translation mode for the chosen translation service. " "Overrides the default translation mode constructed " "based on source and target language. " "Mode string format is translation service dependent.")) opars.add_option( "-n", "--no-fuzzy-flag", action="store_false", dest="flag_fuzzy", default=True, help=_("@info command line option description", "Do not add '%(flag)s' flag to translated messages.", flag="fuzzy")) opars.add_option( "-p", "--parallel-catalogs", dest="parcats", metavar=_("@info command line value placeholder", "SEARCH:REPLACE"), help=_("@info command line option description", "Translate from translation to another language " "found in parallel catalogs. " "For given target catalog path, the path to parallel catalog " "is constructed by replacing once SEARCH with REPLACE.")) opars.add_option( "-s", "--source-lang", dest="slang", metavar=_("@info command line value placeholder", "LANG"), help=_("@info command line option description", "Source language code. " "Detected from catalogs if not given.")) opars.add_option( "-t", "--target-lang", dest="tlang", metavar=_("@info command line value placeholder", "LANG"), help=_("@info command line option description", "Target language code. " "Detected from catalogs if not given.")) opars.add_option( "-T", "--transerv-bin", dest="transerv_bin", metavar=_("@info command line value placeholder", "PATH"), help=_("@info command line option description", "Custom path to translation service executable " "(where applicable).")) opars.add_option( "-d", "--data-directory", dest="data_directory", metavar=_("@info command line value placeholder", "FOLDER"), help=_("@info command line option description", "Custom path to a translation data directory (where applicable).")) (op, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) # Could use some speedup. try: import psyco psyco.full() except ImportError: pass if op.list_transervs: report("\n".join(sorted(_known_transervs.keys()))) sys.exit(0) if len(free_args) < 1: error(_("@info", "Translation service not specified.")) transervkey = free_args.pop(0) if transervkey not in _known_transervs: error(_("@info", "Translation service '%(serv)s' not known.", serv=transervkey)) tsbuilder_wopts = _known_transervs[transervkey] tsbuilder = lambda slang, tlang: tsbuilder_wopts(slang, tlang, op) paths = free_args if not op.parcomp and not op.parcats: translate_direct(paths, tsbuilder, op) else: translate_parallel(paths, tsbuilder, op) def translate_direct (paths, tsbuilder, options): transervs = {} catpaths = collect_catalogs(paths) for catpath in catpaths: # Collect messages and texts to translate. cat = Catalog(catpath) if options.accel is not None: # force explicitly given accelerator cat.set_accelerator(options.accel) texts = [] msgs = [] for msg in cat: if to_translate(msg, options): msgf = MessageUnsafe(msg) remove_accel_msg(msgf, cat) texts.append(msgf.msgid) if msg.msgid_plural is not None: texts.append(msgf.msgid_plural) msgs.append(msg) # Translate collected texts. slang = options.slang or "en" transerv = get_transerv(slang, options.tlang, cat, cat, tsbuilder) texts_tr = transerv.translate(texts) if texts else [] if texts_tr is None: warning(_("@info", "Translation service failure on '%(file)s'.", file=catpath)) continue for i, text in enumerate(texts_tr): text = reduce_for_encoding(text, cat.encoding()) texts_tr[i] = text # Put translated texts into messages. singlepls = cat.plural_indices_single() for msg in msgs: msgid_tr = texts_tr.pop(0) if msg.msgid_plural is not None: msgid_plural_tr = texts_tr.pop(0) if msgid_tr: if msg.msgid_plural is not None: for i in range(len(msg.msgstr)): if i in singlepls: msg.msgstr[i] = msgid_tr else: msg.msgstr[i] = msgid_plural_tr else: msg.msgstr[0] = msgid_tr decorate(msg, options) sync_rep(cat, msgs) def translate_parallel (paths, tsbuilder, options): pathrepl = options.parcats comppath = options.parcomp slang = options.slang tlang = options.tlang ccat = None if comppath is not None: if not os.path.isfile(comppath): error(_("@info", "Compendium '%(file)s' does not exist.", file=comppath)) ccat = Catalog(comppath, monitored=False) if pathrepl is not None: lst = pathrepl.split(":") if len(lst) != 2: error(_("@info", "Invalid search and replace specification '%(spec)s'.", spec=pathrepl)) pathsrch, pathrepl = lst catpaths = collect_catalogs(paths) for catpath in catpaths: # Open parallel catalog if it exists. pcat = None if pathrepl is not None: pcatpath = catpath.replace(pathsrch, pathrepl, 1) if catpath == pcatpath: error(_("@info", "Parallel catalog and target catalog are same files " "for '%(file)s'.", file=catpath)) if os.path.isfile(pcatpath): pcat = Catalog(pcatpath, monitored=False) # If there is neither the parallel catalog nor the compendium, # skip processing current target catalog. if not pcat and not ccat: continue # Collect messages and texts to translate. cat = Catalog(catpath) pmsgs, psmsgs, ptexts = [], [], [] cmsgs, csmsgs, ctexts = [], [], [] for msg in cat: if to_translate(msg, options): # Priority: parallel catalog, then compendium. for scat, msgs, smsgs, texts in ( (pcat, pmsgs, psmsgs, ptexts), (ccat, cmsgs, csmsgs, ctexts), ): if scat and msg in scat: smsg = scat[msg] if smsg.translated: msgs.append(msg) smsgs.append(smsg) texts.extend(smsg.msgstr) break # Translate collected texts. texts_tr = [] for texts, scat in ((ptexts, pcat), (ctexts, ccat)): transerv = get_transerv(slang, tlang, scat, cat, tsbuilder) texts_tr.append(transerv.translate(texts) if texts else []) if texts_tr[-1] is None: texts_tr = None break if texts_tr is None: warning(_("@info", "Translation service failure on '%(file)s'.", file=catpath)) continue ptexts_tr, ctexts_tr = texts_tr # Put translated texts into messages. # For plural messages, assume 1-1 match to parallel language. for msgs, smsgs, texts in ( (pmsgs, psmsgs, ptexts_tr), (cmsgs, csmsgs, ctexts_tr), ): for msg, smsg in zip(msgs, smsgs): ctexts = [] for i in range(len(smsg.msgstr)): text = texts.pop(0) text = reduce_for_encoding(text, cat.encoding()) ctexts.append(text) for i in range(len(msg.msgstr)): msg.msgstr[i] = i < len(ctexts) and ctexts[i] or ctexts[-1] decorate(msg, options) sync_rep(cat, pmsgs + cmsgs) def to_translate (msg, options): return msg.untranslated -_flag_mtrans = u"mtrans" +_flag_mtrans = "mtrans" def decorate (msg, options): msg.unfuzzy() # clear any previous fuzzy stuff if options.flag_fuzzy: msg.fuzzy = True if options.flag_mtrans: msg.flag.add(_flag_mtrans) # Cache of translation services by (source, target) language pair. _transervs = {} # Return translation service for (slang, tlang) pair. # If the service was not created yet, create it and cache it. # If slang or tlang are None, use target language of corresponding catalog. def get_transerv (slang, tlang, scat, tcat, tsbuilder): if not slang: slang = scat.header.get_field_value("Language") if not slang: error(_("@info", "Cannot determine language of source catalog '%(file)s'.", file=scat.filename)) if not tlang: tlang = tcat.header.get_field_value("Language") if not tlang: error(_("@info", "Cannot determine language of target catalog '%(file)s'.", file=tcat.filename)) trdir = (slang, tlang) if trdir not in _transervs: _transervs[trdir] = tsbuilder(slang, tlang) return _transervs[trdir] def sync_rep (cat, mmsgs): if cat.sync(): report("! %s (%s)" % (cat.filename, len(mmsgs))) def reduce_for_encoding (text, enc): while True: try: text.encode(enc) - except UnicodeEncodeError, e: + except UnicodeEncodeError as e: start, end = e[2], e[3] text = text[:start] + ("?" * (end - start)) + text[end:] finally: break return text # ---------------------------------------- # Apertium -- a free/open-source machine translation platform # http://www.apertium.org/ class Translator_apertium (object): def __init__ (self, slang, tlang, options): cmdpath = options.transerv_bin or "apertium" try: subprocess.call(cmdpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: error(_("@info Apertium is machine translation software", "Apertium executable not found at '%(path)s'.", path=cmdpath)) if options.tmode is not None: mode = options.tmode else: mode = "%s-%s" % (slang, tlang) - optional_parameters = u"" + optional_parameters = "" if options.data_directory: - optional_parameters = u"-d %s" % options.data_directory + optional_parameters = "-d %s" % options.data_directory - self.cmdline = u"%s -u -f html-noent %s %s" % ( + self.cmdline = "%s -u -f html-noent %s %s" % ( cmdpath, optional_parameters, mode) entpath = os.path.join(datadir(), "spec", "html.entities") self.htmlents = read_entities(entpath) def translate (self, texts): # Serialize texts to send to Apertium in one go. # Separate texts with an inplace tag followed by dot, # to have each text interpreted as standalone sentence. # FIXME: Any way to really translate each text in turn, # without it being horribly slow? sep0 = "
." sep = None nsep = 0 while not sep: # determine shortest acceptable separator sep = sep0 + sep1 * nsep + sep2 for text in texts: if sep in text: sep = None nsep += 1 break stext = sep.join(texts) # Translate empty string to test language pair. # Otherwise, if a lot of text is sent and language pair not good, # Apertium may just signal broken pipe. res = collect_system(self.cmdline, instr="") if res[2] != 0: warning(_("@info", "Executing Apertium failed:\n%(output)s", output=res[0])) # ...really res[0], error is output to stdout. Tsk. return None res = collect_system(self.cmdline, instr=stext) if res[2] != 0: warning(_("@info", "Executing Apertium failed:\n%(output)s", output=res[0])) # ...really res[0], error is output to stdout. Tsk. return None texts_tr = res[0].split(sep) if len(texts_tr) != len(texts): warning(_("@info", "Apertium reported wrong number of translations, " "%(num1)d instead of %(num2)d.", num1=len(texts_tr), num2=len(texts))) return None texts_tr = [resolve_entities_simple(x, self.htmlents) for x in texts_tr] return texts_tr # ---------------------------------------- # Google Translate # http://translate.google.com # Communication code derived from py-gtranslate library # http://code.google.com/p/py-gtranslate/ # Updated for v2.0 API by Víctor R. Rodríguez Domínguez # http://vrdominguez.es class Translator_google (object): def __init__ (self, slang, tlang, options): if options.tmode is not None: ( self.lang_in, self.lang_out ) = options.tmode.split('|') else: self.lang_in = slang self.lang_out = tlang self.apikey = pology_config.section("pomtrans").string("google-api-key") def translate (self, texts): - import urllib + import urllib.request, urllib.parse, urllib.error try: import simplejson except: error(_("@info", "Python module '%(mod)s' not available. " "Try installing the '%(pkg)s' package.", mod="simplejson", pkg="python-simplejson")) baseurl = "https://www.googleapis.com/language/translate/v2" baseparams = (("key", self.apikey), ("source", self.lang_in), ("target", self.lang_out), ("target","json")) texts_tr = [] for text in texts: params = baseparams + (("q", text.encode("utf8")),) - parfmt = "&".join(["%s=%s" % (p, urllib.quote_plus(v)) + parfmt = "&".join(["%s=%s" % (p, urllib.parse.quote_plus(v)) for p, v in params]) execurl = "%s?%s" % (baseurl, parfmt) try: - res = simplejson.load(urllib.FancyURLopener().open(execurl)) - text_tr = unicode(res["data"]["translations"][0]["translatedText"]) + res = simplejson.load(urllib.request.FancyURLopener().open(execurl)) + text_tr = str(res["data"]["translations"][0]["translatedText"]) except: - text_tr = u"" + text_tr = "" texts_tr.append(text_tr) return texts_tr # ---------------------------------------- # Collect defined translation services by name. _known_transervs = {} def _init (): tspref = "Translator_" - for locvar, locval in globals().items(): + for locvar, locval in list(globals().items()): if locvar.startswith(tspref): _known_transervs[locvar[len(tspref):]] = locval _init() if __name__ == '__main__': exit_on_exception(main) diff --git a/scripts/porewrap.py b/scripts/porewrap.py index 0bbe56b7..84b26ee0 100755 --- a/scripts/porewrap.py +++ b/scripts/porewrap.py @@ -1,90 +1,90 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Rewrap message strings in PO files. Documented in C{doc/user/misctools.docbook#sec-mirewrap}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import locale import os import sys try: import fallback_import_paths except: pass from pology import version, _, n_ from pology.catalog import Catalog from pology.colors import ColorOptionParser import pology.config as pology_config from pology.fsops import collect_paths_cmdline, collect_catalogs from pology.fsops import exit_on_exception from pology.report import report, error from pology.stdcmdopt import add_cmdopt_filesfrom, add_cmdopt_wrapping from pology.wrap import select_field_wrapping def main (): locale.setlocale(locale.LC_ALL, "") # Get defaults for command line options from global config. cfgsec = pology_config.section("porewrap") # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s [options] POFILE...", cmd="%prog") desc = _("@info command description", "Rewrap message strings in PO files.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2007, 2008, 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2007, 2008, 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-v", "--verbose", action="store_true", dest="verbose", default=False, help=_("@info command line option description", "More detailed progress information.")) add_cmdopt_wrapping(opars) add_cmdopt_filesfrom(opars) (op, fargs) = opars.parse_args() if len(fargs) < 1 and not op.files_from: error(_("@info", "No input files given.")) # Could use some speedup. try: import psyco psyco.full() except ImportError: pass # Assemble list of files. fnames = collect_paths_cmdline(rawpaths=fargs, filesfrom=op.files_from, respathf=collect_catalogs, abort=True) # Rewrap all catalogs. for fname in fnames: if op.verbose: report(_("@info:progress", "Rewrapping: %(file)s", file=fname)) cat = Catalog(fname, monitored=False) wrapping = select_field_wrapping(cfgsec, cat, op) cat.set_wrapping(wrapping) cat.sync(force=True) if __name__ == '__main__': exit_on_exception(main) diff --git a/scripts/poselfmerge.py b/scripts/poselfmerge.py index b2f44b9e..b1f4f19c 100755 --- a/scripts/poselfmerge.py +++ b/scripts/poselfmerge.py @@ -1,191 +1,191 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Merge PO file with itself or compendium, to produce fuzzy matches on similar messages. Documented in C{doc/user/misctools.docbook#sec-miselfmerge}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import locale import os import shutil import sys try: import fallback_import_paths except: pass from pology import version, _, n_ from pology.catalog import Catalog from pology.message import MessageUnsafe from pology.colors import ColorOptionParser import pology.config as pology_config from pology.fsops import collect_paths_cmdline, collect_catalogs from pology.fsops import exit_on_exception from pology.merge import merge_pofile from pology.report import report, error from pology.stdcmdopt import add_cmdopt_filesfrom, add_cmdopt_wrapping from pology.wrap import select_field_wrapping def main (): locale.setlocale(locale.LC_ALL, "") # Get defaults for command line options from global config. cfgsec = pology_config.section("poselfmerge") def_minwnex = cfgsec.integer("min-words-exact", 0) def_minasfz = cfgsec.real("min-adjsim-fuzzy", 0.0) def_fuzzex = cfgsec.boolean("fuzzy-exact", False) def_refuzz = cfgsec.boolean("rebase-fuzzies", False) # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s [options] POFILE...", cmd="%prog") desc = _("@info command description", "Merge PO file with itself or compendium, " "to produce fuzzy matches on similar messages.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-A", "--min-adjsim-fuzzy", metavar=_("@info command line value placeholder", "RATIO"), action="store", dest="min_adjsim_fuzzy", default=def_minasfz, help=_("@info command line option description", "On fuzzy matches, the minimum adjusted similarity " "to accept the match, or else the message is left untranslated. " "Range is 0.0-1.0, where 0 means always to accept the match, " "and 1 never to accept; a practical range is 0.6-0.8.")) opars.add_option( "-b", "--rebase-fuzzies", action="store_true", dest="rebase_fuzzies", default=def_refuzz, help=_("@info command line option description", "Before merging, clear those fuzzy messages whose predecessor " "(determined by previous fields) is still in the catalog.")) opars.add_option( "-C", "--compendium", metavar=_("@info command line value placeholder", "POFILE"), action="append", dest="compendiums", default=[], help=_("@info command line option description", "Catalog with existing translations, to additionally use for " "direct and fuzzy matches. Can be repeated.")) opars.add_option( "-v", "--verbose", action="store_true", dest="verbose", default=False, help=_("@info command line option description", "More detailed progress information.")) opars.add_option( "-W", "--min-words-exact", metavar=_("@info command line value placeholder", "NUMBER"), action="store", dest="min_words_exact", default=def_minwnex, help=_("@info command line option description", "When using compendium, in case of exact match, " "minimum number of words that original text must have " "to accept translation without making it fuzzy. " "Zero means to always accept an exact match.")) opars.add_option( "-x", "--fuzzy-exact", action="store_true", dest="fuzzy_exact", default=def_fuzzex, help=_("@info command line option description", "When using compendium, make all exact matches fuzzy.")) add_cmdopt_wrapping(opars) add_cmdopt_filesfrom(opars) (op, fargs) = opars.parse_args() if len(fargs) < 1 and not op.files_from: error(_("@info", "No input files given.")) # Could use some speedup. try: import psyco psyco.full() except ImportError: pass # Convert non-string options to needed types. try: op.min_words_exact = int(op.min_words_exact) except: error(_("@info", "Value to option %(opt)s must be an integer number, " "given '%(val)s' instead.", opt="--min-words-exact", val=op.min_words_exact)) try: op.min_adjsim_fuzzy = float(op.min_adjsim_fuzzy) except: error(_("@info", "Value to option %(opt)s must be a real number, " "given '%(val)s' instead.", opt="--min-adjsim-fuzzy", val=op.min_ajdsim_fuzzy)) # Assemble list of files. fnames = collect_paths_cmdline(rawpaths=fargs, filesfrom=op.files_from, respathf=collect_catalogs, abort=True) # Self-merge all catalogs. for fname in fnames: if op.verbose: report(_("@info:progress", "Self-merging: %(file)s", file=fname)) self_merge_pofile(fname, op.compendiums, op.fuzzy_exact, op.min_words_exact, op.min_adjsim_fuzzy, op.rebase_fuzzies, cfgsec, op) def self_merge_pofile (catpath, compendiums=[], fuzzex=False, minwnex=0, minasfz=0.0, refuzzy=False, cfgsec=None, cmlopt=None): # Create temporary files for merging. ext = ".tmp-selfmerge" catpath_mod = catpath + ext if ".po" in catpath: potpath = catpath.replace(".po", ".pot") + ext else: potpath = catpath + ".pot" + ext shutil.copyfile(catpath, catpath_mod) shutil.copyfile(catpath, potpath) # Open catalog for pre-processing. cat = Catalog(potpath, monitored=False) # Decide wrapping policy. wrapping = select_field_wrapping(cfgsec, cat, cmlopt) # From the dummy template, clean all active messages and # remove all obsolete messages. for msg in cat: if msg.obsolete: cat.remove_on_sync(msg) else: msg.clear() cat.sync() # Merge with dummy template. merge_pofile(catpath_mod, potpath, update=True, wrapping=wrapping, cmppaths=compendiums, fuzzex=fuzzex, minwnex=minwnex, minasfz=minasfz, refuzzy=refuzzy, abort=True) # Overwrite original with temporary catalog. shutil.move(catpath_mod, catpath) os.unlink(potpath) if __name__ == '__main__': exit_on_exception(main) diff --git a/scripts/posieve.py b/scripts/posieve.py index 084ac5e5..0bff5b6b 100755 --- a/scripts/posieve.py +++ b/scripts/posieve.py @@ -1,616 +1,616 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- """ Sieve messages in collections of PO files. Reference documentation in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ try: import fallback_import_paths except: pass import glob import imp import locale import os import re import sys from pology import datadir, version, _, n_, t_ from pology.catalog import Catalog, CatalogSyntaxError from pology.colors import ColorOptionParser, set_coloring_globals import pology.config as pology_config from pology.escape import escape_sh from pology.fsops import str_to_unicode, unicode_to_str from pology.fsops import collect_catalogs, collect_system from pology.fsops import build_path_selector, collect_paths_from_file from pology.fsops import collect_paths_cmdline from pology.fsops import exit_on_exception from pology.msgreport import report_on_msg, warning_on_msg, error_on_msg from pology.report import error, warning, report, encwrite from pology.report import init_file_progress from pology.report import list_options from pology.report import format_item_list from pology.stdcmdopt import add_cmdopt_filesfrom, add_cmdopt_incexc from pology.stdcmdopt import add_cmdopt_colors from pology.subcmd import ParamParser from pology.sieve import SieveMessageError, SieveCatalogError def main (): locale.setlocale(locale.LC_ALL, "") # Get defaults for command line options from global config. cfgsec = pology_config.section("posieve") def_do_skip = cfgsec.boolean("skip-on-error", True) def_msgfmt_check = cfgsec.boolean("msgfmt-check", False) def_skip_obsolete = cfgsec.boolean("skip-obsolete", False) # Setup options and parse the command line. usage = _("@info command usage", "%(cmd)s [OPTIONS] SIEVE [POPATHS...]", cmd="%prog") desc = _("@info command description", "Apply sieves to PO paths, which may be either single PO files or " "directories to search recursively for PO files. " "Some of the sieves only examine PO files, while others " "modify them as well. " "The first non-option argument is the sieve name; " "a list of several comma-separated sieves can be given too.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2007, 2008, 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2007, 2008, 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-a", "--announce-entry", action="store_true", dest="announce_entry", default=False, help=_("@info command line option description", "Announce that header or message is just about to be sieved.")) opars.add_option( "-b", "--skip-obsolete", action="store_true", dest="skip_obsolete", default=def_skip_obsolete, help=_("@info command line option description", "Do not sieve obsolete messages.")) opars.add_option( "-c", "--msgfmt-check", action="store_true", dest="msgfmt_check", default=def_msgfmt_check, help=_("@info command line option description", "Check catalogs by %(cmd)s and skip those which do not pass.", cmd="msgfmt -c")) opars.add_option( "-u", "--single-entry", metavar=_("@info command line value placeholder", "ENTRY_NUMBER"), action="store", dest="single_entry", default=0, help=_("@info command line option description", "Only perform the check on this ENTRY_NUMBER.")) opars.add_option( "--force-sync", action="store_true", dest="force_sync", default=False, help=_("@info command line option description", "Force rewriting of all messages, whether modified or not.")) opars.add_option( "-H", "--help-sieves", action="store_true", dest="help_sieves", default=False, help=_("@info command line option description", "Show help for applied sieves.")) opars.add_option( "--issued-params", action="store_true", dest="issued_params", default=False, help=_("@info command line option description", "Show all issued sieve parameters " "(from command line and user configuration).")) opars.add_option( "-l", "--list-sieves", action="store_true", dest="list_sieves", default=False, help=_("@info command line option description", "List available internal sieves.")) opars.add_option( "--list-options", action="store_true", dest="list_options", default=False, help=_("@info command line option description", "List the names of available options.")) opars.add_option( "--list-sieve-names", action="store_true", dest="list_sieve_names", default=False, help=_("@info command line option description", "List the names of available internal sieves.")) opars.add_option( "--list-sieve-params", action="store_true", dest="list_sieve_params", default=False, help=_("@info command line option description", "List the parameters known to issued sieves.")) opars.add_option( "-m", "--output-modified", metavar=_("@info command line value placeholder", "FILE"), action="store", dest="output_modified", default=None, help=_("@info command line option description", "Output names of modified files into FILE.")) opars.add_option( "--no-skip", action="store_false", dest="do_skip", default=def_do_skip, help=_("@info command line option description", "Do not try to skip catalogs which signal errors.")) opars.add_option( "--no-sync", action="store_false", dest="do_sync", default=True, help=_("@info command line option description", "Do not write any modifications to catalogs.")) opars.add_option( "-q", "--quiet", action="store_true", dest="quiet", default=False, help=_("@info command line option description", "Do not display any progress info " "(does not influence sieves themselves).")) opars.add_option( "-s", metavar=_("@info command line value placeholder", "NAME[:VALUE]"), action="append", dest="sieve_params", default=[], help=_("@info command line option description", "Pass a parameter to sieves.")) opars.add_option( "-S", metavar=_("@info command line value placeholder", "NAME[:VALUE]"), action="append", dest="sieve_no_params", default=[], help=_("@info command line option description", "Remove a parameter to sieves " "(e.g. if it was issued through user configuration).")) opars.add_option( "-v", "--verbose", action="store_true", dest="verbose", default=False, help=_("@info command line option description", "Output more detailed progress information.")) add_cmdopt_filesfrom(opars) add_cmdopt_incexc(opars) add_cmdopt_colors(opars) (op, free_args) = opars.parse_args(str_to_unicode(sys.argv[1:])) if op.list_options: report(list_options(opars)) sys.exit(0) if len(free_args) < 1 and not (op.list_sieves or op.list_sieve_names): error(_("@info", "No sieve to apply given.")) op.raw_sieves = [] op.raw_paths = [] if len(free_args) > 2 and op.single_entry != 0: error(_("@info", "With single entry mode, you can only give one input file.")) if len(free_args) >= 1: op.raw_sieves = free_args[0] op.raw_paths = free_args[1:] # Could use some speedup. try: import psyco psyco.full() except ImportError: pass set_coloring_globals(ctype=op.coloring_type, outdep=(not op.raw_colors)) # Dummy-set all internal sieves as requested if sieve listing required. sieves_requested = [] if op.list_sieves or op.list_sieve_names: # Global sieves. modpaths = glob.glob(os.path.join(datadir(), "sieve", "[a-z]*.py")) modpaths.sort() for modpath in modpaths: sname = os.path.basename(modpath)[:-3] # minus .py sname = sname.replace("_", "-") sieves_requested.append(sname) # Language-specific sieves. modpaths = glob.glob(os.path.join(datadir(), "lang", "*", "sieve", "[a-z]*.py")) modpaths.sort() for modpath in modpaths: sname = os.path.basename(modpath)[:-3] # minus .py sname = sname.replace("_", "-") lang = os.path.basename(os.path.dirname(os.path.dirname(modpath))) sieves_requested.append(lang + ":" + sname) # No need to load and setup sieves if only listing sieve names requested. if op.list_sieve_names: report("\n".join(sieves_requested)) sys.exit(0) # Load sieve modules from supplied names in the command line. if not sieves_requested: sieves_requested = op.raw_sieves.split(",") sieve_modules = [] for sieve_name in sieves_requested: # Resolve sieve file. if not sieve_name.endswith(".py"): # One of internal sieves. if ":" in sieve_name: # Language-specific internal sieve. lang, name = sieve_name.split(":") sieve_path_base = os.path.join("lang", lang, "sieve", name) else: sieve_path_base = os.path.join("sieve", sieve_name) sieve_path_base = sieve_path_base.replace("-", "_") + ".py" sieve_path = os.path.join(datadir(), sieve_path_base) else: # Sieve name is its path. sieve_path = sieve_name try: sieve_file = open(unicode_to_str(sieve_path)) # ...unicode_to_str because of exec below. except IOError: error(_("@info", "Cannot load sieve '%(file)s'.", file=sieve_path)) # Load file into new module. sieve_mod_name = "sieve_" + str(len(sieve_modules)) sieve_mod = imp.new_module(sieve_mod_name) - exec sieve_file in sieve_mod.__dict__ + exec(sieve_file, sieve_mod.__dict__) sieve_file.close() sys.modules[sieve_mod_name] = sieve_mod # to avoid garbage collection sieve_modules.append((sieve_name, sieve_mod)) if not hasattr(sieve_mod, "Sieve"): error(_("@info", "Module '%(file)s' does not define %(classname)s class.", file=sieve_path, classname="Sieve")) # Setup sieves (description, known parameters...) pp = ParamParser() snames = [] for name, mod in sieve_modules: scview = pp.add_subcmd(name) if hasattr(mod, "setup_sieve"): mod.setup_sieve(scview) snames.append(name) # If info on sieves requested, report and exit. if op.list_sieves: report(_("@info", "Available internal sieves:")) report(pp.listcmd(snames)) sys.exit(0) elif op.list_sieve_params: params = set() for scview in pp.cmdviews(): params.update(scview.params(addcol=True)) report("\n".join(sorted(params))) sys.exit(0) elif op.help_sieves: report(_("@info", "Help for sieves:")) report("") report(pp.help(snames)) sys.exit(0) # Prepare sieve parameters for parsing. sieve_params = list(op.sieve_params) # - append paramaters according to configuration sieve_params.extend(read_config_params(pp.cmdviews(), sieve_params)) # - remove paramaters according to command line if op.sieve_no_params: sieve_params_mod = [] for parspec in sieve_params: if parspec.split(":", 1)[0] not in op.sieve_no_params: sieve_params_mod.append(parspec) sieve_params = sieve_params_mod # If assembly of issued parameters requested, report and exit. if op.issued_params: escparams = [] for parspec in sieve_params: if ":" in parspec: param, value = parspec.split(":", 1) escparam = "%s:%s" % (param, escape_sh(value)) else: escparam = parspec escparams.append(escparam) fmtparams = " ".join(["-s%s" % x for x in sorted(escparams)]) if fmtparams: report(fmtparams) sys.exit(0) # Parse sieve parameters. sparams, nacc_params = pp.parse(sieve_params, snames) if nacc_params: error(_("@info", "Parameters not accepted by any of issued subcommands: " "%(paramlist)s.", paramlist=format_item_list(nacc_params))) # ======================================== # FIXME: Think of something less ugly. # Add as special parameter to each sieve: # - root paths from which the catalogs are collected # - whether destination independent coloring is in effect # - test function for catalog selection root_paths = [] if op.raw_paths: root_paths.extend(op.raw_paths) if op.files_from: for ffpath in op.files_from: root_paths.extend(collect_paths_from_file(ffpath)) if not op.raw_paths and not op.files_from: root_paths = ["."] is_cat_included = build_path_selector(incnames=op.include_names, incpaths=op.include_paths, excnames=op.exclude_names, excpaths=op.exclude_paths) - for p in sparams.values(): + for p in list(sparams.values()): p.root_paths = root_paths p.raw_colors = op.raw_colors p.is_cat_included = is_cat_included # ======================================== # Create sieves. sieves = [] for name, mod in sieve_modules: sieves.append(mod.Sieve(sparams[name])) # Get the message monitoring indicator from the sieves. # Monitor unless all sieves have requested otherwise. use_monitored = False for sieve in sieves: if getattr(sieve, "caller_monitored", True): use_monitored = True break if op.verbose and not use_monitored: report(_("@info:progress", "--> Not monitoring messages.")) # Get the sync indicator from the sieves. # Sync unless all sieves have requested otherwise, # and unless syncing is disabled globally in command line. do_sync = False for sieve in sieves: if getattr(sieve, "caller_sync", True): do_sync = True break if not op.do_sync: do_sync = False if op.verbose and not do_sync: report(_("@info:progress", "--> Not syncing after sieving.")) # Open in header-only mode if no sieve has message processor. # Categorize sieves by the presence of message/header processors. use_headonly = True header_sieves = [] header_sieves_last = [] message_sieves = [] for sieve in sieves: if hasattr(sieve, "process"): use_headonly = False message_sieves.append(sieve) if hasattr(sieve, "process_header"): header_sieves.append(sieve) if hasattr(sieve, "process_header_last"): header_sieves_last.append(sieve) if op.verbose and use_headonly: report(_("@info:progress", "--> Opening catalogs in header-only mode.")) # Collect catalog paths. fnames = collect_paths_cmdline(rawpaths=op.raw_paths, incnames=op.include_names, incpaths=op.include_paths, excnames=op.exclude_names, excpaths=op.exclude_paths, filesfrom=op.files_from, elsecwd=True, respathf=collect_catalogs, abort=True) if op.do_skip: errwarn = warning errwarn_on_msg = warning_on_msg else: errwarn = error errwarn_on_msg = error_on_msg # Prepare inline progress indicator. if not op.quiet: update_progress = init_file_progress(fnames, addfmt=t_("@info:progress", "Sieving: %(file)s")) # Sieve catalogs. modified_files = [] for fname in fnames: if op.verbose: report(_("@info:progress", "Sieving %(file)s...", file=fname)) elif not op.quiet: update_progress(fname) if op.msgfmt_check: d1, oerr, ret = collect_system(["msgfmt", "-o", "/dev/null", "-c", fname]) if ret != 0: oerr = oerr.strip() errwarn(_("@info:progress", "%(file)s: %(cmd)s check failed:\n" "%(msg)s", file=fname, cmd="msgfmt -c", msg=oerr)) warning(_("@info:progress", "Skipping catalog due to syntax check failure.")) continue try: cat = Catalog(fname, monitored=use_monitored, headonly=use_headonly, single_entry=int(op.single_entry)) - except CatalogSyntaxError, e: + except CatalogSyntaxError as e: errwarn(_("@info:progress", "%(file)s: Parsing failed: %(msg)s", file=fname, msg=e)) warning(_("@info:progress", "Skipping catalog due to parsing failure.")) continue skip = False # First run all header sieves. if header_sieves and op.announce_entry: report(_("@info:progress", "Sieving header of %(file)s...", file=fname)) for sieve in header_sieves: try: ret = sieve.process_header(cat.header, cat) - except SieveCatalogError, e: + except SieveCatalogError as e: errwarn(_("@info:progress", "%(file)s:header: Sieving failed: %(msg)s", file=fname, msg=e)) skip = True break if ret not in (None, 0): break if skip: warning(_("@info:progress", "Skipping catalog due to header sieving failure.")) continue # Then run all message sieves on each message, # unless processing only the header. if not use_headonly: for msg in cat: if op.skip_obsolete and msg.obsolete: continue if not op.quiet: update_progress(fname) if op.announce_entry: report(_("@info:progress", "Sieving %(file)s:%(line)d(#%(entry)d)...", file=fname, line=msg.refline, entry=msg.refentry)) for sieve in message_sieves: try: ret = sieve.process(msg, cat) - except SieveMessageError, e: + except SieveMessageError as e: errwarn_on_msg(_("@info:progress", "Sieving failed: %(msg)s", msg=e), msg, cat) break - except SieveCatalogError, e: + except SieveCatalogError as e: errwarn_on_msg(_("@info:progress", "Sieving failed: %(msg)s", msg=e), msg, cat) skip = True break if ret not in (None, 0): break if skip: break if skip: warning(_("@info:progress", "Skipping catalog due to message sieving failure.")) continue # Finally run all header-last sieves. if header_sieves_last and op.announce_entry: report(_("@info:progress", "Sieving header (after messages) in %(file)s...", file=fname)) for sieve in header_sieves_last: try: ret = sieve.process_header_last(cat.header, cat) - except SieveCatalogError, e: + except SieveCatalogError as e: errwarn(_("@info:progress", "%(file)s:header: Sieving (after messages) " "failed: %(msg)s", file=fname, msg=e)) skip = True break if ret not in (None, 0): break if skip: warning(_("@info:progress", "Skipping catalog due to header sieving " "(after messages) failure.")) continue if do_sync and cat.sync(op.force_sync): if op.verbose: report(_("@info:progress leading ! is a shorthand " "state indicator", "! (MODIFIED) %(file)s", file=fname)) elif not op.quiet: report(_("@info:progress leading ! is a shorthand " "state indicator", "! %(file)s", file=fname)) modified_files.append(fname) if not op.quiet: update_progress() # clear last progress line, if any for sieve in sieves: if hasattr(sieve, "finalize"): try: sieve.finalize() - except SieveCatalogError, e: + except SieveCatalogError as e: warning(_("@info:progress", "Finalization failed: %(msg)s", msg=e)) if op.output_modified: ofh = open(op.output_modified, "w") ofh.write("\n".join(modified_files) + "\n") ofh.close def read_config_params (scviews, cmdline_parspecs): # Collect parameters defined in the config. cfgsec = pology_config.section("posieve") pref = "param-" config_params = [] for field in cfgsec.fields(): if field.startswith(pref): parspec = field[len(pref):] only_sieves = None inverted = False if "/" in parspec: param, svspec = parspec.split("/", 1) if svspec.startswith("~"): inverted = True svspec = svspec[1:] only_sieves = set(svspec.split(",")) else: param = parspec if "." in param: param, d1 = param.split(".", 1) config_params.append((field, param, only_sieves, inverted)) if not config_params: return [] # Collect parameters known to issued sieves and issued in command line. sieves = set([x.name() for x in scviews]) acc_raw_params = set(sum([x.params(addcol=True) for x in scviews], [])) acc_params = set([x.rstrip(":") for x in acc_raw_params]) acc_flag_params = set([x for x in acc_raw_params if not x.endswith(":")]) cmd_params = set([x.split(":", 1)[0] for x in cmdline_parspecs]) # Select parameters based on issued sieves. sel_params = [] for field, param, only_sieves, inverted in config_params: if param in acc_params and param not in cmd_params: if only_sieves is not None: overlap = bool(sieves.intersection(only_sieves)) add_param = overlap if not inverted else not overlap else: add_param = True if add_param: if param in acc_flag_params: if cfgsec.boolean(field): sel_params.append(param) else: sel_params.append("%s:%s" % (param, cfgsec.string(field))) return sel_params if __name__ == '__main__': exit_on_exception(main) diff --git a/scripts/posummit.py b/scripts/posummit.py index e711eb81..83fcc5d5 100755 --- a/scripts/posummit.py +++ b/scripts/posummit.py @@ -1,2930 +1,2930 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- import copy from difflib import SequenceMatcher import filecmp import hashlib import imp import locale import os import re import shutil import sys import time +from functools import reduce try: import fallback_import_paths except: pass from pology import version, _, n_, t_, PologyError from pology.ascript import collect_ascription_associations from pology.ascript import collect_ascription_history from pology.ascript import make_ascription_selector from pology.catalog import Catalog from pology.header import Header, format_datetime from pology.message import Message, MessageUnsafe from pology.colors import ColorOptionParser from pology.fsops import str_to_unicode, unicode_to_str from pology.fsops import mkdirpath, assert_system, collect_system from pology.fsops import getucwd, join_ncwd from pology.fsops import collect_paths_cmdline, build_path_selector from pology.fsops import exit_on_exception from pology.merge import merge_pofile from pology.monitored import Monpair, Monlist from pology.msgreport import report_on_msg from pology.report import report, error, warning, format_item_list from pology.report import init_file_progress from pology.stdcmdopt import add_cmdopt_incexc, add_cmdopt_filesfrom from pology.vcs import make_vcs from pology.wrap import select_field_wrapping SUMMIT_ID = "+" # must not start with word-character (\w) def main (): locale.setlocale(locale.LC_ALL, "") # Setup options and parse the command line. usage = _("@info command usage", "\n" " %(cmd)s [OPTIONS] CFGFILE LANG OPMODE [PARTIAL...]\n" " (if there is no '%(cfgfile)s' file in a parent directory)\n" " %(cmd)s [OPTIONS] OPMODE [PARTIAL...]\n" " (if there is a '%(cfgfile)s' file in a parent directory)", cmd="%prog", cfgfile="summit-config") desc = _("@info command description", "Translate PO files spread across different branches " "in a unified fashion.") ver = _("@info command version", - u"%(cmd)s (Pology) %(version)s\n" - u"Copyright © 2007, 2008, 2009, 2010 " - u"Chusslove Illich (Часлав Илић) <%(email)s>", + "%(cmd)s (Pology) %(version)s\n" + "Copyright © 2007, 2008, 2009, 2010 " + "Chusslove Illich (Часлав Илић) <%(email)s>", cmd="%prog", version=version(), email="caslav.ilic@gmx.net") opars = ColorOptionParser(usage=usage, description=desc, version=ver) opars.add_option( "-a", "--asc-filter", action="store", dest="asc_filter", default=None, help=_("@info command line option description", "Apply a non-default ascription filter on scatter.")) opars.add_option( "--create", action="store_true", dest="create", default=False, help=_("@info command line option description", "Allow creation of new summit catalogs.")) opars.add_option( "--force", action="store_true", dest="force", default=False, help=_("@info command line option description", "Force some operations that are normally not advised.")) opars.add_option( "-q", "--quiet", action="store_true", dest="quiet", default=False, help=_("@info command line option description", "Output less detailed progress info.")) opars.add_option( "-v", "--verbose", action="store_true", dest="verbose", default=False, help=_("@info command line option description", "Output more detailed progress info")) add_cmdopt_filesfrom(opars) add_cmdopt_incexc(opars) options, free_args = opars.parse_args(str_to_unicode(sys.argv[1:])) # Look for the config file through parent directories. parent = getucwd() cfgpath = None while True: for cfgname in ("summit-config",): cfgpath1 = os.path.join(parent, cfgname) if os.path.isfile(cfgpath1): cfgpath = cfgpath1 break if cfgpath: break pparent = parent parent = os.path.dirname(parent) if parent == pparent: break # If config file not found, expect it and language as arguments. if not cfgpath: if len(free_args) < 1: error(_("@info", "Summit configuration file neither found " "as '%(cfgfile)s' in parent directories, " "nor given in command line.", cfgfile="summit-config")) cfgpath = free_args.pop(0) if not os.path.isfile(cfgpath): error(_("@info", "Summit configuration file '%(file)s' does not exist.", file=cfgpath)) if len(free_args) < 1: error(_("@info", "Language code not given.")) lang = free_args.pop(0) else: lang = None # ...will be read from config file. if len(free_args) < 1: error(_("@info", "Operation mode not given.")) opmodes = free_args.pop(0).split(",") opmodes_uniq = [] for opmode in opmodes: if opmode not in opmodes_uniq: if opmode not in ("gather", "scatter", "merge", "deps"): error(_("@info", "Unknown operation mode '%(mode)s'.", mode=opmode)) opmodes_uniq.append(opmode) opmodes = opmodes_uniq # Could use some speedup. try: import psyco psyco.full() except ImportError: pass # Read project definition. project = Project(lang, opmodes, options) project.include(cfgpath) # In summit-over-templates mode, determine if templates are dynamic. project.templates_dynamic = ( project.over_templates and not project.summit.get("topdir_templates")) # If config file was found in parent directories, # it should have defined the language itself. # Otherwise, its language is set to language given in command line. if not lang: if not project.lang: error(_("@info", "Language code not set in configuration file.")) lang = project.lang else: project.lang = lang # In summit-over-templates mode, derive special project data # for implicitly gathering templates on merge. if project.templates_dynamic and "merge" in project.opmodes: project.toptions = copy.copy(options) project.toptions.quiet = True project.tproject = Project(project.templates_lang, ["gather"], project.toptions) project.tproject.include(cfgpath) project.tproject.templates_dynamic = False project.tproject.summit_version_control = "none" project.tproject.summit_wrap = False # performance project.tproject.summit_fine_wrap = False # performance tpd = project.tproject.summit.get("topdir_templates") if tpd is None: # FIXME: Portability. tpd = "/tmp/summit-templates-%d" % os.getpid() project.tproject.summit["topdir"] = tpd for tb in project.tproject.branches: tbpd = tb.get("topdir_templates") if tbpd is not None: tb["topdir"] = tbpd project.tproject.lang = project.templates_lang project.tproject = derive_project_data(project.tproject, project.toptions, project.summit["topdir"]) project.summit["topdir_templates"] = tpd # Explicit gathering in summit-over-templates mode # may be useful to check if gathering works. # Make some adjustments for this to go smoothly. if ( project.templates_dynamic and "gather" in project.opmodes and project.lang == project.templates_lang ): options.create = True project.summit["topdir"] = project.summit["topdir_templates"] project.summit_version_control = "none" # Derive project data. project = derive_project_data(project, options) # Collect partial processing specs and inclusion-exclusion test. specargs, ffself = collect_paths_cmdline(rawpaths=free_args, filesfrom=options.files_from, getsel=True, abort=True) options.partspecs, options.partbids = collect_partspecs(project, specargs) if not options.files_from: # If there was no from-file input and no partial processing specs # were collected, indicate operation on the whole summit. if not options.partspecs: options.partspecs = None if not options.partbids: options.partbids = None cmdself = build_path_selector(incnames=options.include_names, incpaths=options.include_paths, excnames=options.exclude_names, excpaths=options.exclude_paths) options.selcatf = lambda x: cmdself(x) and ffself(x) # Invoke the appropriate operations on collected bundles. for opmode in opmodes: if options.verbose: report(_("@info:progress", "-----> Processing mode: %(mode)s", mode=opmode)) if opmode == "gather": summit_gather(project, options) elif opmode == "scatter": summit_scatter(project, options) elif opmode == "merge": summit_merge(project, options) elif opmode == "deps": summit_deps(project, options) class Project (object): def __init__ (self, lang, opmodes, options): self.__dict__.update({ "lang" : lang, "opmodes" : opmodes, "options" : options, "summit" : "", "branches" : [], "mappings" : [], "subdir_mappings" : [], "subdir_precedence" : [], "over_templates" : False, "templates_lang" : "templates", "summit_wrap" : False, "summit_fine_wrap" : True, "summit_fuzzy_merging" : True, "branches_wrap" : True, "branches_fine_wrap" : True, "branches_fuzzy_merging" : True, "version_control" : "", "summit_version_control" : "", "branches_version_control" : "", "hook_on_scatter_msgstr" : [], "hook_on_scatter_msg" : [], "hook_on_scatter_cat" : [], "hook_on_scatter_file" : [], "hook_on_scatter_branch": [], "hook_on_gather_msg" : [], "hook_on_gather_msg_branch" : [], "hook_on_gather_cat" : [], "hook_on_gather_cat_branch" : [], "hook_on_gather_file" : [], "hook_on_gather_file_branch" : [], "hook_on_merge_msg" : [], "hook_on_merge_head" : [], "hook_on_merge_cat" : [], "hook_on_merge_file" : [], "header_propagate_fields" : [], "header_skip_fields_on_scatter" : [], "vivify_on_merge" : False, "vivify_w_translator" : "Simulacrum", "vivify_w_langteam" : "Nevernessian", "vivify_w_language" : "", "vivify_w_charset" : "UTF-8", "vivify_w_plurals" : "", "compendium_on_merge" : "", "compendium_fuzzy_exact" : False, "compendium_min_words_exact" : 0, "merge_min_adjsim_fuzzy" : 0.0, "merge_rebase_fuzzy" : False, "scatter_min_completeness" : 0.0, "scatter_acc_completeness" : 0.0, "ascription_filters" : [], "ascription_history_filter" : None, }) self.__dict__["locked"] = False self.inclusion_trail = [] def __setattr__ (self, att, val): # TODO: Do extensive checks. if self.locked and att not in self.__dict__: error(_("@info", "Unknown summit configuration field '%(field)s'.", field=att)) self.__dict__[att] = val def relpath (self, path): rootdir = os.path.dirname(self.inclusion_trail[-1]) if not os.path.isabs(path): path = join_ncwd(rootdir, path) return path # FIXME: Temporary for backward compatibility, remove at some point. def resolve_path_rooted (self, path): return self.relpath(path) def include (self, path): path = os.path.abspath(path) if path in self.inclusion_trail: error(_("@info", "Circular inclusion of '%(file)s' attempted " "in summit configuration.", file=path)) self.inclusion_trail.append(path) self.locked = True - exec open(unicode_to_str(path)) in {"S" : self} + exec(open(unicode_to_str(path)), {"S" : self}) self.locked = False self.inclusion_trail.pop() def derive_project_data (project, options, nwgrefpath=None): p = project # shortcut # Create summit object from summit dictionary. class Summit: pass s = Summit() sd = p.summit s.id = SUMMIT_ID s.by_lang = False s.topdir = sd.pop("topdir", None) s.topdir_templates = sd.pop("topdir_templates", None) # Assert that there are no misnamed keys in the dictionary. if sd: error(_("@info", "Unknown keys in summit configuration: %(keylist)s.", - keylist=format_item_list(sd.keys()))) + keylist=format_item_list(list(sd.keys())))) # Assert that all necessary fields in summit specification exist. if s.topdir is None: error(_("@info", "Top directory not set in summit configuration.")) s.split_path = None # needed only on some checks later p.summit = s # Create branch objects from branch dictionaries. class Branch: pass branches = [] for bd in p.branches: b = Branch() branches.append(b) b.id = bd.pop("id", None) b.topdir = bd.pop("topdir", None) b.topdir_templates = bd.pop("topdir_templates", None) # If operation is performed on templates and branch template directory # is defined, override plain branch directory with it. if p.lang == p.templates_lang and b.topdir_templates is not None: b.topdir = b.topdir_templates b.by_lang = bd.pop("by_lang", False) if b.by_lang and isinstance(b.by_lang, bool): b.by_lang = project.lang # If separate templates directory is not defined in by-language mode, # set it to same as catalogs directory. if b.by_lang and b.topdir_templates is None: b.topdir_templates = b.topdir b.scatter_create_filter = bd.pop("scatter_create_filter", None) b.skip_version_control = bd.pop("skip_version_control", False) # FIXME: merge_locally retained for backward compatibility, # replace at some point with b.merge = bd.pop("merge", False). b.merge = bd.pop("merge", None) if b.merge is None: b.merge = bd.pop("merge_locally", False) b.split_path, b.join_path = bd.pop("transform_path", (None, None)) b.insert_nosim = bd.pop("insert_nosim", False) # Assemble include-exclude functions. includes = bd.pop("includes", []) excludes = bd.pop("excludes", []) def regex_to_func (rxstr): try: rx = re.compile(rxstr, re.U) except: error(_("@info", "Invalid regular expression '%(regex)s' " "in include-exclude specification " "of branch '%(branch)s'.", branch=b.id, regex=rxstr)) return lambda x: bool(rx.search(x)) def chain_tests (tests): testfs = [] for test in tests: - if isinstance(test, basestring): + if isinstance(test, str): testfs.append(regex_to_func(test)) elif callable(test): testfs.append(test) else: error(_("@info", "Invalid test type '%(type)s' " "in include-exclude specification " "of branch '%(branch)s'.", branch=b.id, type=type(test))) return lambda x: reduce(lambda s, y: s or y(x), testfs, False) if includes: includef = chain_tests(includes) if excludes: excludef = chain_tests(excludes) if includes and excludes: b.ignored = lambda x: not includef(x) or excludef(x) elif includes: b.ignored = lambda x: not includef(x) elif excludes: b.ignored = lambda x: excludef(x) else: b.ignored = lambda x: False # Assert that there are no misnamed keys in the dictionary. if bd: error(_("@info", "Unknown keys in specification of branch '%(branch)s': " "%(keylist)s.", - branch=b.id, keylist=format_item_list(bd.keys()))) + branch=b.id, keylist=format_item_list(list(bd.keys())))) p.branches = branches # Assert that all necessary fields in branch specifications exist. p.branch_ids = [] for branch in p.branches: if branch.id is None: error(_("@info", "Branch with undefined ID.")) if branch.id in p.branch_ids: error(_("@info", "Non-unique branch ID '%(branch)s'.", branch=branch.id)) p.branch_ids.append(branch.id) if branch.topdir is None: error(_("@info", "Top directory not set for branch '%(branch)s'.", branch=branch.id)) # Dictionary of branches by branch id. p.bdict = dict([(x.id, x) for x in p.branches]) # Create version control operators if given. p.summit_vcs = None p.branches_vcs = None if p.summit_version_control: p.summit_vcs = make_vcs(p.summit_version_control.lower()) if p.branches_version_control: p.branches_vcs = make_vcs(p.branches_version_control.lower()) if p.version_control: if p.summit_vcs is None: p.summit_vcs = make_vcs(p.version_control.lower()) if p.branches_vcs is None: p.branches_vcs = make_vcs(p.version_control.lower()) # Decide wrapping policies. class D: pass dummyopt = D() dummyopt.do_wrap = p.summit_wrap dummyopt.do_fine_wrap = p.summit_fine_wrap p.summit_wrapping = select_field_wrapping(cmlopt=dummyopt) dummyopt.do_wrap = p.branches_wrap dummyopt.do_fine_wrap = p.branches_fine_wrap p.branches_wrapping = select_field_wrapping(cmlopt=dummyopt) # Decide the extension of catalogs. if p.over_templates and p.lang == p.templates_lang: catext = ".pot" else: catext = ".po" # Collect catalogs from branches. p.catalogs = {} for b in p.branches: p.catalogs[b.id] = collect_catalogs(b.topdir, catext, b.by_lang, b.ignored, b.split_path, project, options) # ...and from the summit. p.catalogs[SUMMIT_ID] = collect_catalogs(p.summit.topdir, catext, None, None, None, project, options) if ( p.lang == p.templates_lang and "gather" in p.opmodes and nwgrefpath is not None ): # Also add summit templates which do not actually exist, # but are going to be created on gather without warnings, # by reflecting the catalogs found in the given path. refcats = collect_catalogs(nwgrefpath, ".po", None, None, None, project, options) - for name, spec in refcats.iteritems(): + for name, spec in refcats.items(): if name not in p.catalogs[SUMMIT_ID]: path, subdir = spec[0] # all summit catalogs unique tpath = join_ncwd(p.summit.topdir, subdir, name + ".pot") p.catalogs[SUMMIT_ID][name] = [(tpath, subdir)] # Resolve ascription filter. project.ascription_filter = None for afname, afspec in project.ascription_filters: if options.asc_filter is None or afname == options.asc_filter: - if isinstance(afspec, basestring): + if isinstance(afspec, str): afcall = make_ascription_selector([afspec]) elif isinstance(afspec, (tuple, list)): afcall = make_ascription_selector(afspec) elif callable(afspec): afcall = afspec else: error(_("@info", "Unknown type of definition for " "ascription filter '%(filt)s'.", filt=afname)) project.ascription_filter = afcall break if options.asc_filter is not None and project.ascription_filter is None: error(_("@info", "Summit configuration does not define " "ascription filter '%(filt)s'.", filt=options.asc_filter)) # Link summit and ascription catalogs. if project.ascription_filter: - tmp0 = [(x, y[0][0]) for x, y in p.catalogs[SUMMIT_ID].items()] + tmp0 = [(x, y[0][0]) for x, y in list(p.catalogs[SUMMIT_ID].items())] tmp1 = [x[0] for x in tmp0] tmp2 = collect_ascription_associations([x[1] for x in tmp0]) - tmp3 = zip([tmp2[0][0]] * len(tmp1), [x[1] for x in tmp2[0][1]]) - p.aconfs_acatpaths = dict(zip(tmp1, tmp3)) + tmp3 = list(zip([tmp2[0][0]] * len(tmp1), [x[1] for x in tmp2[0][1]])) + p.aconfs_acatpaths = dict(list(zip(tmp1, tmp3))) # Assure that summit catalogs are unique. - for name, spec in p.catalogs[SUMMIT_ID].items(): + for name, spec in list(p.catalogs[SUMMIT_ID].items()): if len(spec) > 1: fstr = "\n".join([x[0] for x in spec]) error(_("@info", "Non-unique summit catalog '%(name)s', found as:\n" "%(filelist)s", name=name, filelist=fstr)) # At scatter in summit-over-static-templates mode, add to the collection # of branch catalogs any that should be newly created. p.add_on_scatter = {} if ( p.over_templates and p.lang != p.templates_lang and "scatter" in p.opmodes): # Go through all mappings and collect branch names mapped to # summit catalogs per branch id and summit name, and vice versa. mapped_summit_names = {} mapped_branch_names = {} for mapping in p.mappings: branch_id = mapping[0] branch_name = mapping[1] summit_names = mapping[2:] if not branch_id in mapped_summit_names: mapped_summit_names[branch_id] = {} if not branch_id in mapped_branch_names: mapped_branch_names[branch_id] = {} for summit_name in summit_names: if not summit_name in mapped_summit_names[branch_id]: mapped_summit_names[branch_id][summit_name] = [] mapped_summit_names[branch_id][summit_name].append(branch_name) if not branch_name in mapped_branch_names[branch_id]: mapped_branch_names[branch_id][branch_name] = [] mapped_branch_names[branch_id][branch_name].append(summit_name) # Go through all branches. bt_cache = {} for branch in p.branches: # Skip this branch if no templates. if not branch.topdir_templates: continue # Collect all templates for this branch. branch_templates = bt_cache.get(branch.topdir_templates) if branch_templates is None: branch_templates = collect_catalogs(branch.topdir_templates, ".pot", branch.by_lang, branch.ignored, branch.split_path, project, options) bt_cache[branch.topdir_templates] = branch_templates # Go through all summit catalogs. for summit_name in p.catalogs[SUMMIT_ID]: # Collect names of any catalogs in this branch mapped to # the current summit catalog. branch_names = [] if ( branch.id in mapped_summit_names and summit_name in mapped_summit_names[branch.id]): branch_names = mapped_summit_names[branch.id][summit_name] # Unconditionally add summit name as one possible branch name, # since otherwise a mapped branch catalog could shadow # a direct branch catalog. branch_names.append(summit_name) # For each collected branch name, check if there are some # branch templates for which the corresponding branch path # does not exit and (in case of explicit mapping) whether # all summit catalogs needed for scattering are available. # If this is the case, set missing paths for scattering. for branch_name in branch_names: if ( branch_name in branch_templates - and all(map(lambda x: x in p.catalogs[SUMMIT_ID], - mapped_branch_names.get(branch.id, {}) - .get(branch_name, []))) + and all([x in p.catalogs[SUMMIT_ID] for x in mapped_branch_names.get(branch.id, {}) + .get(branch_name, [])]) ): # Assemble all branch catalog entries. for template in branch_templates[branch_name]: # Compose the branch catalog subdir and path. subdir = template[1] if branch.join_path: subpath = branch.join_path(branch_name, subdir, branch.by_lang) elif branch.by_lang: subpath = os.path.join(subdir, branch_name, branch.by_lang + ".po") else: subpath = os.path.join(subdir, branch_name + ".po") path = join_ncwd(branch.topdir, subpath) # Skip this catalog if excluded from creation on # scatter, by filter on catalog name and subdir # (False -> excluded). scf = branch.scatter_create_filter if scf and not scf(branch_name, subdir): continue # If not there already, add this path # to branch catalog entry, # and record later initialization from template. brcats = p.catalogs[branch.id].get(branch_name) if brcats is None: brcats = [] p.catalogs[branch.id][branch_name] = brcats if (path, subdir) not in brcats: brcats.append((path, subdir)) p.add_on_scatter[path] = template[0] # In summit-over-dynamic-templates mode, # automatic vivification of summit catalogs must be active. if p.templates_dynamic: p.vivify_on_merge = True # At merge in summit-over-templates mode, # if automatic vivification of summit catalogs requested, # add to the collection of summit catalogs any that should be created. p.add_on_merge = {} if ( p.over_templates and p.lang != p.templates_lang and "merge" in p.opmodes and (p.vivify_on_merge or options.create) ): # Collect all summit templates. if not p.templates_dynamic: summit_templates = collect_catalogs(p.summit.topdir_templates, ".pot", None, None, None, project, options) else: summit_templates = p.tproject.catalogs[SUMMIT_ID] # Go through all summit templates, recording missing summit catalogs. - for name, spec in summit_templates.iteritems(): + for name, spec in summit_templates.items(): tpath, tsubdir = spec[0] # all summit catalogs unique if name not in p.catalogs[SUMMIT_ID]: # Compose the summit catalog path. spath = join_ncwd(p.summit.topdir, tsubdir, name + ".po") # Add this file to summit catalog entries. p.catalogs[SUMMIT_ID][name] = [(spath, tsubdir)] # Record later initialization from template. p.add_on_merge[spath] = tpath # Convenient dictionary views of mappings. # - direct: branch_id->branch_name->summit_name # - part inverse: branch_id->summit_name->branch_name # - full inverse: summit_name->branch_id->branch_name p.direct_map = {} p.part_inverse_map = {} p.full_inverse_map = {} # Initialize mappings by branch before the main loop for direct mappings, # because an explicit mapping may name a branch before it was processed # in the main loop. for branch_id in p.branch_ids: p.direct_map[branch_id] = {} for branch_name in p.catalogs[branch_id]: p.direct_map[branch_id][branch_name] = [] # Add direct mappings. # - explicit for mapping in p.mappings: branch_id, branch_name = mapping[:2] if ( "gather" in p.opmodes and ( branch_id not in p.catalogs or branch_name not in p.catalogs[branch_id]) ): warning(_("@info", "No branch catalog corresponding to mapping %(mapping)s " "set by the summit configuration.", mapping=("('%s', '%s', ...)" % (branch_id, branch_name)))) continue summit_names = mapping[2:] p.direct_map[branch_id][branch_name] = summit_names # - implicit for branch_id in p.branch_ids: for branch_name in p.catalogs[branch_id]: if p.direct_map[branch_id][branch_name] == []: p.direct_map[branch_id][branch_name].append(branch_name) # Convert subdir mappings into dictionary by branch ID and subdir. p.subdir_map = {} for bid, bsubdir, ssubdir in p.subdir_mappings: p.subdir_map[(bid, bsubdir)] = ssubdir # Collect missing summit catalogs. needed_additions = [] for branch_id in p.branch_ids: for branch_name in p.catalogs[branch_id]: summit_names = p.direct_map[branch_id][branch_name] for summit_name in summit_names: if summit_name not in p.catalogs[SUMMIT_ID]: # Compose the path for the missing summit catalog. # Default the subdir to that of the current branch, # as it is the primary branch for this catalog. # Or use explicit subdir mapping if given. branch_path, branch_subdir = \ p.catalogs[branch_id][branch_name][0] dmkey = (branch_id, branch_subdir) summit_subdir = p.subdir_map.get(dmkey) or branch_subdir summit_path = join_ncwd(p.summit.topdir, summit_subdir, summit_name + catext) if "gather" in p.opmodes: if options.create: # Add summit catalog into list of existing catalogs; # it will be created for real on gather. p.catalogs[SUMMIT_ID][summit_name] = [ (summit_path, summit_subdir)] else: needed_additions.append((branch_path, summit_path)) elif "scatter" in p.opmodes: needed_additions.append((branch_path, summit_path)) # Initialize inverse mappings. # - part inverse: for branch_id in p.branch_ids: p.part_inverse_map[branch_id] = {} for summit_name in p.catalogs[SUMMIT_ID]: p.part_inverse_map[branch_id][summit_name] = [] # - full inverse: for summit_name in p.catalogs[SUMMIT_ID]: p.full_inverse_map[summit_name] = {} for branch_id in p.branch_ids: p.full_inverse_map[summit_name][branch_id] = [] # Add existing inverse mappings. for branch_id in p.branch_ids: for branch_name in sorted(p.catalogs[branch_id]): for summit_name in p.direct_map[branch_id][branch_name]: if summit_name in p.full_inverse_map: # - part inverse: pinv = p.part_inverse_map[branch_id][summit_name] if branch_name not in pinv: pinv.append(branch_name) # - full inverse: finv = p.full_inverse_map[summit_name][branch_id] if branch_name not in finv: finv.append(branch_name) # Collect superfluous summit catalogs. needed_removals = [] for summit_name in p.catalogs[SUMMIT_ID]: src_branch_ids = [] for branch_id in project.branch_ids: if project.full_inverse_map[summit_name][branch_id]: src_branch_ids.append(branch_id) if not src_branch_ids: if "gather" in p.opmodes: if not options.create: summit_path = p.catalogs[SUMMIT_ID][summit_name][0][0] needed_removals.append(summit_path) # Create function to assign precedence to a subdirectory. p.subdir_precedence = [os.path.normpath(sd) for sd in p.subdir_precedence] def calc_subdir_precedence (subdir): for i, test_subdir in enumerate(p.subdir_precedence): ltsd = len(test_subdir) if ( subdir.startswith(test_subdir) and subdir[ltsd:ltsd + 1] in ("", os.path.sep) ): return i return len(p.subdir_precedence) p.calc_subdir_precedence = calc_subdir_precedence # Collect summit catalogs that should be moved. needed_moves = [] for summit_name in p.catalogs[SUMMIT_ID]: branch_subdirs = [] for branch_id in p.full_inverse_map[summit_name]: for branch_name in p.full_inverse_map[summit_name][branch_id]: branch_subdirs_1 = [] for bpath, bsubdir in p.catalogs[branch_id][branch_name]: dmkey = (branch_id, bsubdir) branch_subdirs_1.append(p.subdir_map.get(dmkey) or bsubdir) branch_subdirs.extend(branch_subdirs_1) if branch_subdirs: branch_subdirs = list(set(branch_subdirs)) - subdir_precs = map(p.calc_subdir_precedence, branch_subdirs) + subdir_precs = list(map(p.calc_subdir_precedence, branch_subdirs)) precs_subdirs = sorted(zip(subdir_precs, branch_subdirs)) branch_subdirs_sel = [sd for pr, sd in precs_subdirs if pr == precs_subdirs[0][0]] summit_subdir = p.catalogs[SUMMIT_ID][summit_name][0][1] if summit_subdir not in branch_subdirs_sel: summit_path = p.catalogs[SUMMIT_ID][summit_name][0][0] dpaths = [] for bsubdir in branch_subdirs_sel: dpath = join_ncwd(p.summit.topdir, bsubdir, summit_name + catext) dpaths.append(dpath) if "gather" in p.opmodes: if not options.create: needed_moves.append((summit_path, dpaths)) # If catalog creation is not allowed, # complain about needed additions, removals, and moves. if needed_additions or needed_removals or needed_moves: if needed_additions: fmtlist = "\n".join("%s --> %s" % x for x in sorted(needed_additions)) warning(_("@info", "Some branch catalogs have no " "associated summit catalog " "(expected summit path given):\n" "%(filelist)s", filelist=fmtlist)) if needed_removals: fmtlist = "\n".join(sorted(needed_removals)) warning(_("@info", "Some summit catalogs have no " "associated branch catalogs:\n" "%(filelist)s", filelist=fmtlist)) if needed_moves: fmtlist = "\n".join("%s --| %s" % (x, " | ".join(y)) for x, y in sorted(needed_moves)) warning(_("@info", "Some summit catalogs should be " "moved to another subdirectory:\n" "%(filelist)s", filelist=fmtlist)) if "gather" in p.opmodes: error(_("@info", "Halting because catalog creation is not allowed " "(consider issuing %(opt)s option).", opt="--create")) # Fill in defaults for missing fields in hook specs. for attr in p.__dict__: if attr.startswith("hook_"): p.__dict__[attr] = hook_fill_defaults(p.__dict__[attr]) return p def split_path_in_project (project, path): if os.path.isfile(path): if not path.endswith((".po", ".pot")): error(_("@info", "Non-PO file '%(file)s' given as catalog.", file=path)) splits = [] for b in [project.summit] + project.branches: broot = os.path.abspath(b.topdir) apath = os.path.abspath(path) if apath.startswith(broot + os.path.sep) or apath == broot: subpath = apath[len(broot + os.path.sep):] # Split the path into catalog name and subdirectory. if os.path.isfile(apath): if b.split_path: catname, subdir = b.split_path(subpath) else: subdir = os.path.dirname(subpath) basename = os.path.basename(subpath) catname = basename[:basename.rfind(".")] if b.by_lang: # If this is by-language mode, # catalog path can be split only if of proper language, # and subdirectory and catalog name should backtrack. if catname != b.by_lang: continue catname = os.path.basename(subdir) subdir = os.path.dirname(subdir) elif os.path.isdir(apath): if b.split_path: catname = None dummy_subpath = os.path.join(subpath, "__dummy__.po") subdir = b.split_path(dummy_subpath)[1] else: subdir = subpath catname = None if b.by_lang: # If this is a leaf directory in by-language mode, # then actually a catalog has been selected, # and subdirectory and catalog name should backtrack. apath2 = os.path.join(subdir, b.by_lang + ".po") if os.path.isfile(apath2): catname = os.path.basename(subdir) subdir = os.path.dirname(subdir) # Collect the splitting. # Catalog name being None means that a subdirectory is selected, # and if subdirectory too is None, the whole branch is selected. if not catname and not subdir: subdir = None splits.append((b.id, subdir, catname)) if not splits: error(_("@info", "Path '%(path)s' is not covered by the summit configuration.", path=path)) return splits def collect_partspecs (project, specargs): partbids = [] partspecs = {} for specarg in specargs: # If the partial specification is a valid path, # convert it to operation target. optargets = [] if os.path.exists(specarg): splits = split_path_in_project(project, specarg) for bid, breldir, catname in splits: if catname: optarget = bid + ":" + catname elif breldir: optarget = bid + ":" + breldir + os.path.sep else: optarget = bid + ":" optargets.append(optarget) else: optargets = [specarg] for optarget in optargets: lst = optarget.split(":", 1) if len(lst) < 2: fdname, = lst bid = None else: bid, fdname = lst if bid not in project.branch_ids and bid != SUMMIT_ID: error(_("@info", "Branch '%(branch)s' is not defined " "in the summit configuration.", branch=bid)) if bid and bid not in partbids: partbids.append(bid) if fdname: bsid = bid or SUMMIT_ID if bsid not in partspecs: partspecs[bsid] = [] partspecs[bsid].append(fdname) return partspecs, partbids # Fill in defaults for missing fields in hook specs. def hook_fill_defaults (specs): new_specs = [] for spec in specs: call = spec[0] branch_rx = r"" if len(spec) > 1: branch_rx = spec[1] name_rx = r"" if len(spec) > 2: name_rx = spec[2] new_specs.append((call, branch_rx, name_rx)) return new_specs # Each catalog is represented by a dictionary entry: the key is the catalog # name, the value is the list of tuples of file path and subdirectory # relative to top (list in case there are several same-named catalogs in # different subdirectories). def collect_catalogs (topdir, catext, by_lang, ignored, split_path, project, options): catalogs = {} topdir = os.path.normpath(topdir) for root, dirs, files in os.walk(topdir): for file in files: catn = "" if file.endswith(catext): if not by_lang: fpath = os.path.abspath(os.path.join(root, file)) if split_path: catn, spath = split_path(fpath[len(topdir) + 1:]) else: catn = file[0:file.rfind(".")] spath = root[len(topdir) + 1:] elif file == by_lang + ".po" or catext == ".pot": fpath = os.path.abspath(os.path.join(root, file)) if split_path: catn, spath = split_path(fpath[len(topdir) + 1:]) else: catn = os.path.basename(root) spath = os.path.dirname(root)[len(topdir) + 1:] if catn: if not ignored or not ignored(fpath): if catn not in catalogs: catalogs[catn] = [] fpath = join_ncwd(fpath) spath = os.path.normpath(spath) catalogs[catn].append((fpath, spath)) - for catpaths in catalogs.values(): + for catpaths in list(catalogs.values()): catpaths.sort(key=lambda x: x[0]) return catalogs def summit_gather (project, options): if ( project.over_templates and project.lang != project.templates_lang and not options.force): error(_("@info", "Gathering catalogs is normally not allowed " "in summit-over-static-templates mode. " "If this is the initial creation of summit catalogs, " "or externally injected branch catalogs need to be gathered, " "run with options %(opts)s.", opts="--create --force")) elif ( project.templates_dynamic and project.lang == project.templates_lang and not options.force): warning(_("@info", "Gathering templates is superfluous in " "summit-over-templates mode. " "If this is done to check whether gathering works, " "to supress this message run with option %(opt)s.", opt="--force")) # Collect names of summit catalogs to gather. summit_names = select_summit_names(project, options) # Setup progress indicator. upprog = lambda x=None: x if not options.verbose: catpaths = [project.catalogs[SUMMIT_ID][x][0][0] for x in summit_names] upprog = init_file_progress(catpaths, addfmt=t_("@info:progress", "Gathering: %(file)s")) # Gather all selected catalogs. for name in summit_names: catpath = project.catalogs[SUMMIT_ID][name][0][0] if options.verbose: report(_("@info:progress", "Gathering %(file)s...", file=catpath)) upprogc = lambda: upprog(catpath) summit_gather_single(name, project, options, update_progress=upprogc) upprog() def summit_scatter (project, options): if project.over_templates and project.lang == project.templates_lang: error(_("@info", "Scattering not possible on '%(lang)s' " "in summit-over-templates mode.", lang=project.templates_lang)) scatter_specs = [] # Select branches to scatter to. if not options.partbids or SUMMIT_ID in options.partbids: branch_ids = project.branch_ids else: branch_ids = options.partbids # Collect catalogs to scatter through all selected branches. for branch_id in branch_ids: branch_catalogs = select_branch_catalogs(branch_id, project, options) for branch_name, branch_path, branch_subdir in branch_catalogs: # Collect names of all the summit catalogs which this branch # catalog supplies messages to. summit_names = project.direct_map[branch_id][branch_name] # Collect paths of selected summit catalogs. summit_paths = [] for summit_name in summit_names: if not summit_name in project.catalogs[SUMMIT_ID]: # Warning pertinent to this situation will have # been issued earlier, so just skip it here. #warning(_("@info", #"Missing summit catalog " #"for branch catalog '%(file)s'.", #file=branch_path)) continue summit_paths.append( project.catalogs[SUMMIT_ID][summit_name][0][0]) # There may be no summit catalogs for this branch catalog. # The warning about this condition has been issued earlier, # just skip the branch catalog here. if summit_paths: scatter_specs.append((branch_id, branch_name, branch_subdir, branch_path, summit_paths)) # Dummy entry to indicate branch switch. scatter_specs.append((branch_id, None, None, None, None)) # Setup progress indicator. upprog = lambda x=None: x if not options.verbose: catpaths = [x[3] for x in scatter_specs if x[1]] upprog = init_file_progress(catpaths, addfmt=t_("@info:progress", "Scattering: %(file)s")) # Scatter to branch catalogs. for scatter_spec in scatter_specs: branch_id, catpath = scatter_spec[0], scatter_spec[3] if catpath is not None: if options.verbose: report(_("@info:progress", "Scattering %(file)s...", file=catpath)) upprogc = lambda: upprog(catpath) summit_scatter_single(*(scatter_spec + (project, options, upprogc))) else: # Apply post-scatter hooks. if options.verbose: report(_("@info:progress", "Applying post-hook to branch %(branch)s...", branch=branch_id)) exec_hook_branch(branch_id, project.hook_on_scatter_branch) upprog() def summit_merge (project, options): if project.over_templates and project.lang == project.templates_lang: error(_("@info", "Merging not possible on '%(lang)s' in " "summit-over-templates mode.", lang=project.templates_lang)) merge_specs = [] # Select branches to merge. if not options.partbids: branch_ids = project.branch_ids + [SUMMIT_ID] else: branch_ids = options.partbids # Setup merging in summit. if SUMMIT_ID in branch_ids and project.summit.topdir_templates: branch_ids.remove(SUMMIT_ID) # Collect names of summit catalogs to merge. summit_names = select_summit_names(project, options) # Collect template catalogs to use. if not project.templates_dynamic: template_catalogs = collect_catalogs(project.summit.topdir_templates, ".pot", None, None, None, project, options) else: template_catalogs = project.tproject.catalogs[SUMMIT_ID] # Collect data for summit catalogs to merge. for name in summit_names: summit_path, summit_subdir = project.catalogs[SUMMIT_ID][name][0] if name not in template_catalogs: warning(_("@info", "No template for summit catalog '%(file)s'.", file=summit_path)) continue template_path = template_catalogs[name][0][0] merge_specs.append((SUMMIT_ID, name, summit_subdir, summit_path, template_path, project.summit_wrapping, project.summit_fuzzy_merging)) # Setup merging in branches. for branch_id in branch_ids: branch = project.bdict[branch_id] # Skip branch if local merging not desired, or no templates defined. if (not branch.merge or branch.topdir_templates is None): continue # Collect branch catalogs to merge. branch_catalogs = select_branch_catalogs(branch_id, project, options) # Collect template catalogs to use. template_catalogs = collect_catalogs(branch.topdir_templates, ".pot", branch.by_lang, branch.ignored, branch.split_path, project, options) # Collect data for branch catalogs to merge. for name, branch_path, branch_subdir in branch_catalogs: if not os.path.isfile(branch_path): # Catalog has been selected due to another operation mode, # which can create catalogs from scratch. continue if not name in template_catalogs: warning(_("@info", "No template for branch catalog '%(file)s'.", file=branch_path)) continue exact = False for template_path, template_subdir in template_catalogs[name]: if template_subdir == branch_subdir: exact = True break if not exact: warning(_("@info", "No exact template for branch catalog '%(file)s'.", file=branch_path)) continue merge_specs.append((branch_id, name, branch_subdir, branch_path, template_path, project.branches_wrapping, project.branches_fuzzy_merging)) # Setup progress indicator. upprog = lambda x=None: x if not options.verbose: catpaths = [x[3] for x in merge_specs] upprog = init_file_progress(catpaths, addfmt=t_("@info:progress", "Merging: %(file)s")) # Merge catalogs. for merge_spec in merge_specs: catpath = merge_spec[3] if options.verbose: report(_("@info:progress", "Merging %(file)s...", file=catpath)) upprogc = lambda: upprog(catpath) summit_merge_single(*(merge_spec + (project, options, upprogc))) upprog() # Remove template tree in summit-over-dynamic-templates mode. if project.templates_dynamic: shutil.rmtree(project.tproject.summit.topdir) def summit_deps (project, options): # Collect names of summit catalogs for which to report dependencies. summit_names = select_summit_names(project, options) # Report dependencies for all selected catalogs. for summit_name in summit_names: if summit_name not in project.catalogs[SUMMIT_ID]: # May happen if there are some missing summit catalogs # to current branch catalogs, i.e. gather has not been made. continue summit_path = project.catalogs[SUMMIT_ID][summit_name][0][0] branch_paths = [] for branch_id in project.branch_ids: for branch_name in project.full_inverse_map[summit_name][branch_id]: for branch_path, d1 in project.catalogs[branch_id][branch_name]: branch_paths.append(branch_path) fmtbpaths = " ".join(branch_paths) if options.verbose: actype = _("@item:intext action performed on a catalog", "depends") report(": (%s) %s %s" % (actype, summit_path, fmtbpaths)) else: report(": %s %s" % (summit_path, fmtbpaths)) def select_branch_catalogs (branch_id, project, options): # Shortcuts. pbcats = project.catalogs[branch_id] # Select either all catalogs in this branch, # or those mentioned in the command line. if not options.partspecs: branch_catalogs = [] - for name, spec in pbcats.items(): + for name, spec in list(pbcats.items()): for path, subdir in spec: if options.selcatf(path): branch_catalogs.append((name, path, subdir)) else: # Select branch catalogs by command line specification. branch_catalogs = [] # Process direct specifications (branch->summit). if branch_id in options.partspecs: for part_spec in options.partspecs[branch_id]: # If the catalog specification has path separators, # then it selects a complete subdir in the branch. branch_catalogs_l = [] if part_spec.find(os.sep) >= 0: sel_subdir = os.path.normpath(part_spec) one_found = False - for name, spec in pbcats.items(): + for name, spec in list(pbcats.items()): for path, subdir in spec: if sel_subdir == subdir: one_found = True if options.selcatf(path): branch_catalogs_l.append( (name, path, subdir)) if not one_found: error(_("@info", "No catalogs in subdirectory '%(dir)s' " "of branch '%(branch)s'.", dir=sel_subdir, branch=branch_id)) else: # Otherwise, specific catalog is selected. sel_name = part_spec one_found = False - for name, spec in pbcats.items(): + for name, spec in list(pbcats.items()): if sel_name == name: for path, subdir in spec: one_found = True if options.selcatf(path): branch_catalogs_l.append( (name, path, subdir)) break if not one_found: error(_("@info", "No catalog named '%(name)s' " "in branch '%(branch)s'.", name=sel_name, branch=branch_id)) # Also select all branch catalogs which contribute to same # summit catalogs as the already selected ones. branch_catalogs_l2 = [] dmap = project.direct_map[branch_id] pimap = project.part_inverse_map[branch_id] for branch_name, d1, d2 in branch_catalogs_l: if branch_name in dmap: for summit_name in dmap[branch_name]: if summit_name in pimap: for name in pimap[summit_name]: for path, subdir in pbcats[name]: if options.selcatf(path): branch_catalogs_l2.append( (name, path, subdir)) branch_catalogs.extend(branch_catalogs_l) branch_catalogs.extend(branch_catalogs_l2) # Process inverse specifications (summit->branch). if SUMMIT_ID in options.partspecs: for part_spec in options.partspecs[SUMMIT_ID]: if part_spec.find(os.sep) >= 0: # Complete subdir. sel_subdir = os.path.normpath(part_spec) cats = [] - for name, spec in project.catalogs[SUMMIT_ID].items(): + for name, spec in list(project.catalogs[SUMMIT_ID].items()): path, subdir = spec[0] # all summit catalogs unique if sel_subdir == subdir: bnames = project.full_inverse_map[name][branch_id] for bname in bnames: if bname in pbcats: for bpath, bsubdir in pbcats[bname]: if options.selcatf(bpath): cats.append((bname, bpath, bsubdir)) branch_catalogs.extend(cats) else: # Specific catalog. sel_name = part_spec if not sel_name in project.catalogs[SUMMIT_ID]: error(_("@info", "No summit catalog named '%(name)s'.", name=sel_name)) bnames = project.full_inverse_map[sel_name][branch_id] for bname in bnames: if bname in pbcats: for bpath, bsubdir in pbcats[bname]: if options.selcatf(bpath): branch_catalogs.append( (bname, bpath, bsubdir)) # Same catalogs may have been selected multiple times, remove. branch_catalogs = list(set(branch_catalogs)) # Sort by path. branch_catalogs.sort(key=lambda x: x[1]) # ...sorting is not only for looks, but to establish priority of # supplying comments to summit messages. return branch_catalogs def select_summit_names (project, options): # Collect all summit catalogs selected explicitly or implicitly. summit_names = [] if options.partspecs is None: - for name, spec in project.catalogs[SUMMIT_ID].items(): + for name, spec in list(project.catalogs[SUMMIT_ID].items()): path, subdir = spec[0] # summit catalogs are unique if options.selcatf(path): summit_names.append(name) else: for branch_id in options.partspecs: for part_spec in options.partspecs[branch_id]: if branch_id == SUMMIT_ID: # explicit by summit reference if part_spec.find(os.sep) >= 0: # whole subdir sel_subdir = os.path.normpath(part_spec) one_found = False - for name, spec in project.catalogs[SUMMIT_ID].items(): + for name, spec in list(project.catalogs[SUMMIT_ID].items()): path, subdir = spec[0] # summit catalogs are unique if sel_subdir == subdir: one_found = True if options.selcatf(path): summit_names.append(name) if not one_found: error(_("@info", "No summit directory named '%(name)s'.", name=sel_subdir)) else: # single name sel_name = part_spec spec = project.catalogs[SUMMIT_ID].get(sel_name) if not spec: error(_("@info", "No summit catalog named '%(name)s'.", name=sel_name)) path, subdir = spec[0] # summit catalogs are unique if options.selcatf(path): summit_names.append(sel_name) else: # implicit by branch reference if part_spec.find(os.sep) >= 0: # whole subdir sel_subdir = os.path.normpath(part_spec) one_found = False - for name, spec in project.catalogs[branch_id].items(): + for name, spec in list(project.catalogs[branch_id].items()): for path, subdir in spec: if sel_subdir == subdir: one_found = True if options.selcatf(path): summit_names.extend( project.direct_map[branch_id][name]) break if not one_found: error(_("@info", "No directory named '%(name)s' " "in branch '%(branch)s'.", name=sel_subdir, branch=branch_id)) else: # single name sel_name = part_spec spec = project.catalogs[branch_id].get(sel_name) if not spec: error(_("@info", "No catalog named '%(name)s' " "in branch '%(branch)s'.", name=sel_name, branch=branch_id)) for path, subdir in spec: if options.selcatf(path): summit_names.extend( project.direct_map[branch_id][sel_name]) break # Make names unique and sort by path. summit_names = list(set(summit_names)) summit_names.sort(key=lambda x: project.catalogs[SUMMIT_ID].get(x, [[""]])[0][0]) # Additionaly sort by subdirectory precedence. # This is necessary so that catalogs can be properly moved when gathering, # in case a higher precedence subdirectory was not created before. # Default "~" means that catalogs with no paths will be sorted at end. summit_names.sort(key=lambda x: project.calc_subdir_precedence( project.catalogs[SUMMIT_ID].get(x, [["", "~"]])[0][1])) return summit_names def summit_gather_single (summit_name, project, options, phony=False, pre_summit_names=(), memo_store=None, update_progress=(lambda: None)): if memo_store is not None: memo_key = (summit_name, tuple(sorted(pre_summit_names))) if memo_key in memo_store: # value can be None return memo_store.get(memo_key) update_progress() summit_path = project.catalogs[SUMMIT_ID][summit_name][0][0] summit_subdir = project.catalogs[SUMMIT_ID][summit_name][0][1] update_from_old = ( os.path.exists(summit_path) and not project.templates_dynamic) # Do not overwrite the old summit catalog here if it exists, # as it will be needed for comparison later. monitored = update_from_old summit_cat = Catalog("", monitored=monitored, wrapping=project.summit_wrapping, create=True) summit_cat.filename = summit_path # Collect branches in which this summit catalog has corresponding # branch catalogs, in order of branch priority. src_branch_ids = [] for branch_id in project.branch_ids: if project.full_inverse_map[summit_name][branch_id]: src_branch_ids.append(branch_id) # If there are no branch catalogs, # then the current summit catalog is to be removed. if not src_branch_ids: if phony: # cannot happen error(_("@info", "Phony gather on summit catalog which is to be removed.")) # Remove by version control, if any. if project.summit_vcs: if not project.summit_vcs.remove(summit_path): warning(_("@info", "Cannot remove '%(path)s' from version control.", path=summit_path)) # If not removed by version control, plainly delete. if os.path.isfile(summit_path): os.unlink(summit_path) if os.path.isfile(summit_path): warning(_("@info", "Cannot remove '%(path)s' from disk.", path=summit_path)) if not os.path.isfile(summit_path): if options.verbose: actype = _("@item:intext action performed on a catalog", "gathered-removed") report("- (%s) %s" % (actype, summit_path)) elif not options.quiet: report("- %s" % summit_path) # Skip the rest, nothing to gather. if memo_store is not None: memo_store[memo_key] = summit_cat return summit_cat # Open all corresponding branch catalogs. # For each branch catalog, also phony-gather any dependent summit # catalogs. Phony means not to take into account branch catalogs which # map to current summit catalog if it is higher in their queue than # the phony-gathered one, and not to sync phony-gathered catalog; # this is needed in order that any new messages get inserted # uniquely and deterministically in case of split-mappings. bcat_pscats = {} if phony or memo_store is not None: sub_memo_store = memo_store else: sub_memo_store = {} for branch_id in src_branch_ids: branch = project.bdict[branch_id] if isinstance(branch.insert_nosim, (list, tuple)): apply_insert_nosim = lambda sn, sd: ( any(re.search(rs, sn) for rs in branch.insert_nosim)) elif callable(branch.insert_nosim): apply_insert_nosim = lambda sn, sd: branch.insert_nosim(sn, sd) else: apply_insert_nosim = lambda sn, sd: bool(branch.insert_nosim) bcat_pscats[branch_id] = [] for branch_name in project.full_inverse_map[summit_name][branch_id]: # In phony-gather, do not use branch catalogs with split-mappings # which map to one of the summit catalogs among previous. phony_skip = False for dep_summit_name in project.direct_map[branch_id][branch_name]: if dep_summit_name in pre_summit_names: phony_skip = True break if phony_skip: continue # Gather and open dependent summit catalogs. dep_summit_cats = [] sub_pre_summit_names = list(pre_summit_names) for dep_summit_name in project.direct_map[branch_id][branch_name]: if dep_summit_name == summit_name: sub_pre_summit_names.append(summit_name) continue dep_summit_cat = summit_gather_single(dep_summit_name, project, options, True, sub_pre_summit_names, sub_memo_store, update_progress) if dep_summit_cat is not None: dep_summit_cats.append(dep_summit_cat) # Open all branch catalogs of this name, ordered by path, # link them to the same dependent summit catalogs. for path, subdir in project.catalogs[branch_id][branch_name]: update_progress() # Apply hooks to branch catalog file, creating temporaries. tmp_path = None if project.hook_on_gather_file_branch: # Temporary path should be such as to not modify the # catalog name (e.g. appending ".mod" could make ".po" # a part of the name). tmp_path = path + "~mod" shutil.copyfile(path, tmp_path) exec_hook_file(branch_id, branch_name, subdir, tmp_path, project.hook_on_gather_file_branch) branch_cat = Catalog(tmp_path or path, monitored=False) if tmp_path: # as soon as catalog is opened, no longer needed os.unlink(tmp_path) # Apply hooks to branch catalog. if project.hook_on_gather_cat_branch: exec_hook_cat(branch_id, branch_name, subdir, branch_cat, project.hook_on_gather_cat_branch) branch_cat.sync_map() # Apply hooks to all branch catalog messages here, # as they may modify message keys. if project.hook_on_gather_msg_branch: for msg in branch_cat: update_progress() exec_hook_msg(branch_id, branch_name, subdir, msg, branch_cat, project.hook_on_gather_msg_branch) branch_cat.sync_map() insert_nosim = apply_insert_nosim(branch_name, subdir) bcat_pscats[branch_id].append((branch_cat, dep_summit_cats, insert_nosim)) # On phony gather, in case of split mappings, # it may happen that there are no corresponding branch catalogs. if phony and not any(bcat_pscats.values()): if memo_store is not None: memo_store[memo_key] = None return None # Select primary branch catalog. prim_branch_cat = None for branch_id in src_branch_ids: if bcat_pscats[branch_id]: prim_branch_cat = bcat_pscats[branch_id][0][0] break assert prim_branch_cat is not None # Gather messages through branch catalogs. for branch_id in src_branch_ids: for branch_cat, dep_summit_cats, insert_nosim in bcat_pscats[branch_id]: is_primary = branch_cat is prim_branch_cat summit_gather_single_bcat(branch_id, branch_cat, is_primary, summit_cat, monitored, dep_summit_cats, insert_nosim, project, options, update_progress) # Gather the summit header according to primary branch. summit_gather_single_header(summit_cat, prim_branch_cat, project, options) # Apply hooks to the summit messages. if project.hook_on_gather_msg: for msg in summit_cat: exec_hook_msg(SUMMIT_ID, summit_cat.name, summit_subdir, msg, summit_cat, project.hook_on_gather_msg) # Apply hooks to the summit catalog. exec_hook_cat(SUMMIT_ID, summit_cat.name, summit_subdir, summit_cat, project.hook_on_gather_cat) # If phony-gather, stop here and return summit catalog for reference. if phony: if memo_store is not None: memo_store[memo_key] = summit_cat return summit_cat # If the old summit catalog exists, compare with the new. # If there were any modified entries, or their order changed, # replace the old with the new summit catalog. # Copy over unmodified entries from the old catalog, # to avoid line reformatting. if update_from_old: old_cat = Catalog(summit_path, monitored=monitored, wrapping=project.summit_wrapping) summit_created = False replace = False # Compare headers without some insignificant fields. if cmpnorm_hdr(summit_cat.header) == cmpnorm_hdr(old_cat.header): summit_cat.header = old_cat.header else: replace = True # Compare messages and their positions. for pos in range(len(summit_cat)): update_progress() old_pos = old_cat.find(summit_cat[pos]) if pos != old_pos: replace = True if old_pos >= 0: if summit_cat[pos] == old_cat[old_pos]: summit_cat[pos] = old_cat[old_pos] else: replace = True # Compare lengths. if len(summit_cat) != len(old_cat): replace = True else: summit_created = True replace = True # Check if the catalog needs to be moved to another subdirectory. branch_subdirs = [] for branch_id in project.full_inverse_map[summit_name]: for branch_name in project.full_inverse_map[summit_name][branch_id]: branch_subdirs_1 = [] for bpath, bsubdir in project.catalogs[branch_id][branch_name]: bsubdir = project.subdir_map.get((branch_id, bsubdir), bsubdir) branch_subdirs_1.append(bsubdir) branch_subdirs_1.sort() branch_subdirs.extend(branch_subdirs_1) new_summit_path = summit_path if branch_subdirs: branch_subdirs = list(set(branch_subdirs)) - subdir_precs = map(project.calc_subdir_precedence, branch_subdirs) + subdir_precs = list(map(project.calc_subdir_precedence, branch_subdirs)) precs_subdirs = sorted(zip(subdir_precs, branch_subdirs)) branch_subdirs_sel = [sd for pr, sd in precs_subdirs if pr == precs_subdirs[0][0]] if summit_subdir not in branch_subdirs_sel: catext = summit_path[summit_path.rfind("."):] new_summit_path = join_ncwd(project.summit.topdir, branch_subdirs_sel[0], summit_name + catext) if replace or summit_cat.filename != new_summit_path: added = False moved = False if replace: # Set template creation date for the summit catalog # to the current date. # Do not try to trust branch template creation dates, # e.g. by copying the latest one. - summit_cat.header.set_field(u"POT-Creation-Date", format_datetime(), - before=u"PO-Revision-Date", + summit_cat.header.set_field("POT-Creation-Date", format_datetime(), + before="PO-Revision-Date", reorder=True) # Sync to disk. summit_cat.sync() # Apply hooks to summit catalog file. exec_hook_file(SUMMIT_ID, summit_cat.name, summit_subdir, summit_cat.filename, project.hook_on_gather_file) if summit_created: added = True # Add to version control. if ( project.summit_vcs and not project.summit_vcs.is_versioned(summit_cat.filename) ): if not project.summit_vcs.add(summit_cat.filename): warning(_("@info", "Cannot add '%(file)s' to version control.", file=summit_cat.filename)) else: added = True if summit_cat.filename != new_summit_path: if project.summit_vcs: if not project.summit_vcs.move(summit_cat.filename, new_summit_path): warning(_("@info", "Cannot move '%(srcfile)s' to '%(dstfile)s'.", srcfile=summit_cat.filename, dstfile=new_summit_path)) else: summit_cat.filename = new_summit_path moved = True branch_paths = [] for branch_id in src_branch_ids: for branch_cat, dep_summit_cats, insert_nosim in bcat_pscats[branch_id]: branch_paths.append(branch_cat.filename) paths_str = " ".join(branch_paths) if options.verbose: if added: actype = _("@item:intext action performed on a catalog", "gathered-added") report(">+ (%s) %s %s" % (actype, summit_cat.filename, paths_str)) elif moved: actype = _("@item:intext action performed on a catalog", "gathered-moved") report(">| (%s) %s %s" % (actype, summit_cat.filename, paths_str)) else: actype = _("@item:intext action performed on a catalog", "gathered") report("> (%s) %s %s" % (actype, summit_cat.filename, paths_str)) elif not options.quiet: if added: report(">+ %s %s" % (summit_cat.filename, paths_str)) elif moved: report(">| %s %s" % (summit_cat.filename, paths_str)) else: report("> %s %s" % (summit_cat.filename, paths_str)) if memo_store is not None: memo_store[memo_key] = summit_cat return summit_cat def cmpnorm_hdr (hdr): rhdr = Header(hdr) for field in ( "POT-Creation-Date", ): rhdr.remove_field(field) return rhdr def extkey_msg (msg): # NOTE: If computation of context pad is modified, # padded messages in existing summit catalogs will get fuzzy # on next merge with newly gathered templates. msg = MessageUnsafe(msg) if msg.msgid_plural is not None: h = hashlib.md5() h.update(msg.msgid_plural.encode("UTF-8")) ctxtpad = h.hexdigest() else: # Something that looks like a hex digest but slightly shorter, # so that it does not match any real digest. ctxtpad = "abcd1234efgh5665hgfe4321dcba" - msg.auto_comment.append(u"%s msgctxt-pad %s" + msg.auto_comment.append("%s msgctxt-pad %s" % (_summit_tag_kwprop, ctxtpad)) if msg.msgctxt is None: - msg.msgctxt = u"%s" % ctxtpad + msg.msgctxt = "%s" % ctxtpad else: - msg.msgctxt = u"%s|%s" % (msg.msgctxt, ctxtpad) + msg.msgctxt = "%s|%s" % (msg.msgctxt, ctxtpad) return msg def summit_gather_single_bcat (branch_id, branch_cat, is_primary, summit_cat, monitored, dep_summit_cats, insert_nosim, project, options, update_progress): MessageType = (Message if monitored else MessageUnsafe) # Go through messages in the branch catalog, merging them with # existing summit messages, or collecting for later insertion. # Do not insert new messages immediately, as source references may be # updated by merging, which reflects on heuristic insertion. # Ignore messages present in dependent summit catalogs. msgs_to_merge = [] msgs_to_insert = [] xkpairs = [] for msg in branch_cat: update_progress() # Do not gather obsolete messages. if msg.obsolete: continue # Normalizations when gathering templates, # in case extraction tool needs to have its sanity checked, # or certain language files stand in for true templates. if project.lang == project.templates_lang: msg.manual_comment[:] = [] msg.unfuzzy() if msg.msgid_plural is None: - msg.msgstr[:] = [u""] + msg.msgstr[:] = [""] else: - msg.msgstr[:] = [u"", u""] + msg.msgstr[:] = ["", ""] # Construct branch message with extended key. xkmsg = extkey_msg(msg) # Do not gather messages belonging to depending summit catalogs. in_dep = False for dep_summit_cat in dep_summit_cats: if msg in dep_summit_cat or xkmsg in dep_summit_cat: in_dep = True break if in_dep: continue # If the summit message for the original branch message exists, # but their extended keys do not match, # switch to branch message with extended key. summit_msg = summit_cat.get(msg) if summit_msg and extkey_msg(summit_msg).key != xkmsg.key: xkpairs.append((msg, xkmsg)) msg = xkmsg summit_msg = summit_cat.get(msg) # Collect the branch message for merging or insertion. if summit_msg is not None: msgs_to_merge.append((msg, summit_msg)) else: msgs_to_insert.append(msg) # If some messages had to have extended keys, update branch catalog. if xkpairs: for msg, xkmsg in xkpairs: branch_cat.remove_on_sync(msg) branch_cat.add_last(xkmsg) branch_cat.sync_map() # Merge messages already in the summit catalog. if msgs_to_merge: for msg, summit_msg in msgs_to_merge: # Merge the message. gather_merge_msg(summit_msg, msg) # Update automatic comments. summit_override_auto(summit_msg, msg, branch_id, is_primary) # Equip any new summit tags to the merged message. summit_set_tags(summit_msg, branch_id, project) # Insert messages not already in the summit catalog. if msgs_to_insert: # Pair messages to insert from branch with summit messages # having common source files. # If summit is empty, this is primary branch catalog, so make # only one dummy pair to preserve original ordering of messages. summit_msgs_by_src_dict = dict(summit_cat.messages_by_source()) if summit_msgs_by_src_dict: msgs_by_src = branch_cat.messages_by_source() else: msgs_by_src = [("", branch_cat)] # Collect possible source file synonyms to those in the summit catalog. fnsyn = branch_cat.detect_renamed_sources(summit_cat) # Prepare messages for insertion into summit. summit_msg_by_msg = {} for msg in msgs_to_insert: update_progress() summit_msg = MessageType(msg) summit_set_tags(summit_msg, branch_id, project) summit_msg_by_msg[msg] = summit_msg # Insert branch messages into summit source by source. for src, msgs in msgs_by_src: # Assemble collection of summit messages from same source file. summit_msgs = [] for osrc in [src] + fnsyn.get(src, []): summit_msgs.extend(summit_msgs_by_src_dict.get(osrc, [])) # If existing summit messages from same source found, # insert branch messages around those summit messages. # Otherwise, just append them at the end. if summit_msgs: # Assemble groups of messages by same msgid and same msgctxt, # for insertion by similarity. if not insert_nosim: smsgs_by_msgid = {} smsgs_by_msgctxt = {} for smsg in summit_msgs: if smsg.msgid not in smsgs_by_msgid: smsgs_by_msgid[smsg.msgid] = [] smsgs_by_msgid[smsg.msgid].append(smsg) if smsg.msgctxt is not None: if smsg.msgctxt not in smsgs_by_msgctxt: smsgs_by_msgctxt[smsg.msgctxt] = [] smsgs_by_msgctxt[smsg.msgctxt].append(smsg) insertions = [] for msg in msgs: update_progress() new_summit_msg = summit_msg_by_msg.get(msg) if new_summit_msg is None: continue # Existing summit message to where (after or before) # current message is to be inserted. summit_msg_ref = None before = False # Try to insert message by similarity. # Similarity is checked by groups, # such that for each group there is a message part # which is compared for similarity. if not insert_nosim: for summit_msgs_group, matt, forceins in ( (smsgs_by_msgid.get(msg.msgid), "msgctxt", True), (smsgs_by_msgctxt.get(msg.msgctxt), "msgid", True), (summit_msgs, "key", False), ): if not summit_msgs_group: continue # Shortcut: if only one summit message in the group # and insertion forced, insert after it. if len(summit_msgs_group) == 1 and forceins: summit_msg_ref = summit_msgs_group[-1] break # Does the message have the part to be matched? mval = msg.get(matt) if mval is None: continue # Find existing message with the most similar # matching attribute. seqm = SequenceMatcher(None, mval, "") maxr = 0.0 for summit_msg in summit_msgs_group: smval = summit_msg.get(matt) if smval is None: continue seqm.set_seq2(smval) r = seqm.ratio() if maxr <= r: maxr = r maxr_summit_msg = summit_msg # If similar enough message has been found, # set insertion position after it. # Otherwise, insert after last summit message # in the group if insertion forced. if maxr > 0.6: summit_msg_ref = maxr_summit_msg break elif forceins: summit_msg_ref = summit_msgs_group[-1] break # If no similar existing message, set position before # the summit message with first greater source reference # line number, if any such. if summit_msg_ref is None and src: for summit_msg in summit_msgs: if msg.source[0][1] < summit_msg.source[0][1]: summit_msg_ref = summit_msg before = True break # If not insertion by source references, insert last. if summit_msg_ref is None: summit_msg_ref = summit_msgs[-1] # Record insertion. pos = summit_cat.find(summit_msg_ref) if not before: pos += 1 insertions.append((new_summit_msg, pos)) # Insert ordered messages into catalog. summit_cat.add_more(insertions) else: for msg in msgs: update_progress() new_summit_msg = summit_msg_by_msg.get(msg) if new_summit_msg is not None: summit_cat.add_last(new_summit_msg) def gather_merge_msg (summit_msg, msg): if summit_msg.key != msg.key: error(_("@info", "Cannot gather messages with different keys.")) if (summit_msg.msgid_plural is None) != (msg.msgid_plural is None): error(_("@info", "Cannot gather messages with different plurality.")) if ( (summit_msg.translated and msg.translated) or (summit_msg.fuzzy and msg.fuzzy) or (summit_msg.untranslated and msg.untranslated) ): if not summit_msg.manual_comment: summit_msg.manual_comment = Monlist(msg.manual_comment) if msg.msgid_plural is not None: summit_msg.msgid_plural = msg.msgid_plural summit_msg.msgstr = Monlist(msg.msgstr) elif summit_msg.fuzzy and msg.translated: summit_msg.manual_comment = Monlist(msg.manual_comment) if summit_msg.msgid_plural is None or msg.msgid_plural is not None: if msg.msgid_plural is not None: summit_msg.msgid_plural = msg.msgid_plural summit_msg.msgstr = Monlist(msg.msgstr) if summit_msg.msgid_plural == msg.msgid_plural: summit_msg.unfuzzy() elif summit_msg.untranslated and (msg.translated or msg.fuzzy): summit_msg.manual_comment = Monlist(msg.manual_comment) if summit_msg.msgid_plural is None or msg.msgid_plural is not None: if msg.fuzzy: summit_msg.msgctxt_previous = msg.msgctxt_previous summit_msg.msgid_previous = msg.msgid_previous summit_msg.msgid_plural_previous = msg.msgid_plural_previous if msg.msgid_plural is not None: summit_msg.msgid_plural = msg.msgid_plural summit_msg.msgstr = Monlist(msg.msgstr) summit_msg.fuzzy = msg.fuzzy def summit_gather_single_header (summit_cat, prim_branch_cat, project, options): # Copy over comments from the primary branch catalog. hdr = summit_cat.header bhdr = prim_branch_cat.header hdr.title = bhdr.title hdr.copyright = bhdr.copyright hdr.license = bhdr.license hdr.author = bhdr.author hdr.comment = bhdr.comment # Copy over standard fields from the primary branch catalog. for fname in [x[0] for x in Header().field]: fvalue = prim_branch_cat.header.get_field_value(fname) if fvalue is not None: summit_cat.header.set_field(fname, fvalue) else: summit_cat.header.remove_field(fname) # Copy over non-standard fields from the primary branch catalog on request. bfields = [] for fname in project.header_propagate_fields: bfields.extend(prim_branch_cat.header.select_fields(fname)) cfields = [] for fname in project.header_propagate_fields: cfields.extend(summit_cat.header.select_fields(fname)) # Replace old with new set if not equal. if bfields != cfields: for cfield in cfields: summit_cat.header.field.remove(cfield) for bfield in bfields: summit_cat.header.field.append(bfield) _asc_check_cache = {} def summit_scatter_single (branch_id, branch_name, branch_subdir, branch_path, summit_paths, project, options, update_progress): update_progress() # See if the branch catalog is to be newly created from the template. new_from_template = False branch_path_mod = branch_path if branch_path in project.add_on_scatter: new_from_template = True # Initialize new catalog with messages directly from the template. # Later the catalog file name will be switched to branch path, # if the catalog satisfies criteria to be created on scatter. branch_path_mod = project.add_on_scatter[branch_path] # Open the branch catalog and all summit catalogs. try: branch_cat = Catalog(branch_path_mod, wrapping=project.branches_wrapping) - except PologyError, e: + except PologyError as e: warning(_("@info", "Cannot open the branch catalog '%(file)s' " "to scatter to. The error was:\n" "%(msg)s", file=branch_path_mod, msg=str_to_unicode(str(e)))) return summit_cats = [] for summit_path in summit_paths: try: # NOTE: Must be opened monitored to have compatible types # when copying message parts to branch message. summit_cat = Catalog(summit_path) - except PologyError, e: + except PologyError as e: warning(_("@info", "Cannot open the summit catalog '%(file)s' " "to scatter from. The error was:\n" "%(msg)s", file=summit_path, msg=str_to_unicode(str(e)))) return summit_cats.append(summit_cat) # Collect and link ascription catalogs to summit catalogs. # (Do not open them here, but only later when a check is not cached.) if project.ascription_filter: aconfs_acats = {} for summit_cat in summit_cats: aconf, acatpath = project.aconfs_acatpaths[summit_cat.name] aconfs_acats[summit_cat.name] = (aconf, None, acatpath) if acatpath not in _asc_check_cache: _asc_check_cache[acatpath] = {} # Pair branch messages with summit messages. msgs_total = 0 msgs_translated = 0 msg_links = [] asc_stopped = 0 for branch_msg in branch_cat: update_progress() # Skip obsolete messages. if branch_msg.obsolete: continue msgs_total += 1 # If there is a hook on branch messages on gather, # it must be used here to prepare branch message for lookup # in summit catalog, as the hook may modify the key. branch_msg_lkp = branch_msg if project.hook_on_gather_msg_branch: branch_msg_lkp = MessageUnsafe(branch_msg) exec_hook_msg(branch_id, branch_name, branch_subdir, branch_msg_lkp, branch_cat, project.hook_on_gather_msg_branch) # Construct branch message for lookup with extended key. branch_xkmsg_lkp = extkey_msg(branch_msg_lkp) # Find first summit catalog which has this message translated. summit_msg = None for summit_cat in summit_cats: # Branch message with extended key must be looked up first. for bmsg_lkp in [branch_xkmsg_lkp, branch_msg_lkp]: if bmsg_lkp in summit_cat: summit_msg = summit_cat[bmsg_lkp] if summit_msg.obsolete: summit_msg = None else: break if summit_msg is not None: break if summit_msg is None: report_on_msg(_("@info:progress", "Message not in the summit."), branch_msg, branch_cat) continue if ( project.ascription_filter and not options.force and do_scatter(summit_msg, branch_msg) ): aconf, acat, acatpath = aconfs_acats[summit_cat.name] if summit_msg.key not in _asc_check_cache[acatpath]: if acat is None: acat = Catalog(acatpath, monitored=False, create=True) aconfs_acats[summit_cat.name] = (aconf, acat, acatpath) hfilter = project.ascription_history_filter ahist = collect_ascription_history(summit_msg, acat, aconf, nomrg=True, hfilter=hfilter) afilter = project.ascription_filter res = afilter(summit_msg, summit_cat, ahist, aconf) _asc_check_cache[acatpath][summit_msg.key] = res if not _asc_check_cache[acatpath][summit_msg.key]: asc_stopped += 1 continue if summit_msg.translated: msgs_translated += 1 msg_links.append((branch_msg, summit_msg, summit_cat)) if asc_stopped > 0: warning(n_("@info:progress", "%(file)s: %(num)d message stopped by ascription filter.", "%(file)s: %(num)d messages stopped by ascription filter.", file=branch_path, num=asc_stopped)) # If completeness less than minimal acceptable, remove all translations. if msgs_total > 0: completeness_ratio = float(msgs_translated) / msgs_total else: completeness_ratio = 1.0 if ( completeness_ratio < project.scatter_acc_completeness and not options.force ): for branch_msg in branch_cat: if branch_msg.obsolete: branch_cat.remove_on_sync(branch_msg) else: clear_msg(branch_msg) # If complete enough, scatter from summit to branch messages. else: scattered_branch_msgs = set() for branch_msg, summit_msg, summit_cat in msg_links: update_progress() if do_scatter(summit_msg, branch_msg): exec_hook_msg(branch_id, branch_name, branch_subdir, summit_msg, summit_cat, project.hook_on_scatter_msg) # NOTE: Same plurality and equal msgid_plural fields # between summit and branch message are enforced, # so only assert this for robustness. if summit_msg.msgid_plural != branch_msg.msgid_plural: error(_("@info", "Cannot scatter messages with " "different plurality.")) for i in range(len(summit_msg.msgstr)): piped_msgstr = exec_hook_msgstr( branch_id, branch_name, branch_subdir, summit_msg.msgstr[i], summit_msg, summit_cat, project.hook_on_scatter_msgstr) if i < len(branch_msg.msgstr): branch_msg.msgstr[i] = piped_msgstr else: branch_msg.msgstr.append(piped_msgstr) branch_msg.unfuzzy() branch_msg.manual_comment = summit_msg.manual_comment scattered_branch_msgs.add(branch_msg) # Fuzzy all active messages which were not scattered, # in order to avoid stale translations in branches. for branch_msg in branch_cat: if branch_msg.active and branch_msg not in scattered_branch_msgs: branch_msg.fuzzy = True # Update branch header based on primary summit catalog. # Copy over all header parts from summit to branch, # except for those copied from template on merging. hdr = branch_cat.header shdr = summit_cats[0].header # Fields to keep due to being copied over on merging. keep_fields = [ "Report-Msgid-Bugs-To", "POT-Creation-Date", ] # Fields to keep if no branch message was modified. if not branch_cat.modcount and branch_cat.header.initialized: keep_fields.extend([ "PO-Revision-Date", "Last-Translator", ]) # Fields to keep due to explicitly being told to. keep_fields.extend(project.header_skip_fields_on_scatter) # Update comments. hdr.title = shdr.title hdr.copyright = shdr.copyright hdr.license = shdr.license hdr.author = shdr.author hdr.comment = shdr.comment # Update fields only if normalized lists of fields do not match. if normhf(hdr.field, keep_fields) != normhf(shdr.field, keep_fields): # Collect branch fields to be preserved. preserved_fs = [] for fnam in keep_fields: selected_fs = branch_cat.header.select_fields(fnam) preserved_fs.append(selected_fs[0] if selected_fs else (fnam, None)) # Overwrite branch with summit header fields. hdr.field = shdr.field # Put back the preserved branch fields. for fnam, fval in preserved_fs: if fval is not None: hdr.set_field(fnam, fval) else: hdr.remove_field(fnam) # Apply hooks to the branch catalog. exec_hook_cat(branch_id, branch_name, branch_subdir, branch_cat, project.hook_on_scatter_cat) # If the branch catalog has been newly created, # see if it is translated enough to be really written out. skip_write = False if new_from_template and not options.force: ntrans = 0 for msg in branch_cat: if msg.translated: ntrans += 1 if len(branch_cat) > 0: skip_write = ( float(ntrans) / len(branch_cat) + 1e-6 < project.scatter_min_completeness) else: skip_write = False if new_from_template and not skip_write: # Create any needed subdirectories and set destination branch path. mkdirpath(os.path.dirname(branch_path)) branch_cat.filename = branch_path # Commit changes to the branch catalog. if not skip_write and (branch_cat.sync() or options.force): # Apply hooks to branch catalog file. exec_hook_file(branch_id, branch_name, branch_subdir, branch_cat.filename, project.hook_on_scatter_file) # Add to version control. if ( project.branches_vcs and not project.bdict[branch_id].skip_version_control ): if not project.branches_vcs.add(branch_cat.filename): warning(_("@info", "Cannot add '%(file)s' to version control.", file=branch_cat.filename)) paths_str = " ".join(summit_paths) if options.verbose: if new_from_template: actype = _("@item:intext action performed on a catalog", "scattered-added") report("<+ (%s) %s %s" % (actype, branch_cat.filename, paths_str)) else: actype = _("@item:intext action performed on a catalog", "scattered") report("< (%s) %s %s" % (actype, branch_cat.filename, paths_str)) elif not options.quiet: if new_from_template: report("<+ %s %s" % (branch_cat.filename, paths_str)) else: report("< %s %s" % (branch_cat.filename, paths_str)) def do_scatter (smsg, bmsg): return smsg.translated def hook_applicable (branch_check, branch_id, name_check, name, subdir): if branch_check is not None: if hasattr(branch_check, "__call__"): if not branch_check(branch_id): return False else: if not re.search(branch_check, branch_id): return False if name_check is not None: if hasattr(name_check, "__call__"): if not name_check(name, subdir): return False else: if not re.search(name_check, name): return False return True # Pipe msgstr through hook calls, # for which branch id and catalog name match hook specification. def exec_hook_msgstr (branch_id, branch_name, branch_subdir, msgstr, msg, cat, hooks): piped_msgstr = msgstr for call, branch_ch, name_ch in hooks: if hook_applicable(branch_ch, branch_id, name_ch, branch_name, branch_subdir): piped_msgstr_tmp = call(piped_msgstr, msg, cat) - if isinstance(piped_msgstr_tmp, basestring): + if isinstance(piped_msgstr_tmp, str): piped_msgstr = piped_msgstr_tmp return piped_msgstr # Pipe message through hook calls, # for which branch id and catalog name match hook specification. def exec_hook_msg (branch_id, branch_name, branch_subdir, msg, cat, hooks): # Apply all hooks to the message. for call, branch_ch, name_ch in hooks: if hook_applicable(branch_ch, branch_id, name_ch, branch_name, branch_subdir): call(msg, cat) # Pipe header through hook calls, # for which branch id and catalog name match hook specification. def exec_hook_head (branch_id, branch_name, branch_subdir, hdr, cat, hooks): # Apply all hooks to the header. for call, branch_ch, name_ch in hooks: if hook_applicable(branch_ch, branch_id, name_ch, branch_name, branch_subdir): call(hdr, cat) # Pipe catalog through hook calls, # for which branch id and catalog name match hook specification. def exec_hook_cat (branch_id, branch_name, branch_subdir, cat, hooks): # Apply all hooks to the catalog. for call, branch_ch, name_ch in hooks: if hook_applicable(branch_ch, branch_id, name_ch, branch_name, branch_subdir): call(cat) # Pipe catalog file through hook calls, # for which branch id and catalog name match hook specification. def exec_hook_file (branch_id, branch_name, branch_subdir, filepath, hooks): # Make temporary backup of the file. # FIXME: Portable construction of temporary file. bckppath = "/tmp/backup%s-%s" % (os.getpid(), os.path.basename(filepath)) shutil.copyfile(filepath, bckppath) # Apply all hooks to the file, but stop if one returns non-zero status. failed = False for call, branch_ch, name_ch in hooks: if hook_applicable(branch_ch, branch_id, name_ch, branch_name, branch_subdir): if call(filepath) != 0: failed = True break # If any hook failed, retrieve the temporary copy. if failed: shutil.move(bckppath, filepath) else: os.unlink(bckppath) # Pipe branch through hook calls, # for which branch id and matches hook specification. def exec_hook_branch (branch_id, hooks): # Apply all hooks to the branch, but stop if one returns non-zero status. failed = False for call, branch_ch, d1 in hooks: if hook_applicable(branch_ch, branch_id, None, None, None): if call(branch_id) != 0: failed = True break def find_summit_comment (msg, summit_tag): i = 0 for c in msg.auto_comment: if c.startswith(summit_tag): return i i += 1 return -1 -def get_summit_comment (msg, summit_tag, default=u""): +def get_summit_comment (msg, summit_tag, default=""): p = find_summit_comment(msg, summit_tag) if p >= 0: return msg.auto_comment[p][len(summit_tag):].strip() else: return default def set_summit_comment (msg, summit_tag, text): - ctext = unicode(summit_tag + " " + text.strip()) + ctext = str(summit_tag + " " + text.strip()) p = find_summit_comment(msg, summit_tag) if p >= 0: msg.auto_comment[p] = ctext else: msg.auto_comment.append(ctext) _summit_tag_branchid = "+>" _summit_tag_kwprop = "+:" _summit_tags = ( _summit_tag_branchid, _summit_tag_kwprop, ) def summit_set_tags (msg, branch_id, project): # Add branch ID. branch_ids = get_summit_comment(msg, _summit_tag_branchid, "").split() if branch_id not in branch_ids: branch_ids.append(branch_id) set_summit_comment(msg, _summit_tag_branchid, " ".join(branch_ids)) def summit_override_auto (summit_msg, branch_msg, branch_id, is_primary): # Copy auto/source/flag comments only if this is the primary branch # for the current message. if is_primary: # Equalize flags, except the fuzzy. for fl in branch_msg.flag: if fl != "fuzzy": summit_msg.flag.add(fl) for fl in summit_msg.flag: if fl != "fuzzy" and fl not in branch_msg.flag: summit_msg.flag.remove(fl) # Equalize source references. # FIXME: Once there is a way to reliably tell the root directory # of source references, add missing and remove obsolete source # references instead. - summit_msg.source = Monlist(map(Monpair, branch_msg.source)) + summit_msg.source = Monlist(list(map(Monpair, branch_msg.source))) # Split auto comments of the current summit message into # summit and non-summit tagged comments. # Also of the branch message, in case it has summit-alike comments. summit_nscmnts, summit_scmnts = split_summit_comments(summit_msg) branch_nscmnts, branch_scmnts = split_summit_comments(branch_msg) # Override auto comments only if different overally # (which needs not be, due to double fresh/old insertion) # and non-summit auto comments of the current summit message # are different to the branch message auto comments. if ( summit_msg.auto_comment != branch_msg.auto_comment and summit_nscmnts != branch_nscmnts ): summit_msg.auto_comment = Monlist(branch_msg.auto_comment) summit_msg.auto_comment.extend(summit_scmnts) def split_summit_comments (msg): non_summit_comments = [] summit_comments = [] for comment in msg.auto_comment: wlst = comment.split() if wlst and wlst[0] in _summit_tags: summit_comments.append(comment) else: non_summit_comments.append(comment) return non_summit_comments, summit_comments def summit_merge_single (branch_id, catalog_name, catalog_subdir, catalog_path, template_path, wrapping, fuzzy_merging, project, options, update_progress): update_progress() # Gather the summit template in summit-over-dynamic-templates mode. if project.templates_dynamic and branch_id == SUMMIT_ID: summit_gather_single(catalog_name, project.tproject, project.toptions, update_progress=update_progress) # FIXME: Portable construction of temporary file. tmp_path = os.path.join("/tmp", ( os.path.basename(catalog_path) + "~merged-%d" % os.getpid())) # Whether to create pristine catalog from template. vivified = catalog_path in project.add_on_merge # Skip calling msgmerge if template creation dates exist and are equal. do_msgmerge = True if not vivified and not project.templates_dynamic and not options.force: hdr = Catalog(catalog_path, monitored=False, headonly=True).header thdr = Catalog(template_path, monitored=False, headonly=True).header pcd = hdr.get_field_value("POT-Creation-Date") tpcd = thdr.get_field_value("POT-Creation-Date") do_msgmerge = (not pcd or not tpcd or pcd != tpcd) header_prop_fields = project.header_propagate_fields # Should merged catalog be opened, and in what mode? do_open = False headonly = False monitored = False otherwrap = set(wrapping).difference(["basic"]) if otherwrap or project.hook_on_merge_msg or project.hook_on_merge_cat: do_open = True elif header_prop_fields or project.hook_on_merge_head or vivified: do_open = True headonly = True if ( header_prop_fields or vivified or project.hook_on_merge_head or project.hook_on_merge_msg or project.hook_on_merge_cat ): monitored = True # Should template catalog be opened too? do_open_template = False if header_prop_fields or vivified: do_open_template = True cat = None if do_msgmerge: # Create the temporary merged catalog. minasfz, refuzzy = 0.0, False cmppaths, fuzzex, minwnex = [], False, 0 if branch_id == SUMMIT_ID: minasfz = project.merge_min_adjsim_fuzzy refuzzy = project.merge_rebase_fuzzy if project.compendium_on_merge: cmppaths.append(project.compendium_on_merge) fuzzex = project.compendium_fuzzy_exact minwnex = project.compendium_min_words_exact catalog_path_mod = catalog_path if vivified: if cmppaths: catalog_path_mod = "/dev/null" else: catalog_path_mod = tmp_path shutil.copyfile(template_path, tmp_path) getcat = do_open and not headonly ignpotdate = project.templates_dynamic cat = merge_pofile(catalog_path_mod, template_path, outpath=tmp_path, wrapping=wrapping, fuzzymatch=fuzzy_merging, minasfz=minasfz, refuzzy=refuzzy, cmppaths=cmppaths, fuzzex=fuzzex, minwnex=minwnex, getcat=getcat, monitored=monitored, ignpotdate=ignpotdate, quiet=True, abort=False) if cat is None: warning(_("@info", "Catalog '%(file1)s' not merged with " "template '%(file2)s' due to errors on merging.", file1=catalog_path_mod, file2=template_path)) return elif not getcat: # Catalog not requested, so the return value is True # indicating that the merge succedded. cat = None else: # Copy current to temporary catalog, to be processed by hooks, etc. shutil.copyfile(catalog_path, tmp_path) # Save good time by opening the merged catalog only if necessary, # and only as much as necessary. # Open catalogs as necessary. if do_open: update_progress() if cat is None: cat = Catalog(tmp_path, monitored=monitored, wrapping=wrapping, headonly=headonly) if do_open_template: tcat = Catalog(template_path, monitored=False, headonly=True) # Initialize header if the catalog has been vivified from template. if vivified: hdr = cat.header hdr.title = Monlist() - hdr.copyright = u"" - hdr.license = u"" + hdr.copyright = "" + hdr.license = "" hdr.author = Monlist() hdr.comment = Monlist() # Get the project ID from template; # if it gives default value, use catalog name instead. projid = tcat.header.get_field_value("Project-Id-Version") if not projid or "PACKAGE" in projid: projid = catalog_name - hdr.set_field(u"Project-Id-Version", unicode(projid)) + hdr.set_field("Project-Id-Version", str(projid)) rdate = time.strftime("%Y-%m-%d %H:%M%z") - hdr.set_field(u"PO-Revision-Date", unicode(rdate)) - hdr.set_field(u"Last-Translator", unicode(project.vivify_w_translator)) - hdr.set_field(u"Language-Team", unicode(project.vivify_w_langteam)) + hdr.set_field("PO-Revision-Date", str(rdate)) + hdr.set_field("Last-Translator", str(project.vivify_w_translator)) + hdr.set_field("Language-Team", str(project.vivify_w_langteam)) if project.vivify_w_language: - hdr.set_field(u"Language", unicode(project.vivify_w_language), + hdr.set_field("Language", str(project.vivify_w_language), after="Language-Team", reorder=True) - hdr.set_field(u"Content-Type", - u"text/plain; charset=%s" % project.vivify_w_charset) - hdr.set_field(u"Content-Transfer-Encoding", u"8bit") + hdr.set_field("Content-Type", + "text/plain; charset=%s" % project.vivify_w_charset) + hdr.set_field("Content-Transfer-Encoding", "8bit") if project.vivify_w_plurals: - hdr.set_field(u"Plural-Forms", unicode(project.vivify_w_plurals)) + hdr.set_field("Plural-Forms", str(project.vivify_w_plurals)) else: - hdr.remove_field(u"Plural-Forms") + hdr.remove_field("Plural-Forms") # Propagate requested header fields. if header_prop_fields: # Preserve order of the fields when collecting. fields = [] for field in cat.header.field: if field[0] in header_prop_fields: fields.append(field) tfields = [] for tfield in tcat.header.field: if tfield[0] in header_prop_fields: tfields.append(tfield) # Replace the field sequence if not equal to that of the template. if fields != tfields: for field in fields: cat.header.field.remove(field) for tfield in tfields: cat.header.field.append(tfield) # Set original instead of temporary file path -- hooks may expect it. if cat is not None: cat.filename = catalog_path # Execute header hooks. if project.hook_on_merge_head: exec_hook_head(branch_id, catalog_name, catalog_subdir, cat.header, cat, project.hook_on_merge_head) # Execute message hooks. if project.hook_on_merge_msg: for msg in cat: exec_hook_msg(branch_id, catalog_name, catalog_subdir, msg, cat, project.hook_on_merge_msg) # Execute catalog hooks. if project.hook_on_merge_cat: exec_hook_cat(branch_id, catalog_name, catalog_subdir, cat, project.hook_on_merge_cat) # Synchronize merged catalog if it has been opened. if cat is not None: cat.filename = tmp_path # not to overwrite original file cat.sync(force=otherwrap) # Execute file hooks. if project.hook_on_merge_file: cat_name = os.path.basename(tmp_path) cat_name = cat_name[:cat_name.rfind(".po")] exec_hook_file(branch_id, cat_name, catalog_subdir, tmp_path, project.hook_on_merge_file) # If there is any difference between merged and old catalog. if vivified or not filecmp.cmp(catalog_path, tmp_path): # Assert correctness of the merged catalog and move over the old. assert_system("msgfmt -c -o/dev/null %s " % tmp_path) added = False if vivified: added = True mkdirpath(os.path.dirname(catalog_path)) shutil.move(tmp_path, catalog_path) # Add to version control if not already added. vcs = project.summit_vcs if SUMMIT_ID else project.branches_vcs if ( vcs and ( branch_id == SUMMIT_ID or not project.bdict[branch_id].skip_version_control) and not vcs.is_versioned(catalog_path) ): if not vcs.add(catalog_path): warning(_("@info", "Cannot add '%(file)s' to version control.", file=catalog_path)) if options.verbose: if added: actype = _("@item:intext action performed on a catalog", "merged-added") report(".+ (%s) %s" % (actype, catalog_path)) else: actype = _("@item:intext action performed on a catalog", "merged") report(". (%s) %s" % (actype, catalog_path)) elif not options.quiet: if added: report(".+ %s" % catalog_path) else: report(". %s" % catalog_path) # Remove the temporary merged catalog. if os.path.exists(tmp_path): os.remove(tmp_path) # Put header fields in canonical form, for equality checking. # Returns ordered list of (field name, field value). def normhf (fields, excluded=[]): nfs = [] for fnam, fval in fields: if fnam not in excluded: nfs.append((fnam, fval)) nfs.sort() return nfs # Remove all translator-related elements from the message. def clear_msg (msg): msg.unfuzzy() - msg.msgstr[:] = [u""] * len(msg.msgstr) + msg.msgstr[:] = [""] * len(msg.msgstr) msg.manual_comment[:] = [] return msg if __name__ == '__main__': exit_on_exception(main) diff --git a/sieve/apply_filter.py b/sieve/apply_filter.py index cb88f752..dc86a1e8 100644 --- a/sieve/apply_filter.py +++ b/sieve/apply_filter.py @@ -1,97 +1,97 @@ # -*- coding: UTF-8 -*- """ Apply hooks to translation. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.getfunc import get_hook_ireq from pology.msgreport import report_msg_content from pology.report import report, warning, error from pology.sieve import add_param_filter from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Apply hooks to translation." "\n\n" "Message msgstr fields are passed through one or more of " "F1A, F3A/C, V1A, V3A/C, S1A, S3A/C hooks." )) add_param_filter(p, _("@info sieve parameter discription", "Specification of the hook through which msgstr fields are passed." )) p.add_param("showmsg", bool, defval=False, desc=_("@info sieve parameter discription", "Report message to standard output if it got modified." )) class Sieve (object): def __init__ (self, params): self.p = params self.tfilters = [[get_hook_ireq(x, abort=True), x] for x in (params.filter or [])] # Number of modified messages. self.nmod = 0 def process (self, msg, cat): mcount = msg.modcount for i in range(len(msg.msgstr)): for tfilter, tfname in self.tfilters: try: # try as type *1A hook res = tfilter(msg.msgstr[i]) except TypeError: try: # try as type *3* hook res = tfilter(msg.msgstr[i], msg, cat) except TypeError: raise SieveError( _("@info", "Cannot execute filter '%(filt)s'.", filt=tfname)) # Process result based on hook type. - if isinstance(res, basestring): + if isinstance(res, str): # Modification hook. msg.msgstr[i] = res elif isinstance(res, list): # Validation hook. if res: report_msg_content(msg, cat, highlight=[("msgstr", i, res)], delim=("-" * 20)) else: # Side-effect hook, nothing to do. # TODO: Perhaps report returned number? pass if mcount < msg.modcount: self.nmod += 1 if self.p.showmsg: report_msg_content(msg, cat, delim=("-" * 20)) def finalize (self): if self.nmod: msg = n_("@info:progress", "Modified %(num)d message by filtering.", "Modified %(num)d messages by filtering.", num=self.nmod) report("===== " + msg) diff --git a/sieve/bad_patterns.py b/sieve/bad_patterns.py index 292d5608..efc2a165 100644 --- a/sieve/bad_patterns.py +++ b/sieve/bad_patterns.py @@ -1,88 +1,88 @@ # -*- coding: UTF-8 -*- """ Check for presence of bad patterns in translation. Documented in C{doc/user/sieving.docbook}. @note: Deprecated. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.bpatterns import bad_patterns_msg from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Check for presence of bad patterns in translation." )) - p.add_param("pattern", unicode, multival=True, + p.add_param("pattern", str, multival=True, metavar=_("@info sieve parameter value placeholder", "STRING"), desc=_("@info sieve parameter discription", "A pattern to check against. " "The pattern can be a substring or regular expression, " "depending on the '%s' parameter. " "This parameter can be repeated to add several patterns." )) - p.add_param("fromfile", unicode, multival=True, + p.add_param("fromfile", str, multival=True, metavar=_("@info sieve parameter value placeholder", "PATH"), desc=_("@info sieve parameter discription", "Read patterns to check against from a file. " "The file format is as follows: " "each line contains one pattern, " "leading and trailing whitespace is removed, " "empty lines are ignored; " "# denotes start of comment, which extends to end of line." "This parameter can be repeated to add several files." )) p.add_param("rxmatch", bool, defval=False, desc=_("@info sieve parameter discription", "Treat patterns as regular expressions; default is substring matching." )) p.add_param("casesens", bool, defval=False, desc=_("@info sieve parameter discription", "Set case-sensitive matching; default is case-insensitive." )) class Sieve (object): def __init__ (self, params): # Indicators to the caller: self.caller_sync = False # no need to sync catalogs to the caller self.caller_monitored = False # no need for monitored messages # Create checker hook. self.check = bad_patterns_msg(rxmatch=params.rxmatch, casesens=params.casesens, patterns=params.pattern, fromfiles=params.fromfile) self.nbad = 0 def process (self, msg, cat): # Check only translated messages. if not msg.translated: return self.nbad += self.check(msg, cat) def finalize (self): if self.nbad > 0: msg = n_("@info:progress", "Detected %(num)d bad pattern in translation.", "Detected %(num)d bad patterns in translation.", num=self.nbad) report("===== " + msg) diff --git a/sieve/check_grammar.py b/sieve/check_grammar.py index 9fc23e68..36c6f238 100644 --- a/sieve/check_grammar.py +++ b/sieve/check_grammar.py @@ -1,183 +1,183 @@ # -*- coding: UTF-8 -*- """ Check language of translation using LanguageTool. Documented in C{doc/user/sieving.docbook}. @author: Sébastien Renard @license: GPLv3 """ -from httplib import HTTPConnection +from http.client import HTTPConnection import socket import sys -from urllib import urlencode +from urllib.parse import urlencode from xml.dom.minidom import parseString from pology import _, n_ from pology.colors import cjoin from pology.msgreport import report_msg_to_lokalize, warning_on_msg from pology.report import report, warning from pology.sieve import SieveError, SieveCatalogError from pology.sieve import add_param_lang, add_param_accel, add_param_markup from pology.sieve import add_param_filter from pology.getfunc import get_hook_ireq from pology.sieve import add_param_poeditors _REQUEST="/?language=%s&disabled=HUNSPELL_RULE&%s" def setup_sieve (p): p.set_desc(_("@info sieve discription", "Check language of translation using LanguageTool." "\n\n" "LanguageTool (http://www.languagetool.org) is an open source " "language checker, which may be used as a standalone application, " "or in server-client mode. " "This sieve runs in client-server mode, so make sure Language Tool " "is running before this sieve is run." )) add_param_lang(p) add_param_accel(p) add_param_markup(p) add_param_filter(p, intro=_("@info sieve parameter discription", "The F1A or F3A/C hook through which to filter the translation " "before passing it to grammar checking." )) p.add_param("host", str, defval="localhost", metavar=_("@info sieve parameter value placeholder", "NAME"), desc=_("@info sieve parameter discription", "Name of the host where the server is running." )) p.add_param("port", str, defval="8081", metavar=_("@info sieve parameter value placeholder", "NUMBER"), desc=_("@info sieve parameter discription", "TCP port on the host which server uses to listen for queries." )) add_param_poeditors(p) class Sieve (object): def __init__ (self, params): self.nmatch = 0 # Number of match for finalize self.connection=None # Connection to LanguageTool server self.setLang=params.lang self.setAccel=params.accel self.setMarkup=params.markup self.lokalize = params.lokalize # LanguageTool server parameters. host=params.host port=params.port #TODO: autodetect tcp port by reading LanguageTool config file if host is localhost # As LT server does not seem to read disabled rules from his config file, we manage exception here #TODO: investigate deeper this problem and make a proper bug report to LT devs. self.disabledRules=["UPPERCASE_SENTENCE_START","COMMA_PARENTHESIS_WHITESPACE"] # Create connection to the LanguageTool server self.connection=HTTPConnection(host, port) self.pfilters = [[get_hook_ireq(x, abort=True), x] for x in (params.filter or [])] def process_header (self, hdr, cat): self.lang=(self.setLang or cat.language()) if not self.lang: raise SieveCatalogError( _("@info", "Cannot determine language for catalog '%(file)s'.", file=cat.filename)) # Force explicitly given accelerators and markup. if self.setAccel is not None: cat.set_accelerator(self.setAccel) if self.setMarkup is not None: cat.set_markup(self.setMarkup) def process (self, msg, cat): if msg.obsolete: return try: for msgstr in msg.msgstr: # Apply precheck filters. for pfilter, pfname in self.pfilters: try: # try as type F1A hook msgstr = pfilter(msgstr) except TypeError: try: # try as type F3* hook msgstr = pfilter(msgstr, msg, cat) except TypeError: raise SieveError( _("@info", "Cannot execute filter '%(filt)s'.", filt=pfname)) self.connection.request("GET", _REQUEST % (self.lang, urlencode({"text":msgstr.encode("UTF-8")}))) response=self.connection.getresponse() if response: responseData=response.read() if "error" in responseData: dom=parseString(responseData) for error in dom.getElementsByTagName("error"): if error.getAttribute("ruleId") in self.disabledRules: continue self.nmatch+=1 report("-"*(len(msgstr)+8)) report(_("@info", "%(file)s:%(line)d(#%(entry)d)", file=cat.filename, line=msg.refline, entry=msg.refentry)) #TODO: create a report function in the right place #TODO: color in red part of context that make the mistake report(_("@info", "Context: %(snippet)s", snippet=error.getAttribute("context"))) report(_("@info", "(%(rule)s) ==> %(note)s", rule=error.getAttribute("ruleId"), note=error.getAttribute("msg"))) report("") if self.lokalize: repls = [_("@label", "Grammar errors:")] repls.append(_( "@info", "%(file)s:%(line)d(#%(entry)d)", file=cat.filename, line=msg.refline, entry=msg.refentry )) repls.append(_( "@info", "(%(rule)s) ==> %(note)s", rule=error.getAttribute("ruleId"), note=error.getAttribute("msg") )) report_msg_to_lokalize(msg, cat, cjoin(repls, "\n")) except socket.error: raise SieveError(_("@info", "Cannot connect to LanguageTool server. " "Did you start it?")) def finalize (self): if self.nmatch: msg = n_("@info:progress", "Detected %(num)d problem in grammar and style.", "Detected %(num)d problems in grammar and style.", num=self.nmatch) report("===== " + msg) diff --git a/sieve/check_kde4.py b/sieve/check_kde4.py index 104eb7e2..d1b6785b 100644 --- a/sieve/check_kde4.py +++ b/sieve/check_kde4.py @@ -1,98 +1,98 @@ # -*- coding: utf-8 -*- """ Check native KDE4 PO files for various problems. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.markup import flag_no_check_markup from pology.markup import validate_kde4_l1 from pology.msgreport import report_on_msg, report_on_msg_hl from pology.msgreport import report_msg_to_lokalize from pology.report import report from pology.sieve import add_param_poeditors from pology.sieve import parse_sieve_flags def setup_sieve (p): p.set_desc(_("@info sieve discription", "Check native KDE4 PO files for various problems." )) p.add_param("strict", bool, defval=False, desc=_("@info sieve parameter discription", "Check for problems in translation regardless of whether the original " "itself is free of problems (default is to check translation only if " "the original has no problems)." )) add_param_poeditors(p) class Sieve (object): def __init__ (self, params): self.strict = params.strict self.lokalize = params.lokalize # Indicators to the caller: self.caller_sync = False # no need to sync catalogs to the caller self.caller_monitored = False # no need for monitored messages self.nproblems = 0 def process (self, msg, cat): # Check only translated messages. if not msg.translated: return # Do not check messages when told so. if flag_no_check_markup in parse_sieve_flags(msg): return # In in non-strict mode, check XML of translation only if the # original itself is valid XML. if not self.strict: if ( validate_kde4_l1(msg.msgid, ents={}) - or validate_kde4_l1(msg.msgid_plural or u"", ents={}) + or validate_kde4_l1(msg.msgid_plural or "", ents={}) ): return highlight = [] for i in range(len(msg.msgstr)): spans = validate_kde4_l1(msg.msgstr[i], ents={}) if spans: self.nproblems += 1 highlight.append(("msgstr", i, spans, msg.msgstr[i])) if highlight: report_on_msg_hl(highlight, msg, cat) if self.lokalize: report_msg_to_lokalize(msg, cat, highlight) def finalize (self): if self.nproblems > 0: if not self.strict: msg = n_("@info:progress", "Found %(num)d problem in KDE4 translations.", "Found %(num)d problems in KDE4 translations.", num=self.nproblems) else: msg = n_("@info:progress", "Found %(num)d problem in KDE4 translations " "(strict mode).", "Found %(num)d problems in KDE4 translations " "(strict mode).", num=self.nproblems) report("===== " + msg) diff --git a/sieve/check_rules.py b/sieve/check_rules.py index 8795ff25..648d75e7 100644 --- a/sieve/check_rules.py +++ b/sieve/check_rules.py @@ -1,611 +1,612 @@ # -*- coding: UTF-8 -*- """ Apply language- and project-dependent validation rules to messages. Documented in C{doc/user/sieving.docbook}. @author: Sébastien Renard @author: Chusslove Illich @license: GPLv3 """ from codecs import open from locale import getpreferredencoding import os from os.path import abspath, basename, dirname, exists, expandvars, join import re import sys from time import strftime, strptime, mktime from pology import _, n_ from pology.colors import cjoin from pology.comments import manc_parse_list, parse_summit_branches from pology.fsops import collect_files_by_ext from pology.message import MessageUnsafe from pology.msgreport import multi_rule_error, rule_xml_error from pology.msgreport import report_msg_to_lokalize from pology.report import report, warning, format_item_list from pology.rules import loadRules, printStat from pology.sieve import add_param_lang, add_param_env, add_param_poeditors from pology.timeout import TimedOutException from pology.sieve import SieveError, SieveCatalogError, SieveMessageError +from functools import reduce # Pattern used to marshall path of cached files _MARSHALL = "+++" # Cache directory (for xml processing only) # FIXME: More portable location of cache. _CACHEDIR = expandvars("$HOME/.pology-check_rules-cache/") # Flag to add to failed messages, if requested. -_flag_mark = u"failed-rule" +_flag_mark = "failed-rule" def setup_sieve (p): p.set_desc(_("@info sieve discription", "Apply rules to messages and report those that do not pass." )) add_param_lang(p, appx=_("@info sieve parameter discription", "If the language is left undefined for a given catalog, " "it will be skipped and a warning may be output." )) add_param_env(p, appx=_("@info sieve parameter discription", "If the environment is left undefined for a given catalog, " "only environment-agnostic rules will be applied." )) p.add_param("stat", bool, defval=False, desc=_("@info sieve parameter discription", "Output statistics on application of rules." )) p.add_param("envonly", bool, defval=False, desc=_("@info sieve parameter discription", "Load only rules explicitly belonging to environment given by '%(par)s'.", par="env" )) - p.add_param("accel", unicode, multival=True, + p.add_param("accel", str, multival=True, metavar=_("@info sieve parameter value placeholder", "CHAR"), desc=_("@info sieve parameter discription", "Character which is used as UI accelerator marker in text fields. " "If a catalog defines accelerator marker in the header, " "this value overrides it." )) - p.add_param("markup", unicode, seplist=True, + p.add_param("markup", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "KEYWORD"), desc=_("@info sieve parameter discription", "Markup that can be expected in text fields, as special keyword " "(see documentation to pology.catalog, Catalog.set_markup(), " "for markup keywords currently known to Pology). " "If a catalog defines markup type in the header, " "this value overrides it." "Several markups can be given as comma-separated list." )) - p.add_param("rfile", unicode, multival=True, + p.add_param("rfile", str, multival=True, metavar=_("@info sieve parameter value placeholder", "PATH"), desc=_("@info sieve parameter discription", "Load rules from a file, rather than internal Pology rules. " "Several rule files can be given by repeating the parameter." )) - p.add_param("rdir", unicode, multival=True, + p.add_param("rdir", str, multival=True, metavar=_("@info sieve parameter value placeholder", "DIRPATH"), desc=_("@info sieve parameter discription", "Load rules from a directory, rather than internal Pology rules." "Several rule directories can be given by repeating the parameter." )) p.add_param("showfmsg", bool, defval=False, desc=_("@info sieve parameter discription", "Show filtered message too when reporting message failed by a rule." )) p.add_param("nomsg", bool, attrname="showmsg", defval=True, desc=_("@info sieve parameter discription", "Do not show message content at all when reporting failures." )) - p.add_param("rule", unicode, seplist=True, + p.add_param("rule", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "RULEID"), desc=_("@info sieve parameter discription", "Apply only the rule given by this identifier. " "Several identifiers can be given as comma-separated list." )) - p.add_param("rulerx", unicode, multival=True, + p.add_param("rulerx", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Apply only the rules with identifiers matching this regular expression. " "Several patterns can be given by repeating the parameter." )) - p.add_param("norule", unicode, seplist=True, + p.add_param("norule", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "RULEID"), desc=_("@info sieve parameter discription", "Do not apply rule given by this identifier. " "Several identifiers can be given as comma-separated list." )) - p.add_param("norulerx", unicode, multival=True, + p.add_param("norulerx", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Do not apply the rules with identifiers matching this regular expression. " "Several patterns can be given by repeating the parameter." )) - p.add_param("branch", unicode, seplist=True, + p.add_param("branch", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "BRANCH"), desc=_("@info sieve parameter discription", "In summit catalogs, consider only messages belonging to given branch. " "Several branches can be given as comma-separated list." )) - p.add_param("xml", unicode, + p.add_param("xml", str, metavar=_("@info sieve parameter value placeholder", "PATH"), desc=_("@info sieve parameter discription", "Write rule failures into an XML file instead of stdout." )) p.add_param("mark", bool, defval=False, desc=_("@info sieve parameter discription", "Add '%(flag)s' flag to each message failed by a rule.", flag=_flag_mark )) p.add_param("byrule", bool, defval=False, desc=_("@info sieve parameter discription", "Output failed messages ordered by sorted rule identifiers." )) p.add_param("ruleinfo", bool, defval=False, desc=_("@info sieve parameter discription", "Show information on loading of rules during sieving." )) add_param_poeditors(p) class Sieve (object): """Find messages matching given rules.""" def __init__ (self, params): self.nmatch = 0 # Number of match for finalize self.rules = [] # List of rules objects loaded in memory self.xmlFile = None # File handle to write XML output self.cacheFile = None # File handle to write XML cache self.cachePath = None # Path to cache file self.filename = "" # File name we are processing self.cached = False # Flag to indicate if process result is already is cache self.globalLang = params.lang self.globalEnvs = params.env self.envOnly = params.envonly self._rulesCache = {} self.accels = params.accel self.markup = params.markup self.ruleChoice = params.rule self.ruleChoiceRx = params.rulerx self.ruleChoiceInv = params.norule self.ruleChoiceInvRx = params.norulerx self.stat = params.stat self.showfmsg = params.showfmsg self.showmsg = params.showmsg self.lokalize = params.lokalize self.mark = params.mark self.byrule = params.byrule self.ruleinfo = params.ruleinfo self.branches = params.branch and set(params.branch) or None # Collect non-internal rule files. self.customRuleFiles = None if params.rfile or params.rdir: self.customRuleFiles = [] if params.rfile: self.customRuleFiles.extend(params.rfile) if params.rdir: for rdir in params.rdir: rfiles = collect_files_by_ext(rdir, "rules") self.customRuleFiles.extend(rfiles) # Also output in XML file ? if params.xml: xmlPath = params.xml if os.access(dirname(abspath(xmlPath)), os.W_OK): #TODO: create nice api to manage xml file and move it to rules.py self.xmlFile = open(xmlPath, "w", "utf-8") self.xmlFile.write('\n') self.xmlFile.write('\n' % strftime('%c').decode(getpreferredencoding())) else: warning(_("@info", "Cannot open file '%(file)s'. XML output disabled.", file=xmlPath)) if not exists(_CACHEDIR) and self.xmlFile: #Create cache dir (only if we want wml output) try: os.mkdir(_CACHEDIR) - except IOError, e: + except IOError as e: raise SieveError(_("@info", "Cannot create cache directory '%(dir)s':\n" "%(msg)s", dir=_CACHEDIR, msg=e)) if self.byrule: self.postFailedMessages = {} self._first_error = True # Unless marking requested, no need to monitor/sync. if not self.mark: self.caller_sync = False self.caller_monitored = False def process_header (self, hdr, cat): # Force explicitly given accelerators. if self.accels is not None: cat.set_accelerator(self.accels) # Force explicitly given markup. if self.markup is not None: cat.set_markup(self.markup) # Choose (possibly loading) appropriate rules for this catalog. self.lang = self.globalLang or cat.language() if not self.lang: raise SieveCatalogError( _("@info", "Cannot determine language for catalog '%(file)s'.", file=cat.filename)) self.envs = self.globalEnvs or cat.environment() or [] rkey = (self.lang, tuple(self.envs)) if rkey not in self._rulesCache: self._rulesCache[rkey] = self._loadRules(self.lang, self.envs) self.rules, self.ruleFilters = self._rulesCache[rkey] def process (self, msg, cat): # Apply rules only on translated messages. if not msg.translated: return # Apply rules only to messages from selected branches. if self.branches: msg_branches = parse_summit_branches(msg) if not set.intersection(self.branches, msg_branches): return filename = basename(cat.filename) # New file handling if self.xmlFile and self.filename != filename: newFile = True self.cached = False # Reset flag self.cachePath = join(_CACHEDIR, abspath(cat.filename).replace("/", _MARSHALL)) if self.cacheFile: self.cacheFile.close() if self.filename != "": # close previous self.xmlFile.write("\n") self.filename = filename else: newFile = False # Current file loaded from cache on previous message. Close and return if self.cached: # No need to analyze message, return immediately if self.cacheFile: self.cacheFile = None # Indicate cache has been used and flushed into xmlFile return # Does cache exist for this file ? if self.xmlFile and newFile and exists(self.cachePath): poDate = None for headerName, headerValue in cat.header.field: if headerName == "PO-Revision-Date": poDate = headerValue break if poDate: #Truncate daylight information poDate = poDate.rstrip("GMT") poDate = poDate[0:poDate.find("+")] #Convert in sec since epoch time format poDate = mktime(strptime(poDate, '%Y-%m-%d %H:%M')) if os.stat(self.cachePath)[8] > poDate: if self.ruleinfo: report(_("@info:progress", "Using cache.")) self.xmlFile.writelines(open(self.cachePath, "r", "utf-8").readlines()) self.cached = True # No cache available, create it for next time if self.xmlFile and newFile and not self.cached: if self.ruleinfo: report(_("@info", "No cache available, processing file.")) self.cacheFile = open(self.cachePath, "w", "utf-8") # Handle start/end of files for XML output (not needed for text output) if self.xmlFile and newFile: # open new po if self.cached: # We can return now, cache is used, no need to process catalog return else: poTag = '\n' % filename self.xmlFile.write(poTag) # Write to result self.cacheFile.write(poTag) # Write to cache # Collect explicitly ignored rules by ID for this message. locally_ignored = manc_parse_list(msg, "skip-rule:", ",") # Collect explicitly applied rules by ID for this message. locally_applied = manc_parse_list(msg, "apply-rule:", ",") # Collect ignored/applied rules by switching comment. swprefix = "switch-rule:" swsep = ">" for cmnt in msg.manual_comment: if cmnt.strip().startswith(swprefix): p1 = cmnt.find(swprefix) + len(swprefix) p2 = cmnt.find(swsep, p1) if p2 < 0: raise SieveMessageError( _("@info", "Separator character '%(sep)s' missing in " "'%(prefix)s' comment.", sep=swsep, prefix=swprefix)) els1 = [x.strip() for x in cmnt[p1:p2].split(",")] els2 = [x.strip() for x in cmnt[p2 + len(swsep):].split(",")] locally_ignored.extend(x for x in els1 if x) locally_applied.extend(x for x in els2 if x) # NOTE: It would be nice to warn if an explicitly applied rule # is not defined, but this is not generally possible because # different rule files may be loaded for different runs. # Prepare filtered messages for checking. envSet = set(self.envs) msgByFilter = {} for mfilter in self.ruleFilters: if mfilter is not None: msgf = MessageUnsafe(msg) mfilter(msgf, cat, envSet) else: msgf = msg msgByFilter[mfilter] = msgf # Now the sieve itself. Check message with every rules failedRules = [] for rule in self.rules: if rule.disabled: continue if rule.environ and rule.environ not in envSet: continue if rule.ident in locally_ignored: continue if rule.manual and not rule.ident in locally_applied: continue msgf = msgByFilter[rule.mfilter] try: spans = rule.process(msgf, cat, envs=envSet, nofilter=True) except TimedOutException: warning(_("@info:progress", "Rule '%(rule)s' timed out, skipping it.", rule=rule.rawPattern)) continue if spans: self.nmatch += 1 if self.xmlFile: # FIXME: rule_xml_error is actually broken, # as it considers matching to always be on msgstr # Multiple span are now supported as well as msgstr index # Now, write to XML file if defined rspans = [x[:2] for x in spans[0][2]] pluid = spans[0][1] xmlError = rule_xml_error(msg, cat, rule, rspans, pluid) self.xmlFile.writelines(xmlError) if not self.cached: # Write result in cache self.cacheFile.writelines(xmlError) if not self.showfmsg: msgf = None failedRules.append((rule, spans, msgf)) if failedRules: if not self.byrule: multi_rule_error(msg, cat, failedRules, self.showmsg, predelim=self._first_error) self._first_error = False else: for rule, spans, msgf in failedRules: if rule.ident not in self.postFailedMessages: self.postFailedMessages[rule.ident] = [] self.postFailedMessages[rule.ident].append( (msg, cat, ((rule, spans, msgf)))) if self.mark: msg.flag.add(_flag_mark) if self.lokalize: repls = [_("@label", "Failed rules:")] for rule, hl, msgf in failedRules: repls.append(_("@item", "rule %(rule)s ==> %(msg)s", rule=rule.displayName, msg=rule.hint)) for part, item, spans, fval in hl: - repls.extend([u"↳ %s" % x[2] + repls.extend(["↳ %s" % x[2] for x in spans if len(x) > 2]) report_msg_to_lokalize(msg, cat, cjoin(repls, "\n")) def finalize (self): if self.byrule: ruleIdents = sorted(self.postFailedMessages.keys()) for ruleIdent in ruleIdents: for msg, cat, failedRule in self.postFailedMessages[ruleIdent]: multi_rule_error(msg, cat, [failedRule], self.showmsg, predelim=self._first_error) self._first_error = False if self.xmlFile: # Close last po tag and xml file if self.cached and self.cacheFile: self.cacheFile.write("\n") self.cacheFile.close() self.cacheFile = None else: self.xmlFile.write("\n") self.xmlFile.write("\n") self.xmlFile.close() if self.nmatch > 0: msg = n_("@info:progress", "Rules detected %(num)d problem.", "Rules detected %(num)d problems.", num=self.nmatch) report("===== " + msg) printStat(self.rules) def _loadRules (self, lang, envs): # Load rules. rules = loadRules(lang, envs, self.envOnly, self.customRuleFiles, self.stat, self.ruleinfo) # Perhaps retain only those rules explicitly requested # in the command line, by their identifiers. selectedRules = set() srules = set() if self.ruleChoice: requestedRules = set([x.strip() for x in self.ruleChoice]) foundRules = set() for rule in rules: if rule.ident in requestedRules: srules.add(rule) foundRules.add(rule.ident) rule.disabled = False if foundRules != requestedRules: missingRules = list(requestedRules - foundRules) fmtMissingRules = format_item_list(sorted(missingRules)) raise SieveError(_("@info", "Some explicitly selected rules " "are missing: %(rulelist)s.", rulelist=fmtMissingRules)) selectedRules.update(foundRules) if self.ruleChoiceRx: identRxs = [re.compile(x, re.U) for x in self.ruleChoiceRx] for rule in rules: if (rule.ident and reduce(lambda s, x: s or x.search(rule.ident), identRxs, False) ): srules.add(rule) selectedRules.add(rule.ident) if self.ruleChoice or self.ruleChoiceRx: rules = list(srules) selectedRulesInv = set() srules = set(rules) if self.ruleChoiceInv: requestedRules = set([x.strip() for x in self.ruleChoiceInv]) foundRules = set() for rule in rules: if rule.ident in requestedRules: if rule in srules: srules.remove(rule) foundRules.add(rule.ident) if foundRules != requestedRules: missingRules = list(requestedRules - foundRules) fmtMissingRules = format_item_list(sorted(missingRules)) raise SieveError(_("@info", "Some explicitly excluded rules " "are missing: %(rulelist)s.", rulelist=fmtMissingRules)) selectedRulesInv.update(foundRules) if self.ruleChoiceInvRx: identRxs = [re.compile(x, re.U) for x in self.ruleChoiceInvRx] for rule in rules: if (rule.ident and reduce(lambda s, x: s or x.search(rule.ident), identRxs, False) ): if rule in srules: srules.remove(rule) selectedRulesInv.add(rule.ident) if self.ruleChoiceInv or self.ruleChoiceInvRx: rules = list(srules) if self.ruleinfo: ntot = len(rules) ndis = len([x for x in rules if x.disabled]) nact = ntot - ndis totfmt = n_("@item:intext inserted below as %(tot)s", "Loaded %(num)d rule", "Loaded %(num)d rules", num=ntot) if self.envOnly: envfmt = _("@item:intext inserted below as %(env)s", "[only: %(envlist)s]", envlist=format_item_list(envs)) else: envfmt = _("@item:intext inserted below as %(env)s", "[%(envlist)s]", envlist=format_item_list(envs)) actfmt = n_("@item:intext inserted below as %(act)s", "%(num)d active", "%(num)d active", num=nact) disfmt = n_("@item:intext inserted below as %(dis)s", "%(num)d disabled", "%(num)d disabled", num=ndis) subs = dict(tot=totfmt, env=envfmt, act=actfmt, dis=disfmt) if ndis and envs: report(_("@info:progress insertions from above", "%(tot)s %(env)s (%(act)s, %(dis)s).", **subs)) elif ndis: report(_("@info:progress insertions from above", "%(tot)s (%(act)s, %(dis)s).", **subs)) elif envs: report(_("@info:progress insertions from above", "%(tot)s %(env)s.", **subs)) else: report(_("@info:progress insertions from above", "%(tot)s.", **subs)) if selectedRules: selectedRules = selectedRules.difference(selectedRulesInv) n = len(selectedRules) if n <= 10: rlst = list(selectedRules) report(_("@info:progress", "Selected rules: %(rulelist)s.", rulelist=format_item_list(sorted(rlst)))) else: report(n_("@info:progress", "Selected %(num)d rule.", "Selected %(num)d rules.", num=n)) elif selectedRulesInv: n = len(selectedRulesInv) if n <= 10: rlst = list(selectedRulesInv) report(_("@info:progress", "Excluded rules: %(rulelist)s.", rulelist=format_item_list(sorted(rlst)))) else: report(n_("@info:progress", "Excluded %(num)d rule.", "Excluded %(num)d rules.", num=n)) # Collect all distinct filters from rules. ruleFilters = set() for rule in rules: if not rule.disabled: ruleFilters.add(rule.mfilter) if self.ruleinfo: nflt = len([x for x in ruleFilters if x is not None]) if nflt: report(n_("@info:progress", "Active rules define %(num)d distinct filter set.", "Active rules define %(num)d distinct filter sets.", num=nflt)) return rules, ruleFilters diff --git a/sieve/check_spell.py b/sieve/check_spell.py index d0b3bc56..88341daf 100644 --- a/sieve/check_spell.py +++ b/sieve/check_spell.py @@ -1,470 +1,470 @@ # -*- coding: UTF-8 -*- """ Spell-check translation using GNU Aspell (U{http://aspell.net/}). Documented in C{doc/user/sieving.docbook}. @author: Sébastien Renard @license: GPLv3 """ from codecs import open import locale import os from os.path import abspath, basename, dirname, isfile, isdir, join import re import sys from time import strftime from pology import datadir, _, n_ from pology.spell import flag_no_check_spell, elist_well_spelled from pology.colors import cjoin from pology.comments import manc_parse_list, manc_parse_flag_list import pology.config as cfg from pology.getfunc import get_hook_ireq from pology.msgreport import spell_error, spell_xml_error from pology.msgreport import report_msg_to_lokalize from pology.report import report, warning, format_item_list from pology.sieve import SieveError, SieveCatalogError from pology.split import proper_words from pology.sieve import add_param_spellcheck def setup_sieve (p): p.set_desc(_("@info sieve discription", "Spell-check translation using Aspell." )) add_param_spellcheck(p) - p.add_param("enc", unicode, + p.add_param("enc", str, metavar=_("@info sieve parameter value placeholder", "ENCODING"), desc=_("@info sieve parameter discription", "Encoding for text sent to Aspell." )) - p.add_param("var", unicode, + p.add_param("var", str, metavar=_("@info sieve parameter value placeholder", "VARIETY"), desc=_("@info sieve parameter discription", "Variety of the Aspell dictionary." )) - p.add_param("xml", unicode, + p.add_param("xml", str, metavar=_("@info sieve parameter value placeholder", "FILE"), desc=_("@info sieve parameter discription", "Build XML report file at given path." )) p.add_param("simsp", bool, defval=False, desc=_("@info sieve parameter discription", "Split text into words in a simpler way (deprecated,)." )) class Sieve (object): """Process messages through the Aspell spell checker""" def __init__ (self, params): self.nmatch = 0 # Number of match for finalize self.unknownWords=None # If not None, only list of faulty word is display (to ease copy/paste into personal dictionary) self.filename="" # File name we are processing self.xmlFile=None # File handle to write XML output # Build Aspell options. self.aspellOptions = {} # - assume markup in messages (provide option to disable?) self.aspellOptions["mode"] = "sgml" # FIXME: In fact not needed? The words are sent parsed to checker. self.lang = params.lang self.encoding = params.enc self.variety = params.var cfgs = cfg.section("aspell") if not self.lang: self.lang = cfgs.string("language") if not self.encoding: self.encoding = cfgs.string("encoding") if not self.variety: self.variety = cfgs.string("variety") self.loc_encoding = locale.getlocale()[1] if not self.encoding: self.encoding = self.loc_encoding if not self.encoding: self.encoding = "UTF-8" self.encoding = self._encoding_for_aspell(self.loc_encoding) self.aspellOptions["lang"] = self.lang.encode(self.loc_encoding) if self.lang else None self.aspellOptions["encoding"] = self.encoding.encode(self.loc_encoding) if self.variety: self.aspellOptions["variety"] = self.variety.encode(self.loc_encoding) if self.variety else None self.unknownWords = None if params.list: self.unknownWords = set() if params.xml: xmlPath=params.xml if os.access(dirname(abspath(xmlPath)), os.W_OK): #TODO: create nice api to manage xml file and move it to rules.py self.xmlFile=open(xmlPath, "w", "utf-8") self.xmlFile.write('\n') self.xmlFile.write('\n' % strftime('%c').decode(locale.getpreferredencoding())) else: warning(_("@info", "Cannot open file '%(file)s'. XML output disabled.", file=xmlPath)) self.accel = params.accel self.markup = params.markup self.skipRx = None if params.skip: flags = re.U if not params.case: flags |= re.I self.skipRx = re.compile(params.skip, flags) self.pfilters = [[get_hook_ireq(x, abort=True), x] for x in (params.filter or [])] self.envs = None if self.envs is None and params.env is not None: self.envs = params.env if self.envs is None and cfgs.string("environment") is not None: self.envs = cfgs.string("environment").split(",") if self.envs is None: self.envs = [] self.envs = [x.strip() for x in self.envs] self.suponly = params.suponly if not self.suponly: self.suponly = cfgs.boolean("supplements-only", False) # NOTE: Temporary hack, remove when word splitting becomes smarter. self.simsp = params.simsp if not self.simsp: self.simsp = cfgs.boolean("simple-split", False) self.lokalize = params.lokalize # Language-dependent elements built along the way. self.aspells = {} self.ignoredContexts = {} self.personalDicts = {} self.tmpDictFiles = {} # Indicators to the caller: self.caller_sync = False # no need to sync catalogs self.caller_monitored = False # no need for monitored messages def process_header (self, hdr, cat): # Check if the catalog itself states the language, and if yes, # create the language-dependent stuff if not already created # for this language. clang = self.lang or cat.language() if not clang: raise SieveCatalogError( _("@info", "Cannot determine language for catalog '%(file)s'.", file=cat.filename)) cenvs = self.envs or cat.environment() or [] ckey = (clang, tuple(cenvs)) if ckey not in self.aspells: # New language. self.aspellOptions["lang"] = clang.encode(self.loc_encoding) # Get Pology's internal personal dictonary for this langenv. if ckey not in self.personalDicts: # may be in but None self.personalDicts[ckey] = self._get_personal_dict(clang, cenvs) if self.personalDicts[ckey]: self.aspellOptions["personal-path"] = self.personalDicts[ckey].encode(self.loc_encoding) else: self.aspellOptions.pop("personal-path", None) # remove previous if not self.suponly: # Create Aspell object. import pology.external.pyaspell as A try: - self.aspells[ckey] = A.Aspell(self.aspellOptions.items()) - except A.AspellConfigError, e: + self.aspells[ckey] = A.Aspell(list(self.aspellOptions.items())) + except A.AspellConfigError as e: raise SieveError( _("@info", "Aspell configuration error:\n%(msg)s", msg=e)) - except A.AspellError, e: + except A.AspellError as e: raise SieveError( _("@info", "Cannot initialize Aspell:\n%(msg)s", msg=e)) else: # Create simple internal checker that only checks against # internal supplemental dictionaries. personalDict=self.personalDicts[ckey] if not personalDict: raise SieveError(_("@info", "No supplemental dictionaries found.")) self.aspells[ckey]=_QuasiSpell(personalDict, self.encoding) # Load list of contexts by which to ignore messages. self.ignoredContexts[ckey] = [] ignoredContextFile=join(datadir(), "lang", clang, "spell", "ignoredContext") if isfile(ignoredContextFile): for line in open(ignoredContextFile, "r", "utf-8"): line=line.strip() if line.startswith("#") or line=="": continue else: self.ignoredContexts[ckey].append(line.lower()) # Get language-dependent stuff. self.aspell = self.aspells[ckey] self.ignoredContext = self.ignoredContexts[ckey] # Force explicitly given accelerators and markup. if self.accel is not None: cat.set_accelerator(self.accel) if self.markup is not None: cat.set_markup(self.markup) # Close previous/open new XML section. if self.xmlFile: filename = os.path.basename(cat.filename) # Close previous PO. if self.filename != "": self.xmlFile.write("\n") self.filename = filename # Open new PO. poTag='\n' % filename self.xmlFile.write(poTag) # Write to result def process (self, msg, cat): if not msg.translated: return id=0 # Count msgstr plural forms failedSuggs=[] # pairs of wrong words and suggestions for msgstr in msg.msgstr: # Skip message with context in the ignoredContext list skip=False for context in self.ignoredContext: - if context in (msg.msgctxt or u"").lower(): + if context in (msg.msgctxt or "").lower(): skip=True break for comment in msg.auto_comment: if context in comment.lower(): skip=True break if skip: break if skip: break # Skip message if explicitly requested. if flag_no_check_spell in manc_parse_flag_list(msg, "|"): continue # Apply precheck filters. for pfilter, pfname in self.pfilters: try: # try as type F1A hook msgstr = pfilter(msgstr) except TypeError: try: # try as type F3* hook msgstr = pfilter(msgstr, msg, cat) except TypeError: raise SieveError( _("@info", "Cannot execute filter '%(filt)s'.", filt=pfname)) # Split text into words. if not self.simsp: words=proper_words(msgstr, True, cat.accelerator(), msg.format) else: # NOTE: Temporary, remove when proper_words becomes smarter. words=msgstr.split() # Eliminate from checking words matching the skip regex. if self.skipRx: words = [x for x in words if not self.skipRx.search(x)] # Eliminate from checking words explicitly listed as good. locally_ignored = manc_parse_list(msg, elist_well_spelled, ",") words = [x for x in words if x not in locally_ignored] for word in words: # Encode word for Aspell. encodedWord=word.encode(self.encoding) spell=self.aspell.check(encodedWord) if spell is False: try: self.nmatch+=1 if self.unknownWords is not None: self.unknownWords.add(word) else: encodedSuggestions=self.aspell.suggest(encodedWord) suggestions=[i.decode(self.encoding) for i in encodedSuggestions] failedSuggs.append((word, suggestions)) if self.xmlFile: xmlError=spell_xml_error(msg, cat, word, suggestions, id) self.xmlFile.writelines(xmlError) else: spell_error(msg, cat, word, suggestions) except UnicodeEncodeError: warning(_("@info", "Cannot encode word '%(word)s' in " "selected encoding '%(enc)s'.", word=word, enc=self.encoding)) id+=1 # Increase msgstr id count if failedSuggs and self.lokalize: repls=[_("@label", "Spelling errors:")] for word, suggs in failedSuggs: if suggs: fmtsuggs=format_item_list(suggs) repls.append(_("@item", "%(word)s (suggestions: %(wordlist)s)", word=word, wordlist=fmtsuggs)) else: repls.append("%s" % (word)) report_msg_to_lokalize(msg, cat, cjoin(repls, "\n")) def finalize (self): # Remove composited personal dictionaries. - for tmpDictFile in self.tmpDictFiles.values(): + for tmpDictFile in list(self.tmpDictFiles.values()): if isfile(tmpDictFile): os.unlink(tmpDictFile) if self.unknownWords is not None: slist = list(self.unknownWords) if slist: slist.sort(lambda x, y: locale.strcoll(x.lower(), y.lower())) report("\n".join(slist)) else: if self.nmatch: msg = n_("@info:progress", "Encountered %(num)d unknown word.", "Encountered %(num)d unknown words.", num=self.nmatch) report("===== " + msg) if self.xmlFile: self.xmlFile.write("\n") self.xmlFile.write("\n") self.xmlFile.close() def _encoding_for_aspell (self, enc): if re.search(r"utf.*8", enc, re.I): return "UTF-8" return enc def _get_personal_dict (self, lang, envs): # Collect all personal dictionaries found for given # language/environment and composit them into one to pass to Aspell. dictFiles=set() for env in (envs or [""]): dictFiles.update(self._get_word_list_files(lang, env)) dictFiles=list(dictFiles) dictFiles.sort() if not dictFiles: return None # If only one, Aspell can just use it. if len(dictFiles)<2: return dictFiles[0] # Composite all dictionary files into one temporary. words=[] for dictFile in dictFiles: words.extend(_read_dict_file(dictFile)) tmpDictFile=("compdict-%d.aspell" % os.getpid()) self.tmpDictFiles[lang]=tmpDictFile file=open(tmpDictFile, "w", "UTF-8") file.write("personal_ws-1.1 %s %d UTF-8\n" % (lang, len(words))) file.writelines([x+"\n" for x in words]) file.close() return tmpDictFile def _get_word_list_files (self, lang, env): # Collect all applicable dictionaries. dictFiles=set() spellRoot=join(datadir(), "lang", lang, "spell") spellSub=join(".", (env or "")) while spellSub: spellDir=join(spellRoot, spellSub) if isdir(spellDir): for item in os.listdir(spellDir): if item.endswith(".aspell"): dictFiles.add(join(spellDir, item)) spellSub=dirname(spellSub) return dictFiles # Read words from an Aspell personal dictionary. def _read_dict_file (fname): # Parse the header for encoding. encDefault="UTF-8" file=open(fname, "r", encDefault) header=file.readline() m=re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) if not m: warning(_("@info", "Malformed header in dictionary file '%(file)s'.", file=filepath)) return [] enc=m.group(4) # Reopen in correct encoding if not the default. if enc.lower() != encDefault.lower(): file.close() file=open(fname, "r", enc) # Read words. words=[] for line in file: word=line.strip() if word: words.append(word) return words # Simple spell checker which reads Aspell's personal dictionary file. class _QuasiSpell (object): def __init__ (self, dictfile, encoding="UTF-8"): self.validWords = _read_dict_file(dictfile) self.encoding = encoding # of the raw text sent in for checking def check (self, encWord): word=str.decode(encWord, self.encoding) if ( word not in self.validWords and word.lower() not in self.validWords ): return False return True def suggest (self, encWord): return [] diff --git a/sieve/check_spell_ec.py b/sieve/check_spell_ec.py index 42f30854..48cc43da 100644 --- a/sieve/check_spell_ec.py +++ b/sieve/check_spell_ec.py @@ -1,332 +1,332 @@ # -*- coding: utf-8 -*- """ Spell-check translation using Enchant (U{http://www.abisource.com/projects/enchant/}). Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import codecs import locale import os import re import tempfile from pology import PologyError, datadir, _, n_ from pology.spell import flag_no_check_spell, elist_well_spelled from pology.colors import cjoin from pology.comments import manc_parse_list, manc_parse_flag_list import pology.config as cfg from pology.getfunc import get_hook_ireq from pology.msgreport import report_on_msg from pology.msgreport import report_msg_to_lokalize from pology.report import report, warning, format_item_list from pology.sieve import SieveError, SieveCatalogError from pology.split import proper_words from pology.sieve import add_param_spellcheck, add_param_poeditors def setup_sieve (p): p.set_desc(_("@info sieve discription", "Spell-check translation using Enchant." )) - p.add_param("provider", unicode, seplist=True, + p.add_param("provider", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "NAME"), desc=_("@info sieve parameter discription", "The spell-checking provider to use. " "Several provider can be given as comma-separated list." )) add_param_spellcheck(p) class Sieve (object): def __init__ (self, params): cfgs = cfg.section("enchant") self.providers = ( ",".join(params.provider or "") or cfgs.string("provider") or None) self.lang = ( params.lang or cfgs.string("language") or None) self.envs = params.env if self.envs is None and cfgs.string("environment") is not None: self.envs = cfgs.string("environment").split(",") if self.envs is None: self.envs = [] self.envs = [x.strip() for x in self.envs] self.accel = params.accel self.markup = params.markup self.skip_rx = None if params.skip is not None: flags = re.U if not params.case: flags |= re.I self.skip_rx = re.compile(params.skip, flags) self.pfilters = [[get_hook_ireq(x, abort=True), x] for x in (params.filter or [])] self.suponly = params.suponly self.words_only = params.list self.lokalize = params.lokalize # Langenv-dependent elements built along the way. self.checkers = {} self.word_lists = {} # Tracking of unknown words. self.unknown_words = set() # Indicators to the caller: self.caller_sync = False # no need to sync catalogs self.caller_monitored = False # no need for monitored messages def process_header (self, hdr, cat): # Check if the catalog itself states the language, and if yes, # create the language-dependent stuff if not already created # for this langenv. clang = self.lang or cat.language() if not clang: raise SieveCatalogError( _("@info", "Cannot determine language for catalog '%(file)s'.", file=cat.filename)) cenvs = self.envs or cat.environment() or [] ckey = (clang, tuple(cenvs)) if ckey not in self.checkers: # Get Pology's internal word list for this langenv. if clang not in self.word_lists: # may be in but None self.word_lists[ckey] = _compose_word_list(clang, cenvs) # Create spell-checker object. clang_mod = (self.suponly and [None] or [clang])[0] checker = _create_checker(self.providers, clang_mod, self.word_lists[ckey]) if not checker: raise SieveError( _("@info", "No spelling dictionary for language '%(lang)s' and " "provider '%(prov)s'.", lang=clang, prov=self.providers)) self.checkers[ckey] = checker # Get language-dependent stuff. self.checker = self.checkers[ckey] # Force explicitly given accelerators and markup. if self.accel is not None: cat.set_accelerator(self.accel) if self.markup is not None: cat.set_markup(self.markup) def process (self, msg, cat): if not msg.translated: return failed_w_suggs = [] for msgstr in msg.msgstr: # Skip message if explicitly requested. if flag_no_check_spell in manc_parse_flag_list(msg, "|"): continue # Apply precheck filters. for pfilter, pfname in self.pfilters: try: # try as type F1A hook msgstr = pfilter(msgstr) except TypeError: try: # try as type F3* hook msgstr = pfilter(msgstr, msg, cat) except TypeError: raise SieveError( _("@info", "Cannot execute filter '%(filt)s'.", filt=pfname)) # Split text into words. # TODO: See to use markup types somehow. words = proper_words(msgstr, True, cat.accelerator(), msg.format) # Eliminate from checking words matching the skip regex. if self.skip_rx: words = [x for x in words if not self.skip_rx.search(x)] # Eliminate from checking words explicitly listed as good. locally_ignored = manc_parse_list(msg, elist_well_spelled, ",") words = [x for x in words if x not in locally_ignored] for word in words: if not self.checker.check(word): failed = True self.unknown_words.add(word) if not self.words_only or self.lokalize: suggs = self.checker.suggest(word) incmp = False if len(suggs) > 5: # do not put out too many words suggs = suggs[:5] incmp = True failed_w_suggs.append((word, suggs)) if not self.words_only: if suggs: fsuggs = format_item_list(suggs, incmp=incmp) report_on_msg(_("@info", "Unknown word '%(word)s' " "(suggestions: %(wordlist)s).", word=word, wordlist=fsuggs), msg, cat) else: report_on_msg(_("@info", "Unknown word '%(word)s'.", word=word), msg, cat) if self.lokalize and failed_w_suggs: repls = [_("@label", "Spelling errors:")] for word, suggs in failed_w_suggs: if suggs: fmtsuggs=format_item_list(suggs, incmp=incmp) repls.append(_("@item", "%(word)s (suggestions: %(wordlist)s)", word=word, wordlist=fmtsuggs)) else: repls.append("%s" % (word)) report_msg_to_lokalize(msg, cat, cjoin(repls, "\n")) def finalize (self): if self.unknown_words: if not self.words_only: nwords = len(self.unknown_words) msg = n_("@info:progress", "Encountered %(num)d unknown word.", "Encountered %(num)d unknown words.", num=nwords) report("===== " + msg) else: wlist = list(self.unknown_words) wlist.sort(lambda x, y: locale.strcoll(x.lower(), y.lower())) report("\n".join(wlist)) # Get checker object from Enchant. def _create_checker (providers, langtag, words): try: import enchant except ImportError: pkgs = ["python-enchant"] raise PologyError(_("@info", "Python wrapper for Enchant not found, " "please install it (possible package names: " "%(pkglist)s).", pkglist=format_item_list(pkgs))) if langtag is not None: try: broker = enchant.Broker() if providers is not None: broker.set_ordering(langtag, providers) checker = broker.request_dict(langtag) checker.check(".") except: checker = None else: tmpf = tempfile.NamedTemporaryFile() tmpf.close() checker = enchant.request_pwl_dict(tmpf.name) os.unlink(tmpf.name) if checker: pname = checker.provider.name.split()[0].lower() need_upcasing = (pname in ("personal", "myspell")) for word in words or []: checker.add_to_session(word) if need_upcasing: checker.add_to_session(word[0].upper() + word[1:]) checker.add_to_session(word.upper()) return checker # Collect words from all internal word lists # available for given language+environment. def _compose_word_list (lang, envs): # Collect all applicable word list files. wlist_files = set() for env in (envs or [""]): wlist_files.update(_get_word_list_files(lang, env)) wlist_files = list(wlist_files) wlist_files.sort() # Read words. words = [] for wlist_file in wlist_files: words.extend(_read_wlist_aspell(wlist_file)) return words def _get_word_list_files (lang, env): # Collect word list paths. wlist_files = set() spell_root = os.path.join(datadir(), "lang", lang, "spell") spell_subdir = os.path.join(".", (env or "")) while spell_subdir: spell_dir = os.path.join(spell_root, spell_subdir) if os.path.isdir(spell_dir): for item in os.listdir(spell_dir): if item.endswith(".aspell"): wlist_files.add(os.path.join(spell_dir, item)) spell_subdir = os.path.dirname(spell_subdir) return wlist_files # Read words from an Aspell word list. def _read_wlist_aspell (fname): # Parse the header for encoding. defenc = "UTF-8" fl = codecs.open(fname, "r", defenc) header = fl.readline() m = re.search(r"^(\S+)\s+(\S+)\s+(\d+)\s+(\S+)\s*", header) if not m: warning(_("@info", "Malformed header in dictionary file '%(file)s'.", file=fname)) return [] enc = m.group(4) # Reopen in correct encoding if not the default. if enc.lower() != defenc.lower(): fl.close() fl = codecs.open(fname, "r", enc) # Read words. words = [] for line in fl: word = line.strip() if word: words.append(word) return words diff --git a/sieve/check_tp_kde.py b/sieve/check_tp_kde.py index 82a0f509..c8d9784d 100644 --- a/sieve/check_tp_kde.py +++ b/sieve/check_tp_kde.py @@ -1,655 +1,655 @@ # -*- coding: UTF-8 -*- """ Check validity of translation in catalogs within KDE Translation Project. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re from pology import _, n_ from pology.markup import flag_no_check_markup from pology.escape import escape_c from pology.msgreport import report_on_msg_hl, report_msg_content from pology.msgreport import report_msg_to_lokalize from pology.normalize import identify from pology.report import report, format_item_list from pology.sieve import add_param_poeditors from pology.sieve import SieveError, SieveCatalogError, parse_sieve_flags from pology.proj.kde.cattype import get_project_subdir from pology.proj.kde.cattype import is_txt_cat, is_qt_cat, is_docbook_cat from pology.proj.kde.cattype import is_html_cat, is_unknown_cat def setup_sieve (p): p.set_desc(_("@info sieve discription", "Check validity of messages in catalogs within KDE Translation Project." )) p.add_param("strict", bool, defval=False, desc=_("@info sieve parameter discription", "Check translations strictly: report problems in translation regardless " "of whether original itself is valid (default is to check translation " "only if original passes checks)." )) - chnames = _known_checks.keys() + chnames = list(_known_checks.keys()) chnames.sort() - p.add_param("check", unicode, seplist=True, + p.add_param("check", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "KEYWORD,..."), desc=_("@info sieve parameter discription", "Run only this check instead of all (currently available: %(chklist)s). " "Several checks can be specified as a comma-separated list.", chklist=format_item_list(chnames) )) p.add_param("showmsg", bool, defval=False, desc=_("@info sieve parameter discription", "Also show the full message that had some problems." )) add_param_poeditors(p) class Sieve (object): def __init__ (self, params): self.strict = params.strict self.showmsg = params.showmsg self.lokalize = params.lokalize self.selected_checks = None if params.check is not None: unknown_checks = [] for chname in params.check: if chname not in _known_checks: unknown_checks.append(chname) if unknown_checks: fmtchecks = format_item_list(unknown_checks) raise SieveError( _("@info", "Unknown checks selected: %(chklist)s.", chklist=fmtchecks)) self.selected_checks = set(params.check) # Indicators to the caller: self.caller_sync = False # no need to sync catalogs to the caller self.caller_monitored = False # no need for monitored messages self.nproblems = 0 def process_header (self, hdr, cat): # Collect catalog data for determining type. cname = cat.name csubdir = get_project_subdir(cat.filename) if not csubdir: raise SieveCatalogError( _("@info", "Cannot determine project subdirectory " "of the catalog '%(file)s'.", file=cat.filename)) # Select checks applicable to current catalog. self.current_checks = [] def add_checks (names): if self.selected_checks is not None: names = set(names).intersection(self.selected_checks) for name in names: self.current_checks.append(_known_checks[name]) if is_txt_cat(cname, csubdir): add_checks(["nots", "keywlist"]) elif is_qt_cat(cname, csubdir): add_checks(["qtmarkup", "qtdt", "nots"]) elif is_docbook_cat(cname, csubdir): add_checks(["dbmarkup", "nots"]) elif is_html_cat(cname, csubdir): add_checks(["htmlmarkup", "nots"]) elif is_unknown_cat(cname, csubdir): add_checks([]) else: # default to native KDE4 catalog add_checks(["kde4markup", "qtdt", "trcredits", "plrunq"]) add_checks(["catspec"]) # to all catalogs, will select internally # Reset catalog progress cache, available to checks. self.pcache = { "strict": self.strict, } def process (self, msg, cat): if not msg.translated: return highlight = [] for check in self.current_checks: self.nproblems += check(msg, cat, self.pcache, highlight) if highlight: if self.showmsg: report_msg_content(msg, cat, highlight=highlight, delim=("-" * 20)) else: report_on_msg_hl(highlight, msg, cat) if self.lokalize: report_msg_to_lokalize(msg, cat, highlight) def finalize (self): if self.nproblems > 0: if not self.strict: msg = n_("@info:progress TP stands for Translation Project", "Found %(num)d problem in KDE TP translations.", "Found %(num)d problems in KDE TP translations.", num=self.nproblems) else: msg = n_("@info:progress", "Found %(num)d problem in " "KDE TP translations (strict mode).", "Found %(num)d problems in " "KDE TP translations (strict mode).", num=self.nproblems) report("===== " + msg) # -------------------------------------- # Helpers for checks. # Memoizer for hook factories. class _FuncallMemoizer (object): def __init__ (self): self._cache = {} def __call__ (self, func, *args, **kwargs): ckey = args + tuple(sorted(kwargs.items())) if ckey in self._cache: value = self._cache[ckey] else: value = func(*args, **kwargs) self._cache[ckey] = value return value # Map of checks by name, # updated at point of definition of the check. _known_checks = {} # -------------------------------------- # Check for KDE4 markup. from pology.markup import validate_kde4_l1 _tsfence = "|/|" def _check_kde4markup (msg, cat, pcache, hl): strict = pcache.get("strict", False) # Do not check markup if: # - the check is explicitly skipped for this message # - the original is bad and not running in strict mode if flag_no_check_markup in parse_sieve_flags(msg): return 0 if not strict: if ( validate_kde4_l1(msg.msgid, ents=[]) - or validate_kde4_l1(msg.msgid_plural or u"", ents=[]) + or validate_kde4_l1(msg.msgid_plural or "", ents=[]) ): return 0 nproblems = 0 for i in range(len(msg.msgstr)): msgstr = msg.msgstr[i] lst = msgstr.split(_tsfence, 1) msgstr = lst[0] msgscript = "" if len(lst) == 2: # FIXME: No point in checking the scripted part as it is, # since calls may be used to modify markup in special ways. # Perhaps it would work to remove calls and check what's left? #msgscript = lst[1] pass for text in (msgstr, msgscript): spans = validate_kde4_l1(text, ents=[]) if spans: nproblems += len(spans) hl.append(("msgstr", i, spans)) return nproblems _known_checks["kde4markup"] = _check_kde4markup # -------------------------------------- # Check for Qt markup. from pology.markup import validate_qtrich_l1 def _check_qtmarkup (msg, cat, pcache, hl): strict = pcache.get("strict", False) if flag_no_check_markup in parse_sieve_flags(msg): return 0 if not strict: if ( validate_qtrich_l1(msg.msgid, ents=[]) - or validate_qtrich_l1(msg.msgid_plural or u"", ents=[]) + or validate_qtrich_l1(msg.msgid_plural or "", ents=[]) ): return 0 nproblems = 0 for i in range(len(msg.msgstr)): spans = validate_qtrich_l1(msg.msgstr[i], ents=[]) if spans: nproblems += len(spans) hl.append(("msgstr", i, spans)) return nproblems _known_checks["qtmarkup"] = _check_qtmarkup # -------------------------------------- # Check for Docbook markup. from pology.markup import check_docbook4_msg def _check_dbmarkup (msg, cat, pcache, hl): check1 = pcache.get("check_dbmarkup_hook") if not check1: strict = pcache.get("strict", False) check1 = check_docbook4_msg(strict=strict, entities=None) pcache["check_dbmarkup_hook"] = check1 hl1 = check1(msg, cat) hl.extend(hl1) nproblems = sum(len(x[2]) for x in hl1) return nproblems _known_checks["dbmarkup"] = _check_dbmarkup # -------------------------------------- # Check for HTML markup. from pology.markup import validate_html_l1 def _check_htmlmarkup (msg, cat, pcache, hl): strict = pcache.get("strict", False) if flag_no_check_markup in parse_sieve_flags(msg): return 0 if not strict: if ( validate_html_l1(msg.msgid, ents=[]) - or validate_html_l1(msg.msgid_plural or u"", ents=[]) + or validate_html_l1(msg.msgid_plural or "", ents=[]) ): return 0 nproblems = 0 for i in range(len(msg.msgstr)): spans = validate_html_l1(msg.msgstr[i], ents=[]) if spans: nproblems += len(spans) hl.append(("msgstr", i, spans)) return nproblems _known_checks["htmlmarkup"] = _check_htmlmarkup # -------------------------------------- # Check for no scripting in dumb messages. def _check_nots (msg, cat, pcache, hl): nproblems = 0 for i in range(len(msg.msgstr)): msgstr = msg.msgstr[i] p = msgstr.find(_tsfence) if p >= 0: nproblems += 1 hl.append(("msgstr", i, [(p, p + len(_tsfence), _("@info", "Dumb message, translation cannot be scripted."))])) return nproblems _known_checks["nots"] = _check_nots # -------------------------------------- # Qt datetime format messages. _qtdt_flag = "qtdt-format" _qtdt_clean_rx = re.compile(r"'.*?'") _qtdt_split_rx = re.compile(r"\W+", re.U) def _qtdt_parse (text): text = _qtdt_clean_rx.sub("", text) fields = [x for x in _qtdt_split_rx.split(text) if x] return fields def _is_qtdt_msg (msg): - return ( (_qtdt_flag in (msg.msgctxt or u"").lower()) + return ( (_qtdt_flag in (msg.msgctxt or "").lower()) or (_qtdt_flag in msg.flag)) # Worker for check_qtdt* hooks. def _check_qtdt_w (msgstr, msg, cat): if not _is_qtdt_msg(msg): return [] # Get format fields from the msgid. msgid_fmts = _qtdt_parse(msg.msgid) # Expect the same format fields in msgstr. msgstr_fmts = _qtdt_parse(msgstr) spans = [] if set(msgid_fmts) != set(msgstr_fmts): errmsg = _("@info", "Qt date-format mismatch: " "original contains fields {%(fieldlist1)s} " "while translation contains {%(fieldlist2)s}.", fieldlist1=format_item_list(sorted(msgid_fmts)), fieldlist2=format_item_list(sorted(msgstr_fmts))) spans.append((None, None, errmsg)) return spans # Pass-through test hook (for external use). def check_qtdt (msgstr, msg, cat): """ Check validity of translation if the message is a Qt date-time format [type S3C hook]. TODO: Document further. """ spans = _check_qtdt_w(msgstr, msg, cat) if spans: report_on_msg(spans[0][-1], msg, cat) return False else: return True # Span-reporting test hook (for external use). def check_qtdt_sp (msgstr, msg, cat): """ Check validity of translation if the message is a Qt date-time format [type V3C hook]. Span reporting version of L{check_qtdt}. """ return _check_qtdt_w(msgstr, msg, cat) # Internal check for this sieve's use. def _check_qtdt (msg, cat, pcache, hl): if not _is_qtdt_msg(msg): return 0 nproblems = 0 for i in range(len(msg.msgstr)): msgstr = msg.msgstr[i] spans = _check_qtdt_w(msgstr, msg, cat) if spans: nproblems += 1 hl.append(("msgstr", i, spans)) return nproblems _known_checks["qtdt"] = _check_qtdt # -------------------------------------- # Check for runtime translator data. _trcredit_name_ctxt = "NAME OF TRANSLATORS" _trcredit_email_ctxt = "EMAIL OF TRANSLATORS" _trcredit_ctxts = set(( _trcredit_name_ctxt, _trcredit_email_ctxt, )) _valid_email_rx = re.compile(r"^\S+@\S+\.\S+$", re.U) def _check_trcredits (msg, cat, pcache, hl): if not msg.active: return 0 if msg.msgctxt not in _trcredit_ctxts: return 0 errors = [] if msg.msgctxt == _trcredit_name_ctxt: names = [x.strip() for x in msg.msgstr[0].split(",")] pcache["trnames"] = names elif msg.msgctxt == _trcredit_email_ctxt: emails = [x.strip() for x in msg.msgstr[0].split(",")] pcache["tremails"] = emails for email in emails: # Check minimal validity of address. if email and not _valid_email_rx.match(email): emsg = _("@info", "Invalid email address '%(email)s'.", email=escape_c(email)) errors.append(emsg) # Check congruence between names and emails. names = pcache.get("trnames") emails = pcache.get("tremails") if emails and names: if len(names) != len(emails): emsg = _("@info", "Different number of translator names (%(num1)d) " "and email addresses (%(num2)d).", num1=len(names), num2=len(emails)) errors.append(emsg) else: - for name, email, i in zip(names, emails, range(1, len(names) + 1)): + for name, email, i in zip(names, emails, list(range(1, len(names) + 1))): if not name and not email: emsg = _("@info", "Both name and email address " "of translator no. %(ord)d are empty.", ord=i) errors.append(emsg) if errors: hl.append(("msgstr", 0, [(None, None, x) for x in errors])) return len(errors) _known_checks["trcredits"] = _check_trcredits # -------------------------------------- # Check for query placeholders in Plasma runners. def _check_plrunq (msg, cat, pcache, hl): if not msg.active: return 0 nerrors = 0 if ":q:" in msg.msgid and ":q:" not in msg.msgstr[0]: errmsg = _("@info", "Plasma runner query placeholder '%(plhold)s' " "is missing in translation.", plhold=":q:") hl.append(("msgstr", 0, [(None, None, errmsg)])) nerrors += 1 return nerrors _known_checks["plrunq"] = _check_plrunq # -------------------------------------- # Check for proper format of keyword lists in .dekstop files. from pology.checks import check_keyword_list _check_keywlist_hook = _FuncallMemoizer() def _check_keywlist (msg, cat, pcache, hl): if not msg.active: return 0 strict = pcache.get("strict", False) checkf = _check_keywlist_hook(check_keyword_list, strict) spans = checkf(msg.msgstr[0], msg, cat) if spans: nerrors = 1 hl.append(("msgstr", 0, spans)) else: nerrors = 0 return nerrors _known_checks["keywlist"] = _check_keywlist # -------------------------------------- # Helpers for catalog-specific checks. # Add a catalog-specific checks to one or more catalogs, selected by name. # For example: # _add_cat_check(_check_cat_xyz, ["catfoo", "catbar"]) _known_checks_by_cat = {} def _add_cat_check_hl (check, catspecs): for catspec in catspecs: if catspec not in _known_checks_by_cat: _known_checks_by_cat[catspec] = [] if check not in _known_checks_by_cat[catspec]: _known_checks_by_cat[catspec].append(check) def _on_cat_hl (catspecs): # as decorator def dec (check): _add_cat_check_hl(check, catspecs) return dec # Like _add_cat_check_hl, except that instead of updating the highlight, # check function returns a single error message or a list of error messages. def _add_cat_check (check, catspecs): - if isinstance(catspecs, basestring): + if isinstance(catspecs, str): catspecs = [catspecs] def check_mod (msg, cat, pcache, hl): errors = check(msg, cat, pcache) if errors: - if isinstance(errors, basestring): + if isinstance(errors, str): errors = [errors] hl.append(("msgstr", 0, [(None, None, x) for x in errors])) return len(errors) else: return 0 _add_cat_check_hl(check_mod, catspecs) def _on_cat (catspecs): # as decorator def dec (check): _add_cat_check(check, catspecs) return dec # Global check to apply appropriate catalog-specific checks. def _check_catspec (msg, cat, pcache, hl): nproblems = 0 for check in _known_checks_by_cat.get(cat.name, []): nproblems += check(msg, cat, pcache, hl) return nproblems _known_checks["catspec"] = _check_catspec # Checks that functional tokens are preserved in translation. def _check_cat_match_tokens (msg, cat, pcache, tokens): for token in tokens: if token in msg.msgid: for msgstr in msg.msgstr: if token not in msgstr: return _("@info", "Translation must contain '%(token)s'.", token=token) # Checks that translation is an ASCII identifier-like string. def _check_cat_ascii_identifier (msg, cat, pcache): for msgstr in msg.msgstr: if msgstr.lower() != identify(msgstr): return _("@info", "Translation must be composed only of ASCII letters, " "numbers, and underscores, " "and must not start with a number.") # -------------------------------------- # Catalog-specific checks. @_on_cat("kdeqt") def _check_cat_kdeqt (msg, cat, pcache): if msg.msgid == "QT_LAYOUT_DIRECTION": if msg.msgstr[0] not in ("LTR", "RTL"): return _("@info", "Translation must be exactly '%(text1)s' or '%(text2)s'.", text1="LTR", text2="RTL") @_on_cat("kiosktool") def _check_cat_kiosktool (msg, cat, pcache): return _check_cat_match_tokens(msg, cat, pcache, ["%action"]) @_on_cat("kplatolibs") def _check_cat_kplatolibs (msg, cat, pcache): if "Letter(s) only" in (msg.msgctxt or ""): if not msg.msgstr[0].isalpha(): return _("@info", "Translation must contain only letters.") @_on_cat("libkleopatra") def _check_cat_libkleopatra (msg, cat, pcache): if "'yes' or 'no'" in (msg.msgctxt or ""): if msg.msgstr[0] not in ("yes", "no"): return _("@info", "Translation must be exactly '%(text1)s' or '%(text2)s'.", text1="yes", text2="no") @_on_cat("libknetworkmanager") def _check_cat_libknetworkmanager (msg, cat, pcache): if "ASCII letters and underscore" in (msg.msgctxt or ""): return _check_cat_ascii_identifier(msg, cat, pcache) diff --git a/sieve/check_tp_wesnoth.py b/sieve/check_tp_wesnoth.py index ba40d5e3..b6e7b225 100644 --- a/sieve/check_tp_wesnoth.py +++ b/sieve/check_tp_wesnoth.py @@ -1,576 +1,577 @@ # -*- coding: UTF-8 -*- """ Check validity of translation in catalogs of The Battle for Wesnoth. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re from pology import _, n_ from pology.report import report, format_item_list from pology.msgreport import report_on_msg_hl, report_msg_content from pology.msgreport import report_msg_to_lokalize from pology.sieve import add_param_poeditors from pology.sieve import SieveError from pology.message import MessageUnsafe +from functools import reduce _ctxtsep = "^" def setup_sieve (p): p.set_desc(_("@info sieve discription", "Check validity of messages in catalogs of The Battle for Wesnoth." )) - chnames = _known_checks.keys() + chnames = list(_known_checks.keys()) chnames.sort() - p.add_param("check", unicode, seplist=True, + p.add_param("check", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "KEYWORD,..."), desc=_("@info sieve parameter discription", "Run only this check instead of all (currently available: %(chklist)s). " "Several checks can be specified as a comma-separated list.", chklist=format_item_list(chnames) )) p.add_param("showmsg", bool, defval=False, desc=_("@info sieve parameter discription", "Also show the full message that had some problems." )) add_param_poeditors(p) class Sieve (object): def __init__ (self, params): self.selected_checks = None if params.check is not None: unknown_checks = [] for chname in params.check: if chname not in _known_checks: unknown_checks.append(chname) if unknown_checks: fmtchecks = format_item_list(unknown_checks) raise SieveError( _("@info", "Unknown checks selected: %(chklist)s.", chklist=fmtchecks)) self.selected_checks = set(params.check) self.showmsg = params.showmsg self.lokalize = params.lokalize # Indicators to the caller: self.caller_sync = False # no need to sync catalogs to the caller self.caller_monitored = False # no need for monitored messages self.nproblems = 0 def process_header (self, hdr, cat): def set_checks (names): self.current_checks = [] if self.selected_checks is not None: names = set(names).intersection(self.selected_checks) for name in names: self.current_checks.append(_known_checks[name]) # Determine applicable checks by characteristic message. # Ugly, but no catalog name and nothing in header. if cat.select_by_key(None, "en"): set_checks(["docbook"]) elif cat.select_by_key(None, "wesnothd"): set_checks(["man"]) else: set_checks(["ctxtsep", "interp", "wml", "pango", "space"]) def process (self, msg, cat): if not msg.translated: return highlight = [] # Convert embedded to proper context. if _ctxtsep in msg.msgid: p = msg.msgid.find(_ctxtsep) msg = MessageUnsafe(msg) # should not modify original message msg.msgctxt = msg.msgid[:p] msg.msgid = msg.msgid[p + len(_ctxtsep):] for check in self.current_checks: self.nproblems += check(msg, cat, False, highlight) if highlight: if self.showmsg: report_msg_content(msg, cat, highlight=highlight, delim=("-" * 20)) else: report_on_msg_hl(highlight, msg, cat) if self.lokalize: report_msg_to_lokalize(msg, cat, highlight) def finalize (self): if self.nproblems > 0: msg = n_("@info:progress BfW stands for \"Battle for Wesnoth\"", "Found %(num)d problem in BfW translations.", "Found %(num)d problems in BfW translations.", num=self.nproblems) report("===== " + msg) # -------------------------------------- # Check for mistranslated contexts. def _check_ctxtsep (msg, cat, strict, hl): nproblems = 0 for i in range(len(msg.msgstr)): p = msg.msgstr[i].find(_ctxtsep) if p >= 0: hl.append(("msgstr", i, [(p, p + len(_ctxtsep), _("@info", "Stray context separator."))])) nproblems += 1 return nproblems # -------------------------------------- # Check for congruence of interpolations. def _check_interp (msg, cat, strict, hl): def match_for_index (index, interps_orig, n_can_miss=0): nproblems = 0 interps_trans = _collect_interps(msg.msgstr[index]) if interps_orig != interps_trans: interps_missing = interps_orig.difference(interps_trans) # Eliminate from check interpolations explicitly ignored. for cmnt in [x.strip() for x in msg.manual_comment]: if cmnt.startswith("ignore-interpolations:"): interps = cmnt[cmnt.find(":") + 1:].split() for interp in interps: interp = interp.strip() if not interp.startswith("$"): interp = "$%s" % interp if interp in interps_missing: interps_missing.remove(interp) interps_unknown = interps_trans.difference(interps_orig) if interps_missing and len(interps_missing) > n_can_miss: vfmt = format_item_list(interps_missing) hl.append(("msgstr", index, [(None, None, _("@info", "Missing interpolations: %(interplist)s.", interplist=vfmt))])) nproblems += 1 elif interps_unknown: vfmt = format_item_list(interps_unknown) hl.append(("msgstr", index, [(None, None, _("@info", "Unknown interpolations: %(interplist)s.", interplist=vfmt))])) nproblems += 1 return nproblems nproblems = 0 if msg.msgid_plural is None: interps_orig = _collect_interps(msg.msgid) nproblems += match_for_index(0, interps_orig) else: interps_orig = _collect_interps(msg.msgid_plural) indices_single = cat.plural_indices_single() for i in range(len(msg.msgstr)): nproblems += match_for_index(i, interps_orig, i in indices_single and 1 or 0) return nproblems _interp_rx = re.compile(r"\$\w+(?:\.\w+)*") # intentionally no re.U flag def _collect_interps (text): return set(_interp_rx.findall(text)) # -------------------------------------- # Check for WML validity. def _check_wml (msg, cat, strict, hl): if _detect_markup(msg, cat) != "wml": return 0 # Validate WML in original and collect links. # If the original is not valid, do not check translation. spans_orig, links_orig = _check_wml_text(msg.msgid) if spans_orig: return 0 nproblems = 0 links_trans = set() for i in range(len(msg.msgstr)): spans, links = _check_wml_text(msg.msgstr[i]) if spans: hl.append(("msgstr", i, spans)) nproblems += len(spans) elif links != links_orig: links_missing = links_orig.difference(links) links_unknown = links.difference(links_orig) if links_missing: vfmt = format_item_list(links_missing) hl.append(("msgstr", i, [(None, None, _("@info", "Missing links: %(linklist)s.", linklist=vfmt))])) nproblems += 1 elif links_unknown: vfmt = format_item_list(links_unknown) hl.append(("msgstr", i, [(None, None, _("@info", "Unknown links: %(linklist)s.", linklist=vfmt))])) nproblems += 1 return nproblems _any_ws = re.compile(r"\s") def _is_tag (tag): return not _any_ws.search(tag) _known_tags = { "bold": {"text": True}, "format": {"bold": False, "color": False, "font_size": False, "italic": False, "text": True}, "header": {"text": True}, "img": {"align": False, "float": False, "src": True}, "italic": {"text": True}, "jump": {"amount": False, "to": False}, "ref": {"dst": True, "force": False, "text": True}, } _bool_vals = set(["no", "yes"]) _att_val_check = { "align" : lambda x: x in ["here", "left", "middle", "right"], "amount" : lambda x: x.isdigit(), "bold" : lambda x: x in _bool_vals, "color" : lambda x: x in ["black", "green", "red", "white", "yellow"], "dst" : lambda x: len(x) > 0, "float" : lambda x: x in _bool_vals, "font_size" : lambda x: x.isdigit(), "force" : lambda x: x in _bool_vals, "italic" : lambda x: x in _bool_vals, "src" : lambda x: len(x) > 0, "text" : lambda x: True, "to" : lambda x: bool(re.match(r"^[+-]\d+$", x)), } _link_atts = set(["dst", "src"]) def _check_wml_text (text): spans = [] links = set() p = 0 while True: p = text.find("<", p) if p < 0: break p2 = text.find(">", p) if p2 < 0: spans.append((p, len(text), _("@info", "End of string within tag."))) break tag = text[p + 1:p2] if not _is_tag(tag): spans.append((p, p2, _("@info", "Invalid tag syntax."))) break if tag not in _known_tags: spans.append((p, p2, _("@info", "Unknown tag."))) break p3 = text.find("", p3) if p4 < 0: spans.append((p3, len(text), _("@info", "Unterminated closing tag."))) break tag2 = text[p3 + 2:p4] # Any further errors do not terminate checking. p = p4 + 1 # start position for next loop if tag2 != tag: spans.append((p3, p4, _("@info", "Mismatched opening and closing tags."))) continue spans_att, links_att = _check_wml_att(tag, text[p2 + 1:p3]) spans.extend([(p2 + 1 + pi1, p2 + 1 + pi2, note) for pi1, pi2, note in spans_att]) links.update(links_att) return spans, links def _check_wml_att (tag, content): spans = [] links = set() have_atts = set() lenc = len(content) p = 0 while True: while p < lenc and content[p].isspace(): p += 1 if p >= lenc: break # Parse attribute. p2 = p while p2 < lenc and content[p2].isalpha(): p2 += 1 if p2 >= lenc: spans.append((p, lenc, _("@info", "End of tag content within attribute."))) break att = content[p:p2] if att not in _known_tags[tag]: spans.append((p, p2 + 1, _("@info", "'%(attr)s' is not an attribute of " "tag '%(tag)s'.", attr=att, tag=tag))) break if content[p2] != "=": spans.append((p, p2 + 1, _("@info", "No equal sign after attribute."))) break if att in have_atts: spans.append((p, p2 + 1, _("@info", "Attribute '%(attr)s' repeated.", attr=att))) break have_atts.add(att) # Parse value. p3 = p2 + 1 if content[p3:p3 + 1] == "'": terminator = "'" p3 += 1 else: terminator = " " p4 = p3 while p4 < lenc and content[p4] != terminator: if content[p4] == "\\": # an escape p4 += 1 p4 += 1 val = content[p3:p4] if not _att_val_check[att](val): spans.append((p3, p4, _("@info", "Invalid value to attribute '%(attr)s'.", attr=att))) if att in _link_atts: links.add(val) # Prepare next loop. p = p4 + 1 if not spans: - for att, mandatory in _known_tags[tag].items(): + for att, mandatory in list(_known_tags[tag].items()): if mandatory and att not in have_atts: spans.append((0, 0, _("@info", "Missing mandatory attribute '%(attr)s'.", attr=att))) return spans, links # -------------------------------------- # Check for Pango markup. from pology.markup import validate_pango_l1 def _check_pango (msg, cat, strict, hl): if _detect_markup(msg, cat) != "pango": return 0 # If the original is not valid, do not check translation. spans_orig = validate_pango_l1(msg.msgid) if spans_orig: return 0 nproblems = 0 for i in range(len(msg.msgstr)): spans = validate_pango_l1(msg.msgstr[i]) if spans: hl.append(("msgstr", i, spans)) nproblems += len(spans) return nproblems # -------------------------------------- # Check for congruence of spaces. _langs_w_outspc = ( "sr", "sr@latin", "de", "lt", "fr", "ru", "sk", "is", ) def _check_space (msg, cat, strict, hl): # Check only for explicitly listed languages. if (cat.language() or cat.name) not in _langs_w_outspc: return 0 # Check if explicitly stated in extracted comment # that outer space in original is significant. kw_outspcsig = "outer-space-significant" outspcsig = reduce(lambda s, x: s or kw_outspcsig in x.lower(), msg.auto_comment, False) nproblems = 0 haslead_o = msg.msgid.startswith(" ") hastail_o = msg.msgid.endswith(" ") tailnspc_o = msg.msgid.strip()[-1:] for i in range(len(msg.msgstr)): haslead_t = msg.msgstr[i].startswith(" ") hastail_t = msg.msgstr[i].endswith(" ") # Consider trailing space in original significant # if explicitly stated so, if it is preceded by colon, # or there was a leading space. if ( hastail_o and not hastail_t and (outspcsig or haslead_o or tailnspc_o in ":") ): hl.append(("msgstr", i, [(-1, -1, _("@info", "Missing trailing space."))])) nproblems += 1 # Consider leading space always significant. if haslead_o and not haslead_t: hl.append(("msgstr", i, [(0, 0, _("@info", "Missing leading space."))])) nproblems += 1 """ Nah, usually invisible and yet frequent. # If original has no trailing space, # translation should also have none. if not hastail_o and hastail_t: hl.append(("msgstr", i, [(-1, -1, "extra trailing space")])) nproblems += 1 """ # If original has no leading space, # translation should also have none. if not haslead_o and haslead_t: hl.append(("msgstr", i, [(0, 0, _("@info", "Extra leading space."))])) nproblems += 1 return nproblems # -------------------------------------- # Check for Docbook markup. from pology.markup import check_docbook4_msg _check_dbmarkup_pt = [None] def _check_dbmarkup (msg, cat, strict, hl): if not _check_dbmarkup_pt[0]: _check_dbmarkup_pt[0] = check_docbook4_msg(strict=strict, entities=None) hl1 = _check_dbmarkup_pt[0](msg, cat) hl.extend(hl1) nproblems = sum(len(x[2]) for x in hl1) return nproblems # -------------------------------------- # Check for man markup. def _check_man (msg, cat, strict, hl): # TODO. return 0 # -------------------------------------- # Map of all existing checks. _known_checks = { "ctxtsep": _check_ctxtsep, "interp": _check_interp, "wml": _check_wml, "pango": _check_pango, "space": _check_space, "docbook": _check_dbmarkup, "man": _check_man, } # -------------------------------------- # Utilities. # Try to heuristically detect which type of markup is used in the message. # Detection is conservative: better report no markup, than wrong markup. from pology.markup import collect_xml_spec_l1 from pology import datadir _tags_wml = _known_tags _specpath = os.path.join(datadir(), "spec", "pango.l1") -_tags_pango = collect_xml_spec_l1(_specpath).keys() +_tags_pango = list(collect_xml_spec_l1(_specpath).keys()) _first_tag_rx = re.compile(r"<\s*(\w+)[^>]*>", re.U) # Return keyword of markup detected in the text. def _detect_markup_in_text (text): m = _first_tag_rx.search(text) if m: tag = m.group(1) if tag in _tags_wml: return "wml" elif tag in _tags_pango: return "pango" else: return "unknown" else: return None # Return keyword of markup detected in the message. def _detect_markup (msg, cat): # First look into original text. # If no markup determined from there, look into translation. markup_type = _detect_markup_in_text(msg.msgid) if markup_type is None: markup_type = _detect_markup_in_text(msg.msgstr[0]) return markup_type diff --git a/sieve/collect_pmap.py b/sieve/collect_pmap.py index a2cd45a3..dea822a8 100644 --- a/sieve/collect_pmap.py +++ b/sieve/collect_pmap.py @@ -1,422 +1,422 @@ # -*- coding: UTF-8 -*- """ Assemble a property map from entries in manual comments. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re from pology import _, n_ from pology.colors import cjoin from pology.fsops import str_to_unicode from pology.msgreport import warning_on_msg from pology.report import report, format_item_list from pology.sieve import SieveError from pology.synder import Synder def setup_sieve (p): p.set_desc(_("@info sieve discription", "Assemble a property map from entries in manual comments." )) - p.add_param("outfile", unicode, + p.add_param("outfile", str, metavar=_("@info sieve parameter value placeholder", "FILE"), desc=_("@info sieve parameter discription", "File to output the property map into. " "If not given, nothing is output (useful for validation runs)." )) - p.add_param("propcons", unicode, + p.add_param("propcons", str, metavar=_("@info sieve parameter value placeholder", "FILE"), desc=_("@info sieve parameter discription", "File defining the constraints on property keys and values." )) p.add_param("extrakeys", bool, defval=False, desc=_("@info sieve parameter discription", "Allow defining additional entry keys." )) - p.add_param("derivs", unicode, + p.add_param("derivs", str, metavar=_("@info sieve parameter value placeholder", "FILE"), desc=_("@info sieve parameter discription", "File defining the derivators used in derived entries." )) - p.add_param("pmhead", unicode, defval=u"pmap:", + p.add_param("pmhead", str, defval="pmap:", metavar=_("@info sieve parameter value placeholder", "STRING"), desc=_("@info sieve parameter discription", "Prefix which starts property map entries in comments." )) - p.add_param("sdhead", unicode, defval=u"synder:", + p.add_param("sdhead", str, defval="synder:", metavar=_("@info sieve parameter value placeholder", "STRING"), desc=_("@info sieve parameter discription", "Prefix which starts syntagma derivator entries in comments." )) class Sieve (object): def __init__ (self, params): self.caller_sync = False self.caller_monitored = False self.propcons = None if params.propcons: self.propcons = self._read_propcons(params.propcons) self.p = params if not params.pmhead: raise SieveError(_("@info", "Prefix which starts property map entries " "in comments cannot be empty.")) if not params.sdhead: raise SieveError(_("@info", "Prefix which starts syntagma derivator entries " "in comments cannot be empty.")) # Collected entries. # Each element is a tuple of the form: # (ekeys, props, psep, kvsep, msg, cat) self.entries = [] # Syntagma derivator, for synder entries. self.synder = Synder() self.sdord = 0 def process (self, msg, cat): if not msg.translated or msg.obsolete: return if msg.msgid_plural is not None: return # Parse property map entries from the message. psep, kvsep = None, None ekeys = set() props = {} for i in range(len(msg.manual_comment)): ind = i + 1 manc = (msg.manual_comment[i]).strip() if manc.startswith(self.p.pmhead): # Parse and check consistency of separators. espec = manc[len(self.p.pmhead):].lstrip() lkvsep, lpsep = espec[:2] if lkvsep.isalnum() or lpsep.isalnum(): warning_on_msg(_("@info", "An alphanumeric separator is used for " "property map entry in comment " "no. %(ord)d.", ord=ind), msg, cat) return if not psep: psep, kvsep = lpsep, lkvsep elif (psep, kvsep) != (lpsep, lkvsep): warning_on_msg(_("@info", "Inconsistent separators for " "continued property map entry in comment " "no. %(ord)d.", ord=ind), msg, cat) return # Remove leading and trailing separators. respec = espec[2:] if respec.endswith(psep + psep): respec = respec[:-2] elif respec.endswith(psep): respec = respec[:-1] else: warning_on_msg(_("@info", "Missing terminating separator for " "property map entry in comment " "no. %(ord)d.", ord=ind), msg, cat) return # Parse entry keys and key-value pairs. for elspec in respec.split(psep): if kvsep in elspec: pkey, pval = elspec.split(kvsep, 1) props[pkey] = pval else: ekey = elspec if not self.p.extrakeys: warning_on_msg(_("@info", "Additional entry key '%(key)s' " "is defined but not allowed for " "property map entry in comment " "no. %(ord)d.", key=ekey, ord=ind), msg, cat) return ekeys.add(ekey) elif manc.startswith(self.p.sdhead): sddef = manc[len(self.p.sdhead):].lstrip() sdkey = str(self.sdord) sdexpr = sdkey + ":" + sddef if self.p.derivs: sdexpr = ">" + self.p.derivs + "\n" + sdexpr try: self.synder.import_string(sdexpr) cprops = self.synder.props(sdkey) - except Exception, e: + except Exception as e: errmsg = str_to_unicode(str(e)) warning_on_msg(_("@info", "Invalid derivation '%(deriv)s':\n" "%(msg)s", deriv=sddef, msg=errmsg), msg, cat) return - jumble = "".join(["".join(x) for x in cprops.items()]) + jumble = "".join(["".join(x) for x in list(cprops.items())]) if not psep: - psep = self._pick_sep(jumble, u"/|¦") - kvsep = self._pick_sep(jumble, u"=:→") + psep = self._pick_sep(jumble, "/|¦") + kvsep = self._pick_sep(jumble, "=:→") if not psep or not kvsep: warning_on_msg(_("@info", "No known separator are applicable " "to keys and values derived from " "'%(deriv)s'.", deriv=sddef), msg, cat) return else: if psep in jumble or kvsep in jumble: warning_on_msg(_("@info", "Previously selected separators " "are not applicable to " "keys and values derived from " "'%(deriv)s'.", deriv=sddef), msg, cat) return props.update(cprops) if not props: if ekeys: warning_on_msg(_("@info", "Some additional entry keys " "are defined for property map entry, " "but there are no properties."), msg, cat) return props = sorted(props.items()) # no need for dictionary any more # Add default keys. ekeys.add(msg.msgid) ekeys.add(msg.msgstr[0]) # Validate entry if requested. if self.propcons: errs = self._validate_props(props, msg, cat, self.propcons) if errs: problems = cjoin([" " + x for x in errs], "\n") warning_on_msg(_("@info", "Property map entry fails validation:\n" "%(msgs)s", msgs=problems), msg, cat) return # Entry parsed. ekeys = sorted(ekeys) props = sorted(props) self.entries.append((ekeys, props, psep, kvsep, msg, cat)) def finalize (self): # Check cross-entry validity, select valid. msgs_by_seen_msgstr = {} unique_entries = [] for entry in self.entries: d1, props, d3, d4, msg, cat = entry msgstr = msg.msgstr[0] if msgstr not in msgs_by_seen_msgstr: msgs_by_seen_msgstr[msgstr] = [] else: for d1, d2, oprops in msgs_by_seen_msgstr[msgstr]: if props == oprops: props = None break if props: unique_entries.append(entry) msgs_by_seen_msgstr[msgstr].append((msg, cat, props)) good_entries = [] for ekeys, props, psep, kvsep, msg, cat in unique_entries: eq_msgstr_set = msgs_by_seen_msgstr.get(msg.msgstr[0]) if eq_msgstr_set is not None: if len(eq_msgstr_set) > 1: cmsgcats = msgs_by_seen_msgstr.pop(msg.msgstr[0]) msg0, cat0, d3 = cmsgcats[0] warning_on_msg(_("@info split to link below", "Property map entries removed due " "to translation conflict with..."), msg0, cat0) for msg, cat, d3 in cmsgcats[1:]: warning_on_msg(_("@info continuation from above", "...this message."), msg, cat) else: good_entries.append((ekeys, props, psep, kvsep)) # If output file has not been given, only validation was expected. if not self.p.outfile: return # Serialize entries. good_entries.sort(key=lambda x: x[0]) lines = [] for ekeys, props, psep, kvsep in good_entries: # Do Unicode, locale-unaware sorting, # for equal results over different systems; # they are not to be read by humans anyway. propstr = psep.join([kvsep.join(x) for x in sorted(props)]) ekeystr = psep.join(sorted(ekeys)) estr = kvsep + psep + ekeystr + psep + propstr + psep + psep lines.append(estr) # Write out the property map. lines.append("") fstr = "\n".join(lines) fstr = fstr.encode("UTF-8") fh = open(self.p.outfile, "w") fh.write(fstr) fh.close() msg = n_("@info:progress", "Collected %(num)d entry for the property map.", "Collected %(num)d entries for the property map.", num=len(good_entries)) report("===== " + msg) def _pick_sep (self, teststr, seps): good = False for sep in seps: if sep not in teststr: good = True break return sep if good else None def _read_propcons (self, fpath): if not os.path.isfile(fpath): raise SieveError(_("@info", "Property constraint file '%(file)s' " "does not exist.", file=fpath)) lines = open(fpath).read().decode("UTF-8").split("\n") if not lines[-1]: lines.pop() cmrx = re.compile(r"#.*") # Constraints collected as list of tuples: # (compiled key regex, string key regex, # compiled value regex, string value regex, # string of flags) propcons = [] lno = 0 def mkerr (problem): return _("@info", "Invalid property map constraint " "at %(file)s:%(line)d: %(snippet)s.", file=fpath, line=lno, snippet=problem) known_flags = set(("i", "I", "t", "r")) for line in lines: lno += 1 line = cmrx.sub("", line).strip() if not line: continue sep = line[0] if sep.isalnum(): raise SieveError(mkerr(_("@item:intext", "alphanumeric separators " "not allowed"))) lst = line.split(sep) if len(lst) < 4: raise SieveError(mkerr(_("@item:intext", "too few separators"))) elif len(lst) > 4: raise SieveError(mkerr(_("@item:intext", "too many separators"))) d1, keyrxstr, valrxstr, flags = lst unknown_flags = set(flags).difference(known_flags) if unknown_flags: fmtflags = format_item_list(sorted(unknown_flags), quoted=True) raise SieveError(mkerr(_("@item:intext", "unknown flags %(flaglist)s", flaglist=fmtflags))) rxs = [] for rxstr, iflag in ((keyrxstr, "I"), (valrxstr, "i")): rxfls = re.U if iflag in flags: rxfls |= re.I wrxstr = r"^(?:%s)$" % rxstr try: rx = re.compile(wrxstr, rxfls) except: raise SieveError(mkerr(_("@item:intext", "invalid regular expression " "'%(regex)s'", regex=rxstr))) rxs.append(rx) keyrx, valrx = rxs propcons.append((keyrx, keyrxstr, valrx, valrxstr, flags)) return propcons def _validate_props (self, props, msg, cat, propcons): matched_cons = set() errs = [] adderr = lambda err: errs.append(err) - for prop, ip in zip(props, range(len(props))): + for prop, ip in zip(props, list(range(len(props)))): key, val = prop key_matched = False - for propcon, ic in zip(propcons, range(len(propcons))): + for propcon, ic in zip(propcons, list(range(len(propcons)))): keyrx, keyrxstr, valrx, valrxstr, flags = propcon if keyrx.search(key): key_matched = True matched_cons.add(ic) if not valrx.search(val): pattern = valrx adderr(_("@info", "Value '%(val)s' to key '%(key)s' " "does not match '%(pattern)s'.", val=val, key=key, pattern=pattern)) if "t" in flags: if "i" in flags: eq = (val.lower() == msg.msgstr[0].lower()) else: eq = (val == msg.msgstr[0]) if not eq: adderr(_("@info", "Value '%(val)s' to key '%(key)s' " "does not match translation " "of the message.", val=val, key=key)) if not key_matched: adderr(_("@info", "Key '%(key)s' does not match any constraint.", key=key)) - for propcon, ic in zip(propcons, range(len(propcons))): + for propcon, ic in zip(propcons, list(range(len(propcons)))): pattern, rlags = propcon[1], propcon[-1] if "r" in flags and ic not in matched_cons: adderr(_("@info", "No key matched required constraint '%(pattern)s'.", pattern=pattern)) return errs diff --git a/sieve/diff_previous.py b/sieve/diff_previous.py index bcc9eaeb..2582920d 100644 --- a/sieve/diff_previous.py +++ b/sieve/diff_previous.py @@ -1,110 +1,110 @@ # -*- coding: UTF-8 -*- """ Embed differences in original text in fuzzy messages into previous fields. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import _, n_ from pology.comments import parse_summit_branches from pology.diff import word_ediff, word_ediff_to_old from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Diff previous to current fields in fuzzy messages." )) p.add_param("strip", bool, desc=_("@info sieve parameter discription", "Remove embedded differences from previous fields." )) - p.add_param("branch", unicode, seplist=True, + p.add_param("branch", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "BRANCH"), desc=_("@info sieve parameter discription", "In summit catalogs, process only messages belonging to given branch. " "Several branches can be given as comma-separated list." )) class Sieve (object): def __init__ (self, params): self.nmod = 0 self.strip = params.strip self.branches = set(params.branch or []) def _diff (self, msgold, msgnew, format): # Remove any previous diff. msgold_clean = word_ediff_to_old(msgold) # Create the diff or only put back the clean text. if not self.strip: return word_ediff(msgold_clean, msgnew, markup=True, format=format) else: return msgold_clean def process (self, msg, cat): # Summit: if branches were given, skip the message if it does not # belong to any of the given branches. if self.branches: msg_branches = parse_summit_branches(msg) if not set.intersection(self.branches, msg_branches): return # Skip if message is not fuzzy or does not have previous fields. if not msg.fuzzy or msg.msgid_previous is None: # Remove any stray previous fields. msg.msgctxt_previous = None msg.msgid_previous = None msg.msgid_plural_previous = None return # Skip message if obsolete fuzzy. if msg.obsolete: return oldcount = msg.modcount msg.msgctxt_previous = self._diff(msg.msgctxt_previous, msg.msgctxt, msg.format) msg.msgid_previous = self._diff(msg.msgid_previous, msg.msgid, msg.format) msg.msgid_plural_previous = self._diff(msg.msgid_plural_previous, msg.msgid_plural, msg.format) if msg.modcount > oldcount: self.nmod += 1 def finalize (self): if self.nmod > 0: if not self.strip: msg = n_("@info:progress", "Added differences to %(num)d fuzzy message.", "Added differences to %(num)d fuzzy messages.", num=self.nmod) else: msg = n_("@info:progress", "Stripped differences from %(num)d fuzzy message.", "Stripped differences from %(num)d fuzzy messages.", num=self.nmod) report("===== " + msg) diff --git a/sieve/fancy_quote.py b/sieve/fancy_quote.py index eb0224cf..dba7ea05 100644 --- a/sieve/fancy_quote.py +++ b/sieve/fancy_quote.py @@ -1,302 +1,302 @@ # -*- coding: UTF-8 -*- """ Transform ASCII single and double quotes into fancy counterparts. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re from pology import _, n_ from pology.comments import manc_parse_flag_list from pology.escape import split_escaped from pology.report import report from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Transform ASCII single and double quotes into fancy counterparts." )) - p.add_param("single", unicode, + p.add_param("single", str, metavar=_("@info sieve parameter value placeholder", "QUOTES"), desc=_("@info sieve parameter discription", "Opening and closing single quote (two characters)." )) - p.add_param("double", unicode, + p.add_param("double", str, metavar=_("@info sieve parameter value placeholder", "QUOTES"), desc=_("@info sieve parameter discription", "Opening and closing double quote (two characters)." )) - p.add_param("longsingle", unicode, + p.add_param("longsingle", str, metavar=_("@info sieve parameter value placeholder", "OPEN,CLOSED"), desc=_("@info sieve parameter discription", "Opening and closing single quote longer than single character." )) - p.add_param("longdouble", unicode, + p.add_param("longdouble", str, metavar=_("@info sieve parameter value placeholder", "OPEN,CLOSED"), desc=_("@info sieve parameter discription", "Opening and closing double quote longer than single character." )) # Pipe flag used to manually prevent transformation into fancy quotes. _flag_no_fancy_quote = "no-fancy-quote" class Sieve (object): def __init__ (self, params): self.nrepl_single = 0 self.nrepl_double = 0 # Pair of single quotes. self.singles = () if params.single is not None and params.longsingle is not None: raise SieveError( _("@info", "Both single- and multi-character replacement of " "single quotes issued.")) if params.single is not None: quotes = params.single if len(quotes) != 2: raise SieveError( _("@info", "Invalid specification of single quotes (%(quotes)s), " "expected two characters.", quotes=quotes)) self.singles = (quotes[0], quotes[1]) elif params.longsingle is not None: quotes = split_escaped(params.longsingle, ",") if len(quotes) != 2: raise SieveError( _("@info", "Invalid specification of single quotes (%(quotes)s), " "expected two strings.", quotes=quotes)) self.singles = (quotes[0], quotes[1]) # Pair of double quotes. self.doubles = () if params.double is not None and params.longdouble is not None: raise SieveError( _("@info", "Both single- and multi-character replacement of " "double quotes issued.")) if params.double is not None: quotes = params.double if len(quotes) != 2: raise SieveError( _("@info", "Invalid specification of double quotes (%(quotes)s), " "expected two characters.", quotes=quotes)) self.doubles = (quotes[0], quotes[1]) elif params.longdouble is not None: quotes = split_escaped(params.longdouble, ",") if len(quotes) != 2: raise SieveError( _("@info", "Invalid specification of double quotes '%(quotes)s', " "expected two strings.", quotes=quotes)) self.doubles = (quotes[0], quotes[1]) def process (self, msg, cat): # Skip the message when told so. if _flag_no_fancy_quote in manc_parse_flag_list(msg, "|"): return # Skip the message if special by context (one of meta-messages). if _spec_msgctxt_rx.search(msg.msgctxt or ""): return # Skip the message if auto comments identify it as literal user input. for cmnt in msg.auto_comment: cmnt = cmnt.lower() # - extracted by KDE's xml2pot if "tag:" in cmnt: tag = cmnt[cmnt.find(":")+1:].strip() if tag in _xml_literal_tags: return # Modify quotes in all translations. for i in range(len(msg.msgstr)): text = msg.msgstr[i] if self.singles: text, nrepl = equip_fancy_quotes(text, "'", self.singles) self.nrepl_single += nrepl if self.doubles: text, nrepl = equip_fancy_quotes(text, '"', self.doubles) self.nrepl_double += nrepl msg.msgstr[i] = text def finalize (self): nrepl_both = self.nrepl_single + self.nrepl_double if nrepl_both > 0: msg = n_("@info:progress", "Replaced %(num)d pair of quotes in translation " "(single+double: %(nums)d+%(numd)d).", "Replaced %(num)d pairs of quotes in translation " "(single+double: %(nums)d+%(numd)d).", num=nrepl_both, nums=self.nrepl_single, numd=self.nrepl_double) report("===== " + msg) # Regular expression for matching special messages by context. _spec_msgctxt = ( "qtdt-format", ) _spec_msgctxt_rx = re.compile("|".join(_spec_msgctxt)) # Regular expression for matching no-modify nodes in XML markup. _xml_literal_tags = ( # HTML "tt", "code", # KUIT "icode", "bcode", # Docbook "screen", "screenco", "userinput", "code", "literal", "markup", "programlisting", "programlistingco", "returnvalue", "command", "synopsis", "cmdsynopsis", "synopfragment", "synopfragmentref", "guilabel", "guimenuitem", "action", "errorname", ) _xml_literal_rx = re.compile(r"< *(%s)\b" % "|".join(_xml_literal_tags)) def equip_fancy_quotes (text, squote, fquotes): """ Heuristically replace simple with fancy quotes (eg. "foo" with “foo”). The replacement tries to avoid quotes in markup (e.g. XML attributes), and other situations where the original quoting should not be touched. @param text: the text to equip with fancy quotes @type text: string @param squote: the simple quote, used for both opening and closing @type squote: string @param fquotes: the opening and closing fancy quote @type fquotes: two-tuple of strings @returns: the modified text and number of fancy pairs replaced @rtype: string, int """ # Quick check: simple quote valid, any simple quotes at all? if not squote or squote not in text: return text, 0 nrepl = 0 no_mod_end = "" i_after_close = 0 i_open = -1 i = 0 ntext = "" lensq = len(squote) while i < len(text): # Calculate the length of no-modify segment if it starts here. no_mod_len = 0 # - known XML nodes which are literal user input to computer m = _xml_literal_rx.match(text, i) if m: tag = m.group(1) end_rx = re.compile(r"\b%s *>" % tag) m = end_rx.search(text, i + len(tag)) if m: # skip only if closed, otherwise stay put no_mod_len = m.span()[1] - i # - within XML tags elif text[i] == "<": ic = text.find(">", i + 1) if ic >= 0: # markup only if closed, otherwise stay put no_mod_len = ic - i + 1 # - text in special parenthesis elif text[i] in ("{", "["): qopen = text[i] if qopen == "{": qclose = "}" else: qclose = "]" # Look for balanced pair. nopen = 1 ic = i + 1 while ic < len(text) and nopen > 0: if text[ic] == qopen: nopen += 1 elif text[ic] == qclose: nopen -= 1 ic += 1 if nopen == 0: # special only if closed, otherwise stay put no_mod_len = ic - i # - simple quotes with no text in between elif text[i:i + 2 * lensq] == squote + squote: no_mod_len = 2 * lensq # - ASCII quote just after a number, and no opening quote so far # (may be a unit: inch, foot, minute, second) elif i_open < 0 and text[i:i + 1].isdigit(): if text[i + 1:i + 1 + lensq] == squote: no_mod_len = 1 + lensq # - simple quote in between two letters, may be a contraction elif ( text[i:i + 1].isalpha() and text[i + 1:i + 1 + lensq] == squote and text[i + 1 + lensq:i + 1 + lensq + 1].isalpha() ): no_mod_len = 1 + lensq + 1 # Advance past the end of no-modify segment if found. if no_mod_len > 0: i += no_mod_len # If at simple quote. elif text[i:i+len(squote)] == squote: if i_open < 0: # No quote opened, this is opening quote. i_open = i # record opening position ntext += text[i_after_close:i_open] # append text so far else: # Quote opened beforehand, this is closing quote. tseg = text[i_open + len(squote) : i] # quoted segment ntext += fquotes[0] + tseg + fquotes[1] # append fancy-quoted nrepl += 1 # count added fancy pair i_open = -1 # cancel opened state i_after_close = i + len(squote) # record position after closing # Advance past the simple quote i += len(squote) else: # Nothing special, advance to next char. i += 1 # Append the remaining text. if i_open >= 0: # Unpaired opening quote. ntext += text[i_open:] else: # All quotes paired. ntext += text[i_after_close:] return ntext, nrepl diff --git a/sieve/find_messages.py b/sieve/find_messages.py index 9d83c2c0..48d95625 100644 --- a/sieve/find_messages.py +++ b/sieve/find_messages.py @@ -1,418 +1,419 @@ # -*- coding: UTF-8 -*- """ Find messages in catalogs. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import locale import os import re import sys from pology import _, n_ from pology.message import MessageUnsafe from pology.remove import remove_accel_msg from pology.fsops import str_to_unicode from pology.getfunc import get_hook_ireq from pology.match import make_msg_matcher, make_matcher, make_filtered_msg from pology.match import ExprError from pology.msgreport import report_msg_content from pology.msgreport import report_msg_to_lokalize from pology.report import report, error, warning, format_item_list from pology.sieve import SieveError from pology.sieve import add_param_poeditors +from functools import reduce def setup_sieve (p): p.set_desc(_("@info sieve discription", "Find messages in catalogs." "\n\n" "Each message is matched according to one or several criteria, " "and if it matches as whole, it is displayed to standard output, " "along with the catalog path and referent line and entry number." "\n\n" "When several matching parameters are given, by default a message " "is matched if all of them match (AND-relation). " "This can be changed to OR-relation for matching in text fields " "(%(fieldlist)s) using the '%(par)s' parameter. " "Any matching parameter can be repeated when it makes sense " "(e.g. two matches on msgid).", fieldlist=format_item_list(["msgctxt", "msgid", "msgstr", "comment"]), par="or" )) # NOTE: Do not add default values for matchers, # we need None to see if they were issued or not. - p.add_param("msgid", unicode, multival=True, + p.add_param("msgid", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if the '%(field)s' field matches the regular expression.", field="msgid" )) - p.add_param("nmsgid", unicode, multival=True, + p.add_param("nmsgid", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if the '%(field)s' field does not match the regular expression.", field="msgid" )) - p.add_param("msgstr", unicode, multival=True, + p.add_param("msgstr", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if the '%(field)s' field matches the regular expression.", field="msgstr" )) - p.add_param("nmsgstr", unicode, multival=True, + p.add_param("nmsgstr", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if the '%(field)s' field does not match the regular expression.", field="msgstr" )) - p.add_param("msgctxt", unicode, multival=True, + p.add_param("msgctxt", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if the '%(field)s' field matches the regular expression.", field="msgctxt" )) - p.add_param("nmsgctxt", unicode, multival=True, + p.add_param("nmsgctxt", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if the '%(field)s' field does not match the regular expression.", field="msgctxt" )) - p.add_param("comment", unicode, multival=True, + p.add_param("comment", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if a comment line (extracted or translator) " "matches the regular expression." )) - p.add_param("ncomment", unicode, multival=True, + p.add_param("ncomment", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if a comment line (extracted or translator) " "does not match the regular expression." )) p.add_param("transl", bool, desc=_("@info sieve parameter discription", "Matches if the message is translated." )) p.add_param("ntransl", bool, desc=_("@info sieve parameter discription", "Matches if the message is not translated." )) p.add_param("obsol", bool, desc=_("@info sieve parameter discription", "Matches if the message is obsolete." )) p.add_param("nobsol", bool, desc=_("@info sieve parameter discription", "Matches if the message is not obsolete." )) p.add_param("active", bool, desc=_("@info sieve parameter discription", "Matches if the message is active (translated and not obsolete)." )) p.add_param("nactive", bool, desc=_("@info sieve parameter discription", "Matches if the message is not active (not translated or obsolete)." )) - p.add_param("flag", unicode, multival=True, + p.add_param("flag", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if one of the flags matches the regular expression." )) - p.add_param("nflag", unicode, multival=True, + p.add_param("nflag", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Matches if none of the flags matches the regular expression." )) p.add_param("plural", bool, desc=_("@info sieve parameter discription", "Matches if the message is plural." )) p.add_param("nplural", bool, desc=_("@info sieve parameter discription", "Matches if the message is not plural." )) p.add_param("maxchar", int, metavar=_("@info sieve parameter value placeholder", "NUM"), desc=_("@info sieve parameter discription", "Matches if both the '%(field1)s' and '%(field2)s' field " "have at most this many characters " "(0 or less means any number of characters).", field1="msgid", field2="msgstr" )) p.add_param("nmaxchar", int, metavar=_("@info sieve parameter value placeholder", "NUM"), desc=_("@info sieve parameter discription", "Matches if either the '%(field1)s' or '%(field2)s' field " "have more than this many characters " "(0 or less means any number of characters).", field1="msgid", field2="msgstr" )) - p.add_param("lspan", unicode, + p.add_param("lspan", str, metavar=_("@info sieve parameter value placeholder", "START:END"), desc=_("@info sieve parameter discription", "Matches if the message line number is in the given range " "(including starting line, excluding ending line)." )) - p.add_param("nlspan", unicode, + p.add_param("nlspan", str, metavar=_("@info sieve parameter value placeholder", "START:END"), desc=_("@info sieve parameter discription", "Matches if the message line number is not in the given range " "(including starting line, excluding ending line)." )) - p.add_param("espan", unicode, + p.add_param("espan", str, metavar=_("@info sieve parameter value placeholder", "START:END"), desc=_("@info sieve parameter discription", "Matches if the message entry number is in the given range " "(including starting entry, excluding ending entry)." )) - p.add_param("nespan", unicode, + p.add_param("nespan", str, metavar=_("@info sieve parameter value placeholder", "START:END"), desc=_("@info sieve parameter discription", "Matches if the message entry number is not in the given range " "(including starting entry, excluding ending entry)." )) - p.add_param("branch", unicode, seplist=True, + p.add_param("branch", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "BRANCH"), desc=_("@info sieve parameter discription", "In summit catalogs, match only messages belonging to given branch. " "Several branches can be given as comma-separated list." )) - p.add_param("nbranch", unicode, seplist=True, + p.add_param("nbranch", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "BRANCH"), desc=_("@info sieve parameter discription", "Match only messages not belonging to given branch." )) - p.add_param("fexpr", unicode, + p.add_param("fexpr", str, metavar=_("@info sieve parameter value placeholder", "EXPRESSION"), desc=_("@info sieve parameter discription", "Matches if the logical expression matches. " "The expression is composed of direct matchers (not starting with n*), " "explicitly linked with AND, OR, and NOT operators, and parenthesis. " "Base matchers taking parameters are given as MATCHER/VALUE/, " "where slash can be replaced consistently with any other character. " "Global matching modifiers can be overriden using MATCHER/VALUE/MODS, or " "MATCHER/MODS for parameterless matchers " "(currently available: c/i for case-sensitive/insensitive). " "Examples:" "\n\n" "fexpr:'(msgctxt/foo/ or comment/foo/) and msgid/bar/'" "\n\n" "fexpr:'msgid/quuk/ and msgstr/Qaak/c'" )) - p.add_param("nfexpr", unicode, + p.add_param("nfexpr", str, metavar=_("@info sieve parameter value placeholder", "EXPRESSION"), desc=_("@info sieve parameter discription", "Matches if the logical expression does not match." )) p.add_param("or", bool, defval=False, attrname="or_match", desc=_("@info sieve parameter discription", "Use OR-relation for matching text fields: if any of " "the patterns matches, the message is matched as whole." )) p.add_param("invert", bool, defval=False, desc=_("@info sieve parameter discription", "Invert the condition: report messages which do not match." )) p.add_param("case", bool, defval=False, desc=_("@info sieve parameter discription", "Case-sensitive text matching." )) - p.add_param("accel", unicode, multival=True, + p.add_param("accel", str, multival=True, metavar=_("@info sieve parameter value placeholder", "CHAR"), desc=_("@info sieve parameter discription", "Character which is used as UI accelerator marker in text fields, " "to ignore it on matching. " "If a catalog defines accelerator marker in the header, " "this value overrides it." )) p.add_param("mark", bool, defval=False, desc=_("@info sieve parameter discription", "Add '%(flag)s' flag to each matched message.", flag=_flag_mark )) - p.add_param("filter", unicode, multival=True, + p.add_param("filter", str, multival=True, metavar=_("@info sieve parameter value placeholder", "HOOK"), desc=_("@info sieve parameter discription", "F1A hook specification, to filter the msgstr fields through " "before matching them. " "Several hooks can be specified by repeating the parameter." )) - p.add_param("replace", unicode, + p.add_param("replace", str, metavar=_("@info sieve parameter value placeholder", "REPLSTR"), desc=_("@info sieve parameter discription", "Replace all substrings matched by msgstr pattern with REPLSTR. " "It can include back-references to matched groups (\\1, \\2, etc.)" )) p.add_param("nomsg", bool, defval=False, desc=_("@info sieve parameter discription", "Do not report message to standard output " "(when only the number of matches is wanted)." )) add_param_poeditors(p) -_flag_mark = u"match" +_flag_mark = "match" class Sieve (object): def __init__ (self, params): self.nmatch = 0 self.p = params # Build matching function. # It takes as arguments: filtered message, message, catalog, # and highlight specification (which is filled on matches). def make_match_group (names, negatable=False, orlinked=False): names_negs = [(x, False) for x in names] if negatable: names_negs.extend([(x, True) for x in names]) matchers = [] for name, neg in names_negs: nname = name if neg: nname = "n" + name values = getattr(params, nname) if values is None: # parameter not given continue if not isinstance(values, list): values = [values] for value in values: try: if name == "fexpr": m = make_msg_matcher(value, params) else: m = make_matcher(name, value, [], params, neg) - except ExprError, e: + except ExprError as e: raise SieveError(str_to_unicode(str(e))) matchers.append(m) if orlinked: expr = lambda *a: reduce(lambda s, m: s or m(*a), matchers, False) else: expr = lambda *a: reduce(lambda s, m: s and m(*a), matchers, True) return expr # - first matchers which are always AND expr_and = make_match_group([ "transl", "obsol", "active", "plural", "maxchar", "lspan", "espan", "flag", "branch", ], negatable=True, orlinked=False) # - then matchers which can be AND or OR expr_andor = make_match_group([ "msgctxt", "msgid", "msgstr", "comment", "fexpr", ], negatable=True, orlinked=self.p.or_match) # - all together self.matcher = lambda *a: expr_and(*a) and expr_andor(*a) # Prepare replacement. self.replrxs = [] if self.p.replace is not None: if not self.p.msgstr: raise SieveError( _("@info", "Cannot perform replacement if match " "on '%(field)s' is not given.", field="msgstr")) rxflags = re.U if not self.p.case: rxflags |= re.I for rxstr in self.p.msgstr: self.replrxs.append(re.compile(rxstr, rxflags)) # Resolve filtering hooks. self.pfilters = [] for hreq in self.p.filter or []: self.pfilters.append(get_hook_ireq(hreq, abort=True)) # Unless replacement or marking requested, no need to monitor/sync. if self.p.replace is None and not self.p.mark: self.caller_sync = False self.caller_monitored = False def process_header (self, hdr, cat): # Force explicitly given accelerators. if self.p.accel is not None: cat.set_accelerator(self.p.accel) def process (self, msg, cat): """ Returns 0 if the message is matched, 1 otherwise. """ # Prepare filtered message for matching. msgf = make_filtered_msg(msg, cat, filters=self.pfilters) # Match the message. hl_spec = [] match = self.matcher(msgf, msg, cat, hl_spec) if self.p.invert: match = not match if match: self.nmatch += 1 # Do the replacement in translation if requested. # NOTE: Use the real, not the filtered message. for regex in self.replrxs: for i in range(len(msg.msgstr)): msg.msgstr[i] = regex.sub(self.p.replace, msg.msgstr[i]) if not self.p.nomsg: delim = "-" * 20 if self.nmatch == 1: report(delim) report_msg_content(msg, cat, wrapf=cat.wrapf(), force=True, delim=delim, highlight=hl_spec) if self.p.mark: msg.flag.add(_flag_mark) if self.p.lokalize: report_msg_to_lokalize(msg, cat) elif self.p.mark and _flag_mark in msg.flag: # Remove the flag if present but the message does not match. msg.flag.remove(_flag_mark) return 0 if match else 1 def finalize (self): if self.nmatch: msg = n_("@info:progress", "Found %(num)d message satisfying the conditions.", "Found %(num)d messages satisfying the conditions.", num=self.nmatch) report("===== " + msg) diff --git a/sieve/generate_xml.py b/sieve/generate_xml.py index 77a4ac26..5c70f911 100644 --- a/sieve/generate_xml.py +++ b/sieve/generate_xml.py @@ -1,181 +1,181 @@ # -*- coding: UTF-8 -*- """ Generate an XML tree from the input PO files. Documented in C{doc/user/sieving.docbook}. @author: Nicolas Ternisien @license: GPLv3 """ from codecs import open import locale import os from os.path import abspath, basename, dirname, isdir, isfile, join import sys from pology import _, n_ from pology.report import report from pology.rules import loadRules, Rule from pology.timeout import TimedOutException def setup_sieve (p): p.set_desc(_("@info sieve discription", "Generate an XML tree from the input PO files." "\n\n" "See documentation for the description of the XML format used." )) - p.add_param("xml", unicode, + p.add_param("xml", str, metavar=_("@info sieve parameter value placeholder", "FILE"), desc=_("@info sieve parameter discription", "Write the XML tree into a file instead to standard output." )) # FIXME: Parameter name out of style. p.add_param("translatedOnly", bool, defval=False, desc=_("@info sieve parameter discription", "Consider only translated messages." )) class Sieve (object): def __init__ (self, params): self.xmlFile = None # File handle to write XML output self.filename = "" # File name we are processing self.translatedOnly = False # Also output in XML file ? if params.xml: xmlPath = params.xml if os.access(dirname(abspath(xmlPath)), os.W_OK): self.xmlFile=open(xmlPath, "w", "utf-8") else: warning(_("@info", "Cannot open file '%(file)s'. XML output disabled.", file=xmlPath)) self.translatedOnly = params.translatedOnly self.output('\n') self.output('\n') self.count = {} self.count["obs"] = 0 self.count["tot"] = 0 self.count["trn"] = 0 self.count["fuz"] = 0 self.count["unt"] = 0 # Indicators to the caller: self.caller_sync = False # no need to sync catalogs self.caller_monitored = False # no need for monitored messages def process(self, msg, cat): filename=basename(cat.filename) # Handle start/end of files for XML output (not needed for text output) if self.filename!=filename: if self.filename != "": self.write_stats() # close previous self.output("\n") self.filename=filename # open new po self.output('\n' % filename) # Test the add or not of this message if self.add_message(msg) is False: return # Statistics updating if msg.obsolete: self.count["obs"] += 1 status = "obsolete" else: self.count["tot"] += 1 if msg.translated: self.count["trn"] += 1 status = "translated" elif msg.fuzzy: self.count["fuz"] += 1 status = "fuzzy" elif msg.untranslated: self.count["unt"] += 1 status = "untranslated" # Output writing self.output("\t\n") self.output("\t\t%s\n" % msg.refline) self.output("\t\t%s\n" % msg.refentry) self.output("\t\t%s\n" % status) self.output("\t\t\n" % self.replace_cdata(msg.msgid) ) self.output("\t\t%s\n" % self.join_plural_form(msg.msgstr) ) if not msg.msgctxt: self.output("\t\t\n") else: self.output("\t\t\n" % self.replace_cdata(msg.msgctxt) ) self.output("\t\n") def join_plural_form(self, message_list): if len(message_list) == 1: return "" % self.replace_cdata(message_list[0]) message_str = "" for msgstr in message_list: message_str += "" % self.replace_cdata(msgstr) return message_str def add_message(self, msg): if self.translatedOnly is False: return True if self.translatedOnly is True and msg.translated is True: return True return False def replace_cdata(self, msg): return msg.replace("", "]]>") def output(self, content): if self.xmlFile: self.xmlFile.write(content) else: report(content.rstrip("\n")) def write_stats(self): self.output("\t\n") self.output("\t\t%s\n" % self.count["obs"]) self.output("\t\t%s\n" % self.count["tot"]) self.output("\t\t%s\n" % self.count["trn"]) self.output("\t\t%s\n" % self.count["fuz"]) self.output("\t\t%s\n" % self.count["unt"]) self.output("\t\n") self.count["obs"] = 0 self.count["tot"] = 0 self.count["trn"] = 0 self.count["fuz"] = 0 self.count["unt"] = 0 def finalize (self): self.write_stats() self.output("\n") self.output('\n') if self.xmlFile: # Close last po tag and xml file self.xmlFile.close() diff --git a/sieve/merge_corr_tree.py b/sieve/merge_corr_tree.py index 4778af26..1c3ee086 100644 --- a/sieve/merge_corr_tree.py +++ b/sieve/merge_corr_tree.py @@ -1,106 +1,106 @@ # -*- coding: UTF-8 -*- """ Merge translation corrections from (partial) PO files tree into main tree. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @author: Goran Rakic (Горан Ракић) @license: GPLv3 """ import os from pology import _, n_ from pology.catalog import Catalog from pology.header import Header from pology.report import report from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Merge translation corrections from partial PO files tree into main tree." "\n\n" "Give main PO files tree as input and provide the path difference to " "where the partial correction tree is available." )) - p.add_param("pathdelta", unicode, mandatory=True, + p.add_param("pathdelta", str, mandatory=True, metavar=_("@info sieve parameter value placeholder", "FIND[:REPLACE]"), desc=_("@info sieve parameter discription", "Specify that partial tree is available at path obtained when " "first FIND in the input path is replaced with REPLACE. " "If REPLACE is not given, FIND is just removed. " "Example:" "\n\n" "pathdelta:ui:ui-check" )) class Sieve (object): def __init__ (self, params): self.ncorr = 0 pathdelta = params.pathdelta if ":" not in pathdelta: self.pd_srch = pathdelta self.pd_repl = "" else: self.pd_srch, self.pd_repl = pathdelta.split(":", 1) def process_header (self, hdr, cat): # Cancel prior correction catalog. self.corr_cat = None # Construct expected path to correction catalog. corr_path = cat.filename.replace(self.pd_srch, self.pd_repl, 1) # Open the catalog if it exists. if os.path.isfile(corr_path): self.corr_cat = Catalog(corr_path) def process (self, msg, cat): if not self.corr_cat: # No correction catalog for this one, skip return if msg in self.corr_cat: corr_msg = self.corr_cat[msg] oldcount = msg.modcount # Need to take over manual comments too (the translator may have # made some upon correction), but without those added by pofilter. corr_manual_comment = [] for cmnt in corr_msg.manual_comment: if "(pofilter)" not in cmnt: corr_manual_comment.append(cmnt) # Take over all extraction-invariant parts # and set cleaned up comments. msg.set_inv(corr_msg) msg.manual_comment[:] = corr_manual_comment if msg.modcount > oldcount: self.ncorr += 1 def finalize (self): if self.ncorr > 0: msg = n_("@info:progress", "Merged %(num)d corrected message.", "Merged %(num)d corrected messages.", num=self.ncorr) report("===== " + msg) diff --git a/sieve/normctxt_delim.py b/sieve/normctxt_delim.py index 4e5e4462..89bbd0dc 100644 --- a/sieve/normctxt_delim.py +++ b/sieve/normctxt_delim.py @@ -1,103 +1,103 @@ # -*- coding: UTF-8 -*- """ Convert delimitor-embedded context to Gettext context. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.escape import unescape_c as unescape from pology.msgreport import warning_on_msg from pology.report import report from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Convert delimiter-embedded context to Gettext context." )) - p.add_param("head", unicode, mandatory=True, + p.add_param("head", str, mandatory=True, metavar=_("@info sieve parameter value placeholder", "STRING"), desc=_("@info sieve parameter discription", "Start of the msgid field which indicates that the context follows." )) - p.add_param("tail", unicode, mandatory=True, + p.add_param("tail", str, mandatory=True, metavar=_("@info sieve parameter value placeholder", "STRING"), desc=_("@info sieve parameter discription", "End of context in msgid field, after which the text follows." )) class Sieve (object): def __init__ (self, params): self.nconv = 0 self.chead = unescape(params.head) if not self.chead: raise SieveError( _("@info", "Context head cannot be empty string.")) self.ctail = unescape(params.tail) if not self.ctail: raise SieveError( _("@info", "Context tail cannot be empty string.")) def process (self, msg, cat): # Skip messages already having Gettext context. if msg.msgctxt or msg.msgctxt_previous: return msrc = (cat.filename, msg.refline, msg.refentry) if msg.msgid.startswith(self.chead): pos = msg.msgid.find(self.ctail) if pos < 0: warning_on_msg(_("@info", "Malformed embedded context."), msg, cat) return ctxt = msg.msgid[len(self.chead):pos] text = msg.msgid[pos + len(self.ctail):] if not ctxt or not text: warning_on_msg(_("@info", "Empty context or text."), msg, cat) return exmsgs = cat.select_by_key(ctxt, text, wobs=True) if exmsgs: exmsg = exmsgs[0] if not msg.obsolete and exmsg.obsolete: cat.remove_on_sync(exmsg) elif msg.obsolete and not exmsg.obsolete: cat.remove_on_sync(msg) return else: warning_on_msg( _("@info", "A non-obsolete message with same context and text " "already exist."), msg, cat) return msg.msgctxt = ctxt msg.msgid = text self.nconv += 1 def finalize (self): if self.nconv > 0: msg = n_("@info:progress", "Converted %(num)d delimiter-embedded context.", "Converted %(num)d delimiter-embedded contexts.", num=self.nconv) report("===== " + msg) diff --git a/sieve/normctxt_sep.py b/sieve/normctxt_sep.py index d982b38e..77b85e0b 100644 --- a/sieve/normctxt_sep.py +++ b/sieve/normctxt_sep.py @@ -1,83 +1,83 @@ # -*- coding: UTF-8 -*- """ Convert separator-embedded context to Gettext context. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.escape import unescape_c as unescape from pology.msgreport import warning_on_msg from pology.report import report from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Convert separator-embedded context to Gettext context." )) - p.add_param("sep", unicode, mandatory=True, + p.add_param("sep", str, mandatory=True, metavar=_("@info sieve parameter value placeholder", "STRING"), desc=_("@info sieve parameter discription", "Separator between the context and the text in msgid field." )) class Sieve (object): def __init__ (self, params): self.nconv = 0 self.csep = unescape(params.sep) if not self.csep: raise SieveError( _("@info", "Context separator cannot be empty string.")) def process (self, msg, cat): # Skip messages already having Gettext context. if msg.msgctxt or msg.msgctxt_previous: return pos = msg.msgid.find(self.csep) if pos >= 0: if msg.msgid.find(self.csep, pos + len(self.csep)) >= 0: # If more than one delimiter, probably not context. return ctxt = msg.msgid[:pos] text = msg.msgid[pos + len(self.csep):] if not ctxt or not text: # Something is strange, skip. return exmsgs = cat.select_by_key(ctxt, text, wobs=True) if exmsgs: exmsg = exmsgs[0] if not msg.obsolete and exmsg.obsolete: cat.remove_on_sync(exmsg) elif msg.obsolete and not exmsg.obsolete: cat.remove_on_sync(msg) return else: return msg.msgctxt = ctxt msg.msgid = text self.nconv += 1 def finalize (self): if self.nconv > 0: msg = n_("@info:progress", "Converted %(num)d separator-embedded context.", "Converted %(num)d separator-embedded contexts.", num=self.nconv) report("===== " + msg) diff --git a/sieve/remove_fuzzy_comments.py b/sieve/remove_fuzzy_comments.py index e5aa88b2..e961d661 100644 --- a/sieve/remove_fuzzy_comments.py +++ b/sieve/remove_fuzzy_comments.py @@ -1,124 +1,124 @@ # -*- coding: UTF-8 -*- """ Remove selected manual comments in fuzzy messages. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import _, n_ from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Remove selected manual comments from fuzzy messages." )) p.add_param("all", bool, defval=False, desc=_("@info sieve parameter discription", "Remove all manual comments." )) p.add_param("nopipe", bool, defval=False, desc=_("@info sieve parameter discription", "Remove embedded lists of no-pipe flags (# |, foo, ...)." )) - p.add_param("pattern", unicode, + p.add_param("pattern", str, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Remove comments matching the regular expression." )) - p.add_param("exclude", unicode, + p.add_param("exclude", str, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Remove comments not matching the regular expression." )) p.add_param("case", bool, defval=False, desc=_("@info sieve parameter discription", "Case-sensitive pattern matching." )) class Sieve (object): def __init__ (self, params): self.sel_all = params.all self.sel_nopipe = params.nopipe self.rxflags = re.U if not params.case: self.rxflags |= re.I self.pattern = None if params.pattern: self.pattern = params.pattern self.pattern_rx = re.compile(self.pattern, self.rxflags) self.exclude = None if params.exclude: self.exclude = params.exclude self.exclude_rx = re.compile(self.exclude, self.rxflags) # Regex for matching no-pipe flag lists. self.nopipe_rx = re.compile(r"^\s*\|,") # Number of modified messages. self.nmod = 0 def process (self, msg, cat): # Process comments only for fuzzy messages. if not msg.fuzzy: return modcount = msg.modcount # Go through manual comments. i = 0 while i < len(msg.manual_comment): selected = False cmnt = msg.manual_comment[i] # Specific selections. if not selected and self.sel_all: selected = True if not selected and self.sel_nopipe: selected = self.nopipe_rx.search(cmnt) is not None # Inclusion pattern. if not selected and self.pattern is not None: selected = self.pattern_rx.search(cmnt) is not None # Exclusion pattern. if selected and self.exclude is not None: selected = self.exclude_rx.search(cmnt) is None # Apply selection. if selected: msg.manual_comment.pop(i) else: i += 1 if msg.modcount > modcount: self.nmod += 1 def finalize (self): if self.nmod > 0: msg = n_("@info:progress", "Removed some comments from %(num)d fuzzy message.", "Removed some comments from %(num)d fuzzy messages.", num=self.nmod) report("===== " + msg) diff --git a/sieve/resolve_aggregates.py b/sieve/resolve_aggregates.py index cce217a0..0c4fd16e 100644 --- a/sieve/resolve_aggregates.py +++ b/sieve/resolve_aggregates.py @@ -1,157 +1,157 @@ # -*- coding: UTF-8 -*- """ Resolve aggregate messages produced by C{msgcat}. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ # DESIGN NOTE: # If one of the messages is missing one of the parts that others have, # that part is silently not added to the aggregation -- there is no explicit # indicator to tell that it was missing. # PO file names need not be unique either (if collected from a directory tree), # so it is not possible to deduce this from file names; likewise for project ID. # This means that there is no way to reconstruct complete original messages, # so each part has to be resolved independently. import re from pology import _, n_ from pology.header import Header from pology.message import Message from pology.report import report from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Resolve aggregate messages produced by '%(cmd)s'.", cmd="msgcat" )) p.add_param("first", bool, defval=False, desc=_("@info sieve parameter discription", "Always pick the first variant (by default, aggregate messages " "are resolved by taking the most frequent variant)." )) p.add_param("unfuzzy", bool, defval=False, desc=_("@info sieve parameter discription", "Unfuzzy resolved messages. " "DANGEROUS: Use only if all messages in aggregation can be guaranteed " "not to be fuzzy." )) p.add_param("keepsrc", bool, defval=False, desc=_("@info sieve parameter discription", "Keep source reference on resolved messages instead of removing them." )) class Sieve (object): def __init__ (self, params): exclusive_picks = [params.first] if sum(exclusive_picks) > 2: raise SieveError( _("@info", "Only one resolution criterion for " "aggregate messages can be given.")) if params.first: self.selvar = _selvar_first else: self.selvar = _selvar_frequent self.unfuzzy = params.unfuzzy self.keepsrc = params.keepsrc self.nresolved = 0 self.nresolvedhdr = 0 def process_header (self, hdr, cat): hmsg = Message(hdr.to_msg()) if _resolve_msg(hmsg, self.selvar): self.nresolvedhdr += 1 cat.header = Header(hmsg) def process (self, msg, cat): if _resolve_msg(msg, self.selvar): self.nresolved += 1 if self.unfuzzy: msg.unfuzzy() if not self.keepsrc: msg.source[:] = [] def finalize (self): if self.nresolvedhdr > 0: msg = n_("@info:progress", "Resolved %(num)d aggregate header.", "Resolved %(num)d aggregate headers.", num=self.nresolvedhdr) report("===== " + msg) if self.nresolved > 0: msg = n_("@info:progress", "Resolved %(num)d aggregate message.", "Resolved %(num)d aggregate messages.", num=self.nresolved) report("===== " + msg) def _selvar_first (texts): return texts[0] def _selvar_frequent (texts): tinds_by_text = {} - for text, tind in zip(texts, range(len(texts))): + for text, tind in zip(texts, list(range(len(texts)))): if text not in tinds_by_text: tinds_by_text[text] = [] tinds_by_text[text].append(tind) - tinds = sorted(tinds_by_text.values(), key=lambda x: (-len(x), x)) + tinds = sorted(list(tinds_by_text.values()), key=lambda x: (-len(x), x)) return texts[tinds[0][0]] def _resolve_msg (msg, selvar): oldcount = msg.modcount if msg.manual_comment: aggtext = "\n".join(msg.manual_comment) msg.manual_comment[:] = _resolve_aggtext(aggtext, selvar).split("\n") if msg.auto_comment: aggtext = "\n".join(msg.auto_comment) msg.auto_comment[:] = _resolve_aggtext(aggtext, selvar).split("\n") # Separator swallows trailing newline, put it based on msgid. need_trailing_nl = msg.msgid.endswith("\n") for i in range(len(msg.msgstr)): nmsgstr = _resolve_aggtext(msg.msgstr[i], selvar) if need_trailing_nl and nmsgstr != msg.msgstr[i]: nmsgstr += "\n" msg.msgstr[i] = nmsgstr return msg.modcount > oldcount _splitter_rx = re.compile(r"\n?(?:#-){3,}# .*? (?:#-){3,}#\n?") def _resolve_aggtext (aggtext, selvar): texts = _splitter_rx.split(aggtext)[1:] - return unicode(selvar(texts)) if texts else aggtext + return str(selvar(texts)) if texts else aggtext diff --git a/sieve/resolve_alternatives.py b/sieve/resolve_alternatives.py index ccbc9bc1..f1579e60 100644 --- a/sieve/resolve_alternatives.py +++ b/sieve/resolve_alternatives.py @@ -1,103 +1,103 @@ # -*- coding: UTF-8 -*- """ Resolve alternative directives in translation. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import sys from pology import _, n_ from pology.msgreport import warning_on_msg from pology.report import report from pology.resolve import resolve_alternatives from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Resolve alternative directives in translation." )) - p.add_param("alt", unicode, mandatory=True, + p.add_param("alt", str, mandatory=True, metavar=_("@info sieve parameter value placeholder", "N,Mt"), desc=_("@info sieve parameter discription", "N is index (1-based) of the alternative to take from each directive, " "and M the number of alternatives per directive. Example:\n" "\n" "alt:1,2t" )) class Sieve (object): def __init__ (self, params): self.total = None self.select = None try: for spec in params.alt.split(","): if spec.endswith("t"): self.total = int(spec[:-1]) else: self.select = int(spec) except: raise SieveError( _("@info", "Malformed specification for " "resolution of alternatives '%(spec)s'.", spec=params.alt)) if self.total is None: raise SieveError( _("@info", "Number of alternatives per directive not given.")) if self.select is None: raise SieveError( _("@info", "Index of selected alternative not given.")) if self.total < 1: raise SieveError( _("@info", "Number of alternatives specified as %(num)d, " "but must be greater than 1.", num=self.total)) if self.select < 1 or self.select > self.total: raise SieveError( _("@info", "Selected alternative no. %(ord)d is out of range.", ord=self.select)) self.nresolved = 0 def process (self, msg, cat): for i in range(len(msg.msgstr)): msg.msgstr[i], nresolved, valid = \ resolve_alternatives(msg.msgstr[i], self.select, self.total, srcname=cat.filename) if valid: self.nresolved += nresolved else: warning_on_msg(_("@info", "Invalid alternatives directive " "in translation."), msg, cat) def finalize (self): if self.nresolved > 0: msg = n_("@info:progress", "Resolved %(num)d alternative in translation.", "Resolved %(num)d alternatives in translation.", num=self.nresolved) report("===== " + msg) diff --git a/sieve/resolve_entities.py b/sieve/resolve_entities.py index 04919f4f..87ef690a 100644 --- a/sieve/resolve_entities.py +++ b/sieve/resolve_entities.py @@ -1,75 +1,75 @@ # -*- coding: UTF-8 -*- """ Resolve XML entities in translation. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.entities import read_entities from pology.msgreport import warning_on_msg from pology.report import report, format_item_list from pology.resolve import resolve_entities from pology.sieve import add_param_entdef def setup_sieve (p): p.set_desc(_("@info sieve discription", "Resolve XML entities in translation." )) add_param_entdef(p) - p.add_param("ignore", unicode, seplist=True, + p.add_param("ignore", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "ENTITY1,..."), desc=_("@info sieve parameter discription", "Comma-separated list of entity names to ignore." )) class Sieve (object): def __init__ (self, params): self.entity_files = params.entdef or [] self.ignored_entities = ["lt", "gt", "apos", "quot", "amp"] if params.ignore: self.ignored_entities.extend(params.ignore) # Read entity definitions. self.entities = read_entities(self.entity_files) self.nresolved = 0 def process (self, msg, cat): for i in range(len(msg.msgstr)): msg.msgstr[i], resolved, unknown = \ resolve_entities(msg.msgstr[i], self.entities, self.ignored_entities, cat.filename) self.nresolved += len(resolved) if unknown: warning_on_msg(_("@info", "Unknown entities in translation: " "%(entlist)s.", entlist=format_item_list(unknown)), msg, cat) def finalize (self): if self.nresolved > 0: msg = n_("@info:progress", "Resolved %(num)d entity in translation.", "Resolved %(num)d entities in translation.", num=self.nresolved) report("===== " + msg) diff --git a/sieve/set_header.py b/sieve/set_header.py index f21942e6..4b085956 100644 --- a/sieve/set_header.py +++ b/sieve/set_header.py @@ -1,245 +1,245 @@ # -*- coding: UTF-8 -*- """ Set elements of the PO header. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import time import re from pology import _, n_ from pology.report import report, warning from pology.resolve import expand_vars from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Set elements of the PO header." "\n\n" "Sometimes a header field needs to be modified, added or removed " "in many catalogs, and this sieve serves that purpose." "\n\n" "%%-character in the value is used to expand known variables. " "Currently these are: %%%(var1)s - name of the catalog. " "If literal %% is needed (e.g. in plural forms header), " "it can be escaped as %%%%.", var1="poname" )) - p.add_param("field", unicode, multival=True, + p.add_param("field", str, multival=True, metavar=_("@info sieve parameter value placeholder", "FIELD:VALUE"), desc=_("@info sieve parameter discription", "Set a header field to the given value. " "This parameter can be repeated, to set several fields in single run." )) p.add_param("create", bool, defval=False, desc=_("@info sieve parameter discription", "Add the field if not present " "(by default the field value is set only if the field already exists " "in the header)." )) - p.add_param("after", unicode, + p.add_param("after", str, metavar=_("@info sieve parameter value placeholder", "FIELD"), desc=_("@info sieve parameter discription", "When the new field is being added, add it after this field. " "If such field does not exist, the new field is added as the last one." )) - p.add_param("before", unicode, + p.add_param("before", str, metavar=_("@info sieve parameter value placeholder", "FIELD"), desc=_("@info sieve parameter discription", "When the new field is being added, add it before this field. " "If such field does not exist, the new field is added as the last one." )) p.add_param("reorder", bool, defval=False, desc=_("@info sieve parameter discription", "If the field to be set is present, but not in the order implied by " "'%(par1)s' and '%(par2)s' parameters, reinsert it accordingly.", par1="after", par2="before" )) - p.add_param("remove", unicode, multival=True, + p.add_param("remove", str, multival=True, metavar=_("@info sieve parameter value placeholder", "FIELD"), desc=_("@info sieve parameter discription", "Remove the field." )) - p.add_param("removerx", unicode, multival=True, + p.add_param("removerx", str, multival=True, metavar=_("@info sieve parameter value placeholder", "REGEX"), desc=_("@info sieve parameter discription", "Remove all fields matching the regular expression. " "Matching is not case-sensitive." )) - p.add_param("title", unicode, multival=True, + p.add_param("title", str, multival=True, metavar=_("@info sieve parameter value placeholder", "VALUE"), desc=_("@info sieve parameter discription", "Set title comment to the given value." "Can be repeated to set several title lines. " "All existing title lines are removed before setting the new ones." )) p.add_param("rmtitle", bool, defval=False, desc=_("@info sieve parameter discription", "Remove title comments." )) - p.add_param("copyright", unicode, + p.add_param("copyright", str, metavar=_("@info sieve parameter value placeholder", "VALUE"), desc=_("@info sieve parameter discription", "Set copyright comment to the given value." )) p.add_param("rmcopyright", bool, defval=False, desc=_("@info sieve parameter discription", "Remove the copyright comment." )) - p.add_param("license", unicode, + p.add_param("license", str, metavar=_("@info sieve parameter value placeholder", "VALUE"), desc=_("@info sieve parameter discription", "Set license comment to the given value." )) p.add_param("rmlicense", bool, defval=False, desc=_("@info sieve parameter discription", "Remove the license comment." )) - p.add_param("author", unicode, multival=True, + p.add_param("author", str, multival=True, metavar=_("@info sieve parameter value placeholder", "VALUE"), desc=_("@info sieve parameter discription", "Set author comment to the given value. " "Can be repeated to set several authors. " "All existing authors are removed before setting the new ones." )) p.add_param("rmauthor", bool, defval=False, desc=_("@info sieve parameter discription", "Remove author comments." )) - p.add_param("comment", unicode, multival=True, + p.add_param("comment", str, multival=True, metavar=_("@info sieve parameter value placeholder", "VALUE"), desc=_("@info sieve parameter discription", "Set free comment to the given value. " "Can be repeated to set several free comment lines. " "All existing comment lines are removed before setting the new ones." )) p.add_param("rmcomment", bool, defval=False, desc=_("@info sieve parameter discription", "Remove free comments." )) p.add_param("rmallcomm", bool, defval=False, desc=_("@info sieve parameter discription", "Remove all header comments." )) class Sieve (object): def __init__ (self, params): # Parse field setting specifications. self.fields_values = [] for field_value_str in (params.field or []): field_value = field_value_str.split(":", 1) if len(field_value) != 2: raise SieveError( _("@info", "Invalid specification '%(spec)s' " "of header field and value.", spec=field_value_str)) self.fields_values.append(field_value) # Set fields in reverse, so that 'after' and 'before' parameters # are followed by the order of appearance of fields in command line. if params.after or params.before: self.fields_values.reverse() # Prepare matching for field removal. if params.removerx is not None: rxs = [] for rxstr in params.removerx: try: rx = re.compile(rxstr, re.U|re.I) except: raise SieveError( _("@info", "Invalid regular expression '%(regex)s' " "for removing fields.", regex=rxstr)) rxs.append(rx) params.removerx = rxs # Check validity of comment values. for title in (params.title or []): if re.search(r"copyright|©|\(C\)|license|<.*?@.*?>", title, re.I|re.U): raise SieveError( _("@info", "Invalid value '%(val)s' for title comment " "(it contains some elements appropriate " "for other types of comments).", val=title)) if params.copyright is not None: if not re.search(r"copyright|©|\(C\)", params.copyright, re.I|re.U): raise SieveError( _("@info", "Invalid value '%(val)s' for copyright comment " "(missing the word 'copyright'?).", val=params.copyright)) if params.license is not None: if not re.search(r"license", params.license, re.I): raise SieveError( _("@info", "Invalid value '%(val)s' for license comment " "(missing the word 'license'?).", val=params.license)) for author in (params.author or []): if not re.search(r"<.*?@.*?>", author): raise SieveError( _("@info", "Invalid value '%(val)s' for author comment " "(missing the email address?).", val=author)) self.p = params def process_header (self, hdr, cat): pvars = {"poname" : cat.name} for rmname in self.p.remove or []: hdr.remove_field(rmname) for rmrx in self.p.removerx or []: to_remove = set() for name, value in hdr.field: if name not in to_remove and rmrx.search(name): to_remove.add(name) for name in to_remove: hdr.remove_field(name) for field, value in self.fields_values: if self.p.create or hdr.select_fields(field): hdr.set_field(field, expand_vars(value, pvars), after=self.p.after, before=self.p.before, reorder=self.p.reorder) if self.p.rmtitle or self.p.rmallcomm: hdr.title[:] = [] if self.p.title is not None: hdr.title[:] = [expand_vars(x, pvars) for x in self.p.title] if self.p.rmcopyright or self.p.rmallcomm: hdr.copyright = None if self.p.copyright is not None: hdr.copyright = expand_vars(self.p.copyright, pvars) if self.p.rmlicense or self.p.rmallcomm: hdr.license = None if self.p.license is not None: hdr.license = expand_vars(self.p.license, pvars) if self.p.rmauthor or self.p.rmallcomm: hdr.author[:] = [] if self.p.author is not None: hdr.author[:] = [expand_vars(x, pvars) for x in self.p.author] if self.p.rmcomment or self.p.rmallcomm: hdr.comment[:] = [] if self.p.comment is not None: hdr.comment[:] = [expand_vars(x, pvars) for x in self.p.comment] diff --git a/sieve/stats.py b/sieve/stats.py index 84e36e1b..03046772 100644 --- a/sieve/stats.py +++ b/sieve/stats.py @@ -1,961 +1,961 @@ # -*- coding: UTF-8 -*- """ Catalog statistics: message and word counts, etc. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import codecs import locale import os import sys from pology import _, n_ from pology.catalog import Catalog from pology.message import MessageUnsafe from pology.colors import ColorString, cjoin, cinterp from pology.comments import parse_summit_branches from pology.diff import tdiff from pology.fsops import collect_catalogs from pology.getfunc import get_hook_ireq from pology.report import report, warning, format_item_list from pology.split import proper_words from pology.tabulate import tabulate from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Compute translation statistics.\n" "\n" "Provides basic count of number of messages by type (translated, fuzzy, " "etc.), along with words and character counts, and some other derived " "statistics on request." )) - p.add_param("accel", unicode, multival=True, + p.add_param("accel", str, multival=True, metavar=_("@info sieve parameter value placeholder", "CHAR"), desc=_("@info sieve parameter discription", "Character which is used as UI accelerator marker in text fields, " "to remove it before counting. " "If a catalog defines accelerator marker in the header, " "this value overrides it." )) p.add_param("detail", bool, defval=False, desc=_("@info sieve parameter discription", "Compute and display some derived statistical quantities." )) p.add_param("incomplete", bool, defval=False, desc=_("@info sieve parameter discription", "List catalogs which are not fully translated, with incompletness counts." )) - p.add_param("incompfile", unicode, + p.add_param("incompfile", str, metavar=_("@info sieve parameter value placeholder", "FILE"), desc=_("@info sieve parameter discription", "Write paths of catalogs that are not fully translated into a file, " "one per line." )) - p.add_param("templates", unicode, + p.add_param("templates", str, metavar=_("@info sieve parameter value placeholder", "FIND:REPLACE"), desc=_("@info sieve parameter discription", "Count in templates without a corresponding catalog (i.e. translation on " "it has not started yet) into statistics. " "Assumes that translated catalogs and templates live in two root " "directories with same structure; then for each path of an existing " "catalog, its directory is taken and the path to corresponding templates " "directory constructed by replacing first occurence of FIND with REPLACE." )) - p.add_param("branch", unicode, seplist=True, + p.add_param("branch", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "BRANCH"), desc=_("@info sieve parameter discription", "In summit catalogs, count in only messages belonging to given branch. " "Several branches can be given as comma-separated list." )) p.add_param("maxwords", int, metavar=_("@info sieve parameter value placeholder", "NUMBER"), desc=_("@info sieve parameter discription", "Count in only messages which have at most this many words, " "either in original or translation." )) p.add_param("minwords", int, metavar=_("@info sieve parameter value placeholder", "NUMBER"), desc=_("@info sieve parameter discription", "Count in only messages which have at least this many words, " "either in original or translation." )) - p.add_param("lspan", unicode, + p.add_param("lspan", str, metavar=_("@info sieve parameter value placeholder", "FROM:TO"), desc=_("@info sieve parameter discription", "Count in only messages at or after line FROM, and before line TO. " "If FROM is empty, 0 is assumed; " "if TO is empty, total number of lines is assumed." )) - p.add_param("espan", unicode, + p.add_param("espan", str, metavar=_("@info sieve parameter value placeholder", "FROM:TO"), desc=_("@info sieve parameter discription", "Count in only messages at or after entry FROM, and before entry TO. " "If FROM is empty, 0 is assumed; " "if TO is empty, total number of entries is assumed." )) p.add_param("bydir", bool, defval=False, desc=_("@info sieve parameter discription", "Report statistics per leaf directory in searched paths." )) p.add_param("byfile", bool, defval=False, desc=_("@info sieve parameter discription", "Report statistics per catalog." )) p.add_param("wbar", bool, defval=False, desc=_("@info sieve parameter discription", "Show statistics in form of word bars." )) p.add_param("msgbar", bool, defval=False, desc=_("@info sieve parameter discription", "Show statistics in form of message bars." )) p.add_param("msgfmt", bool, defval=False, desc=_("@info sieve parameter discription", "Show a minimal summary of the statistics (like msgfmt)." )) p.add_param("absolute", bool, defval=False, desc=_("@info sieve parameter discription", "Scale lengths of word and message bars to numbers they represent, " "rather than relative to percentage of translation state. " "Useful with '%(par1)s' and '%(par2)s' parameters, " "to compare sizes of different translation units.", par1="byfile", par2="bydir" )) p.add_param("ondiff", bool, defval=False, desc=_("@info sieve parameter discription", "Split word and character counts of fuzzy messages " "into translated and untranslated categories (leaving zero in fuzzy), " "based on difference ratio between current and previous original text." )) p.add_param("mincomp", float, defval=None, metavar=_("@info sieve parameter value placeholder", "RATIO"), desc=_("@info sieve parameter discription", "Include into statistics only catalogs with sufficient completeness, " "as ratio of translated to other messages (real value between 0 and 1)." )) - p.add_param("filter", unicode, multival=True, + p.add_param("filter", str, multival=True, metavar=_("@info sieve parameter value placeholder", "HOOK"), desc=_("@info sieve parameter discription", "F1A hook specification, to filter the translation through. " "Several filters can be specified by repeating the parameter." )) class Sieve (object): def __init__ (self, params): self.p = params # Templates correspondence. # Mapping of catalogs to templates, in form of :. # For each catalog file path, the first substring is replaced # by , and .po replaced with .pot, to construct its template # file path. All templates not found under such paths are reported. # Furthermore, all subdirs of these paths are searched for templates # without corresponding catalogs, and every such template is counted # as fully untranslated PO. if self.p.templates: if ":" not in self.p.templates: self.tspec_srch = self.p.templates self.tspec_repl = "" else: self.tspec_srch, self.tspec_repl = self.p.templates.split(":", 1) # Turn off table display if a bar view has been selected. self.p.table = True if self.p.msgbar or self.p.wbar or self.p.msgfmt: self.p.table = False # Filenames of catalogs which are not fully translated. self.incomplete_catalogs = {} # Counted categories. self.count_spec = ( ("trn", _("@title:row translated messages/words/characters", "translated")), ("fuz", _("@title:row fuzzy messages/words/characters", "fuzzy")), ("unt", _("@title:row untranslated messages/words/characters", "untranslated")), ("tot", _("@title:row fuzzy messages/words/characters", "total")), ("obs", _("@title:row fuzzy messages/words/characters", "obsolete")), ) # FIXME: After parameter parser can deliver requested sequence type. if self.p.branch is not None: self.p.branch = set(self.p.branch) # Parse line/entry spans. def parse_span (spanspec): lst = spanspec is not None and spanspec.split(":") or ("", "") if len(lst) != 2: raise SieveError( _("@info", "Wrong number of elements in span " "specification '%(spec)s'.", spec=self.p.lspan)) nlst = [] for el in lst: if not el: nlst.append(None) else: try: nlst.append(int(el)) except: raise SieveError( _("@info", "Not an integer number in span " "specification '%(spec)s'.", spec=self.p.lspan)) return tuple(nlst) self.lspan = parse_span(self.p.lspan) self.espan = parse_span(self.p.espan) # Number of counts per category: # messages, words in original, words in translation, # characters in original, characters in translation. self.counts_per_cat = 5 # Category counts per catalog filename. self.counts = {} # Collections of all confirmed templates and tentative template subdirs. self.matched_templates = {} self.template_subdirs = [] if self.p.templates: for rpath in params.root_paths: if os.path.isfile(rpath): rpath = os.path.dirname(rpath) rpath = rpath.replace(self.tspec_srch, self.tspec_repl, 1) self.template_subdirs.append(rpath) # Map of template to translation subdirs. self.mapped_template_subdirs = {} # Some indicators of metamessages. self.xml2po_meta_msgid = dict([(x, True) for x in ("translator-credits",)]) self.xml2pot_meta_msgid = dict([(x, True) for x in ("ROLES_OF_TRANSLATORS", "CREDIT_FOR_TRANSLATORS")]) self.kde_meta_msgctxt = dict([(x, True) for x in ("NAME OF TRANSLATORS", "EMAIL OF TRANSLATORS")]) # Resolve filtering hooks. self.pfilters = [] for hreq in self.p.filter or []: self.pfilters.append(get_hook_ireq(hreq, abort=True)) # Indicators to the caller: self.caller_sync = False # no need to sync catalogs self.caller_monitored = False # no need for monitored messages def _count_zero (self): return dict([(x[0], [0] * self.counts_per_cat) for x in self.count_spec]) def _count_sum (self, c1, c2): cs = self._count_zero() for cat, catname in self.count_spec: for i in range(self.counts_per_cat): cs[cat][i] = c1[cat][i] + c2[cat][i] return cs def process_header (self, hdr, cat): # Establish counts for this file. if cat.filename not in self.counts: self.counts[cat.filename] = self._count_zero() self.count = self.counts[cat.filename] # If template correspondence requested, handle template matching. if ( self.p.templates and not cat.filename.endswith(".pot")): # Construct expected template path. tpath = cat.filename.replace(self.tspec_srch, self.tspec_repl, 1) pdot = tpath.rfind(".") if pdot >= 0: tpath = tpath[:pdot] + ".pot" # Inform if the template does not exist. if not os.path.isfile(tpath): warning(_("@info", "Expected template catalog '%(file)s' is missing.", file=tpath)) # Indicate the template has been matched. if tpath not in self.matched_templates: self.matched_templates[tpath] = True # Force explicitly given accelerators. if self.p.accel is not None: cat.set_accelerator(self.p.accel) def process (self, msg, cat): # Summit: if branches were given, skip the message if it does not # belong to any of the given branches. if self.p.branch: msg_branches = parse_summit_branches(msg) if not set.intersection(self.p.branch, msg_branches): return # If line/entry spans given, skip message if not in range. if self.lspan[0] is not None and msg.refline < self.lspan[0]: return if self.lspan[1] is not None and msg.refline >= self.lspan[1]: return if self.espan[0] is not None and msg.refentry < self.espan[0]: return if self.espan[1] is not None and msg.refentry >= self.espan[1]: return # Decide if a metamessage: ismeta = False # - msgid in form "@@: ..." from xml2po if msg.msgid.startswith("@@"): ps = msg.msgid.find(":") ismeta = (ps >= 0 and msg.msgid[2:ps].isalnum()) # - translator credits from xml2po and xml2pot if ( msg.msgid in self.xml2po_meta_msgid or msg.msgid in self.xml2pot_meta_msgid ): ismeta = True # - translator credits in KDE GUI if msg.msgctxt in self.kde_meta_msgctxt: ismeta = True # Prepare filtered message for counting. if self.pfilters: msg = MessageUnsafe(msg) for pfilter in self.pfilters: for i in range(len(msg.msgstr)): msg.msgstr[i] = pfilter(msg.msgstr[i]) # Count the words and characters in original and translation. # Remove shortcut markers prior to counting; don't include words # which do not start with a letter; remove scripted part. # For plural messages compute averages of msgid and msgstr groups, # to normalize comparative counts on varying number of plural forms. nwords = {"orig" : 0, "tran" : 0} nchars = {"orig" : 0, "tran" : 0} msgids = [msg.msgid] if msg.msgid_plural is not None: msgids.append(msg.msgid_plural) for src, texts in (("orig", msgids), ("tran", msg.msgstr)): if ismeta: # consider metamessages as zero counts continue lnwords = [] # this group's word count, for averaging lnchars = [] # this group's character count, for averaging for text in texts: pf = text.find("|/|") if pf >= 0: text = text[0:pf] words = proper_words(text, True, cat.accelerator(), msg.format) # If there are no proper words but there are some characters, # set to one empty word in order for a fuzzy or # an untranslated message not to be considered translated # when only word counts are observed. if not words and text: words = [""] lnwords.append(len(words)) lnchars.append(len("".join(words))) nwords[src] += int(round(float(sum(lnwords)) / len(texts))) nchars[src] += int(round(float(sum(lnchars)) / len(texts))) #nchars[src] += (nwords[src] - 1) # nominal space per each two words # If the number of words has been limited, skip the message if it # does not fall in the range. if self.p.maxwords is not None: if not ( nwords["orig"] <= self.p.maxwords or nwords["tran"] <= self.p.maxwords): return if self.p.minwords is not None: if not ( nwords["orig"] >= self.p.minwords or nwords["tran"] >= self.p.minwords): return # Split word and character counts in fuzzy original if requested. nswords = {} nschars = {} if self.p.ondiff and msg.fuzzy and msg.msgid_previous is not None: diff, dr = tdiff(msg.msgid_previous, msg.msgid, diffr=True) # Reduce difference ratio to a smaller range by some threshold. # Texts more different than the threshold need full review. drth = 0.4 #dr2 = dr if dr < drth else 1.0 dr2 = min(dr / drth, 1.0) # Split counts between primary fuzzy count, and secondary # translated, so that total remains the same. nswords.update({"trn": {}, "fuz": {}, "unt": {}}) nschars.update({"trn": {}, "fuz": {}, "unt": {}}) for nitems, nitems2, src in ( (nwords, nswords, "orig"), (nwords, nswords, "tran"), (nchars, nschars, "orig"), (nchars, nschars, "tran"), ): num = nitems[src] # Difference ratio of 0 can happen if the new and old texts # are the same, normally when only the context has changed. # Fuzzy counts should not be totally eliminated then, # as it should be seen that message needs updating. if dr2 > 0.0: rnum = int(round(dr2 * num + 0.5)) # round up else: rnum = 1 rnum = min(rnum, num) # in case of rounding overflow nitems2["trn"][src] = num - rnum nitems2["fuz"][src] = 0 nitems2["unt"][src] = rnum # Detect categories and add the counts. categories = set() if not msg.obsolete: # do not count obsolete into totals self.count["tot"][0] += 1 categories.add("tot") if nswords: - categories.update(nswords.keys()) + categories.update(list(nswords.keys())) if msg.obsolete: # do not split obsolete into fuzzy/translated self.count["obs"][0] += 1 categories.add("obs") nswords = {} nschars = {} elif msg.translated: self.count["trn"][0] += 1 categories.add("trn") elif msg.fuzzy: self.count["fuz"][0] += 1 categories.add("fuz") if cat.filename not in self.incomplete_catalogs: self.incomplete_catalogs[cat.filename] = True elif msg.untranslated: self.count["unt"][0] += 1 categories.add("unt") if cat.filename not in self.incomplete_catalogs: self.incomplete_catalogs[cat.filename] = True for cat in categories: nwords1 = nswords.get(cat, nwords) nchars1 = nschars.get(cat, nchars) self.count[cat][1] += nwords1["orig"] self.count[cat][2] += nwords1["tran"] self.count[cat][3] += nchars1["orig"] self.count[cat][4] += nchars1["tran"] # Sort filenames as if templates-only were within language subdirs. def _sort_equiv_filenames (self, filenames): def equiv_template_path (x): cdir = os.path.dirname(x) if cdir in self.mapped_template_subdirs: cdir = self.mapped_template_subdirs[cdir] return os.path.join(cdir, os.path.basename(x)) else: return x filenames.sort(key=lambda x: equiv_template_path(x)) def finalize (self): # If template correspondence requested, handle POTs without POs. if self.template_subdirs: # Collect all catalogs in template subdirs. tpaths = collect_catalogs(self.template_subdirs) - tpaths = filter(self.p.is_cat_included, tpaths) + tpaths = list(filter(self.p.is_cat_included, tpaths)) # Filter to have only POTs remain. tpaths = [x for x in tpaths if x.endswith(".pot")] # Filter to leave out matched templates. tpaths = [x for x in tpaths if x not in self.matched_templates] # Add stats on all unmatched templates. for tpath in tpaths: cat = Catalog(tpath, monitored=False) self.process_header(cat.header, cat) for msg in cat: self.process(msg, cat) # Map template to translation subdirs. for tpath in tpaths: tsubdir = os.path.dirname(tpath) subdir = tsubdir.replace(self.tspec_repl, self.tspec_srch, 1) self.mapped_template_subdirs[tsubdir] = subdir # If completeness limit in effect, eliminate catalogs not passing it. if self.p.mincomp is not None: ncounts = {} ninccats = {} - for filename, count in self.counts.iteritems(): + for filename, count in self.counts.items(): cr = float(count["trn"][0]) / (count["tot"][0] or 1) if cr >= self.p.mincomp: ncounts[filename] = count inccat = self.incomplete_catalogs.get(filename) if inccat is not None: ninccats[filename] = inccat self.counts = ncounts self.incomplete_catalogs = ninccats # Assemble sets of total counts by requested divisions. count_overall = self._count_zero() counts_bydir = {} filenames_bydir = {} - for filename, count in self.counts.iteritems(): + for filename, count in self.counts.items(): count_overall = self._count_sum(count_overall, count) if self.p.bydir: cdir = os.path.dirname(filename) if cdir in self.mapped_template_subdirs: # Pretend templates-only are within language subdir. cdir = self.mapped_template_subdirs[cdir] if cdir not in counts_bydir: counts_bydir[cdir] = self._count_zero() filenames_bydir[cdir] = [] counts_bydir[cdir] = self._count_sum(counts_bydir[cdir], count) filenames_bydir[cdir].append(filename) # Arrange sets into ordered list with titles. counts = [] if self.p.bydir: - cdirs = counts_bydir.keys(); + cdirs = list(counts_bydir.keys()); cdirs.sort() for cdir in cdirs: if self.p.byfile: self._sort_equiv_filenames(filenames_bydir[cdir]) for filename in filenames_bydir[cdir]: counts.append((filename, self.counts[filename], False)) counts.append(("%s/" % cdir, counts_bydir[cdir], False)) counts.append((_("@item:intable sum of all other entries", "(overall)"), count_overall, True)) elif self.p.byfile: - filenames = self.counts.keys() + filenames = list(self.counts.keys()) self._sort_equiv_filenames(filenames) for filename in filenames: counts.append((filename, self.counts[filename], False)) counts.append((_("@item:intable sum of all other entries", "(overall)"), count_overall, True)) else: counts.append((None, count_overall, False)) # Indicate conspicuously up front modifiers to counting. modstrs = [] if self.p.branch: fmtbranches = format_item_list(self.p.branch) modstrs.append(_("@item:intext", "branches (%(branchlist)s)", branchlist=fmtbranches)) if self.p.maxwords is not None and self.p.minwords is None: modstrs.append(n_("@item:intext", "at most %(num)d word", "at most %(num)d words", num=self.p.maxwords)) if self.p.minwords is not None and self.p.maxwords is None: modstrs.append(n_("@item:intext", "at least %(num)d word", "at least %(num)d words", num=self.p.minwords)) if self.p.minwords is not None and self.p.maxwords is not None: modstrs.append(n_("@item:intext", "from %(num1)d to %(num)d word", "from %(num1)d to %(num)d words", num1=self.p.minwords, num=self.p.maxwords)) if self.p.lspan: modstrs.append(_("@item:intext", "line span %(span)s", span=self.p.lspan)) if self.p.espan: modstrs.append(_("@item:intext", "entry span %(span)s", span=self.p.espan)) if self.p.ondiff: modstrs.append(_("@item:intext", "scaled fuzzy counts")) # Should titles be output in-line or on separate lines. self.inline = False maxtitlecw = 0 if (not self.p.wbar or not self.p.msgbar or not self.p.msgfmt) and (not self.p.table): for title, count, summed in counts: if title is not None: self.inline = True titlecw = len(title) if maxtitlecw < titlecw: maxtitlecw = titlecw # Output statistics in requested forms. for title, count, summed in counts: # Output the title if defined. if title is not None: if self.inline: ntitle = (("%%-%ds" % maxtitlecw) % title) else: ntitle = title # Must color after padding, to avoid it seeing the colors. ntitle = _("@title", "%(title)s", title=ntitle) if self.inline: report(ntitle + " ", newline=False) else: report(ntitle) if self.p.table: self._tabular_stats(counts, title, count) if self.p.msgbar: self._msg_bar_stats(counts, title, count, summed) if self.p.wbar: self._w_bar_stats(counts, title, count, summed) if self.p.msgfmt: self._msg_simple_stats(title, count, summed) # Output the table of catalogs which are not fully translated, # if requested. if self.p.incomplete and self.incomplete_catalogs: - filenames = self.incomplete_catalogs.keys() + filenames = list(self.incomplete_catalogs.keys()) self._sort_equiv_filenames(filenames) data = [] # Column of catalog filenames. data.append(filenames) data.append([self.counts[x]["fuz"][0] for x in filenames]) data.append([self.counts[x]["unt"][0] for x in filenames]) data.append([x + y for x, y in zip(data[1], data[2])]) data.append([self.counts[x]["fuz"][1] for x in filenames]) data.append([self.counts[x]["unt"][1] for x in filenames]) data.append([x + y for x, y in zip(data[4], data[5])]) # Columns of the two added. # Column names and formats. coln = [_("@title:column", "catalog"), _("@title:column fuzzy messages", "msg/f"), _("@title:column untranslated messages", "msg/u"), _("@title:column fuzzy and untranslated messages", "msg/f+u"), _("@title:column words in fuzzy messages", "w/f"), _("@title:column words in untranslated messages", "w/u"), _("@title:column words in fuzzy and untranslated messages", "w/f+u")] maxfl = max([len(x) for x in filenames]) dfmt = ["%%-%ds" % maxfl, "%d", "%d", "%d", "%d", "%d", "%d"] # Output. report("-") - report(tabulate(data, coln=coln, dfmt=dfmt, space=" ", none=u"-", + report(tabulate(data, coln=coln, dfmt=dfmt, space=" ", none="-", colorize=True)) # Write file names of catalogs which are not fully translated # into a file, if requested. if self.p.incompfile: filenames = sorted(self.incomplete_catalogs.keys()) cmdlenc = locale.getpreferredencoding() ofl = codecs.open(self.p.incompfile, "w", cmdlenc) ofl.writelines([x + "\n" for x in filenames]) ofl.close() if modstrs: report(_("@item:intable", "modifiers: %(modlist)s", modlist=format_item_list(modstrs))) def _tabular_stats (self, counts, title, count): # Order counts in tabular form. selected_cats = self.count_spec if False and self.p.incomplete: # skip this for the moment # Display only fuzzy and untranslated counts. selected_cats = (self.count_spec[1], self.count_spec[2]) # Skip display if complete. really_incomplete = True for tkey, tname in selected_cats: for col in range(self.counts_per_cat): if count[tkey][col] > 0: really_incomplete = False break if really_incomplete: return data = [[count[tkey][y] for tkey, tname in selected_cats] for y in range(self.counts_per_cat)] # Derived data: messages/words completition ratios. for col, ins in ((0, 1), (1, 3)): compr = [] for tkey, tname in selected_cats: if tkey not in ("tot", "obs") and count["tot"][col] > 0: r = float(count[tkey][col]) / count["tot"][col] compr.append(r * 100) else: compr.append(None) data.insert(ins, compr) if self.p.detail: # Derived data: word and character expansion factors. for o, t, ins, incsp in ((1, 2, 7, None), (3, 4, 8, (1, 2, 0.0))): ratio = [] for tkey, tname in selected_cats: if count[tkey][o] > 0 and count[tkey][t] > 0: inct, inco = 0.0, 0.0 if incsp: co, ct, fact = incsp inco = (count[tkey][co] - 1) * fact inct = (count[tkey][ct] - 1) * fact r = (count[tkey][t] + inct) / (count[tkey][o] + inco) ratio.append((r - 1) * 100) else: ratio.append(None) data.insert(ins, ratio) if self.p.detail: # Derived data: character/word ratio, word/message ratio. for w, c, ins in ((0, 1, 9), (0, 2, 10), (1, 3, 11), (2, 4, 12)): chpw = [] for tkey, tname in selected_cats: if count[tkey][w] > 0 and count[tkey][c] > 0: r = float(count[tkey][c]) / count[tkey][w] chpw.append(r) else: chpw.append(None) data.insert(ins, chpw) # Row, column names and formats. rown = [tname for tkey, tname in selected_cats] coln = [_("@title:column messages", "msg"), _("@title:column percentage of total messages", "msg/tot"), _("@title:column words in original", "w-or"), _("@title:column percentage of words to total in original", "w/tot-or"), _("@title:column words in translation", "w-tr"), _("@title:column characters in original", "ch-or"), _("@title:column characters in translation", "ch-tr")] dfmt = ["%d", "%.1f%%", "%d", "%.1f%%", "%d", "%d", "%d"] if self.p.detail: coln.extend([_("@title:column word efficiency", "w-ef"), _("@title:column character efficiency", "ch-ef"), _("@title:column words per message in original", "w/msg-or"), _("@title:column words per message in translation", "w/msg-tr"), _("@title:column characters per message in original", "ch/w-or"), _("@title:column characters per message in translation", "ch/w-tr")]) dfmt.extend(["%+.1f%%", "%+.1f%%", "%.1f", "%.1f", "%.1f", "%.1f"]) # Output the table. report(tabulate(data, rown=rown, coln=coln, dfmt=dfmt, - space=" ", none=u"-", colorize=True)) + space=" ", none="-", colorize=True)) def _msg_bar_stats (self, counts, title, count, summed): self._bar_stats(counts, title, count, summed, _("@item:intable number of messages", "msgs"), 0) def _w_bar_stats (self, counts, title, count, summed): self._bar_stats(counts, title, count, summed, _("@item:intable number of words in original", "w-or"), 1) def _bar_stats (self, counts, title, count, summed, dlabel, dcolumn): # Count categories to display and chars/colors associated to them. # Note: Use only characters from Latin1. - tspecs = (("trn", u"×", "green"), - ("fuz", u"¤", "blue"), - ("unt", u"·", "red")) + tspecs = (("trn", "×", "green"), + ("fuz", "¤", "blue"), + ("unt", "·", "red")) # Find out maximum counts overall. maxcounts = dict(trn=0, fuz=0, unt=0, tot=0) maxcounts_jumbled = maxcounts.copy() for otitle, ocount, osummed in counts: # If absolute bars, compare counts only for non-summed counts. if self.p.absolute and osummed: continue # Count both messages and words, for the number display padding. for tkey in maxcounts_jumbled: for dcol in (0, 1): c = ocount[tkey][dcol] if maxcounts_jumbled[tkey] < c: maxcounts_jumbled[tkey] = c for tkey in maxcounts: c = ocount[tkey][dcolumn] if maxcounts[tkey] < c: maxcounts[tkey] = c # Character widths of maximum count categories. maxcountscw = {} - for tkey, tval in maxcounts.iteritems(): + for tkey, tval in maxcounts.items(): maxcountscw[tkey] = len(str(tval)) maxcountscw_jumbled = {} - for tkey, tval in maxcounts_jumbled.iteritems(): + for tkey, tval in maxcounts_jumbled.items(): maxcountscw_jumbled[tkey] = len(str(tval)) # Formatted counts by disjunct categories. fmt_counts = [] for tkey, tchar, tcol in tspecs: cstr = str(count[tkey][dcolumn]) if cstr == "0": cstr = "-" cfmt = ("%%%ds" % maxcountscw_jumbled[tkey]) % cstr if tcol is not None: fmt_counts.append((ColorString("<%s>%%s") % (tcol, tcol)) % cfmt) else: fmt_counts.append(cfmt) fmt_counts = cjoin(fmt_counts, "/") # Maximum and nominal bar widths in characters. # TODO: Make parameters. if self.inline: nombarcw = 20 maxbarcw = 50 else: nombarcw = 40 maxbarcw = 80 def roundnear (x): return int(round(x, 0)) def roundup (x): ix = int(x) if x - ix > 1e-16: ix += 1 return ix # Compute number of cells per category. n_cells = {} if self.p.absolute: # Absolute bar. n_per_cell = 0 for npc in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000): if npc * maxbarcw > maxcounts["tot"]: n_per_cell = npc break if not n_per_cell: warning(_("@info", "Count too large, cannot display bar graph.")) return for tkey, roundf in (("fuz", roundup), ("unt", roundup), ("tot", roundnear)): c = count[tkey][dcolumn] n_cells[tkey] = roundf(float(c) / n_per_cell) # Correct the situation when there are no cells. if n_cells["tot"] < 1: n_cells["tot"] = 1 # Correct the situation when the sum of cells fuzzy+untranslated # goes over the total; give priority to untranslated when reducing. while n_cells["fuz"] + n_cells["unt"] > n_cells["tot"]: if n_cells["fuz"] >= n_cells["unt"]: n_cells["fuz"] -= 1 else: n_cells["unt"] -= 1 n_cells["trn"] = n_cells["tot"] - n_cells["fuz"] - n_cells["unt"] else: # Relative bar. if count["tot"][dcolumn] > 0: n_per_cell = float(nombarcw) / count["tot"][dcolumn] else: n_per_cell = 0 for tkey in ("fuz", "unt"): c = count[tkey][dcolumn] n_cells[tkey] = roundup(c * n_per_cell) # When there are almost none translated, it may have happened that # the sum of cells fuzzy+untranslated is over nominal; reduce. while n_cells["fuz"] + n_cells["unt"] > nombarcw: if n_cells["fuz"] >= n_cells["unt"]: n_cells["fuz"] -= 1 else: n_cells["unt"] -= 1 n_cells["trn"] = nombarcw - n_cells["fuz"] - n_cells["unt"] # Create the bar. fmt_bar = [] for tkey, tchar, tcol in tspecs: bar = tchar * n_cells[tkey] if tcol is not None: bar = (ColorString("<%s>%%s") % (tcol, tcol)) % bar fmt_bar.append(bar) fmt_bar = cjoin(fmt_bar) # Assemble final output. if not self.p.absolute or not summed: if count["tot"][dcolumn] == 0: fmt_bar = "" report(cinterp("%s %s |%s|", fmt_counts, dlabel, fmt_bar)) else: report(cinterp("%s %s", fmt_counts, dlabel)) def _msg_simple_stats (self, title, count, summed): """ msgfmt-style report """ fmt_trn = n_("@item:intext", "%(num)d translated message", "%(num)d translated messages", num=count["trn"][0]) fmt_fuz = n_("@item:intext", "%(num)d fuzzy translation", "%(num)d fuzzy translations", num=count["fuz"][0]) fmt_unt = n_("@item:intext", "%(num)d untranslated message", "%(num)d untranslated messages", num=count["unt"][0]) report(_("@info composition of three previous messages", "%(trn)s, %(fuz)s, %(unt)s", trn=fmt_trn, fuz=fmt_fuz, unt=fmt_unt)) diff --git a/sieve/tag_untranslated.py b/sieve/tag_untranslated.py index 97389794..0f572ce9 100644 --- a/sieve/tag_untranslated.py +++ b/sieve/tag_untranslated.py @@ -1,96 +1,96 @@ # -*- coding: UTF-8 -*- """ Tag untranslated messages. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.comments import parse_summit_branches from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Tag all untranslated messages with '%(flag)s' flag.", flag=_flag_untranslated )) p.add_param("strip", bool, desc=_("@info sieve parameter discription", "Remove tags from messages." )) p.add_param("wfuzzy", bool, desc=_("@info sieve parameter discription", "Also add tags to fuzzy messages." )) - p.add_param("branch", unicode, seplist=True, + p.add_param("branch", str, seplist=True, metavar=_("@info sieve parameter value placeholder", "BRANCH"), desc=_("@info sieve parameter discription", "In summit catalogs, consider only messages belonging to given branch. " "Several branches can be given as comma-separated list." )) -_flag_untranslated = u"untranslated" +_flag_untranslated = "untranslated" class Sieve (object): def __init__ (self, params): self.strip = params.strip self.wfuzzy = params.wfuzzy self.branches = set(params.branch or []) self.ntagged = 0 self.ncleared = 0 def process (self, msg, cat): # Skip obsolete messages. if msg.obsolete: return # Summit: if branches were given, considered the message for # tagging based on whether it belongs to any of the given branches. may_tag = True if self.branches: msg_branches = parse_summit_branches(msg) if not set.intersection(self.branches, msg_branches): may_tag = False ok_msg = msg.untranslated if self.wfuzzy and not ok_msg: ok_msg = msg.fuzzy if not self.strip and may_tag and ok_msg: if _flag_untranslated not in msg.flag: msg.flag.add(_flag_untranslated) self.ntagged += 1 else: if _flag_untranslated in msg.flag: msg.flag.remove(_flag_untranslated) self.ncleared += 1 def finalize (self): if self.ntagged > 0: msg = n_("@info:progress", "Tagged %(num)d untranslated message.", "Tagged %(num)d untranslated messages.", num=self.ntagged) report("===== " + msg) if self.ncleared > 0: msg = n_("@info:progress", "Cleared untranslated tag from %(num)d message.", "Cleared untranslated tag from %(num)d messages.", num=self.ncleared) report("===== " + msg) diff --git a/sieve/unfuzzy_context_only.py b/sieve/unfuzzy_context_only.py index 0a3cfb71..18de954b 100644 --- a/sieve/unfuzzy_context_only.py +++ b/sieve/unfuzzy_context_only.py @@ -1,123 +1,123 @@ # -*- coding: UTF-8 -*- """ Unfuzzy those messages fuzzied only due to a context change. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ from pology import _, n_ from pology.msgreport import report_msg_content from pology.msgreport import report_msg_to_lokalize from pology.report import report from pology.sieve import add_param_poeditors def setup_sieve (p): p.set_desc(_("@info sieve discription", "Unfuzzy messages which got fuzzy only due to changed context." "\n\n" "Possible only if catalogs were merged with --previous option." "\n\n" "By default, unfuzzied messages will get a translator comment with " "the string '%(str)s', so that they can be reviewed later.", str="unreviewed-context" )) p.add_param("noreview", bool, defval=False, desc=_("@info sieve parameter discription", "Do not add translator comment indicating unreviewed context." )) p.add_param("eqmsgid", bool, defval=False, desc=_("@info sieve parameter discription", "Do not unfuzzy messages which have same msgid as another message, " "and report them together with all other messages with the same msgid." )) add_param_poeditors(p) class Sieve (object): def __init__ (self, params): self.p = params self.nunfuzz = 0 self.nrep = 0 def process_header (self, hdr, cat): self.msgs_by_msgid = {} self.msgs_to_unfuzzy_by_msgid = {} def process (self, msg, cat): if msg.obsolete: return if msg.msgid not in self.msgs_by_msgid: self.msgs_by_msgid[msg.msgid] = [] self.msgs_by_msgid[msg.msgid].append(msg) if ( msg.fuzzy and msg.msgid == msg.msgid_previous and msg.msgid_plural == msg.msgid_plural_previous ): if msg.msgid not in self.msgs_to_unfuzzy_by_msgid: self.msgs_to_unfuzzy_by_msgid[msg.msgid] = [] self.msgs_to_unfuzzy_by_msgid[msg.msgid].append(msg) def process_header_last (self, hdr, cat): msgs_to_report = [] keys_of_msgs_to_report = set() if self.p.eqmsgid: for msg in cat: if msg.obsolete: continue msgs = self.msgs_by_msgid.get(msg.msgid) msgs_to_unfuzzy = self.msgs_to_unfuzzy_by_msgid.get(msg.msgid) if msgs and msgs_to_unfuzzy and len(msgs) > 1: msgs_to_report.append(msg) keys_of_msgs_to_report.add(msg.key) - for msgs in self.msgs_to_unfuzzy_by_msgid.values(): + for msgs in list(self.msgs_to_unfuzzy_by_msgid.values()): for msg in msgs: if msg.key not in keys_of_msgs_to_report: msg.unfuzzy() self.nunfuzz += 1 for msg in msgs_to_report: if self.p.lokalize: report_msg_to_lokalize(msg, cat) else: report_msg_content(msg, cat, delim="-" * 20) self.nrep += 1 def finalize (self): if self.nunfuzz > 0: msg = n_("@info:progress", "Unfuzzied %(num)d message fuzzy due to " "difference in context only.", "Unfuzzied %(num)d messages fuzzy due to " "difference in context only.", num=self.nunfuzz) report("===== " + msg) if self.nrep > 0: msg = n_("@info:progress", "Reported %(num)d message due to equality " "of '%(field)s' field.", "Reported %(num)d messages due to equality " "of '%(field)s' field.", num=self.nrep, field="msgid") report("===== " + msg) diff --git a/sieve/unfuzzy_ctxmark_only.py b/sieve/unfuzzy_ctxmark_only.py index 593758d8..50c7800e 100644 --- a/sieve/unfuzzy_ctxmark_only.py +++ b/sieve/unfuzzy_ctxmark_only.py @@ -1,82 +1,82 @@ # -*- coding: UTF-8 -*- """ Unfuzzy messages fuzzied only due to a change in UI context marker. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import _, n_ from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Unfuzzy messages which got fuzzy only due to changed context marker." "\n\n" "Possible only if catalogs were merged with --previous option." "\n\n" "By default, unfuzzied messages will get a translator comment with " "the string '%(str)s', so that they can be reviewed later.", str="unreviewed-context" )) p.add_param("noreview", bool, defval=False, desc=_("@info sieve parameter discription", "Do not add translator comment indicating unreviewed context." )) _strip_rx = re.compile(r"^\s*@[^\s]+(.*)", re.U) _norm_rx = re.compile(r"[^\w]", re.U) # Strip the KUIT context marker, and normalize rest of the string. def _stripped (ctxt): m = _strip_rx.search(ctxt) if m: stripped = m.group(1) else: stripped = ctxt return _norm_rx.sub("", stripped.lower()) class Sieve (object): def __init__ (self, params): self.flag_review = not params.noreview self.nmatch = 0 def process (self, msg, cat): if ( msg.fuzzy and msg.msgid == msg.msgid_previous and msg.msgid_plural == msg.msgid_plural_previous - and ( _stripped(msg.msgctxt or u"") - == _stripped(msg.msgctxt_previous or u"")) + and ( _stripped(msg.msgctxt or "") + == _stripped(msg.msgctxt_previous or "")) ): msg.unfuzzy() if self.flag_review: # Add as manual comment, as any other type will vanish # when catalog is merged with template. - msg.manual_comment.append(u"unreviewed-context") + msg.manual_comment.append("unreviewed-context") self.nmatch += 1 def finalize (self): if self.nmatch > 0: msg = n_("@info:progress", "Unfuzzied %(num)d message fuzzy due to " "difference in context marker only.", "Unfuzzied %(num)d messages fuzzy due to " "difference in context marker only.", num=self.nmatch) report("===== " + msg) diff --git a/sieve/unfuzzy_inplace_only.py b/sieve/unfuzzy_inplace_only.py index a243e4fd..8770a922 100644 --- a/sieve/unfuzzy_inplace_only.py +++ b/sieve/unfuzzy_inplace_only.py @@ -1,100 +1,100 @@ # -*- coding: UTF-8 -*- """ Unfuzzy messages fuzzied only due to some tags being closed in-place (like C{
} to C{
}). Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import _, n_ from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Unfuzzy messages fuzzied only due to some tags being closed in-place " "(like '%(tag1)s' to '%(tag2)s')." "\n\n" "Possible only if catalogs were merged with --previous option.", tag1="
", tag2="
")) _tags_inpl = r"(br|hr|nl)" _open_inpl_rx = re.compile(r"<\s*" + _tags_inpl + r"\s*>", re.U) _close_inpl_rx = re.compile(r"<\s*/\s*" + _tags_inpl + r"\s*>", re.U) _openclose_inpl_rx = re.compile(r"<\s*" + _tags_inpl + r"\s*/\s*>", re.U) # Replace any needed <...> with <.../> in the text. def _norm_inpl (text): text = _open_inpl_rx.sub(r"<\1/>", text) text = _openclose_inpl_rx.sub(r"<\1/>", text) # to normalize
, etc. return text class Sieve (object): def __init__ (self, params): self.caller_monitored = True self.nunfuzz = 0 self.nmodinpl = 0 def process (self, msg, cat): # Skip checks if the msgid contains closing , too odd. if _close_inpl_rx.search(msg.msgid): return # Unfuzzy message if closed <.../> are the only difference. if ( msg.fuzzy and msg.msgid_previous is not None and msg.msgctxt_previous == msg.msgctxt and _open_inpl_rx.search(msg.msgid_previous) ): # Normalize <...> tags for checking. msgid_previous_n = _norm_inpl(msg.msgid_previous) - msgid_plural_previous_n = _norm_inpl(msg.msgid_plural_previous or u"") + msgid_plural_previous_n = _norm_inpl(msg.msgid_plural_previous or "") msgid_n = _norm_inpl(msg.msgid) - msgid_plural_n = _norm_inpl(msg.msgid_plural or u"") + msgid_plural_n = _norm_inpl(msg.msgid_plural or "") if ( msgid_n == msgid_previous_n and msgid_plural_n == msgid_plural_previous_n ): msg.unfuzzy() self.nunfuzz += 1 # Replace any <...> with <.../> in the msgstr. for i in range(len(msg.msgstr)): if _open_inpl_rx.search(msg.msgstr[i]): msg.msgstr[i] = _open_inpl_rx.sub(r"<\1/>", msg.msgstr[i]) self.nmodinpl += 1 def finalize (self): if self.nunfuzz > 0: msg = n_("@info:progress", "Unfuzzied %(num)d message due to " "closing tags in-place.", "Unfuzzied %(num)d messages due to " "closing tags in-place.", num=self.nunfuzz) report("===== " + msg) if self.nmodinpl > 0: msg = n_("@info:progress", "Modified %(num)d translations by " "closing tags in-place.", "Modified %(num)d translations by " "closing tags in-place.", num=self.nmodinpl) report("===== " + msg) diff --git a/sieve/unfuzzy_qtclass_only.py b/sieve/unfuzzy_qtclass_only.py index e9bab284..06aeb860 100644 --- a/sieve/unfuzzy_qtclass_only.py +++ b/sieve/unfuzzy_qtclass_only.py @@ -1,66 +1,66 @@ # -*- coding: UTF-8 -*- """ Unfuzzy messages fuzzied only due to changed Qt class name. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import re from pology import _, n_ from pology.report import report def setup_sieve (p): p.set_desc(_("@info sieve discription", "Unfuzzy messages which got fuzzy only due to changed Qt class name." "\n\n" "Possible only if catalogs were merged with --previous option.", )) _strip_rx = re.compile(r"^[a-z][\w:]*\|(.*)", re.U|re.I) # Strip the Qt class. def _stripped (ctxt): m = _strip_rx.search(ctxt) stripped = m.group(1) if m else ctxt return stripped class Sieve (object): def __init__ (self, params): self.nmatch = 0 def process (self, msg, cat): if ( msg.fuzzy and msg.msgid == msg.msgid_previous and msg.msgid_plural == msg.msgid_plural_previous - and ( _stripped(msg.msgctxt or u"") - == _stripped(msg.msgctxt_previous or u"")) + and ( _stripped(msg.msgctxt or "") + == _stripped(msg.msgctxt_previous or "")) ): msg.unfuzzy() self.nmatch += 1 def finalize (self): if self.nmatch > 0: msg = n_("@info:progress", "Unfuzzied %(num)d message fuzzy due to " "difference in Qt class context only.", "Unfuzzied %(num)d messages fuzzy due to " "difference in Qt class context only.", num=self.nmatch) report("===== " + msg) diff --git a/sieve/update_header.py b/sieve/update_header.py index fccf0469..60d8da2d 100644 --- a/sieve/update_header.py +++ b/sieve/update_header.py @@ -1,132 +1,132 @@ # -*- coding: UTF-8 -*- """ Initialize and update the PO header with own translation data. Documented in C{doc/user/sieving.docbook}. @author: Chusslove Illich (Часлав Илић) @license: GPLv3 """ import os import re import time from pology import _, n_ import pology.config as config from pology.report import warning from pology.resolve import expand_vars from pology.sieve import SieveError def setup_sieve (p): p.set_desc(_("@info sieve discription", "Initialize or update the PO header with own translator data." )) - p.add_param("proj", unicode, mandatory=True, + p.add_param("proj", str, mandatory=True, metavar=_("@info sieve parameter value placeholder", "ID"), desc=_("@info sieve parameter discription", "Project ID in Pology configuration file, " "which contains the necessary project data to update the header." )) p.add_param("init", bool, defval=False, desc=_("@info sieve parameter discription", "Consider header as uninitialized, removing any existing information " "before adding own and project data." )) p.add_param("onmod", bool, defval=False, desc=_("@info sieve parameter discription", "Update header only if the catalog was otherwise modified " "(in sieve chains)." )) class Sieve (object): def __init__ (self, params): self.p = params # Collect user and project configuration. prjsect = "project-" + params.proj if not config.has_section(prjsect): raise SieveError( _("@info", "Project '%(id)s' is not defined in user configuration.", id=params.proj)) self.prjcfg = config.section(prjsect) prjcfg = config.section(prjsect) usrcfg = config.section("user") # Collect project data. self.name = prjcfg.string("name") or usrcfg.string("name") if not self.name: warning(_("@info", "Field '%(field)s' is not set in " "project or user configuration.", field="name")) self.email = prjcfg.string("email") or usrcfg.string("email") if not self.email: warning(_("@info", "Field '%(field)s' is not set in " "project or user configuration.", field="email")) self.langteam = prjcfg.string("language-team") if not self.langteam: warning(_("@info", "Field '%(field)s' is not set in " "project configuration.", field="language-team")) self.teamemail = prjcfg.string("team-email") # ok not to be present self.langcode = prjcfg.string("language") or usrcfg.string("language") if not self.langcode: warning(_("@info", "Field '%(field)s' is not set in " "project configuration.", field="language")) self.encoding = ( prjcfg.string("encoding") or usrcfg.string("encoding") - or u"UTF-8") + or "UTF-8") self.plforms = ( prjcfg.string("plural-forms") or usrcfg.string("plural-forms")) if not self.plforms: warning(_("@info", "Field '%(field)s' is not set in " "project configuration.", field="plural-forms")) self.poeditor = ( prjcfg.string("po-editor") or usrcfg.string("po-editor")) # ok not to be present def process_header_last (self, hdr, cat): if self.p.onmod and cat.modcount == 0: return if self.p.init: # Assemble translation title. if self.langteam: - title = (u"Translation of %(title)s into %(lang)s." + title = ("Translation of %(title)s into %(lang)s." % dict(title="%poname", lang="%langname")) else: - title = (u"Translation of %(title)s." + title = ("Translation of %(title)s." % dict(title="%poname")) # Remove some placeholders. if "YEAR" in hdr.copyright: hdr.copyright = None if "PACKAGE" in hdr.license: hdr.license = None cat.update_header(project=cat.name, title=title, name=self.name, email=self.email, teamemail=self.teamemail, langname=self.langteam, langcode=self.langcode, encoding=self.encoding, ctenc="8bit", plforms=self.plforms, poeditor=self.poeditor) else: cat.update_header(name=self.name, email=self.email, poeditor=self.poeditor) diff --git a/util/add-html-highlight.py b/util/add-html-highlight.py index ae94ad4c..482ad59b 100755 --- a/util/add-html-highlight.py +++ b/util/add-html-highlight.py @@ -1,221 +1,221 @@ #!/usr/bin/env python # -*- coding: UTF-8 -*- import os import re import sys from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import get_lexer_by_name from pygments.lexer import RegexLexer, bygroups, include from pygments.token import Keyword, Comment, Name, String, Text, Number, Generic from pygments.util import ClassNotFound _cmd = os.path.basename(sys.argv[0]) def main (): for infile in sys.argv[1:]: add_html_highlight(infile) def add_html_highlight (infile): ifh = open(infile) htmlstr = ifh.read().decode("utf8") ifh.close() pre_rx = re.compile(r"(

)"
                         r"\s*\s*"
                         r"(.*?)"
                         r"(
)", re.S|re.U) p = 0 segs = [] while True: m = pre_rx.search(htmlstr, p) if m is None: segs.append(htmlstr[p:]) break p1, p2 = m.span() segs.append(htmlstr[p:p1]) otag, language, snippet, ctag = m.groups() try: lexer = get_custom_lexer_by_name(language) if lexer is None: lexer = get_lexer_by_name(language) except ClassNotFound: seg = snippet warning("Unknown language '%s'." % language) lexer = None if lexer: snippet, tags = hide_tags(snippet) snippet = unescape_xml(snippet) seg = highlight(snippet, lexer, HtmlFormatter(nowrap=True)) seg = unhide_tags(seg, tags) segs.extend((otag, seg, ctag)) p = p2 htmlstr_mod = "".join(segs) ofh = open(infile, "w") ofh.write(htmlstr_mod.encode("utf8")) ofh.close() def warning (msg): sys.stderr.write("%s: [warning] %s\n" % (_cmd, msg)) def unescape_xml (s): s = s.replace("<", "<") s = s.replace(">", ">") s = s.replace("'", "'") s = s.replace(""", '"') s = s.replace("&", "&") return s _hide_tags_rx = re.compile(r"<.*?>", re.S|re.U) -_hide_tags_rseq = u"⌒" +_hide_tags_rseq = "⌒" def hide_tags (s): tags = _hide_tags_rx.findall(s) s = _hide_tags_rx.sub(_hide_tags_rseq, s) return s, tags def unhide_tags (s, tags): segs = [] i = 0 p1 = 0 while True: p2 = s.find(_hide_tags_rseq, p1) if p2 < 0: p2 = len(s) segs.append(s[p1:p2]) if p2 == len(s): break assert i < len(tags) segs.append(tags[i]) i += 1 p1 = p2 + len(_hide_tags_rseq) assert i == len(tags) s = "".join(segs) return s _custom_lexers = set() def get_custom_lexer_by_name (language): for lexer_type in _custom_lexers: if language in lexer_type.aliases: return lexer_type() return None from pygments.lexers import GettextLexer class GettextXLexer (GettextLexer): pass GettextXLexer.tokens = { 'root': [ (r'^#,\s.*?$', Name.Decorator), (r'^#:\s.*?$', Name.Label), (r'^#\|\s*(msgid_plural|msgid)\s*"', Comment.Single, 'prevstring'), (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single), (r'^(msgstr\[)(\d)(\])', bygroups(Name.Variable, Number.Integer, Name.Variable)), (r'^(msgctxt|msgid_plural|msgid|msgstr|msgscr)', bygroups(Name.Variable)), (r'"', String, 'string'), (r'^\.\.\.$', Text), # for cutting out intermediate messages - (ur'\u2060', Text), # for not splitting on empty line in POT extraction + (r'\u2060', Text), # for not splitting on empty line in POT extraction (r'\s+', Text), ], 'string': [ (r'\\.', String.Escape), (r'\{\{|\}\}', String.Escape), (r'\{-.*?-\}', Generic.Deleted), (r'\{\+.*?\+\}', Generic.Inserted), (r'\{([a-z].*?|)\}', String.Interpol), (r'%[ -+]?\d*\.?\d*[idufFgGeEcs%]', String.Interpol), (r'<(?=[\w/])', String.Other, 'tag'), (r'~~', String.Escape), (r'~', String.Other), (r'\$\[', String.Symbol, 'script'), (r'"', String, '#pop'), (r'.', String), ], 'prevstring': [ (r'\{-.*?-\}', Generic.Deleted), (r'\{\+.*?\+\}', Generic.Inserted), (r'"', Comment.Single, '#pop'), (r'.', Comment.Single), ], 'tag': [ (r'>', String.Other, '#pop'), (r'.', String.Other), ], 'script': [ (r'\]', String.Symbol, '#pop'), (r"''", String.Escape), (r"'", String.Symbol, 'scriptquote'), include('string'), ], 'scriptquote': [ (r"''", String.Escape), (r"'", String.Symbol, '#pop'), include('string'), ], } _custom_lexers.add(GettextXLexer) from pygments.lexers import CppLexer class CppXLexer (CppLexer): pass CppXLexer.tokens = CppLexer.tokens.copy() CppXLexer.tokens.update({ 'string': [ (r'"', String, '#pop'), (r'\\([\\abfnrtv"\']|x[a-fA-F0-9]{2,4}|[0-7]{1,3})', String.Escape), (r'%(\([a-zA-Z0-9_]+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?' r'[hlL]?[diouxXeEfFgGcrs%]', String.Interpol), (r'\{\{|\}\}', String.Escape), (r'\{.*?\}', String.Interpol), (r'%\d+', String.Interpol), (r'[^\\"\n%{}]+', String), # all other characters (r'\\\n', String), # line continuation (r'\\', String), # stray backslash (r'[%{}]', String), ], }) _custom_lexers.add(CppXLexer) from pygments.lexers import PythonLexer class PythonXLexer (PythonLexer): pass PythonXLexer.tokens.update({ 'strings': [ (r'%(\([a-zA-Z0-9_]+\))?[-#0 +]*([0-9]+|[*])?(\.([0-9]+|[*]))?' r'[hlL]?[diouxXeEfFgGcrs%]', String.Interpol), (r'\{\{|\}\}', String.Escape), (r'\{.*?\}', String.Interpol), (r'[^\\\'"%{}\n]+', String), (r'[\'"\\]', String), (r'[%{}]', String), ], }) _custom_lexers.add(PythonXLexer) if __name__ == "__main__": main()