Changeset View
Changeset View
Standalone View
Standalone View
ps/trunk/source/tools/i18n/checkTranslationsForSpam.py
#!/usr/bin/env python2 | #!/usr/bin/env python3 | ||||
# -*- coding:utf-8 -*- | |||||
# | # | ||||
# Copyright (C) 2014 Wildfire Games. | # Copyright (C) 2020 Wildfire Games. | ||||
# This file is part of 0 A.D. | # This file is part of 0 A.D. | ||||
# | # | ||||
# 0 A.D. is free software: you can redistribute it and/or modify | # 0 A.D. is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU General Public License as published by | # it under the terms of the GNU General Public License as published by | ||||
# the Free Software Foundation, either version 2 of the License, or | # the Free Software Foundation, either version 2 of the License, or | ||||
# (at your option) any later version. | # (at your option) any later version. | ||||
# | # | ||||
# 0 A.D. is distributed in the hope that it will be useful, | # 0 A.D. is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | # GNU General Public License for more details. | ||||
# | # | ||||
# You should have received a copy of the GNU General Public License | # You should have received a copy of the GNU General Public License | ||||
# along with 0 A.D. If not, see <http://www.gnu.org/licenses/>. | # along with 0 A.D. If not, see <http://www.gnu.org/licenses/>. | ||||
from __future__ import absolute_import, division, print_function, unicode_literals | import os, re, sys | ||||
import multiprocessing | |||||
import codecs, os, re, sys | from i18n_helper import l10nToolsDirectory, projectRootDirectory | ||||
from i18n_helper.catalog import Catalog | |||||
from i18n_helper.globber import getCatalogs | |||||
from pology.catalog import Catalog | |||||
from pology.message import Message | |||||
l10nToolsDirectory = os.path.dirname(os.path.realpath(__file__)) | |||||
projectRootDirectory = os.path.abspath(os.path.join(l10nToolsDirectory, os.pardir, os.pardir, os.pardir)) | |||||
l10nFolderName = "l10n" | l10nFolderName = "l10n" | ||||
def checkTranslationsForSpam(inputFilePath): | def checkTranslationsForSpam(inputFilePath): | ||||
print(f"Checking {inputFilePath}") | |||||
print(u"Checking", inputFilePath) | templateCatalog = Catalog.readFrom() | ||||
templateCatalog = Catalog(inputFilePath) | |||||
# If language codes were specified on the command line, filter by those. | # If language codes were specified on the command line, filter by those. | ||||
filters = sys.argv[1:] | filters = sys.argv[1:] | ||||
# Load existing translation catalogs. | # Load existing translation catalogs. | ||||
existingTranslationCatalogs = [] | existingTranslationCatalogs = getCatalogs(inputFilePath, filters) | ||||
l10nFolderPath = os.path.dirname(inputFilePath) | |||||
# .pot is one letter longer than .po, but the dot that separates the locale | urlPattern = re.compile(r"https?://(?:[a-z0-9-_$@./&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.IGNORECASE) | ||||
# code from the rest of the filename in .po files makes up for that. | |||||
charactersToSkip = len(os.path.basename(inputFilePath)) | |||||
for filename in os.listdir(l10nFolderPath): | |||||
if len(filename) > 3 and filename[-3:] == ".po" and filename[:4] != "long": | |||||
if not filters or filename[:-charactersToSkip] in filters: | |||||
if os.path.basename(inputFilePath)[:-4] == filename.split('.')[-2]: | |||||
existingTranslationCatalogs.append([filename[:-charactersToSkip], os.path.join(l10nFolderPath, filename)]) | |||||
urlPattern = re.compile(u"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") | |||||
# Check the URLs in translations against the URLs in the translation template. | |||||
for languageCode, pofile in existingTranslationCatalogs: | |||||
translationCatalog = Catalog(pofile) | |||||
for templateMessage in templateCatalog: | |||||
translationMessage = translationCatalog.select_by_key(templateMessage.msgctxt, templateMessage.msgid) | |||||
if translationMessage: | |||||
templateSingularString = templateMessage.msgid | |||||
templateUrls = urlPattern.findall(templateMessage.msgid) | |||||
# Assert that the same URL is used in both the plural and singular forms. | |||||
if templateMessage.msgid_plural and len(templateMessage.msgstr) > 1: | |||||
pluralUrls = urlPattern.findall(templateMessage.msgstr[0]) | |||||
for url in pluralUrls: | |||||
if url not in templateUrls: | |||||
print(u"Different URLs in singular and plural source strings for ‘{}’ in ‘{}’".format( | |||||
templateMessage.msgid, | |||||
inputFilePath)) | |||||
for translationString in translationMessage[0].msgstr: | |||||
translationUrls = urlPattern.findall(translationString) | |||||
for translationUrl in translationUrls: | |||||
if translationUrl not in templateUrls: | |||||
print(u"{}: Found the “{}” URL in the translation, which does not match any of the URLs in the translation template: {}".format( | |||||
languageCode, | |||||
translationUrl, | |||||
u", ".join(templateUrls))) | |||||
# Check that there are no spam URLs. | |||||
# Loop through all messages in the .POT catalog for URLs. | |||||
# For each, check for the corresponding key in the .PO catalogs. | |||||
# If found, check that URLS in the .PO keys are the same as those in the .POT key. | |||||
for templateMessage in templateCatalog: | |||||
templateUrls = set(urlPattern.findall( | |||||
templateMessage.id[0] if templateMessage.pluralizable else templateMessage.id | |||||
)) | |||||
# As a sanity check, verify that the template message is coherent | |||||
if templateMessage.pluralizable: | |||||
pluralUrls = set(urlPattern.findall(templateMessage.id[1])) | |||||
if pluralUrls.difference(templateUrls): | |||||
print(f"{inputFilePath} - Different URLs in singular and plural source strings " | |||||
f"for '{templateMessage}' in '{inputFilePath}'") | |||||
for translationCatalog in existingTranslationCatalogs: | |||||
translationMessage = translationCatalog.get(templateMessage.id, templateMessage.context) | |||||
if not translationMessage: | |||||
continue | |||||
translationUrls = set(urlPattern.findall( | |||||
translationMessage.string[0] if translationMessage.pluralizable else translationMessage.string | |||||
)) | |||||
unknown_urls = translationUrls.difference(templateUrls) | |||||
if unknown_urls: | |||||
print(f'{inputFilePath} - {translationCatalog.locale}: ' | |||||
f'Found unknown URL(s) {", ".join(unknown_urls)} in the translation ' | |||||
f'which do not match any of the URLs in the template: {", ".join(templateUrls)}') | |||||
print(f"Done checking {inputFilePath}") | |||||
def main(): | def main(): | ||||
print("\n\tWARNING: Remember to regenerate the POT files with “updateTemplates.py” " | |||||
print(u"\n WARNING: Remember to regenerate the POT files with “updateTemplates.py” before you run this script.\n POT files are not in the repository.\n") | "before you run this script.\n\tPOT files are not in the repository.\n") | ||||
foundPots = 0 | foundPots = 0 | ||||
for root, folders, filenames in os.walk(projectRootDirectory): | for root, folders, filenames in os.walk(projectRootDirectory): | ||||
root = root.decode("utf-8") | |||||
for filename in filenames: | for filename in filenames: | ||||
if len(filename) > 4 and filename[-4:] == ".pot" and os.path.basename(root) == "l10n": | if len(filename) > 4 and filename[-4:] == ".pot" and os.path.basename(root) == "l10n": | ||||
foundPots += 1 | foundPots += 1 | ||||
checkTranslationsForSpam(os.path.join(root, filename)) | multiprocessing.Process( | ||||
target=checkTranslationsForSpam, | |||||
args=(os.path.join(root, filename), ) | |||||
).start() | |||||
if foundPots == 0: | if foundPots == 0: | ||||
print(u"This script did not work because no ‘.pot’ files were found.") | print( | ||||
print(u"Please, run ‘updateTemplates.py’ to generate the ‘.pot’ files, and run ‘pullTranslations.py’ to pull the latest translations from Transifex.") | "This script did not work because no '.pot' files were found. " | ||||
print(u"Then you can run this script to generate ‘.po’ files with the longest strings.") | "Please run 'updateTemplates.py' to generate the '.pot' files, " | ||||
"and run 'pullTranslations.py' to pull the latest translations from Transifex. " | |||||
"Then you can run this script to check for spam in translations.") | |||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
main() | main() |
Wildfire Games · Phabricator