Changeset View
Changeset View
Standalone View
Standalone View
source/tools/i18n/checkTranslationsForSpam.py
#!/usr/bin/env python2 | #!/usr/bin/env python3 | ||||||||
# -*- coding:utf-8 -*- | |||||||||
# | # | ||||||||
# Copyright (C) 2014 Wildfire Games. | # Copyright (C) 2020 Wildfire Games. | ||||||||
# This file is part of 0 A.D. | # This file is part of 0 A.D. | ||||||||
# | # | ||||||||
# 0 A.D. is free software: you can redistribute it and/or modify | # 0 A.D. is free software: you can redistribute it and/or modify | ||||||||
# it under the terms of the GNU General Public License as published by | # it under the terms of the GNU General Public License as published by | ||||||||
# the Free Software Foundation, either version 2 of the License, or | # the Free Software Foundation, either version 2 of the License, or | ||||||||
# (at your option) any later version. | # (at your option) any later version. | ||||||||
# | # | ||||||||
# 0 A.D. is distributed in the hope that it will be useful, | # 0 A.D. is distributed in the hope that it will be useful, | ||||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||||||
# GNU General Public License for more details. | # GNU General Public License for more details. | ||||||||
# | # | ||||||||
# You should have received a copy of the GNU General Public License | # You should have received a copy of the GNU General Public License | ||||||||
# along with 0 A.D. If not, see <http://www.gnu.org/licenses/>. | # along with 0 A.D. If not, see <http://www.gnu.org/licenses/>. | ||||||||
from __future__ import absolute_import, division, print_function, unicode_literals | import os, re, sys | ||||||||
import codecs, os, re, sys | from i18n_helper.catalog import Catalog | ||||||||
from i18n_helper.globber import getCatalogs | |||||||||
from pology.catalog import Catalog | |||||||||
from pology.message import Message | |||||||||
import multiprocessing | |||||||||
l10nToolsDirectory = os.path.dirname(os.path.realpath(__file__)) | l10nToolsDirectory = os.path.dirname(os.path.realpath(__file__)) | ||||||||
projectRootDirectory = os.path.abspath(os.path.join(l10nToolsDirectory, os.pardir, os.pardir, os.pardir)) | projectRootDirectory = os.path.abspath(os.path.join(l10nToolsDirectory, os.pardir, os.pardir, os.pardir)) | ||||||||
l10nFolderName = "l10n" | l10nFolderName = "l10n" | ||||||||
def checkTranslationsForSpam(inputFilePath): | def checkTranslationsForSpam(inputFilePath): | ||||||||
print(f"Checking {inputFilePath}") | |||||||||
print(u"Checking", inputFilePath) | templateCatalog = Catalog.readFrom() | ||||||||
templateCatalog = Catalog(inputFilePath) | |||||||||
# If language codes were specified on the command line, filter by those. | # If language codes were specified on the command line, filter by those. | ||||||||
filters = sys.argv[1:] | filters = sys.argv[1:] | ||||||||
# Load existing translation catalogs. | # Load existing translation catalogs. | ||||||||
existingTranslationCatalogs = [] | existingTranslationCatalogs = getCatalogs(inputFilePath, filters) | ||||||||
l10nFolderPath = os.path.dirname(inputFilePath) | |||||||||
# .pot is one letter longer than .po, but the dot that separates the locale | urlPattern = re.compile(r"https?:\/\/(?:[a-zA-Z]|[0-9]|[-_$@./&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") | ||||||||
Gallaecio: I see you removed `[!*\(\),]`, I’ll trust that is on purpose, I’m not familiar enough with the… | |||||||||
Done Inline Actions[!*\(\),] seemed un-necessary (and even buggy?) in my testing. Since all that's important is to recognise URLs, we don't really need to parse all possible items of an URL. This seems fine to me. wraitii: `[!*\(\),]` seemed un-necessary (and even buggy?) in my testing. Since all that's important is… | |||||||||
# code from the rest of the filename in .po files makes up for that. | |||||||||
charactersToSkip = len(os.path.basename(inputFilePath)) | |||||||||
for filename in os.listdir(l10nFolderPath): | |||||||||
if len(filename) > 3 and filename[-3:] == ".po" and filename[:4] != "long": | |||||||||
if not filters or filename[:-charactersToSkip] in filters: | |||||||||
if os.path.basename(inputFilePath)[:-4] == filename.split('.')[-2]: | |||||||||
existingTranslationCatalogs.append([filename[:-charactersToSkip], os.path.join(l10nFolderPath, filename)]) | |||||||||
urlPattern = re.compile(u"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") | |||||||||
# Check the URLs in translations against the URLs in the translation template. | |||||||||
for languageCode, pofile in existingTranslationCatalogs: | |||||||||
translationCatalog = Catalog(pofile) | |||||||||
for templateMessage in templateCatalog: | |||||||||
translationMessage = translationCatalog.select_by_key(templateMessage.msgctxt, templateMessage.msgid) | |||||||||
if translationMessage: | |||||||||
templateSingularString = templateMessage.msgid | |||||||||
templateUrls = urlPattern.findall(templateMessage.msgid) | |||||||||
# Assert that the same URL is used in both the plural and singular forms. | |||||||||
if templateMessage.msgid_plural and len(templateMessage.msgstr) > 1: | |||||||||
pluralUrls = urlPattern.findall(templateMessage.msgstr[0]) | |||||||||
for url in pluralUrls: | |||||||||
if url not in templateUrls: | |||||||||
print(u"Different URLs in singular and plural source strings for ‘{}’ in ‘{}’".format( | |||||||||
templateMessage.msgid, | |||||||||
inputFilePath)) | |||||||||
for translationString in translationMessage[0].msgstr: | |||||||||
translationUrls = urlPattern.findall(translationString) | |||||||||
for translationUrl in translationUrls: | |||||||||
if translationUrl not in templateUrls: | |||||||||
print(u"{}: Found the “{}” URL in the translation, which does not match any of the URLs in the translation template: {}".format( | |||||||||
languageCode, | |||||||||
translationUrl, | |||||||||
u", ".join(templateUrls))) | |||||||||
# Check that there are no spam URLs. | |||||||||
# Loop through all messages in the .POT catalog for URLs. | |||||||||
# For each, check for the corresponding key in the .PO catalogs. | |||||||||
# If found, check that URLS in the .PO keys are the same as those in the .POT key. | |||||||||
for templateMessage in templateCatalog: | |||||||||
templateUrls = set(urlPattern.findall( | |||||||||
templateMessage.id[0] if templateMessage.pluralizable else templateMessage.id | |||||||||
)) | |||||||||
# As a sanity check, verify that the template message is coherent | |||||||||
if templateMessage.pluralizable: | |||||||||
pluralUrls = set(urlPattern.findall(templateMessage.id[1])) | |||||||||
if pluralUrls.difference(templateUrls): | |||||||||
print(f"{inputFilePath} - Different URLs in singular and plural source strings" | |||||||||
f"for '{templateMessage}' in '{inputFilePath}'") | |||||||||
for translationCatalog in existingTranslationCatalogs: | |||||||||
translationMessage = translationCatalog.get(templateMessage.id, templateMessage.context) | |||||||||
if not translationMessage: | |||||||||
continue | |||||||||
translationUrls = set(urlPattern.findall( | |||||||||
translationMessage.string[0] if translationMessage.pluralizable else translationMessage.string | |||||||||
)) | |||||||||
unknown_urls = translationUrls.difference(templateUrls) | |||||||||
if unknown_urls: | |||||||||
print(f'{inputFilePath} - {translationCatalog.locale}: ' | |||||||||
f'Found unknown URL(s) {", ".join(unknown_urls)} in the translation ' | |||||||||
f'which do not match any of the URLs in the template: {", ".join(templateUrls)}') | |||||||||
print(f"Done checking {inputFilePath}") | |||||||||
def main(): | def main(): | ||||||||
print("\n\tWARNING: Remember to regenerate the POT files with “updateTemplates.py”" | |||||||||
print(u"\n WARNING: Remember to regenerate the POT files with “updateTemplates.py” before you run this script.\n POT files are not in the repository.\n") | "before you run this script.\n\tPOT files are not in the repository.\n") | ||||||||
Done Inline Actions
This looks like a common issue in print usages in this changeset. Please make sure you are adding spaces or new lines at the end where needed. Gallaecio: This looks like a common issue in print usages in this changeset. Please make sure you are… | |||||||||
foundPots = 0 | foundPots = 0 | ||||||||
for root, folders, filenames in os.walk(projectRootDirectory): | for root, folders, filenames in os.walk(projectRootDirectory): | ||||||||
root = root.decode("utf-8") | |||||||||
for filename in filenames: | for filename in filenames: | ||||||||
if len(filename) > 4 and filename[-4:] == ".pot" and os.path.basename(root) == "l10n": | if len(filename) > 4 and filename[-4:] == ".pot" and os.path.basename(root) == "l10n": | ||||||||
foundPots += 1 | foundPots += 1 | ||||||||
checkTranslationsForSpam(os.path.join(root, filename)) | multiprocessing.Process( | ||||||||
target=checkTranslationsForSpam, | |||||||||
args=(os.path.join(root, filename), ) | |||||||||
).start() | |||||||||
Done Inline ActionsAwesome! ? Gallaecio: Awesome! ? | |||||||||
if foundPots == 0: | if foundPots == 0: | ||||||||
print(u"This script did not work because no ‘.pot’ files were found.") | print( | ||||||||
print(u"Please, run ‘updateTemplates.py’ to generate the ‘.pot’ files, and run ‘pullTranslations.py’ to pull the latest translations from Transifex.") | "This script did not work because no '.pot' files were found. " | ||||||||
print(u"Then you can run this script to generate ‘.po’ files with the longest strings.") | "Please run 'updateTemplates.py' to generate the '.pot' files, " | ||||||||
"and run 'pullTranslations.py' to pull the latest translations from Transifex. " | |||||||||
"Then you can run this script to check for spam in translations.") | |||||||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||||||
main() | main() |
Wildfire Games · Phabricator
I see you removed [!*\(\),], I’ll trust that is on purpose, I’m not familiar enough with the URL spec.
? / does not need escaping.
? It comes from the original string, but I see no reason not to merge those character groups into a single [-a-zA-Z\d_$@./&+].