Differential D2757 Diff 12028 source/tools/i18n/checkTranslationsForSpam.py

Changeset View

Standalone View

View Options

source/tools/i18n/checkTranslationsForSpam.py

#!/usr/bin/env python2 #!/usr/bin/env python3

# -*- coding:utf-8 -*-

# #

# This file is part of 0 A.D. # This file is part of 0 A.D.

# #

# 0 A.D. is free software: you can redistribute it and/or modify # 0 A.D. is free software: you can redistribute it and/or modify

# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by

# the Free Software Foundation, either version 2 of the License, or # the Free Software Foundation, either version 2 of the License, or

# (at your option) any later version. # (at your option) any later version.

# #

# 0 A.D. is distributed in the hope that it will be useful, # 0 A.D. is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of # but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

# GNU General Public License for more details. # GNU General Public License for more details.

# #

# You should have received a copy of the GNU General Public License # You should have received a copy of the GNU General Public License

# along with 0 A.D. If not, see <http://www.gnu.org/licenses/>. # along with 0 A.D. If not, see <http://www.gnu.org/licenses/>.

from __future__ import absolute_import, division, print_function, unicode_literals import os, re, sys

import codecs, os, re, sys from i18n_helper.catalog import Catalog

from i18n_helper.globber import getCatalogs

from pology.catalog import Catalog

from pology.message import Message

import multiprocessing

l10nToolsDirectory = os.path.dirname(os.path.realpath(__file__)) l10nToolsDirectory = os.path.dirname(os.path.realpath(__file__))

projectRootDirectory = os.path.abspath(os.path.join(l10nToolsDirectory, os.pardir, os.pardir, os.pardir)) projectRootDirectory = os.path.abspath(os.path.join(l10nToolsDirectory, os.pardir, os.pardir, os.pardir))

l10nFolderName = "l10n" l10nFolderName = "l10n"

def checkTranslationsForSpam(inputFilePath): def checkTranslationsForSpam(inputFilePath):

print(f"Checking {inputFilePath}")

print(u"Checking", inputFilePath) templateCatalog = Catalog.readFrom()

templateCatalog = Catalog(inputFilePath)

# If language codes were specified on the command line, filter by those. # If language codes were specified on the command line, filter by those.

filters = sys.argv[1:] filters = sys.argv[1:]

# Load existing translation catalogs. # Load existing translation catalogs.

existingTranslationCatalogs = [] existingTranslationCatalogs = getCatalogs(inputFilePath, filters)

l10nFolderPath = os.path.dirname(inputFilePath)

# .pot is one letter longer than .po, but the dot that separates the locale urlPattern = re.compile(r"https?:\/\/(?:[a-zA-Z]|[0-9]|[-_$@./&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")

GallaecioUnsubmitted

Not Done

I see you removed [!*,], I’ll trust that is on purpose, I’m not familiar enough with the URL spec.

? / does not need escaping.

? It comes from the original string, but I see no reason not to merge those character groups into a single [-a-zA-Z\d_$@./&+].

Gallaecio: I see you removed `[!*,]`, I’ll trust that is on purpose, I’m not familiar enough with the…

wraitiiAuthorUnsubmitted

Done

[!*,] seemed un-necessary (and even buggy?) in my testing. Since all that's important is to recognise URLs, we don't really need to parse all possible items of an URL. This seems fine to me.

wraitii: `[!*,]` seemed un-necessary (and even buggy?) in my testing. Since all that's important is…

# code from the rest of the filename in .po files makes up for that.

charactersToSkip = len(os.path.basename(inputFilePath))

for filename in os.listdir(l10nFolderPath):

if len(filename) > 3 and filename[-3:] == ".po" and filename[:4] != "long":

if not filters or filename[:-charactersToSkip] in filters:

if os.path.basename(inputFilePath)[:-4] == filename.split('.')[-2]:

existingTranslationCatalogs.append([filename[:-charactersToSkip], os.path.join(l10nFolderPath, filename)])

urlPattern = re.compile(u"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")

# Check the URLs in translations against the URLs in the translation template.

for languageCode, pofile in existingTranslationCatalogs:

translationCatalog = Catalog(pofile)

for templateMessage in templateCatalog:

translationMessage = translationCatalog.select_by_key(templateMessage.msgctxt, templateMessage.msgid)

if translationMessage:

templateSingularString = templateMessage.msgid

templateUrls = urlPattern.findall(templateMessage.msgid)

# Assert that the same URL is used in both the plural and singular forms.

if templateMessage.msgid_plural and len(templateMessage.msgstr) > 1:

pluralUrls = urlPattern.findall(templateMessage.msgstr[0])

for url in pluralUrls:

if url not in templateUrls:

print(u"Different URLs in singular and plural source strings for ‘{}’ in ‘{}’".format(

templateMessage.msgid,

inputFilePath))

for translationString in translationMessage[0].msgstr:

translationUrls = urlPattern.findall(translationString)

for translationUrl in translationUrls:

if translationUrl not in templateUrls:

print(u"{}: Found the “{}” URL in the translation, which does not match any of the URLs in the translation template: {}".format(

languageCode,

translationUrl,

u", ".join(templateUrls)))

# Check that there are no spam URLs.

# Loop through all messages in the .POT catalog for URLs.

# For each, check for the corresponding key in the .PO catalogs.

# If found, check that URLS in the .PO keys are the same as those in the .POT key.

for templateMessage in templateCatalog:

templateUrls = set(urlPattern.findall(

templateMessage.id[0] if templateMessage.pluralizable else templateMessage.id

))

# As a sanity check, verify that the template message is coherent

if templateMessage.pluralizable:

pluralUrls = set(urlPattern.findall(templateMessage.id[1]))

if pluralUrls.difference(templateUrls):

print(f"{inputFilePath} - Different URLs in singular and plural source strings"

f"for '{templateMessage}' in '{inputFilePath}'")

for translationCatalog in existingTranslationCatalogs:

translationMessage = translationCatalog.get(templateMessage.id, templateMessage.context)

if not translationMessage:

continue

translationUrls = set(urlPattern.findall(

translationMessage.string[0] if translationMessage.pluralizable else translationMessage.string

))

unknown_urls = translationUrls.difference(templateUrls)

if unknown_urls:

print(f'{inputFilePath} - {translationCatalog.locale}: '

f'Found unknown URL(s) {", ".join(unknown_urls)} in the translation '

f'which do not match any of the URLs in the template: {", ".join(templateUrls)}')

print(f"Done checking {inputFilePath}")

def main(): def main():

print("\n\tWARNING: Remember to regenerate the POT files with “updateTemplates.py”"

print(u"\n WARNING: Remember to regenerate the POT files with “updateTemplates.py” before you run this script.\n POT files are not in the repository.\n") "before you run this script.\n\tPOT files are not in the repository.\n")

GallaecioUnsubmitted

Done

def main():

- print("\n\tWARNING: Remember to regenerate the POT files with “updateTemplates.py”"

+ print("\n\tWARNING: Remember to regenerate the POT files with “updateTemplates.py” "

"before you run this script.\n\tPOT files are not in the repository.\n")

This looks like a common issue in print usages in this changeset. Please make sure you are adding spaces or new lines at the end where needed.

Gallaecio: This looks like a common issue in print usages in this changeset. Please make sure you are…

foundPots = 0 foundPots = 0

for root, folders, filenames in os.walk(projectRootDirectory): for root, folders, filenames in os.walk(projectRootDirectory):

root = root.decode("utf-8")

for filename in filenames: for filename in filenames:

if len(filename) > 4 and filename[-4:] == ".pot" and os.path.basename(root) == "l10n": if len(filename) > 4 and filename[-4:] == ".pot" and os.path.basename(root) == "l10n":

foundPots += 1 foundPots += 1

checkTranslationsForSpam(os.path.join(root, filename)) multiprocessing.Process(

target=checkTranslationsForSpam,

args=(os.path.join(root, filename), )

).start()

GallaecioUnsubmitted

Done

Awesome! ?

Gallaecio: Awesome! ?

if foundPots == 0: if foundPots == 0:

print(u"This script did not work because no ‘.pot’ files were found.") print(

print(u"Please, run ‘updateTemplates.py’ to generate the ‘.pot’ files, and run ‘pullTranslations.py’ to pull the latest translations from Transifex.") "This script did not work because no '.pot' files were found. "

print(u"Then you can run this script to generate ‘.po’ files with the longest strings.") "Please run 'updateTemplates.py' to generate the '.pot' files, "

"and run 'pullTranslations.py' to pull the latest translations from Transifex. "

"Then you can run this script to check for spam in translations.")

if __name__ == "__main__": if __name__ == "__main__":

main() main()