From 08071e5634a1354c645c447d64773d8fab1a0c95 Mon Sep 17 00:00:00 2001 From: Pete Bacon Darwin Date: Sun, 1 Mar 2020 22:16:15 +0000 Subject: [PATCH] fix(localize): improve matching and parsing of XLIFF 2.0 translation files (#35793) Previously, the `Xliff2TranslationParser` only matched files that had a narrow choice of extensions (e.g. `xlf`) and also relied upon a regular expression match of an optional XML namespace directive. This commit relaxes the requirement on both of these and, instead, relies upon parsing the file into XML and identifying an element of the form `` which is the minimal requirement for such files. PR Close #35793 --- .../xliff2_translation_parser.ts | 183 +++--- .../xliff2_translation_parser_spec.ts | 581 +++++++++++++++++- 2 files changed, 678 insertions(+), 86 deletions(-) diff --git a/packages/localize/src/tools/src/translate/translation_files/translation_parsers/xliff2_translation_parser.ts b/packages/localize/src/tools/src/translate/translation_files/translation_parsers/xliff2_translation_parser.ts index 12f35626a8..0dbd7f54ec 100644 --- a/packages/localize/src/tools/src/translate/translation_files/translation_parsers/xliff2_translation_parser.ts +++ b/packages/localize/src/tools/src/translate/translation_files/translation_parsers/xliff2_translation_parser.ts @@ -5,20 +5,16 @@ * Use of this source code is governed by an MIT-style license that can be * found in the LICENSE file at https://angular.io/license */ -import {Element, Node, XmlParser, visitAll} from '@angular/compiler'; -import {ɵMessageId, ɵParsedTranslation} from '@angular/localize'; -import {extname} from 'path'; +import {Element, Node, ParseErrorLevel, visitAll} from '@angular/compiler'; +import {ɵParsedTranslation} from '@angular/localize'; import {Diagnostics} from '../../../diagnostics'; import {BaseVisitor} from '../base_visitor'; import {MessageSerializer} from '../message_serialization/message_serializer'; import {TargetMessageRenderer} from '../message_serialization/target_message_renderer'; -import {TranslationParseError} from './translation_parse_error'; import {ParsedTranslationBundle, TranslationParser} from './translation_parser'; -import {getAttrOrThrow, getAttribute, parseInnerRange} from './translation_utils'; - -const XLIFF_2_0_NS_REGEX = /xmlns="urn:oasis:names:tc:xliff:document:2.0"/; +import {XmlTranslationParserHint, addParseDiagnostic, addParseError, canParseXml, getAttribute, isNamedElement, parseInnerRange} from './translation_utils'; /** * A translation parser that can load translations from XLIFF 2 files. @@ -26,85 +22,132 @@ const XLIFF_2_0_NS_REGEX = /xmlns="urn:oasis:names:tc:xliff:document:2.0"/; * http://docs.oasis-open.org/xliff/xliff-core/v2.0/os/xliff-core-v2.0-os.html * */ -export class Xliff2TranslationParser implements TranslationParser { - canParse(filePath: string, contents: string): boolean { - return (extname(filePath) === '.xlf') && XLIFF_2_0_NS_REGEX.test(contents); +export class Xliff2TranslationParser implements TranslationParser { + canParse(filePath: string, contents: string): XmlTranslationParserHint|false { + return canParseXml(filePath, contents, 'xliff', {version: '2.0'}); } - parse(filePath: string, contents: string): ParsedTranslationBundle { - const xmlParser = new XmlParser(); - const xml = xmlParser.parse(contents, filePath); - const bundle = Xliff2TranslationBundleVisitor.extractBundle(xml.rootNodes); - if (bundle === undefined) { + parse(filePath: string, contents: string, hint?: XmlTranslationParserHint): + ParsedTranslationBundle { + if (hint) { + return this.extractBundle(hint); + } else { + return this.extractBundleDeprecated(filePath, contents); + } + } + + private extractBundle({element, errors}: XmlTranslationParserHint): ParsedTranslationBundle { + const diagnostics = new Diagnostics(); + errors.forEach(e => addParseError(diagnostics, e)); + + if (element.children.length === 0) { + addParseDiagnostic( + diagnostics, element.sourceSpan, 'Missing expected element', + ParseErrorLevel.WARNING); + return {locale: undefined, translations: {}, diagnostics}; + } + + const locale = getAttribute(element, 'trgLang'); + const files = element.children.filter(isFileElement); + if (files.length === 0) { + addParseDiagnostic( + diagnostics, element.sourceSpan, 'No elements found in ', + ParseErrorLevel.WARNING); + } else if (files.length > 1) { + addParseDiagnostic( + diagnostics, files[1].sourceSpan, 'More than one element found in ', + ParseErrorLevel.WARNING); + } + + const bundle = {locale, translations: {}, diagnostics}; + const translationVisitor = new Xliff2TranslationVisitor(); + visitAll(translationVisitor, files[0].children, {bundle}); + + return bundle; + } + + private extractBundleDeprecated(filePath: string, contents: string) { + const hint = this.canParse(filePath, contents); + if (!hint) { throw new Error(`Unable to parse "${filePath}" as XLIFF 2.0 format.`); } + const bundle = this.extractBundle(hint); + if (bundle.diagnostics.hasErrors) { + const message = + bundle.diagnostics.formatDiagnostics(`Failed to parse "${filePath}" as XLIFF 2.0 format`); + throw new Error(message); + } return bundle; } } -interface BundleVisitorContext { - parsedLocale?: string; -} -class Xliff2TranslationBundleVisitor extends BaseVisitor { - private bundle: ParsedTranslationBundle|undefined; - - static extractBundle(xliff: Node[]): ParsedTranslationBundle|undefined { - const visitor = new this(); - visitAll(visitor, xliff, {}); - return visitor.bundle; - } - - visitElement(element: Element, {parsedLocale}: BundleVisitorContext): any { - if (element.name === 'xliff') { - parsedLocale = getAttribute(element, 'trgLang'); - return visitAll(this, element.children, {parsedLocale}); - } else if (element.name === 'file') { - this.bundle = { - locale: parsedLocale, - translations: Xliff2TranslationVisitor.extractTranslations(element), - diagnostics: new Diagnostics(), - }; - } else { - return visitAll(this, element.children, {parsedLocale}); - } - } +interface TranslationVisitorContext { + unit?: string; + bundle: ParsedTranslationBundle; } class Xliff2TranslationVisitor extends BaseVisitor { - private translations: Record<ɵMessageId, ɵParsedTranslation> = {}; - - static extractTranslations(file: Element): Record { - const visitor = new this(); - visitAll(visitor, file.children); - return visitor.translations; - } - - visitElement(element: Element, context: any): any { + visitElement(element: Element, {bundle, unit}: TranslationVisitorContext): any { if (element.name === 'unit') { - const externalId = getAttrOrThrow(element, 'id'); - if (this.translations[externalId] !== undefined) { - throw new TranslationParseError( - element.sourceSpan, `Duplicated translations for message "${externalId}"`); - } - visitAll(this, element.children, {unit: externalId}); + this.visitUnitElement(element, bundle); } else if (element.name === 'segment') { - assertTranslationUnit(element, context); - const targetMessage = element.children.find(isTargetElement); - if (targetMessage === undefined) { - throw new TranslationParseError(element.sourceSpan, 'Missing required element'); - } - this.translations[context.unit] = serializeTargetMessage(targetMessage); + this.visitSegmentElement(element, bundle, unit); } else { - return visitAll(this, element.children); + visitAll(this, element.children, {bundle, unit}); } } -} -function assertTranslationUnit(segment: Element, context: any) { - if (context === undefined || context.unit === undefined) { - throw new TranslationParseError( - segment.sourceSpan, 'Invalid element: should be a child of a element.'); + private visitUnitElement(element: Element, bundle: ParsedTranslationBundle): void { + // Error if no `id` attribute + const externalId = getAttribute(element, 'id'); + if (externalId === undefined) { + addParseDiagnostic( + bundle.diagnostics, element.sourceSpan, + `Missing required "id" attribute on element.`, ParseErrorLevel.ERROR); + return; + } + + // Error if there is already a translation with the same id + if (bundle.translations[externalId] !== undefined) { + addParseDiagnostic( + bundle.diagnostics, element.sourceSpan, + `Duplicated translations for message "${externalId}"`, ParseErrorLevel.ERROR); + return; + } + + visitAll(this, element.children, {bundle, unit: externalId}); + } + + private visitSegmentElement( + element: Element, bundle: ParsedTranslationBundle, unit: string|undefined): void { + // A `` element must be below a `` element + if (unit === undefined) { + addParseDiagnostic( + bundle.diagnostics, element.sourceSpan, + 'Invalid element: should be a child of a element.', + ParseErrorLevel.ERROR); + return; + } + + const targetMessage = element.children.find(isNamedElement('target')); + if (targetMessage === undefined) { + addParseDiagnostic( + bundle.diagnostics, element.sourceSpan, 'Missing required element', + ParseErrorLevel.ERROR); + return; + } + + try { + bundle.translations[unit] = serializeTargetMessage(targetMessage); + } catch (e) { + // Capture any errors from serialize the target message + if (e.span && e.msg && e.level) { + addParseDiagnostic(bundle.diagnostics, e.span, e.msg, e.level); + } else { + throw e; + } + } } } @@ -118,6 +161,6 @@ function serializeTargetMessage(source: Element): ɵParsedTranslation { return serializer.serialize(parseInnerRange(source)); } -function isTargetElement(node: Node): node is Element { - return node instanceof Element && node.name === 'target'; +function isFileElement(node: Node): node is Element { + return node instanceof Element && node.name === 'file'; } diff --git a/packages/localize/src/tools/test/translate/translation_files/translation_parsers/xliff2_translation_parser_spec.ts b/packages/localize/src/tools/test/translate/translation_files/translation_parsers/xliff2_translation_parser_spec.ts index 10381a3b2b..1ee37f708c 100644 --- a/packages/localize/src/tools/test/translate/translation_files/translation_parsers/xliff2_translation_parser_spec.ts +++ b/packages/localize/src/tools/test/translate/translation_files/translation_parsers/xliff2_translation_parser_spec.ts @@ -10,24 +10,27 @@ import {Xliff2TranslationParser} from '../../../../src/translate/translation_fil describe('Xliff2TranslationParser', () => { describe('canParse()', () => { - it('should return true if the file extension is `.xlf` and it contains the XLIFF namespace', () => { - const parser = new Xliff2TranslationParser(); - expect( - parser.canParse( - '/some/file.xlf', - '')) - .toBe(true); - expect( - parser.canParse( - '/some/file.json', - '')) - .toBe(false); - expect(parser.canParse('/some/file.xlf', '')).toBe(false); - expect(parser.canParse('/some/file.json', '')).toBe(false); - }); + it('should return true if the file contains an element with version="2.0" attribute', + () => { + const parser = new Xliff2TranslationParser(); + expect(parser.canParse( + '/some/file.xlf', + '')) + .toBeTruthy(); + expect(parser.canParse( + '/some/file.json', + '')) + .toBeTruthy(); + expect(parser.canParse('/some/file.xliff', '')).toBeTruthy(); + expect(parser.canParse('/some/file.json', '')).toBeTruthy(); + expect(parser.canParse('/some/file.xlf', '')).toBe(false); + expect(parser.canParse('/some/file.xlf', '')).toBe(false); + expect(parser.canParse('/some/file.xlf', '')).toBe(false); + expect(parser.canParse('/some/file.json', '')).toBe(false); + }); }); - describe('parse()', () => { + describe('parse() [without hint]', () => { it('should extract the locale from the file contents', () => { const XLIFF = ` @@ -475,4 +478,550 @@ describe('Xliff2TranslationParser', () => { }); }); }); + + describe('parse() [with hint]', () => { + it('should extract the locale from the file contents', () => { + const XLIFF = ` + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.locale).toEqual('fr'); + }); + + it('should return undefined locale if there is no locale in the file', () => { + const XLIFF = ` + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.locale).toBeUndefined(); + }); + + it('should extract basic messages', () => { + /** + * Source HTML: + * + * ``` + *
translatable attribute
+ * ``` + */ + const XLIFF = ` + + + + + file.ts:2 + + + translatable attribute + etubirtta elbatalsnart + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId('translatable attribute', '')]) + .toEqual(ɵmakeParsedTranslation(['etubirtta elbatalsnart'])); + }); + + it('should extract translations with simple placeholders', () => { + /** + * Source HTML: + * + * ``` + *
translatable element >with placeholders {{ interpolation}}
+ * ``` + */ + const XLIFF = ` + + + + + file.ts:3 + + + translatable element with placeholders + tnemele elbatalsnart sredlohecalp htiw + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect( + result.translations[ɵcomputeMsgId( + 'translatable element {$START_BOLD_TEXT}with placeholders{$LOSE_BOLD_TEXT} {$INTERPOLATION}')]) + .toEqual(ɵmakeParsedTranslation( + ['', ' tnemele elbatalsnart ', 'sredlohecalp htiw', ''], + ['INTERPOLATION', 'START_BOLD_TEXT', 'CLOSE_BOLD_TEXT'])); + }); + + it('should extract translations with simple ICU expressions', () => { + /** + * Source HTML: + * + * ``` + *
{VAR_PLURAL, plural, =0 {

test

} }
+ * ``` + */ + const XLIFF = ` + + + + + file.ts:4 + + + {VAR_PLURAL, plural, =0 {test} } + {VAR_PLURAL, plural, =0 {TEST} } + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId( + '{VAR_PLURAL, plural, =0 {{START_PARAGRAPH}test{CLOSE_PARAGRAPH}}}')]) + .toEqual(ɵmakeParsedTranslation( + ['{VAR_PLURAL, plural, =0 {{START_PARAGRAPH}TEST{CLOSE_PARAGRAPH}}}'], [])); + }); + + it('should extract translations with duplicate source messages', () => { + /** + * Source HTML: + * + * ``` + *
foo
+ *
foo
+ *
foo
+ * ``` + */ + const XLIFF = ` + + + + + d + m + file.ts:5 + + + foo + oof + + + + + d + m + file.ts:5 + + + foo + toto + + + + + d + m + file.ts:5 + + + foo + tata + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId('foo')]).toEqual(ɵmakeParsedTranslation(['oof'])); + expect(result.translations['i']).toEqual(ɵmakeParsedTranslation(['toto'])); + expect(result.translations['bar']).toEqual(ɵmakeParsedTranslation(['tata'])); + }); + + it('should extract translations with only placeholders, which are re-ordered', () => { + /** + * Source HTML: + * + * ``` + *

+ * ``` + */ + const XLIFF = ` + + + + + ph names + file.ts:7 + + + + + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId('{$LINE_BREAK}{$TAG_IMG}{$TAG_IMG_1}')]) + .toEqual( + ɵmakeParsedTranslation(['', '', '', ''], ['TAG_IMG_1', 'TAG_IMG', 'LINE_BREAK'])); + }); + + it('should extract translations with empty target', () => { + /** + * Source HTML: + * + * ``` + *
hello
+ * ``` + */ + const XLIFF = ` + + + + + empty element + file.ts:8 + + + hello + + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId('hello {$START_TAG_SPAN}{$CLOSE_TAG_SPAN}')]) + .toEqual(ɵmakeParsedTranslation([''])); + }); + + it('should extract translations with deeply nested ICUs', () => { + /** + * Source HTML: + * + * ``` + * Test: { count, plural, =0 { { sex, select, other {

deeply nested

}} } =other {a lot}} + * ``` + * + * Note that the message gets split into two translation units: + * * The first one contains the outer message with an `ICU` placeholder + * * The second one is the ICU expansion itself + * + * Note that special markers `VAR_PLURAL` and `VAR_SELECT` are added, which are then replaced + * by IVY at runtime with the actual values being rendered by the ICU expansion. + */ + const XLIFF = ` + + + + + file.ts:10 + + + Test: + Le test: + + + + + file.ts:10 + + + {VAR_PLURAL, plural, =0 {{VAR_SELECT, select, other {deeply nested}}} =other {a lot}} + {VAR_PLURAL, plural, =0 {{VAR_SELECT, select, other {profondément imbriqué}}} =other {beaucoup}} + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId('Test: {$ICU}')]) + .toEqual(ɵmakeParsedTranslation(['Le test: ', ''], ['ICU'])); + + expect( + result.translations[ɵcomputeMsgId( + '{VAR_PLURAL, plural, =0 {{VAR_SELECT, select, other {{START_PARAGRAPH}deeply nested{CLOSE_PARAGRAPH}}}} =other {beaucoup}}')]) + .toEqual(ɵmakeParsedTranslation([ + '{VAR_PLURAL, plural, =0 {{VAR_SELECT, select, other {{START_PARAGRAPH}profondément imbriqué{CLOSE_PARAGRAPH}}}} =other {beaucoup}}' + ])); + }); + + it('should extract translations containing multiple lines', () => { + /** + * Source HTML: + * + * ``` + *
multi + * lines
+ * ``` + */ + const XLIFF = ` + + + + + file.ts:11,12 + + + multi\nlines + multi\nlignes + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations[ɵcomputeMsgId('multi\nlines')]) + .toEqual(ɵmakeParsedTranslation(['multi\nlignes'])); + }); + + it('should extract translations with elements', () => { + const XLIFF = ` + + + + + First sentence. + Translated first sentence. + + + + + First sentence. Second sentence. + Translated first sentence. + + + + `; + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + + expect(result.translations['mrk-test']) + .toEqual(ɵmakeParsedTranslation(['Translated first sentence.'])); + + expect(result.translations['mrk-test2']) + .toEqual(ɵmakeParsedTranslation(['Translated first sentence.'])); + }); + + describe('[structure errors]', () => { + it('should provide a diagnostic error when a trans-unit has no translation', () => { + const XLIFF = ` + + + + + + + + + `; + + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.diagnostics.messages.length).toEqual(1); + expect(result.diagnostics.messages[0].message).toEqual(`Missing required element (" + + + [ERROR ->] + + +"): /some/file.xlf@4:12`); + }); + + + it('should provide a diagnostic error when a trans-unit has no id attribute', () => { + const XLIFF = ` + + + + + + + + + + `; + + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.diagnostics.messages.length).toEqual(1); + expect(result.diagnostics.messages[0].message) + .toEqual( + `Missing required "id" attribute on element. ("ocument:2.0" srcLang="en" trgLang="fr"> + + [ERROR ->] + + +"): /some/file.xlf@3:10`); + }); + + it('should provide a diagnostic error on duplicate trans-unit id', () => { + const XLIFF = ` + + + + + + + + + + + + + + + + `; + + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.diagnostics.messages.length).toEqual(1); + expect(result.diagnostics.messages[0].message) + .toEqual(`Duplicated translations for message "deadbeef" (" + + + [ERROR ->] + + +"): /some/file.xlf@9:10`); + }); + }); + + describe('[message errors]', () => { + it('should provide a diagnostic error on unknown message tags', () => { + const XLIFF = ` + + + + + + msg should contain only ph and pc tags + + + + `; + + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.diagnostics.messages.length).toEqual(1); + expect(result.diagnostics.messages[0].message).toEqual(`Invalid element found in message. (" + + + [ERROR ->]msg should contain only ph and pc tags + + +"): /some/file.xlf@6:22`); + }); + + it('should provide a diagnostic error when a placeholder misses an id attribute', () => { + const XLIFF = ` + + + + + + + + + + `; + + const parser = new Xliff2TranslationParser(); + const hint = parser.canParse('/some/file.xlf', XLIFF); + if (!hint) { + return fail('expected XLIFF to be valid'); + } + const result = parser.parse('/some/file.xlf', XLIFF, hint); + expect(result.diagnostics.messages.length).toEqual(1); + expect(result.diagnostics.messages[0].message) + .toEqual(`Missing required "equiv" attribute: (" + + + [ERROR ->] + + +"): /some/file.xlf@6:22`); + }); + }); + }); });