Fix for incorrect tokenization due to index difference of Unicode character/scalar (#286)

* Fix: `Scanner` now uses indices of the respective UnicodeScalarView * Fix: `Scanner` now uses indices of the respective UnicodeScalarView * Extended test for Unicode `Combining Diaeresis` * Fixed test for combining diaeresis * Inlined template for testing Unicode combining diaeresis Co-authored-by: Ilya Puchka <ilyapuchka@gmail.com>
2020-01-12 16:21:02 +01:00
parent 5b2d5dc5e0
commit a1718ae350
2 changed files with 18 additions and 8 deletions
--- a/Sources/Lexer.swift
+++ b/Sources/Lexer.swift
@@ -114,7 +114,7 @@ struct Lexer {
 class Scanner {
  let originalContent: String
  var content: String
-  var range: Range<String.Index>
+  var range: Range<String.UnicodeScalarView.Index>

  /// The start delimiter for a token.
  private static let tokenStartDelimiter: Unicode.Scalar = "{"
@@ -124,7 +124,7 @@ class Scanner {
  init(_ content: String) {
    self.originalContent = content
    self.content = content
-    range = content.startIndex..<content.startIndex
+    range = content.unicodeScalars.startIndex..<content.unicodeScalars.startIndex
  }

  var isEmpty: Bool {
@@ -147,9 +147,9 @@ class Scanner {

    for (index, char) in content.unicodeScalars.enumerated() {
      if foundChar && char == Scanner.tokenEndDelimiter {
-        let result = String(content.prefix(index + 1))
-        content = String(content.dropFirst(index + 1))
-        range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index + 1)
+        let result = String(content.unicodeScalars.prefix(index + 1))
+        content = String(content.unicodeScalars.dropFirst(index + 1))
+        range = range.upperBound..<originalContent.unicodeScalars.index(range.upperBound, offsetBy: index + 1)
        return result
      } else {
        foundChar = (char == tokenChar)
@@ -181,9 +181,9 @@ class Scanner {
    range = range.upperBound..<range.upperBound
    for (index, char) in content.unicodeScalars.enumerated() {
      if foundBrace && tokenChars.contains(char) {
-        let result = String(content.prefix(index - 1))
-        content = String(content.dropFirst(index - 1))
-        range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index - 1)
+        let result = String(content.unicodeScalars.prefix(index - 1))
+        content = String(content.unicodeScalars.dropFirst(index - 1))
+        range = range.upperBound..<originalContent.unicodeScalars.index(range.upperBound, offsetBy: index - 1)
        return (char, result)
      } else {
        foundBrace = (char == Scanner.tokenStartDelimiter)
--- a/Tests/StencilTests/LexerSpec.swift
+++ b/Tests/StencilTests/LexerSpec.swift
@@ -126,6 +126,16 @@ final class LexerTests: XCTestCase {
    }
  }

+  func testCombiningDiaeresis() throws {
+    // the symbol "ü" in the `templateString` is unusually encoded as 0x75 0xCC 0x88 (LATIN SMALL LETTER U + COMBINING DIAERESIS) instead of 0xC3 0xBC (LATIN SMALL LETTER U WITH DIAERESIS)
+    let templateString = "ü\n{% if test %}ü{% endif %}\n{% if ü %}ü{% endif %}\n"
+    let lexer = Lexer(templateString: templateString)
+    let tokens = lexer.tokenize()
+
+    try expect(tokens.count) == 9
+    assert(tokens[1].contents == "if test")
+  }
+
  private func makeSourceMap(_ token: String, for lexer: Lexer, options: String.CompareOptions = []) -> SourceMap {
    guard let range = lexer.templateString.range(of: token, options: options) else { fatalError("Token not found") }
    return SourceMap(location: lexer.rangeLocation(range))