Fix for incorrect tokenization due to index difference of Unicode character/scalar (#286)
* Fix: `Scanner` now uses indices of the respective UnicodeScalarView * Fix: `Scanner` now uses indices of the respective UnicodeScalarView * Extended test for Unicode `Combining Diaeresis` * Fixed test for combining diaeresis * Inlined template for testing Unicode combining diaeresis Co-authored-by: Ilya Puchka <ilyapuchka@gmail.com>
This commit is contained in:
@@ -114,7 +114,7 @@ struct Lexer {
|
|||||||
class Scanner {
|
class Scanner {
|
||||||
let originalContent: String
|
let originalContent: String
|
||||||
var content: String
|
var content: String
|
||||||
var range: Range<String.Index>
|
var range: Range<String.UnicodeScalarView.Index>
|
||||||
|
|
||||||
/// The start delimiter for a token.
|
/// The start delimiter for a token.
|
||||||
private static let tokenStartDelimiter: Unicode.Scalar = "{"
|
private static let tokenStartDelimiter: Unicode.Scalar = "{"
|
||||||
@@ -124,7 +124,7 @@ class Scanner {
|
|||||||
init(_ content: String) {
|
init(_ content: String) {
|
||||||
self.originalContent = content
|
self.originalContent = content
|
||||||
self.content = content
|
self.content = content
|
||||||
range = content.startIndex..<content.startIndex
|
range = content.unicodeScalars.startIndex..<content.unicodeScalars.startIndex
|
||||||
}
|
}
|
||||||
|
|
||||||
var isEmpty: Bool {
|
var isEmpty: Bool {
|
||||||
@@ -147,9 +147,9 @@ class Scanner {
|
|||||||
|
|
||||||
for (index, char) in content.unicodeScalars.enumerated() {
|
for (index, char) in content.unicodeScalars.enumerated() {
|
||||||
if foundChar && char == Scanner.tokenEndDelimiter {
|
if foundChar && char == Scanner.tokenEndDelimiter {
|
||||||
let result = String(content.prefix(index + 1))
|
let result = String(content.unicodeScalars.prefix(index + 1))
|
||||||
content = String(content.dropFirst(index + 1))
|
content = String(content.unicodeScalars.dropFirst(index + 1))
|
||||||
range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index + 1)
|
range = range.upperBound..<originalContent.unicodeScalars.index(range.upperBound, offsetBy: index + 1)
|
||||||
return result
|
return result
|
||||||
} else {
|
} else {
|
||||||
foundChar = (char == tokenChar)
|
foundChar = (char == tokenChar)
|
||||||
@@ -181,9 +181,9 @@ class Scanner {
|
|||||||
range = range.upperBound..<range.upperBound
|
range = range.upperBound..<range.upperBound
|
||||||
for (index, char) in content.unicodeScalars.enumerated() {
|
for (index, char) in content.unicodeScalars.enumerated() {
|
||||||
if foundBrace && tokenChars.contains(char) {
|
if foundBrace && tokenChars.contains(char) {
|
||||||
let result = String(content.prefix(index - 1))
|
let result = String(content.unicodeScalars.prefix(index - 1))
|
||||||
content = String(content.dropFirst(index - 1))
|
content = String(content.unicodeScalars.dropFirst(index - 1))
|
||||||
range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index - 1)
|
range = range.upperBound..<originalContent.unicodeScalars.index(range.upperBound, offsetBy: index - 1)
|
||||||
return (char, result)
|
return (char, result)
|
||||||
} else {
|
} else {
|
||||||
foundBrace = (char == Scanner.tokenStartDelimiter)
|
foundBrace = (char == Scanner.tokenStartDelimiter)
|
||||||
|
|||||||
@@ -126,6 +126,16 @@ final class LexerTests: XCTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testCombiningDiaeresis() throws {
|
||||||
|
// the symbol "ü" in the `templateString` is unusually encoded as 0x75 0xCC 0x88 (LATIN SMALL LETTER U + COMBINING DIAERESIS) instead of 0xC3 0xBC (LATIN SMALL LETTER U WITH DIAERESIS)
|
||||||
|
let templateString = "ü\n{% if test %}ü{% endif %}\n{% if ü %}ü{% endif %}\n"
|
||||||
|
let lexer = Lexer(templateString: templateString)
|
||||||
|
let tokens = lexer.tokenize()
|
||||||
|
|
||||||
|
try expect(tokens.count) == 9
|
||||||
|
assert(tokens[1].contents == "if test")
|
||||||
|
}
|
||||||
|
|
||||||
private func makeSourceMap(_ token: String, for lexer: Lexer, options: String.CompareOptions = []) -> SourceMap {
|
private func makeSourceMap(_ token: String, for lexer: Lexer, options: String.CompareOptions = []) -> SourceMap {
|
||||||
guard let range = lexer.templateString.range(of: token, options: options) else { fatalError("Token not found") }
|
guard let range = lexer.templateString.range(of: token, options: options) else { fatalError("Token not found") }
|
||||||
return SourceMap(location: lexer.rangeLocation(range))
|
return SourceMap(location: lexer.rangeLocation(range))
|
||||||
|
|||||||
Reference in New Issue
Block a user