Merge pull request #226 from Liquidsoul/faster-scanner

Optimise Scanner performance
2018-09-26 00:38:05 +02:00
parent fce3dc5e48 cb4e514846
commit 6f9bb3e931
5 changed files with 1279 additions and 80 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,11 @@
 - Update to Spectre 0.9.0.  
  [Ilya Puchka](https://github.com/ilyapuchka)
  [#247](https://github.com/stencilproject/Stencil/pull/247)
 - Optimise Scanner performance.  
  [Eric Thorpe](https://github.com/trametheka)
  [Sébastien Duperron](https://github.com/Liquidsoul)
  [David Jennes](https://github.com/djbe)
  [#226](https://github.com/stencilproject/Stencil/pull/226)
 ## 0.12.1
--- a/Sources/Lexer.swift
+++ b/Sources/Lexer.swift
@@ -7,6 +7,18 @@ struct Lexer {
  let templateString: String
  let lines: [Line]
  /// The potential token start characters. In a template these appear after a
  /// `{` character, for example `{{`, `{%`, `{#`, ...
  private static let tokenChars: [Unicode.Scalar] = ["{", "%", "#"]
  /// The token end characters, corresponding to their token start characters.
  /// For example, a variable token starts with `{{` and ends with `}}`
  private static let tokenCharMap: [Unicode.Scalar: Unicode.Scalar] = [
    "{": "}",
    "%": "%",
    "#": "#"
  ]
  init(templateName: String? = nil, templateString: String) {
    self.templateName = templateName
    self.templateString = templateString
@@ -17,12 +29,19 @@ struct Lexer {
    }
  }
  /// Create a token that will be passed on to the parser, with the given
  /// content and a range. The content will be tested to see if it's a
  /// `variable`, a `block` or a `comment`, otherwise it'll default to a simple
  /// `text` token.
  ///
  /// - Parameters:
  ///   - string: The content string of the token
  ///   - range: The range within the template content, used for smart
  ///            error reporting
  func createToken(string: String, at range: Range<String.Index>) -> Token {
    func strip() -> String {
      guard string.count > 4 else { return "" }
-      let start = string.index(string.startIndex, offsetBy: 2)
+      let trimmed = String(string.dropFirst(2).dropLast(2))
      let end = string.index(string.endIndex, offsetBy: -2)
      let trimmed = String(string[start..<end])
        .components(separatedBy: "\n")
        .filter({ !$0.isEmpty })
        .map({ $0.trim(character: " ") })
@@ -50,26 +69,22 @@ struct Lexer {
    return .text(value: string, at: sourceMap)
  }
-  /// Returns an array of tokens from a given template string.
+  /// Transforms the template into a list of tokens, that will eventually be
  /// passed on to the parser.
  ///
  /// - Returns: The list of tokens (see `createToken(string: at:)`).
  func tokenize() -> [Token] {
    var tokens: [Token] = []
    let scanner = Scanner(templateString)
    let map = [
      "{{": "}}",
      "{%": "%}",
      "{#": "#}",
      ]
    while !scanner.isEmpty {
-      if let text = scanner.scan(until: ["{{", "{%", "{#"]) {
+      if let (char, text) = scanner.scanForTokenStart(Lexer.tokenChars) {
-        if !text.1.isEmpty {
+        if !text.isEmpty {
-          tokens.append(createToken(string: text.1, at: scanner.range))
+          tokens.append(createToken(string: text, at: scanner.range))
        }
-        let end = map[text.0]!
+        guard let end = Lexer.tokenCharMap[char] else { continue }
-        let result = scanner.scan(until: end, returnUntil: true)
+        let result = scanner.scanForTokenEnd(end)
        tokens.append(createToken(string: result, at: scanner.range))
      } else {
        tokens.append(createToken(string: scanner.content, at: scanner.range))
@@ -80,6 +95,11 @@ struct Lexer {
    return tokens
  }
  /// Finds the line matching the given range (for a token)
  ///
  /// - Parameter range: The range to search for.
  /// - Returns: The content for that line, the line number and offset within
  ///            the line.
  func rangeLocation(_ range: Range<String.Index>) -> ContentLocation {
    guard let line = self.lines.first(where: { $0.range.contains(range.lowerBound) }) else {
      return ("", 0, 0)
@@ -95,6 +115,11 @@ class Scanner {
  var content: String
  var range: Range<String.Index>
  /// The start delimiter for a token.
  private static let tokenStartDelimiter: Unicode.Scalar = "{"
  /// And the corresponding end delimiter for a token.
  private static let tokenEndDelimiter: Unicode.Scalar = "}"
  init(_ content: String) {
    self.originalContent = content
    self.content = content
@@ -105,64 +130,69 @@ class Scanner {
    return content.isEmpty
  }
-  func scan(until: String, returnUntil: Bool = false) -> String {
+  /// Scans for the end of a token, with a specific ending character. If we're
-    var index = content.startIndex
+  /// searching for the end of a block token `%}`, this method receives a `%`.
  /// The scanner will search for that `%` followed by a `}`.
  ///
  /// Note: if the end of a token is found, the `content` and `range`
  /// properties are updated to reflect this. `content` will be set to what
  /// remains of the template after the token. `range` will be set to the range
  /// of the token within the template.
  ///
  /// - Parameter tokenChar: The token end character to search for.
  /// - Returns: The content of a token, or "" if no token end was found.
  func scanForTokenEnd(_ tokenChar: Unicode.Scalar) -> String {
    var foundChar = false
-    if until.isEmpty {
+    for (index, char) in content.unicodeScalars.enumerated() {
-      return ""
+      if foundChar && char == Scanner.tokenEndDelimiter {
-    }
+        let result = String(content.prefix(index))
-
+        content = String(content.dropFirst(index + 1))
-    range = range.upperBound..<range.upperBound
+        range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index + 1)
    while index != content.endIndex {
      let substring = String(content[index...])
      if substring.hasPrefix(until) {
        let result = String(content[..<index])
        if returnUntil {
          range = range.lowerBound..<originalContent.index(range.upperBound, offsetBy: until.count)
          content = String(substring[until.endIndex...])
          return result + until
        }
        content = substring
        return result
      } else {
        foundChar = (char == tokenChar)
      }
      index = content.index(after: index)
      range = range.lowerBound..<originalContent.index(after: range.upperBound)
    }
    content = ""
    return ""
  }
-  func scan(until: [String]) -> (String, String)? {
+  /// Scans for the start of a token, with a list of potential starting
-    if until.isEmpty {
+  /// characters. To scan for the start of variables (`{{`), blocks (`{%`) and
-      return nil
+  /// comments (`{#`), this method receives the characters `{`, `%` and `#`.
-    }
+  /// The scanner will search for a `{`, followed by one of the search
  /// characters. It will give the found character, and the content that came
  /// before the token.
  ///
  /// Note: if the start of a token is found, the `content` and `range`
  /// properties are updated to reflect this. `content` will be set to what
  /// remains of the template starting with the token. `range` will be set to
  /// the start of the token within the template.
  ///
  /// - Parameter tokenChars: List of token start characters to search for.
  /// - Returns: The found token start character, together with the content
  ///            before the token, or nil of no token start was found.
  func scanForTokenStart(_ tokenChars: [Unicode.Scalar]) -> (Unicode.Scalar, String)? {
    var foundBrace = false
    var index = content.startIndex
    range = range.upperBound..<range.upperBound
-    while index != content.endIndex {
+    for (index, char) in content.unicodeScalars.enumerated() {
-      let substring = String(content[index...])
+      if foundBrace && tokenChars.contains(char) {
-      for string in until {
+        let result = String(content.prefix(index - 1))
-        if substring.hasPrefix(string) {
+        content = String(content.dropFirst(index - 1))
-          let result = String(content[..<index])
+        range = range.upperBound..<originalContent.index(range.upperBound, offsetBy: index - 1)
-          content = substring
+        return (char, result)
-          return (string, result)
+      } else {
-        }
+        foundBrace = (char == Scanner.tokenStartDelimiter)
      }
      index = content.index(after: index)
      range = range.lowerBound..<originalContent.index(after: range.upperBound)
    }
    return nil
  }
 }
 extension String {
  func findFirstNot(character: Character) -> String.Index? {
    var index = startIndex
--- a/Tests/StencilTests/LexerSpec.swift
+++ b/Tests/StencilTests/LexerSpec.swift
@@ -1,10 +1,16 @@
-import XCTest
+import PathKit
 import Spectre
@testable import Stencil
 import XCTest
 class LexerTests: XCTestCase {
  func testLexer() {
    describe("Lexer") {
      func makeSourceMap(_ token: String, for lexer: Lexer, options: String.CompareOptions = []) -> SourceMap {
        guard let range = lexer.templateString.range(of: token, options: options) else { fatalError("Token not found") }
        return SourceMap(location: lexer.rangeLocation(range))
      }
      $0.it("can tokenize text") {
        let lexer = Lexer(templateString: "Hello World")
        let tokens = lexer.tokenize()
@@ -44,9 +50,9 @@ class LexerTests: XCTestCase {
        let tokens = lexer.tokenize()
        try expect(tokens.count) == 3
-        try expect(tokens[0]) == Token.text(value: "My name is ", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "My name is ")!)))
+        try expect(tokens[0]) == Token.text(value: "My name is ", at: makeSourceMap("My name is ", for: lexer))
-        try expect(tokens[1]) == Token.variable(value: "myname", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "myname")!)))
+        try expect(tokens[1]) == Token.variable(value: "myname", at: makeSourceMap("myname", for: lexer))
-        try expect(tokens[2]) == Token.text(value: ".", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: ".")!)))
+        try expect(tokens[2]) == Token.text(value: ".", at: makeSourceMap(".", for: lexer))
      }
      $0.it("can tokenize two variables without being greedy") {
@@ -55,43 +61,69 @@ class LexerTests: XCTestCase {
        let tokens = lexer.tokenize()
        try expect(tokens.count) == 2
-        try expect(tokens[0]) == Token.variable(value: "thing", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "thing")!)))
+        try expect(tokens[0]) == Token.variable(value: "thing", at: makeSourceMap("thing", for: lexer))
-        try expect(tokens[1]) == Token.variable(value: "name", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "name")!)))
+        try expect(tokens[1]) == Token.variable(value: "name", at: makeSourceMap("name", for: lexer))
      }
      $0.it("can tokenize an unclosed block") {
        let lexer = Lexer(templateString: "{%}")
-        let _ = lexer.tokenize()
+        _ = lexer.tokenize()
      }
      $0.it("can tokenize incorrect syntax without crashing") {
        let lexer = Lexer(templateString: "func some() {{% if %}")
        _ = lexer.tokenize()
      }
      $0.it("can tokenize an empty variable") {
        let lexer = Lexer(templateString: "{{}}")
-        let _ = lexer.tokenize()
+        _ = lexer.tokenize()
      }
      $0.it("can tokenize with new lines") {
        let templateString = """
-        My name is {%
+          My name is {%
-            if name
+              if name
-             and
+               and
-            name
+              name
-        %}{{
+          %}{{
-        name
+          name
-        }}{%
+          }}{%
-        endif %}.
+          endif %}.
-        """
+          """
        let lexer = Lexer(templateString: templateString)
        let tokens = lexer.tokenize()
        try expect(tokens.count) == 5
-        try expect(tokens[0]) == Token.text(value: "My name is ", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "My name is")!)))
+        try expect(tokens[0]) == Token.text(value: "My name is ", at: makeSourceMap("My name is", for: lexer))
-        try expect(tokens[1]) == Token.block(value: "if name and name", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "{%")!)))
+        try expect(tokens[1]) == Token.block(value: "if name and name", at: makeSourceMap("{%", for: lexer))
-        try expect(tokens[2]) == Token.variable(value: "name", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "name", options: [.backwards])!)))
+        try expect(tokens[2]) == Token.variable(value: "name", at: makeSourceMap("name", for: lexer, options: .backwards))
-        try expect(tokens[3]) == Token.block(value: "endif", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: "endif")!)))
+        try expect(tokens[3]) == Token.block(value: "endif", at: makeSourceMap("endif", for: lexer))
-        try expect(tokens[4]) == Token.text(value: ".", at: SourceMap(location: lexer.rangeLocation(templateString.range(of: ".")!)))
+        try expect(tokens[4]) == Token.text(value: ".", at: makeSourceMap(".", for: lexer))
      }
      $0.it("can tokenize escape sequences") {
        let templateString = "class Some {{ '{' }}{% if true %}{{ stuff }}{% endif %}"
        let lexer = Lexer(templateString: templateString)
        let tokens = lexer.tokenize()
        try expect(tokens.count) == 5
        try expect(tokens[0]) == Token.text(value: "class Some ", at: makeSourceMap("class Some ", for: lexer))
        try expect(tokens[1]) == Token.variable(value: "'{'", at: makeSourceMap("'{'", for: lexer))
        try expect(tokens[2]) == Token.block(value: "if true", at: makeSourceMap("if true", for: lexer))
        try expect(tokens[3]) == Token.variable(value: "stuff", at: makeSourceMap("stuff", for: lexer))
        try expect(tokens[4]) == Token.block(value: "endif", at: makeSourceMap("endif", for: lexer))
      }
    }
  }
  func testPerformance() throws {
    let path = Path(#file) + ".."  + "fixtures" + "huge.html"
    let content: String = try path.read()
    measure {
      let lexer = Lexer(templateString: content)
      _ = lexer.tokenize()
    }
  }
 }
--- a/Tests/StencilTests/XCTestManifests.swift
+++ b/Tests/StencilTests/XCTestManifests.swift
@@ -57,6 +57,7 @@ extension InheritenceTests {
 extension LexerTests {
    static let __allTests = [
        ("testLexer", testLexer),
        ("testPerformance", testPerformance),
    ]
 }
--- a/Tests/StencilTests/fixtures/huge.html
+++ b/Tests/StencilTests/fixtures/huge.html