// Parser.swift // // Half inspired by Reader class from John Sundell's Ink project // https://github.com/JohnSundell/Ink/blob/master/Sources/Ink/Internal/Reader.swift // with optimisation working ie removing String and doing my own UTF8 processing inspired by Fabian Fett's work in // https://github.com/swift-extras/swift-extras-json/blob/main/Sources/ExtrasJSON/Parsing/DocumentReader.swift // // This is a copy of the parser from Hummingbird. I am not using the version in Hummingbird to avoid the dependency import Foundation /// Reader object for parsing String buffers struct HBParser { enum Error: Swift.Error { case overflow case unexpected case emptyString case invalidUTF8 case invalidPosition } /// Create a Parser object /// - Parameter string: UTF8 data to parse init?(_ utf8Data: Bytes, validateUTF8: Bool = true) where Bytes.Element == UInt8 { if let buffer = utf8Data as? [UInt8] { self.buffer = buffer } else { buffer = Array(utf8Data) } index = 0 range = 0 ..< buffer.endIndex // should check that the data is valid utf8 if validateUTF8 == true, self.validateUTF8() == false { return nil } } init(_ string: String) { buffer = Array(string.utf8) index = 0 range = 0 ..< buffer.endIndex } /// Return contents of parser as a string var count: Int { return range.count } /// Return contents of parser as a string var string: String { return makeString(buffer[range]) } private var buffer: [UInt8] private var index: Int private let range: Range } // MARK: sub-parsers extension HBParser { /// initialise a parser that parses a section of the buffer attached to another parser init(_ parser: HBParser, range: Range) { buffer = parser.buffer index = range.startIndex self.range = range precondition(range.startIndex >= 0 && range.endIndex <= buffer.endIndex) precondition(buffer[range.startIndex] & 0xC0 != 0x80) // check we arent in the middle of a UTF8 character } /// initialise a parser that parses a section of the buffer attached to this parser func subParser(_ range: Range) -> HBParser { if range.startIndex == range.endIndex { return HBParser(self, range: self.range.startIndex ..< self.range.startIndex) } return HBParser(self, range: range) } } extension HBParser { /// Return current character /// - Throws: .overflow /// - Returns: Current character mutating func character() throws -> Unicode.Scalar { guard !reachedEnd() else { throw Error.overflow } return unsafeCurrentAndAdvance() } /// Read the current character and return if it is as intended. If character test returns true then move forward 1 /// - Parameter char: character to compare against /// - Throws: .overflow /// - Returns: If current character was the one we expected mutating func read(_ char: Unicode.Scalar) throws -> Bool { let initialIndex = index let c = try character() guard c == char else { index = initialIndex; return false } return true } /// Read the current character and check if it is in a set of characters If character test returns true then move forward 1 /// - Parameter characterSet: Set of characters to compare against /// - Throws: .overflow /// - Returns: If current character is in character set mutating func read(_ characterSet: Set) throws -> Bool { let initialIndex = index let c = try character() guard characterSet.contains(c) else { index = initialIndex; return false } return true } /// Compare characters at current position against provided string. If the characters are the same as string provided advance past string /// - Parameter string: String to compare against /// - Throws: .overflow, .emptyString /// - Returns: If characters at current position equal string mutating func read(_ string: String) throws -> Bool { let initialIndex = index guard string.count > 0 else { throw Error.emptyString } let subString = try read(count: string.count) guard subString.string == string else { index = initialIndex; return false } return true } /// Read next so many characters from buffer /// - Parameter count: Number of characters to read /// - Throws: .overflow /// - Returns: The string read from the buffer mutating func read(count: Int) throws -> HBParser { var count = count var readEndIndex = index while count > 0 { guard readEndIndex != range.endIndex else { throw Error.overflow } readEndIndex = skipUTF8Character(at: readEndIndex) count -= 1 } let result = subParser(index ..< readEndIndex) index = readEndIndex return result } /// Read from buffer until we hit a character. Position after this is of the character we were checking for /// - Parameter until: Unicode.Scalar to read until /// - Throws: .overflow if we hit the end of the buffer before reading character /// - Returns: String read from buffer @discardableResult mutating func read(until: Unicode.Scalar, throwOnOverflow: Bool = true) throws -> HBParser { let startIndex = index while !reachedEnd() { if unsafeCurrent() == until { return subParser(startIndex ..< index) } unsafeAdvance() } if throwOnOverflow { _setPosition(startIndex) throw Error.overflow } return subParser(startIndex ..< index) } /// Read from buffer until we hit a character in supplied set. Position after this is of the character we were checking for /// - Parameter characterSet: Unicode.Scalar set to check against /// - Throws: .overflow /// - Returns: String read from buffer @discardableResult mutating func read(until characterSet: Set, throwOnOverflow: Bool = true) throws -> HBParser { let startIndex = index while !reachedEnd() { if characterSet.contains(unsafeCurrent()) { return subParser(startIndex ..< index) } unsafeAdvance() } if throwOnOverflow { _setPosition(startIndex) throw Error.overflow } return subParser(startIndex ..< index) } /// Read from buffer until we hit a character that returns true for supplied closure. Position after this is of the character we were checking for /// - Parameter until: Function to test /// - Throws: .overflow /// - Returns: String read from buffer @discardableResult mutating func read(until: (Unicode.Scalar) -> Bool, throwOnOverflow: Bool = true) throws -> HBParser { let startIndex = index while !reachedEnd() { if until(unsafeCurrent()) { return subParser(startIndex ..< index) } unsafeAdvance() } if throwOnOverflow { _setPosition(startIndex) throw Error.overflow } return subParser(startIndex ..< index) } /// Read from buffer until we hit a character where supplied KeyPath is true. Position after this is of the character we were checking for /// - Parameter characterSet: Unicode.Scalar set to check against /// - Throws: .overflow /// - Returns: String read from buffer @discardableResult mutating func read(until keyPath: KeyPath, throwOnOverflow: Bool = true) throws -> HBParser { let startIndex = index while !reachedEnd() { if unsafeCurrent()[keyPath: keyPath] { return subParser(startIndex ..< index) } unsafeAdvance() } if throwOnOverflow { _setPosition(startIndex) throw Error.overflow } return subParser(startIndex ..< index) } /// Read from buffer until we hit a string. By default the position after this is of the beginning of the string we were checking for /// - Parameter untilString: String to check for /// - Parameter throwOnOverflow: Throw errors if we hit the end of the buffer /// - Parameter skipToEnd: Should we set the position to after the found string /// - Throws: .overflow, .emptyString /// - Returns: String read from buffer @discardableResult mutating func read(untilString: String, throwOnOverflow: Bool = true, skipToEnd: Bool = false) throws -> HBParser { var untilString = untilString return try untilString.withUTF8 { utf8 in guard utf8.count > 0 else { throw Error.emptyString } let startIndex = index var foundIndex = index var untilIndex = 0 while !reachedEnd() { if buffer[index] == utf8[untilIndex] { if untilIndex == 0 { foundIndex = index } untilIndex += 1 if untilIndex == utf8.endIndex { unsafeAdvance() if skipToEnd == false { index = foundIndex } let result = subParser(startIndex ..< foundIndex) return result } } else { untilIndex = 0 } index += 1 } if throwOnOverflow { _setPosition(startIndex) throw Error.overflow } return subParser(startIndex ..< index) } } /// Read from buffer from current position until the end of the buffer /// - Returns: String read from buffer @discardableResult mutating func readUntilTheEnd() -> HBParser { let startIndex = index index = range.endIndex return subParser(startIndex ..< index) } /// Read while character at current position is the one supplied /// - Parameter while: Unicode.Scalar to check against /// - Returns: String read from buffer @discardableResult mutating func read(while: Unicode.Scalar) -> Int { var count = 0 while !reachedEnd(), unsafeCurrent() == `while` { unsafeAdvance() count += 1 } return count } /// Read while character at current position is in supplied set /// - Parameter while: character set to check /// - Returns: String read from buffer @discardableResult mutating func read(while characterSet: Set) -> HBParser { let startIndex = index while !reachedEnd(), characterSet.contains(unsafeCurrent()) { unsafeAdvance() } return subParser(startIndex ..< index) } /// Read while character returns true for supplied closure /// - Parameter while: character set to check /// - Returns: String read from buffer @discardableResult mutating func read(while: (Unicode.Scalar) -> Bool) -> HBParser { let startIndex = index while !reachedEnd(), `while`(unsafeCurrent()) { unsafeAdvance() } return subParser(startIndex ..< index) } /// Read while character returns true for supplied KeyPath /// - Parameter while: character set to check /// - Returns: String read from buffer @discardableResult mutating func read(while keyPath: KeyPath) -> HBParser { let startIndex = index while !reachedEnd(), unsafeCurrent()[keyPath: keyPath] { unsafeAdvance() } return subParser(startIndex ..< index) } /// Split parser into sections separated by character /// - Parameter separator: Separator character /// - Returns: arrays of sub parsers mutating func split(separator: Unicode.Scalar) -> [HBParser] { var subParsers: [HBParser] = [] while !reachedEnd() { do { let section = try read(until: separator) subParsers.append(section) unsafeAdvance() } catch { if !reachedEnd() { subParsers.append(readUntilTheEnd()) } } } return subParsers } /// Return whether we have reached the end of the buffer /// - Returns: Have we reached the end func reachedEnd() -> Bool { return index == range.endIndex } } /// Public versions of internal functions which include tests for overflow extension HBParser { /// Return the character at the current position /// - Throws: .overflow /// - Returns: Unicode.Scalar func current() -> Unicode.Scalar { guard !reachedEnd() else { return Unicode.Scalar(0) } return unsafeCurrent() } /// Move forward one character /// - Throws: .overflow mutating func advance() throws { guard !reachedEnd() else { throw Error.overflow } return unsafeAdvance() } /// Move forward so many character /// - Parameter amount: number of characters to move forward /// - Throws: .overflow mutating func advance(by amount: Int) throws { var amount = amount while amount > 0 { guard !reachedEnd() else { throw Error.overflow } index = skipUTF8Character(at: index) amount -= 1 } } /// Move backwards one character /// - Throws: .overflow mutating func retreat() throws { guard index > range.startIndex else { throw Error.overflow } index = backOneUTF8Character(at: index) } /// Move back so many characters /// - Parameter amount: number of characters to move back /// - Throws: .overflow mutating func retreat(by amount: Int) throws { var amount = amount while amount > 0 { guard index > range.startIndex else { throw Error.overflow } index = backOneUTF8Character(at: index) amount -= 1 } } mutating func unsafeAdvance() { index = skipUTF8Character(at: index) } mutating func unsafeAdvance(by amount: Int) { var amount = amount while amount > 0 { index = skipUTF8Character(at: index) amount -= 1 } } func getPosition() -> Int { return index } mutating func setPosition(_ index: Int) throws { if index == self.range.endIndex { _setPosition(index) return } guard range.contains(index) else { throw Error.invalidPosition } guard validateUTF8Character(at: index).0 != nil else { throw Error.invalidPosition } _setPosition(index) } } /// extend Parser to conform to Sequence extension HBParser: Sequence { typealias Element = Unicode.Scalar __consuming func makeIterator() -> Iterator { return Iterator(self) } struct Iterator: IteratorProtocol { typealias Element = Unicode.Scalar var parser: HBParser init(_ parser: HBParser) { self.parser = parser } mutating func next() -> Unicode.Scalar? { guard !parser.reachedEnd() else { return nil } return parser.unsafeCurrentAndAdvance() } } } // internal versions without checks private extension HBParser { func unsafeCurrent() -> Unicode.Scalar { return decodeUTF8Character(at: index).0 } mutating func unsafeCurrentAndAdvance() -> Unicode.Scalar { let (unicodeScalar, index) = decodeUTF8Character(at: self.index) self.index = index return unicodeScalar } mutating func _setPosition(_ index: Int) { self.index = index } func makeString(_ bytes: Bytes) -> String where Bytes.Element == UInt8, Bytes.Index == Int { if let string = bytes.withContiguousStorageIfAvailable({ String(decoding: $0, as: Unicode.UTF8.self) }) { return string } else { return String(decoding: bytes, as: Unicode.UTF8.self) } } } // UTF8 parsing extension HBParser { func decodeUTF8Character(at index: Int) -> (Unicode.Scalar, Int) { var index = index let byte1 = UInt32(buffer[index]) var value: UInt32 if byte1 & 0xC0 == 0xC0 { index += 1 let byte2 = UInt32(buffer[index] & 0x3F) if byte1 & 0xE0 == 0xE0 { index += 1 let byte3 = UInt32(buffer[index] & 0x3F) if byte1 & 0xF0 == 0xF0 { index += 1 let byte4 = UInt32(buffer[index] & 0x3F) value = (byte1 & 0x7) << 18 + byte2 << 12 + byte3 << 6 + byte4 } else { value = (byte1 & 0xF) << 12 + byte2 << 6 + byte3 } } else { value = (byte1 & 0x1F) << 6 + byte2 } } else { value = byte1 & 0x7F } let unicodeScalar = Unicode.Scalar(value)! return (unicodeScalar, index + 1) } func skipUTF8Character(at index: Int) -> Int { if buffer[index] & 0x80 != 0x80 { return index + 1 } if buffer[index + 1] & 0xC0 == 0x80 { return index + 2 } if buffer[index + 2] & 0xC0 == 0x80 { return index + 3 } return index + 4 } func backOneUTF8Character(at index: Int) -> Int { if buffer[index - 1] & 0xC0 != 0x80 { return index - 1 } if buffer[index - 2] & 0xC0 != 0x80 { return index - 2 } if buffer[index - 3] & 0xC0 != 0x80 { return index - 3 } return index - 4 } /// same as `decodeUTF8Character` but adds extra validation, so we can make assumptions later on in decode and skip func validateUTF8Character(at index: Int) -> (Unicode.Scalar?, Int) { var index = index let byte1 = UInt32(buffer[index]) var value: UInt32 if byte1 & 0xC0 == 0xC0 { index += 1 let byte = UInt32(buffer[index]) guard byte & 0xC0 == 0x80 else { return (nil, index) } let byte2 = UInt32(byte & 0x3F) if byte1 & 0xE0 == 0xE0 { index += 1 let byte = UInt32(buffer[index]) guard byte & 0xC0 == 0x80 else { return (nil, index) } let byte3 = UInt32(byte & 0x3F) if byte1 & 0xF0 == 0xF0 { index += 1 let byte = UInt32(buffer[index]) guard byte & 0xC0 == 0x80 else { return (nil, index) } let byte4 = UInt32(byte & 0x3F) value = (byte1 & 0x7) << 18 + byte2 << 12 + byte3 << 6 + byte4 } else { value = (byte1 & 0xF) << 12 + byte2 << 6 + byte3 } } else { value = (byte1 & 0x1F) << 6 + byte2 } } else { value = byte1 & 0x7F } let unicodeScalar = Unicode.Scalar(value) return (unicodeScalar, index + 1) } /// return if the buffer is valid UTF8 func validateUTF8() -> Bool { var index = range.startIndex while index < range.endIndex { let (scalar, newIndex) = validateUTF8Character(at: index) guard scalar != nil else { return false } index = newIndex } return true } private static let asciiHexValues: [UInt8] = [ /* 00 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 08 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 10 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 18 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 20 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 28 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 30 */ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 38 */ 0x08, 0x09, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 40 */ 0x80, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x80, /* 48 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 50 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 58 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 60 */ 0x80, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x80, /* 68 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 70 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 78 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 80 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 88 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 90 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* 98 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* A0 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* A8 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* B0 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* B8 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* C0 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* C8 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D0 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* D8 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E0 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* E8 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* F0 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, /* F8 */ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, ] /// percent decode UTF8 func percentDecode() -> String? { struct DecodeError: Swift.Error {} func _percentDecode(_ original: ArraySlice, _ bytes: UnsafeMutableBufferPointer) throws -> Int { var newIndex = 0 var index = original.startIndex while index < original.endIndex { // if we have found a percent sign if original[index] == 0x25 { let high = Self.asciiHexValues[Int(original[index + 1])] let low = Self.asciiHexValues[Int(original[index + 2])] index += 3 if ((high | low) & 0x80) != 0 { throw DecodeError() } bytes[newIndex] = (high << 4) | low newIndex += 1 } else { bytes[newIndex] = original[index] newIndex += 1 index += 1 } } return newIndex } guard index != range.endIndex else { return "" } do { if #available(macOS 11, *) { return try String(unsafeUninitializedCapacity: range.endIndex - index) { bytes -> Int in try _percentDecode(self.buffer[self.index ..< range.endIndex], bytes) } } else { let newBuffer = try [UInt8].init(unsafeUninitializedCapacity: range.endIndex - index) { bytes, count in try count = _percentDecode(self.buffer[self.index ..< range.endIndex], bytes) } return makeString(newBuffer) } } catch { return nil } } } extension Unicode.Scalar { var isWhitespace: Bool { return properties.isWhitespace } var isNewline: Bool { switch value { case 0x000A ... 0x000D /* LF ... CR */: return true case 0x0085 /* NEXT LINE (NEL) */: return true case 0x2028 /* LINE SEPARATOR */: return true case 0x2029 /* PARAGRAPH SEPARATOR */: return true default: return false } } var isNumber: Bool { return properties.numericType != nil } var isLetter: Bool { return properties.isAlphabetic } var isLetterOrNumber: Bool { return isLetter || isNumber } } extension Set where Element == Unicode.Scalar { init(_ string: String) { self = Set(string.unicodeScalars) } }