Mercurial > core / rust/lib/sxp/src/read.rs
changeset 698: |
96958d3eb5b0 |
parent: |
3d78bed56188
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Fri, 04 Oct 2024 22:04:59 -0400 |
permissions: |
-rw-r--r-- |
description: |
fixes |
1 //! read.rs --- sxp reader 2 use crate::{err::ErrorCode, Error, Result}; 4 use core::{char, cmp, ops::Deref, str}; 6 #[cfg(feature = "std")] 8 #[cfg(feature = "std")] 9 use crate::iter::LineColIterator; 11 /// Trait used by the deserializer for iterating over input. This is manually 12 /// "specialized" for iterating over &[u8]. Once feature(specialization) is 13 /// stable we can use actual specialization. 14 // TODO 2023-07-14: should we use specialization anyways? we are on 15 // nightly 99% of the time for better or worse 18 fn next(&mut self) -> Result<Option<u8>>; 20 fn peek(&mut self) -> Result<Option<u8>>; 22 /// Only valid after a call to peek(). Discards the peeked byte. 24 fn discard(&mut self); 26 /// Position of the most recent call to next(). 28 /// The most recent call was probably next() and not peek(), but this method 29 /// should try to return a sensible result if the most recent call was 30 /// actually peek() because we don't always know. 32 /// Only called in case of an error, so performance is not important. 34 fn position(&self) -> Position; 36 /// Position of the most recent call to peek(). 38 /// The most recent call was probably peek() and not next(), but this method 39 /// should try to return a sensible result if the most recent call was 40 /// actually next() because we don't always know. 42 /// Only called in case of an error, so performance is not important. 44 fn peek_position(&self) -> Position; 46 /// Offset from the beginning of the input to the next byte that would be 47 /// returned by next() or peek(). 49 fn byte_offset(&self) -> usize; 51 /// Assumes the previous byte was a quotation mark. Parses an escaped 52 /// string until the next quotation mark using the given scratch space if 53 /// necessary. The scratch space is initially empty. 57 scratch: &'s mut Vec<u8>, 58 ) -> Result<Reference<'de, 's, str>>; 60 /// Assumes the previous byte was a quotation mark. Parses an escaped 61 /// string until the next quotation mark using the given scratch space if 62 /// necessary. The scratch space is initially empty. 64 /// This function returns the raw bytes in the string with escape sequences 65 /// expanded but without performing unicode validation. 69 scratch: &'s mut Vec<u8>, 70 ) -> Result<Reference<'de, 's, [u8]>>; 72 /// Assumes the previous byte was a quotation mark. Parses a 73 /// string until the next quotation mark but discards the data. 75 fn ignore_str(&mut self) -> Result<()>; 77 /// Assumes the previous byte was a hex escape sequnce ('\u') in a string. 78 /// Parses next hexadecimal sequence. 80 fn decode_hex_escape(&mut self) -> Result<u16>; 82 /// Whether StreamDeserializer::next needs to check the failed flag. True 83 /// for IoRead, false for StrRead and SliceRead which can track failure by 84 /// truncating their input slice to avoid the extra check on every next 87 const SHOULD_EARLY_RETURN_IF_FAILED: bool; 89 /// Mark a persistent failure of StreamDeserializer, either by setting the 90 /// flag or by truncating the input data. 92 fn set_failed(&mut self, failed: &mut bool); 100 pub enum Reference<'b, 'c, T> 108 impl<'b, 'c, T> Deref for Reference<'b, 'c, T> 114 fn deref(&self) -> &Self::Target { 116 Reference::Borrowed(b) => b, 117 Reference::Copied(c) => c, 122 /// SXP input source that reads from a std::io input stream. 123 #[cfg(feature = "std")] 124 #[cfg_attr(docsrs, doc(cfg(feature = "std")))] 129 iter: LineColIterator<io::Bytes<R>>, 130 /// Temporary storage of peeked byte. 134 /// SXP input source that reads from a slice of bytes. 136 // This is more efficient than other iterators because peek() can be read-only 137 // and we can compute line/col position only if an error happens. 138 pub struct SliceRead<'a> { 140 /// Index of the *next* byte that will be returned by next() or peek(). 144 /// SXP input source that reads from a UTF-8 string. 146 // Able to elide UTF-8 checks by assuming that the input is valid UTF-8. 147 pub struct StrRead<'a> { 148 delegate: SliceRead<'a>, 151 #[cfg(feature = "std")] 156 /// Create a SXP input source to read from a std::io input stream. 157 pub fn new(reader: R) -> Self { 159 iter: LineColIterator::new(reader.bytes()), 165 #[cfg(feature = "std")] 170 fn parse_str_bytes<'s, T, F>( 172 scratch: &'s mut Vec<u8>, 178 F: FnOnce(&'s Self, &'s [u8]) -> Result<T>, 181 let ch = e!(next_or_eof(self)); 182 if !ESCAPE[ch as usize] { 188 return result(self, scratch); 191 e!(parse_escape(self, validate, scratch)); 195 return error(self, ErrorCode::ControlCharacterWhileParsingString); 204 #[cfg(feature = "std")] 205 impl<'de, R> Read<'de> for IoRead<R> 210 fn next(&mut self) -> Result<Option<u8>> { 211 match self.ch.take() { 212 Some(ch) => Ok(Some(ch)), 213 None => match self.iter.next() { 214 Some(Err(err)) => Err(Error::io(err)), 215 Some(Ok(ch)) => Ok(Some(ch)), 222 fn peek(&mut self) -> Result<Option<u8>> { 224 Some(ch) => Ok(Some(ch)), 225 None => match self.iter.next() { 226 Some(Err(err)) => Err(Error::io(err)), 237 fn discard(&mut self) { 241 fn position(&self) -> Position { 243 line: self.iter.line(), 244 column: self.iter.col(), 248 fn peek_position(&self) -> Position { 249 // The LineColIterator updates its position during peek() so it has the 254 fn byte_offset(&self) -> usize { 256 Some(_) => self.iter.byte_offset() - 1, 257 None => self.iter.byte_offset(), 263 scratch: &'s mut Vec<u8>, 264 ) -> Result<Reference<'de, 's, str>> { 266 .parse_str_bytes(scratch, true, as_str) 267 .map(Reference::Copied) 270 fn parse_str_raw<'s>( 272 scratch: &'s mut Vec<u8>, 273 ) -> Result<Reference<'de, 's, [u8]>> { 275 .parse_str_bytes(scratch, false, |_, bytes| Ok(bytes)) 276 .map(Reference::Copied) 279 fn ignore_str(&mut self) -> Result<()> { 281 let ch = e!(next_or_eof(self)); 282 if !ESCAPE[ch as usize] { 290 e!(ignore_escape(self)); 293 return error(self, ErrorCode::ControlCharacterWhileParsingString); 299 fn decode_hex_escape(&mut self) -> Result<u16> { 302 match decode_hex_val(e!(next_or_eof(self))) { 303 None => return error(self, ErrorCode::InvalidEscape), 312 const SHOULD_EARLY_RETURN_IF_FAILED: bool = true; 316 fn set_failed(&mut self, failed: &mut bool) { 321 ////////////////////////////////////////////////////////////////////////////// 323 impl<'a> SliceRead<'a> { 324 /// Create a SXP input source to read from a slice of bytes. 325 pub fn new(slice: &'a [u8]) -> Self { 326 SliceRead { slice, index: 0 } 329 fn position_of_index(&self, i: usize) -> Position { 330 let mut position = Position { line: 1, column: 0 }; 331 for ch in &self.slice[..i] { 338 position.column += 1; 345 /// The big optimization here over IoRead is that if the string contains no 346 /// backslash escape sequences, the returned &str is a slice of the raw SXP 347 /// data so we avoid copying into the scratch space. 348 fn parse_str_bytes<'s, T, F>( 350 scratch: &'s mut Vec<u8>, 353 ) -> Result<Reference<'a, 's, T>> 356 F: for<'f> FnOnce(&'s Self, &'f [u8]) -> Result<&'f T>, 358 // Index of the first byte not yet copied into the scratch space. 359 let mut start = self.index; 362 while self.index < self.slice.len() 363 && !ESCAPE[self.slice[self.index] as usize] 367 if self.index == self.slice.len() { 368 return error(self, ErrorCode::EofWhileParsingString); 370 match self.slice[self.index] { 372 if scratch.is_empty() { 373 // Fast path: return a slice of the raw SXP without any 375 let borrowed = &self.slice[start..self.index]; 377 return result(self, borrowed).map(Reference::Borrowed); 379 scratch.extend_from_slice(&self.slice[start..self.index]); 381 return result(self, scratch).map(Reference::Copied); 385 scratch.extend_from_slice(&self.slice[start..self.index]); 387 e!(parse_escape(self, validate, scratch)); 393 return error(self, ErrorCode::ControlCharacterWhileParsingString); 401 impl<'a> Read<'a> for SliceRead<'a> { 403 fn next(&mut self) -> Result<Option<u8>> { 404 // `Ok(self.slice.get(self.index).map(|ch| { self.index += 1; *ch }))` 405 // is about 10% slower. 406 Ok(if self.index < self.slice.len() { 407 let ch = self.slice[self.index]; 416 fn peek(&mut self) -> Result<Option<u8>> { 417 // `Ok(self.slice.get(self.index).map(|ch| *ch))` is about 10% slower 419 Ok(if self.index < self.slice.len() { 420 Some(self.slice[self.index]) 427 fn discard(&mut self) { 431 fn position(&self) -> Position { 432 self.position_of_index(self.index) 435 fn peek_position(&self) -> Position { 436 // Cap it at slice.len() just in case the most recent call was next() 437 // and it returned the last byte. 438 self.position_of_index(cmp::min(self.slice.len(), self.index + 1)) 441 fn byte_offset(&self) -> usize { 447 scratch: &'s mut Vec<u8>, 448 ) -> Result<Reference<'a, 's, str>> { 449 self.parse_str_bytes(scratch, true, as_str) 452 fn parse_str_raw<'s>( 454 scratch: &'s mut Vec<u8>, 455 ) -> Result<Reference<'a, 's, [u8]>> { 456 self.parse_str_bytes(scratch, false, |_, bytes| Ok(bytes)) 459 fn ignore_str(&mut self) -> Result<()> { 461 while self.index < self.slice.len() 462 && !ESCAPE[self.slice[self.index] as usize] 466 if self.index == self.slice.len() { 467 return error(self, ErrorCode::EofWhileParsingString); 469 match self.slice[self.index] { 476 e!(ignore_escape(self)); 479 return error(self, ErrorCode::ControlCharacterWhileParsingString); 485 fn decode_hex_escape(&mut self) -> Result<u16> { 486 if self.index + 4 > self.slice.len() { 487 self.index = self.slice.len(); 488 return error(self, ErrorCode::EofWhileParsingString); 493 let ch = decode_hex_val(self.slice[self.index]); 496 None => return error(self, ErrorCode::InvalidEscape), 505 const SHOULD_EARLY_RETURN_IF_FAILED: bool = false; 509 fn set_failed(&mut self, _failed: &mut bool) { 510 self.slice = &self.slice[..self.index]; 514 ////////////////////////////////////////////////////////////////////////////// 516 impl<'a> StrRead<'a> { 517 /// Create a SXP input source to read from a UTF-8 string. 518 pub fn new(s: &'a str) -> Self { 520 delegate: SliceRead::new(s.as_bytes()), 525 impl<'a> Read<'a> for StrRead<'a> { 527 fn next(&mut self) -> Result<Option<u8>> { 532 fn peek(&mut self) -> Result<Option<u8>> { 537 fn discard(&mut self) { 538 self.delegate.discard(); 541 fn position(&self) -> Position { 542 self.delegate.position() 545 fn peek_position(&self) -> Position { 546 self.delegate.peek_position() 549 fn byte_offset(&self) -> usize { 550 self.delegate.byte_offset() 555 scratch: &'s mut Vec<u8>, 556 ) -> Result<Reference<'a, 's, str>> { 557 self.delegate.parse_str_bytes(scratch, true, |_, bytes| { 558 // The deserialization input came in as &str with a UTF-8 guarantee, 559 // and the \u-escapes are checked along the way, so don't need to 561 Ok(unsafe { str::from_utf8_unchecked(bytes) }) 565 fn parse_str_raw<'s>( 567 scratch: &'s mut Vec<u8>, 568 ) -> Result<Reference<'a, 's, [u8]>> { 569 self.delegate.parse_str_raw(scratch) 572 fn ignore_str(&mut self) -> Result<()> { 573 self.delegate.ignore_str() 576 fn decode_hex_escape(&mut self) -> Result<u16> { 577 self.delegate.decode_hex_escape() 580 const SHOULD_EARLY_RETURN_IF_FAILED: bool = false; 584 fn set_failed(&mut self, failed: &mut bool) { 585 self.delegate.set_failed(failed); 589 ////////////////////////////////////////////////////////////////////////////// 591 impl<'a, 'de, R> Read<'de> for &'a mut R 595 fn next(&mut self) -> Result<Option<u8>> { 599 fn peek(&mut self) -> Result<Option<u8>> { 603 fn discard(&mut self) { 607 fn position(&self) -> Position { 611 fn peek_position(&self) -> Position { 612 R::peek_position(self) 615 fn byte_offset(&self) -> usize { 621 scratch: &'s mut Vec<u8>, 622 ) -> Result<Reference<'de, 's, str>> { 623 R::parse_str(self, scratch) 626 fn parse_str_raw<'s>( 628 scratch: &'s mut Vec<u8>, 629 ) -> Result<Reference<'de, 's, [u8]>> { 630 R::parse_str_raw(self, scratch) 633 fn ignore_str(&mut self) -> Result<()> { 637 fn decode_hex_escape(&mut self) -> Result<u16> { 638 R::decode_hex_escape(self) 641 const SHOULD_EARLY_RETURN_IF_FAILED: bool = R::SHOULD_EARLY_RETURN_IF_FAILED; 643 fn set_failed(&mut self, failed: &mut bool) { 644 R::set_failed(self, failed); 648 /// Marker for whether StreamDeserializer can implement FusedIterator. 650 impl<'a> Fused for SliceRead<'a> {} 651 impl<'a> Fused for StrRead<'a> {} 653 // Lookup table of bytes that must be escaped. A value of true at index i means 654 // that byte i requires an escape sequence in the input. 655 static ESCAPE: [bool; 256] = { 656 const CT: bool = true; // control character \x00..=\x1F 657 const QU: bool = true; // quote \x22 658 const BS: bool = true; // backslash \x5C 659 const __: bool = false; // allow unescaped 661 // 1 2 3 4 5 6 7 8 9 A B C D E F 662 CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0 663 CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1 664 __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 665 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 666 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 667 __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 668 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 669 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 670 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 671 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 672 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A 673 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B 674 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C 675 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D 676 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E 677 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F 681 fn next_or_eof<'de, R>(read: &mut R) -> Result<u8> 683 R: ?Sized + Read<'de>, 685 match e!(read.next()) { 687 None => error(read, ErrorCode::EofWhileParsingString), 691 fn peek_or_eof<'de, R>(read: &mut R) -> Result<u8> 693 R: ?Sized + Read<'de>, 695 match e!(read.peek()) { 697 None => error(read, ErrorCode::EofWhileParsingString), 701 fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result<T> 703 R: ?Sized + Read<'de>, 705 let position = read.position(); 706 Err(Error::syntax(reason, position.line, position.column)) 709 fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> { 710 str::from_utf8(slice) 711 .or_else(|_| error(read, ErrorCode::InvalidUnicodeCodePoint)) 714 /// Parses a SXP escape sequence and appends it into the scratch space. Assumes 715 /// the previous byte read was a backslash. 716 fn parse_escape<'de, R: Read<'de>>( 719 scratch: &mut Vec<u8>, 721 let ch = e!(next_or_eof(read)); 724 b'"' => scratch.push(b'"'), 725 b'\\' => scratch.push(b'\\'), 726 b'/' => scratch.push(b'/'), 727 b'b' => scratch.push(b'\x08'), 728 b'f' => scratch.push(b'\x0c'), 729 b'n' => scratch.push(b'\n'), 730 b'r' => scratch.push(b'\r'), 731 b't' => scratch.push(b'\t'), 733 fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) { 734 scratch.extend_from_slice(&[ 735 (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000, 736 (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000, 737 (n & 0b0011_1111) as u8 | 0b1000_0000, 741 let c = match e!(read.decode_hex_escape()) { 742 n @ 0xDC00..=0xDFFF => { 744 error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) 746 encode_surrogate(scratch, n); 751 // Non-BMP characters are encoded as a sequence of two hex 752 // escapes, representing UTF-16 surrogates. If deserializing a 753 // utf-8 string the surrogates are required to be paired, 754 // whereas deserializing a byte string accepts lone surrogates. 755 n1 @ 0xD800..=0xDBFF => { 756 if e!(peek_or_eof(read)) == b'\\' { 761 error(read, ErrorCode::UnexpectedEndOfHexEscape) 763 encode_surrogate(scratch, n1); 768 if e!(peek_or_eof(read)) == b'u' { 773 error(read, ErrorCode::UnexpectedEndOfHexEscape) 775 encode_surrogate(scratch, n1); 776 // The \ prior to this byte started an escape sequence, 777 // so we need to parse that now. This recursive call 778 // does not blow the stack on malicious input because 779 // the escape is not \u, so it will be handled by one 780 // of the easy nonrecursive cases. 781 parse_escape(read, validate, scratch) 785 let n2 = e!(read.decode_hex_escape()); 787 if !(0xDC00..=0xDFFF).contains(&n2) { 788 return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); 792 (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; 794 match char::from_u32(n) { 797 return error(read, ErrorCode::InvalidUnicodeCodePoint); 802 // Every u16 outside of the surrogate ranges above is guaranteed 803 // to be a legal char. 804 n => char::from_u32(n as u32).unwrap(), 807 scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); 810 return error(read, ErrorCode::InvalidEscape); 817 /// Parses a SXP escape sequence and discards the value. Assumes the previous 818 /// byte read was a backslash. 819 fn ignore_escape<'de, R>(read: &mut R) -> Result<()> 821 R: ?Sized + Read<'de>, 823 let ch = e!(next_or_eof(read)); 826 b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {} 828 // At this point we don't care if the codepoint is valid. We just 829 // want to consume it. We don't actually know what is valid or not 830 // at this point, because that depends on if this string will 831 // ultimately be parsed into a string or a byte buffer in the "real" 834 e!(read.decode_hex_escape()); 837 return error(read, ErrorCode::InvalidEscape); 844 static HEX: [u8; 256] = { 845 const __: u8 = 255; // not a hex digit 847 // 1 2 3 4 5 6 7 8 9 A B C D E F 848 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0 849 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1 850 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 851 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, __, __, __, __, __, __, // 3 852 __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4 853 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5 854 __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6 855 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 856 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 857 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 858 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A 859 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B 860 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C 861 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D 862 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E 863 __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F 867 fn decode_hex_val(val: u8) -> Option<u16> { 868 let n = HEX[val as usize] as u16;