changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / rust/lib/sxp/src/read.rs

changeset 698: 96958d3eb5b0
parent: 3d78bed56188
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 //! read.rs --- sxp reader
2 use crate::{err::ErrorCode, Error, Result};
3 use alloc::vec::Vec;
4 use core::{char, cmp, ops::Deref, str};
5 
6 #[cfg(feature = "std")]
7 use crate::io;
8 #[cfg(feature = "std")]
9 use crate::iter::LineColIterator;
10 
11 /// Trait used by the deserializer for iterating over input. This is manually
12 /// "specialized" for iterating over &[u8]. Once feature(specialization) is
13 /// stable we can use actual specialization.
14 // TODO 2023-07-14: should we use specialization anyways? we are on
15 // nightly 99% of the time for better or worse
16 pub trait Read<'de> {
17  #[doc(hidden)]
18  fn next(&mut self) -> Result<Option<u8>>;
19  #[doc(hidden)]
20  fn peek(&mut self) -> Result<Option<u8>>;
21 
22  /// Only valid after a call to peek(). Discards the peeked byte.
23  #[doc(hidden)]
24  fn discard(&mut self);
25 
26  /// Position of the most recent call to next().
27  ///
28  /// The most recent call was probably next() and not peek(), but this method
29  /// should try to return a sensible result if the most recent call was
30  /// actually peek() because we don't always know.
31  ///
32  /// Only called in case of an error, so performance is not important.
33  #[doc(hidden)]
34  fn position(&self) -> Position;
35 
36  /// Position of the most recent call to peek().
37  ///
38  /// The most recent call was probably peek() and not next(), but this method
39  /// should try to return a sensible result if the most recent call was
40  /// actually next() because we don't always know.
41  ///
42  /// Only called in case of an error, so performance is not important.
43  #[doc(hidden)]
44  fn peek_position(&self) -> Position;
45 
46  /// Offset from the beginning of the input to the next byte that would be
47  /// returned by next() or peek().
48  #[doc(hidden)]
49  fn byte_offset(&self) -> usize;
50 
51  /// Assumes the previous byte was a quotation mark. Parses an escaped
52  /// string until the next quotation mark using the given scratch space if
53  /// necessary. The scratch space is initially empty.
54  #[doc(hidden)]
55  fn parse_str<'s>(
56  &'s mut self,
57  scratch: &'s mut Vec<u8>,
58  ) -> Result<Reference<'de, 's, str>>;
59 
60  /// Assumes the previous byte was a quotation mark. Parses an escaped
61  /// string until the next quotation mark using the given scratch space if
62  /// necessary. The scratch space is initially empty.
63  ///
64  /// This function returns the raw bytes in the string with escape sequences
65  /// expanded but without performing unicode validation.
66  #[doc(hidden)]
67  fn parse_str_raw<'s>(
68  &'s mut self,
69  scratch: &'s mut Vec<u8>,
70  ) -> Result<Reference<'de, 's, [u8]>>;
71 
72  /// Assumes the previous byte was a quotation mark. Parses a
73  /// string until the next quotation mark but discards the data.
74  #[doc(hidden)]
75  fn ignore_str(&mut self) -> Result<()>;
76 
77  /// Assumes the previous byte was a hex escape sequnce ('\u') in a string.
78  /// Parses next hexadecimal sequence.
79  #[doc(hidden)]
80  fn decode_hex_escape(&mut self) -> Result<u16>;
81 
82  /// Whether StreamDeserializer::next needs to check the failed flag. True
83  /// for IoRead, false for StrRead and SliceRead which can track failure by
84  /// truncating their input slice to avoid the extra check on every next
85  /// call.
86  #[doc(hidden)]
87  const SHOULD_EARLY_RETURN_IF_FAILED: bool;
88 
89  /// Mark a persistent failure of StreamDeserializer, either by setting the
90  /// flag or by truncating the input data.
91  #[doc(hidden)]
92  fn set_failed(&mut self, failed: &mut bool);
93 }
94 
95 pub struct Position {
96  pub line: usize,
97  pub column: usize,
98 }
99 
100 pub enum Reference<'b, 'c, T>
101 where
102  T: ?Sized + 'static,
103 {
104  Borrowed(&'b T),
105  Copied(&'c T),
106 }
107 
108 impl<'b, 'c, T> Deref for Reference<'b, 'c, T>
109 where
110  T: ?Sized + 'static,
111 {
112  type Target = T;
113 
114  fn deref(&self) -> &Self::Target {
115  match *self {
116  Reference::Borrowed(b) => b,
117  Reference::Copied(c) => c,
118  }
119  }
120 }
121 
122 /// SXP input source that reads from a std::io input stream.
123 #[cfg(feature = "std")]
124 #[cfg_attr(docsrs, doc(cfg(feature = "std")))]
125 pub struct IoRead<R>
126 where
127  R: io::Read,
128 {
129  iter: LineColIterator<io::Bytes<R>>,
130  /// Temporary storage of peeked byte.
131  ch: Option<u8>,
132 }
133 
134 /// SXP input source that reads from a slice of bytes.
135 //
136 // This is more efficient than other iterators because peek() can be read-only
137 // and we can compute line/col position only if an error happens.
138 pub struct SliceRead<'a> {
139  slice: &'a [u8],
140  /// Index of the *next* byte that will be returned by next() or peek().
141  index: usize,
142 }
143 
144 /// SXP input source that reads from a UTF-8 string.
145 //
146 // Able to elide UTF-8 checks by assuming that the input is valid UTF-8.
147 pub struct StrRead<'a> {
148  delegate: SliceRead<'a>,
149 }
150 
151 #[cfg(feature = "std")]
152 impl<R> IoRead<R>
153 where
154  R: io::Read,
155 {
156  /// Create a SXP input source to read from a std::io input stream.
157  pub fn new(reader: R) -> Self {
158  IoRead {
159  iter: LineColIterator::new(reader.bytes()),
160  ch: None,
161  }
162  }
163 }
164 
165 #[cfg(feature = "std")]
166 impl<R> IoRead<R>
167 where
168  R: io::Read,
169 {
170  fn parse_str_bytes<'s, T, F>(
171  &'s mut self,
172  scratch: &'s mut Vec<u8>,
173  validate: bool,
174  result: F,
175  ) -> Result<T>
176  where
177  T: 's,
178  F: FnOnce(&'s Self, &'s [u8]) -> Result<T>,
179  {
180  loop {
181  let ch = e!(next_or_eof(self));
182  if !ESCAPE[ch as usize] {
183  scratch.push(ch);
184  continue;
185  }
186  match ch {
187  b'"' => {
188  return result(self, scratch);
189  }
190  b'\\' => {
191  e!(parse_escape(self, validate, scratch));
192  }
193  _ => {
194  if validate {
195  return error(self, ErrorCode::ControlCharacterWhileParsingString);
196  }
197  scratch.push(ch);
198  }
199  }
200  }
201  }
202 }
203 
204 #[cfg(feature = "std")]
205 impl<'de, R> Read<'de> for IoRead<R>
206 where
207  R: io::Read,
208 {
209  #[inline]
210  fn next(&mut self) -> Result<Option<u8>> {
211  match self.ch.take() {
212  Some(ch) => Ok(Some(ch)),
213  None => match self.iter.next() {
214  Some(Err(err)) => Err(Error::io(err)),
215  Some(Ok(ch)) => Ok(Some(ch)),
216  None => Ok(None),
217  },
218  }
219  }
220 
221  #[inline]
222  fn peek(&mut self) -> Result<Option<u8>> {
223  match self.ch {
224  Some(ch) => Ok(Some(ch)),
225  None => match self.iter.next() {
226  Some(Err(err)) => Err(Error::io(err)),
227  Some(Ok(ch)) => {
228  self.ch = Some(ch);
229  Ok(self.ch)
230  }
231  None => Ok(None),
232  },
233  }
234  }
235 
236  #[inline]
237  fn discard(&mut self) {
238  self.ch = None;
239  }
240 
241  fn position(&self) -> Position {
242  Position {
243  line: self.iter.line(),
244  column: self.iter.col(),
245  }
246  }
247 
248  fn peek_position(&self) -> Position {
249  // The LineColIterator updates its position during peek() so it has the
250  // right one here.
251  self.position()
252  }
253 
254  fn byte_offset(&self) -> usize {
255  match self.ch {
256  Some(_) => self.iter.byte_offset() - 1,
257  None => self.iter.byte_offset(),
258  }
259  }
260 
261  fn parse_str<'s>(
262  &'s mut self,
263  scratch: &'s mut Vec<u8>,
264  ) -> Result<Reference<'de, 's, str>> {
265  self
266  .parse_str_bytes(scratch, true, as_str)
267  .map(Reference::Copied)
268  }
269 
270  fn parse_str_raw<'s>(
271  &'s mut self,
272  scratch: &'s mut Vec<u8>,
273  ) -> Result<Reference<'de, 's, [u8]>> {
274  self
275  .parse_str_bytes(scratch, false, |_, bytes| Ok(bytes))
276  .map(Reference::Copied)
277  }
278 
279  fn ignore_str(&mut self) -> Result<()> {
280  loop {
281  let ch = e!(next_or_eof(self));
282  if !ESCAPE[ch as usize] {
283  continue;
284  }
285  match ch {
286  b'"' => {
287  return Ok(());
288  }
289  b'\\' => {
290  e!(ignore_escape(self));
291  }
292  _ => {
293  return error(self, ErrorCode::ControlCharacterWhileParsingString);
294  }
295  }
296  }
297  }
298 
299  fn decode_hex_escape(&mut self) -> Result<u16> {
300  let mut n = 0;
301  for _ in 0..4 {
302  match decode_hex_val(e!(next_or_eof(self))) {
303  None => return error(self, ErrorCode::InvalidEscape),
304  Some(val) => {
305  n = (n << 4) + val;
306  }
307  }
308  }
309  Ok(n)
310  }
311 
312  const SHOULD_EARLY_RETURN_IF_FAILED: bool = true;
313 
314  #[inline]
315  #[cold]
316  fn set_failed(&mut self, failed: &mut bool) {
317  *failed = true;
318  }
319 }
320 
321 //////////////////////////////////////////////////////////////////////////////
322 
323 impl<'a> SliceRead<'a> {
324  /// Create a SXP input source to read from a slice of bytes.
325  pub fn new(slice: &'a [u8]) -> Self {
326  SliceRead { slice, index: 0 }
327  }
328 
329  fn position_of_index(&self, i: usize) -> Position {
330  let mut position = Position { line: 1, column: 0 };
331  for ch in &self.slice[..i] {
332  match *ch {
333  b'\n' => {
334  position.line += 1;
335  position.column = 0;
336  }
337  _ => {
338  position.column += 1;
339  }
340  }
341  }
342  position
343  }
344 
345  /// The big optimization here over IoRead is that if the string contains no
346  /// backslash escape sequences, the returned &str is a slice of the raw SXP
347  /// data so we avoid copying into the scratch space.
348  fn parse_str_bytes<'s, T, F>(
349  &'s mut self,
350  scratch: &'s mut Vec<u8>,
351  validate: bool,
352  result: F,
353  ) -> Result<Reference<'a, 's, T>>
354  where
355  T: ?Sized + 's,
356  F: for<'f> FnOnce(&'s Self, &'f [u8]) -> Result<&'f T>,
357  {
358  // Index of the first byte not yet copied into the scratch space.
359  let mut start = self.index;
360 
361  loop {
362  while self.index < self.slice.len()
363  && !ESCAPE[self.slice[self.index] as usize]
364  {
365  self.index += 1;
366  }
367  if self.index == self.slice.len() {
368  return error(self, ErrorCode::EofWhileParsingString);
369  }
370  match self.slice[self.index] {
371  b'"' => {
372  if scratch.is_empty() {
373  // Fast path: return a slice of the raw SXP without any
374  // copying.
375  let borrowed = &self.slice[start..self.index];
376  self.index += 1;
377  return result(self, borrowed).map(Reference::Borrowed);
378  } else {
379  scratch.extend_from_slice(&self.slice[start..self.index]);
380  self.index += 1;
381  return result(self, scratch).map(Reference::Copied);
382  }
383  }
384  b'\\' => {
385  scratch.extend_from_slice(&self.slice[start..self.index]);
386  self.index += 1;
387  e!(parse_escape(self, validate, scratch));
388  start = self.index;
389  }
390  _ => {
391  self.index += 1;
392  if validate {
393  return error(self, ErrorCode::ControlCharacterWhileParsingString);
394  }
395  }
396  }
397  }
398  }
399 }
400 
401 impl<'a> Read<'a> for SliceRead<'a> {
402  #[inline]
403  fn next(&mut self) -> Result<Option<u8>> {
404  // `Ok(self.slice.get(self.index).map(|ch| { self.index += 1; *ch }))`
405  // is about 10% slower.
406  Ok(if self.index < self.slice.len() {
407  let ch = self.slice[self.index];
408  self.index += 1;
409  Some(ch)
410  } else {
411  None
412  })
413  }
414 
415  #[inline]
416  fn peek(&mut self) -> Result<Option<u8>> {
417  // `Ok(self.slice.get(self.index).map(|ch| *ch))` is about 10% slower
418  // for some reason.
419  Ok(if self.index < self.slice.len() {
420  Some(self.slice[self.index])
421  } else {
422  None
423  })
424  }
425 
426  #[inline]
427  fn discard(&mut self) {
428  self.index += 1;
429  }
430 
431  fn position(&self) -> Position {
432  self.position_of_index(self.index)
433  }
434 
435  fn peek_position(&self) -> Position {
436  // Cap it at slice.len() just in case the most recent call was next()
437  // and it returned the last byte.
438  self.position_of_index(cmp::min(self.slice.len(), self.index + 1))
439  }
440 
441  fn byte_offset(&self) -> usize {
442  self.index
443  }
444 
445  fn parse_str<'s>(
446  &'s mut self,
447  scratch: &'s mut Vec<u8>,
448  ) -> Result<Reference<'a, 's, str>> {
449  self.parse_str_bytes(scratch, true, as_str)
450  }
451 
452  fn parse_str_raw<'s>(
453  &'s mut self,
454  scratch: &'s mut Vec<u8>,
455  ) -> Result<Reference<'a, 's, [u8]>> {
456  self.parse_str_bytes(scratch, false, |_, bytes| Ok(bytes))
457  }
458 
459  fn ignore_str(&mut self) -> Result<()> {
460  loop {
461  while self.index < self.slice.len()
462  && !ESCAPE[self.slice[self.index] as usize]
463  {
464  self.index += 1;
465  }
466  if self.index == self.slice.len() {
467  return error(self, ErrorCode::EofWhileParsingString);
468  }
469  match self.slice[self.index] {
470  b'"' => {
471  self.index += 1;
472  return Ok(());
473  }
474  b'\\' => {
475  self.index += 1;
476  e!(ignore_escape(self));
477  }
478  _ => {
479  return error(self, ErrorCode::ControlCharacterWhileParsingString);
480  }
481  }
482  }
483  }
484 
485  fn decode_hex_escape(&mut self) -> Result<u16> {
486  if self.index + 4 > self.slice.len() {
487  self.index = self.slice.len();
488  return error(self, ErrorCode::EofWhileParsingString);
489  }
490 
491  let mut n = 0;
492  for _ in 0..4 {
493  let ch = decode_hex_val(self.slice[self.index]);
494  self.index += 1;
495  match ch {
496  None => return error(self, ErrorCode::InvalidEscape),
497  Some(val) => {
498  n = (n << 4) + val;
499  }
500  }
501  }
502  Ok(n)
503  }
504 
505  const SHOULD_EARLY_RETURN_IF_FAILED: bool = false;
506 
507  #[inline]
508  #[cold]
509  fn set_failed(&mut self, _failed: &mut bool) {
510  self.slice = &self.slice[..self.index];
511  }
512 }
513 
514 //////////////////////////////////////////////////////////////////////////////
515 
516 impl<'a> StrRead<'a> {
517  /// Create a SXP input source to read from a UTF-8 string.
518  pub fn new(s: &'a str) -> Self {
519  StrRead {
520  delegate: SliceRead::new(s.as_bytes()),
521  }
522  }
523 }
524 
525 impl<'a> Read<'a> for StrRead<'a> {
526  #[inline]
527  fn next(&mut self) -> Result<Option<u8>> {
528  self.delegate.next()
529  }
530 
531  #[inline]
532  fn peek(&mut self) -> Result<Option<u8>> {
533  self.delegate.peek()
534  }
535 
536  #[inline]
537  fn discard(&mut self) {
538  self.delegate.discard();
539  }
540 
541  fn position(&self) -> Position {
542  self.delegate.position()
543  }
544 
545  fn peek_position(&self) -> Position {
546  self.delegate.peek_position()
547  }
548 
549  fn byte_offset(&self) -> usize {
550  self.delegate.byte_offset()
551  }
552 
553  fn parse_str<'s>(
554  &'s mut self,
555  scratch: &'s mut Vec<u8>,
556  ) -> Result<Reference<'a, 's, str>> {
557  self.delegate.parse_str_bytes(scratch, true, |_, bytes| {
558  // The deserialization input came in as &str with a UTF-8 guarantee,
559  // and the \u-escapes are checked along the way, so don't need to
560  // check here.
561  Ok(unsafe { str::from_utf8_unchecked(bytes) })
562  })
563  }
564 
565  fn parse_str_raw<'s>(
566  &'s mut self,
567  scratch: &'s mut Vec<u8>,
568  ) -> Result<Reference<'a, 's, [u8]>> {
569  self.delegate.parse_str_raw(scratch)
570  }
571 
572  fn ignore_str(&mut self) -> Result<()> {
573  self.delegate.ignore_str()
574  }
575 
576  fn decode_hex_escape(&mut self) -> Result<u16> {
577  self.delegate.decode_hex_escape()
578  }
579 
580  const SHOULD_EARLY_RETURN_IF_FAILED: bool = false;
581 
582  #[inline]
583  #[cold]
584  fn set_failed(&mut self, failed: &mut bool) {
585  self.delegate.set_failed(failed);
586  }
587 }
588 
589 //////////////////////////////////////////////////////////////////////////////
590 
591 impl<'a, 'de, R> Read<'de> for &'a mut R
592 where
593  R: Read<'de>,
594 {
595  fn next(&mut self) -> Result<Option<u8>> {
596  R::next(self)
597  }
598 
599  fn peek(&mut self) -> Result<Option<u8>> {
600  R::peek(self)
601  }
602 
603  fn discard(&mut self) {
604  R::discard(self);
605  }
606 
607  fn position(&self) -> Position {
608  R::position(self)
609  }
610 
611  fn peek_position(&self) -> Position {
612  R::peek_position(self)
613  }
614 
615  fn byte_offset(&self) -> usize {
616  R::byte_offset(self)
617  }
618 
619  fn parse_str<'s>(
620  &'s mut self,
621  scratch: &'s mut Vec<u8>,
622  ) -> Result<Reference<'de, 's, str>> {
623  R::parse_str(self, scratch)
624  }
625 
626  fn parse_str_raw<'s>(
627  &'s mut self,
628  scratch: &'s mut Vec<u8>,
629  ) -> Result<Reference<'de, 's, [u8]>> {
630  R::parse_str_raw(self, scratch)
631  }
632 
633  fn ignore_str(&mut self) -> Result<()> {
634  R::ignore_str(self)
635  }
636 
637  fn decode_hex_escape(&mut self) -> Result<u16> {
638  R::decode_hex_escape(self)
639  }
640 
641  const SHOULD_EARLY_RETURN_IF_FAILED: bool = R::SHOULD_EARLY_RETURN_IF_FAILED;
642 
643  fn set_failed(&mut self, failed: &mut bool) {
644  R::set_failed(self, failed);
645  }
646 }
647 
648 /// Marker for whether StreamDeserializer can implement FusedIterator.
649 pub trait Fused {}
650 impl<'a> Fused for SliceRead<'a> {}
651 impl<'a> Fused for StrRead<'a> {}
652 
653 // Lookup table of bytes that must be escaped. A value of true at index i means
654 // that byte i requires an escape sequence in the input.
655 static ESCAPE: [bool; 256] = {
656  const CT: bool = true; // control character \x00..=\x1F
657  const QU: bool = true; // quote \x22
658  const BS: bool = true; // backslash \x5C
659  const __: bool = false; // allow unescaped
660  [
661  // 1 2 3 4 5 6 7 8 9 A B C D E F
662  CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0
663  CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1
664  __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
665  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
666  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
667  __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
668  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
669  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
670  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
671  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
672  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
673  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
674  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
675  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
676  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
677  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
678  ]
679 };
680 
681 fn next_or_eof<'de, R>(read: &mut R) -> Result<u8>
682 where
683  R: ?Sized + Read<'de>,
684 {
685  match e!(read.next()) {
686  Some(b) => Ok(b),
687  None => error(read, ErrorCode::EofWhileParsingString),
688  }
689 }
690 
691 fn peek_or_eof<'de, R>(read: &mut R) -> Result<u8>
692 where
693  R: ?Sized + Read<'de>,
694 {
695  match e!(read.peek()) {
696  Some(b) => Ok(b),
697  None => error(read, ErrorCode::EofWhileParsingString),
698  }
699 }
700 
701 fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result<T>
702 where
703  R: ?Sized + Read<'de>,
704 {
705  let position = read.position();
706  Err(Error::syntax(reason, position.line, position.column))
707 }
708 
709 fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> {
710  str::from_utf8(slice)
711  .or_else(|_| error(read, ErrorCode::InvalidUnicodeCodePoint))
712 }
713 
714 /// Parses a SXP escape sequence and appends it into the scratch space. Assumes
715 /// the previous byte read was a backslash.
716 fn parse_escape<'de, R: Read<'de>>(
717  read: &mut R,
718  validate: bool,
719  scratch: &mut Vec<u8>,
720 ) -> Result<()> {
721  let ch = e!(next_or_eof(read));
722 
723  match ch {
724  b'"' => scratch.push(b'"'),
725  b'\\' => scratch.push(b'\\'),
726  b'/' => scratch.push(b'/'),
727  b'b' => scratch.push(b'\x08'),
728  b'f' => scratch.push(b'\x0c'),
729  b'n' => scratch.push(b'\n'),
730  b'r' => scratch.push(b'\r'),
731  b't' => scratch.push(b'\t'),
732  b'u' => {
733  fn encode_surrogate(scratch: &mut Vec<u8>, n: u16) {
734  scratch.extend_from_slice(&[
735  (n >> 12 & 0b0000_1111) as u8 | 0b1110_0000,
736  (n >> 6 & 0b0011_1111) as u8 | 0b1000_0000,
737  (n & 0b0011_1111) as u8 | 0b1000_0000,
738  ]);
739  }
740 
741  let c = match e!(read.decode_hex_escape()) {
742  n @ 0xDC00..=0xDFFF => {
743  return if validate {
744  error(read, ErrorCode::LoneLeadingSurrogateInHexEscape)
745  } else {
746  encode_surrogate(scratch, n);
747  Ok(())
748  };
749  }
750 
751  // Non-BMP characters are encoded as a sequence of two hex
752  // escapes, representing UTF-16 surrogates. If deserializing a
753  // utf-8 string the surrogates are required to be paired,
754  // whereas deserializing a byte string accepts lone surrogates.
755  n1 @ 0xD800..=0xDBFF => {
756  if e!(peek_or_eof(read)) == b'\\' {
757  read.discard();
758  } else {
759  return if validate {
760  read.discard();
761  error(read, ErrorCode::UnexpectedEndOfHexEscape)
762  } else {
763  encode_surrogate(scratch, n1);
764  Ok(())
765  };
766  }
767 
768  if e!(peek_or_eof(read)) == b'u' {
769  read.discard();
770  } else {
771  return if validate {
772  read.discard();
773  error(read, ErrorCode::UnexpectedEndOfHexEscape)
774  } else {
775  encode_surrogate(scratch, n1);
776  // The \ prior to this byte started an escape sequence,
777  // so we need to parse that now. This recursive call
778  // does not blow the stack on malicious input because
779  // the escape is not \u, so it will be handled by one
780  // of the easy nonrecursive cases.
781  parse_escape(read, validate, scratch)
782  };
783  }
784 
785  let n2 = e!(read.decode_hex_escape());
786 
787  if !(0xDC00..=0xDFFF).contains(&n2) {
788  return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape);
789  }
790 
791  let n =
792  (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000;
793 
794  match char::from_u32(n) {
795  Some(c) => c,
796  None => {
797  return error(read, ErrorCode::InvalidUnicodeCodePoint);
798  }
799  }
800  }
801 
802  // Every u16 outside of the surrogate ranges above is guaranteed
803  // to be a legal char.
804  n => char::from_u32(n as u32).unwrap(),
805  };
806 
807  scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes());
808  }
809  _ => {
810  return error(read, ErrorCode::InvalidEscape);
811  }
812  }
813 
814  Ok(())
815 }
816 
817 /// Parses a SXP escape sequence and discards the value. Assumes the previous
818 /// byte read was a backslash.
819 fn ignore_escape<'de, R>(read: &mut R) -> Result<()>
820 where
821  R: ?Sized + Read<'de>,
822 {
823  let ch = e!(next_or_eof(read));
824 
825  match ch {
826  b'"' | b'\\' | b'/' | b'b' | b'f' | b'n' | b'r' | b't' => {}
827  b'u' => {
828  // At this point we don't care if the codepoint is valid. We just
829  // want to consume it. We don't actually know what is valid or not
830  // at this point, because that depends on if this string will
831  // ultimately be parsed into a string or a byte buffer in the "real"
832  // parse.
833 
834  e!(read.decode_hex_escape());
835  }
836  _ => {
837  return error(read, ErrorCode::InvalidEscape);
838  }
839  }
840 
841  Ok(())
842 }
843 
844 static HEX: [u8; 256] = {
845  const __: u8 = 255; // not a hex digit
846  [
847  // 1 2 3 4 5 6 7 8 9 A B C D E F
848  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 0
849  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 1
850  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
851  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, __, __, __, __, __, __, // 3
852  __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 4
853  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 5
854  __, 10, 11, 12, 13, 14, 15, __, __, __, __, __, __, __, __, __, // 6
855  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
856  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
857  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
858  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
859  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
860  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
861  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
862  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
863  __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
864  ]
865 };
866 
867 fn decode_hex_val(val: u8) -> Option<u16> {
868  let n = HEX[val as usize] as u16;
869  if n == 255 {
870  None
871  } else {
872  Some(n)
873  }
874 }