1
1
use std:: fmt;
2
- use std:: ops :: RangeInclusive ;
2
+ use std:: iter :: Peekable ;
3
3
use std:: sync:: atomic:: { AtomicU32 , Ordering } ;
4
4
5
5
use super :: { Byte , Ref , Tree , Uninhabited } ;
@@ -211,15 +211,15 @@ where
211
211
let b_transitions =
212
212
b_src. and_then ( |b_src| b. transitions . get ( & b_src) ) . unwrap_or ( & empty_transitions) ;
213
213
214
- let byte_transitions =
215
- a_transitions. byte_transitions . union ( & b_transitions. byte_transitions ) ;
216
-
217
- let byte_transitions = byte_transitions. map_states ( |( a_dst, b_dst) | {
218
- assert ! ( a_dst. is_some( ) || b_dst. is_some( ) ) ;
214
+ let byte_transitions = a_transitions. byte_transitions . union (
215
+ & b_transitions. byte_transitions ,
216
+ |a_dst, b_dst| {
217
+ assert ! ( a_dst. is_some( ) || b_dst. is_some( ) ) ;
219
218
220
- queue. enqueue ( a_dst, b_dst) ;
221
- mapped ( ( a_dst, b_dst) )
222
- } ) ;
219
+ queue. enqueue ( a_dst, b_dst) ;
220
+ mapped ( ( a_dst, b_dst) )
221
+ } ,
222
+ ) ;
223
223
224
224
let ref_transitions =
225
225
a_transitions. ref_transitions . keys ( ) . chain ( b_transitions. ref_transitions . keys ( ) ) ;
@@ -245,18 +245,6 @@ where
245
245
Self { transitions, start, accept }
246
246
}
247
247
248
- pub ( crate ) fn states_from (
249
- & self ,
250
- state : State ,
251
- src_validity : RangeInclusive < u8 > ,
252
- ) -> impl Iterator < Item = ( Byte , State ) > {
253
- self . transitions
254
- . get ( & state)
255
- . map ( move |t| t. byte_transitions . states_from ( src_validity) )
256
- . into_iter ( )
257
- . flatten ( )
258
- }
259
-
260
248
pub ( crate ) fn get_uninit_edge_dst ( & self , state : State ) -> Option < State > {
261
249
let transitions = self . transitions . get ( & state) ?;
262
250
transitions. byte_transitions . get_uninit_edge_dst ( )
@@ -334,95 +322,31 @@ where
334
322
335
323
use edge_set:: EdgeSet ;
336
324
mod edge_set {
337
- use std:: cmp;
338
-
339
- use run:: * ;
340
- use smallvec:: { SmallVec , smallvec} ;
325
+ use smallvec:: SmallVec ;
341
326
342
327
use super :: * ;
343
- mod run {
344
- use std:: ops:: { Range , RangeInclusive } ;
345
-
346
- use super :: * ;
347
- use crate :: layout:: Byte ;
348
-
349
- /// A logical set of edges.
350
- ///
351
- /// A `Run` encodes one edge for every byte value in `start..=end`
352
- /// pointing to `dst`.
353
- #[ derive( Eq , PartialEq , Copy , Clone , Debug ) ]
354
- pub ( super ) struct Run < S > {
355
- // `start` and `end` are both inclusive (ie, closed) bounds, as this
356
- // is required in order to be able to store 0..=255. We provide
357
- // setters and getters which operate on closed/open ranges, which
358
- // are more intuitive and easier for performing offset math.
359
- start : u8 ,
360
- end : u8 ,
361
- pub ( super ) dst : S ,
362
- }
363
-
364
- impl < S > Run < S > {
365
- pub ( super ) fn new ( range : RangeInclusive < u8 > , dst : S ) -> Self {
366
- Self { start : * range. start ( ) , end : * range. end ( ) , dst }
367
- }
368
-
369
- pub ( super ) fn from_inclusive_exclusive ( range : Range < u16 > , dst : S ) -> Self {
370
- Self {
371
- start : range. start . try_into ( ) . unwrap ( ) ,
372
- end : ( range. end - 1 ) . try_into ( ) . unwrap ( ) ,
373
- dst,
374
- }
375
- }
376
-
377
- pub ( super ) fn contains ( & self , idx : u16 ) -> bool {
378
- idx >= u16:: from ( self . start ) && idx <= u16:: from ( self . end )
379
- }
380
-
381
- pub ( super ) fn as_inclusive_exclusive ( & self ) -> ( u16 , u16 ) {
382
- ( u16:: from ( self . start ) , u16:: from ( self . end ) + 1 )
383
- }
384
-
385
- pub ( super ) fn as_byte ( & self ) -> Byte {
386
- Byte :: new ( self . start ..=self . end )
387
- }
388
328
389
- pub ( super ) fn map_state < SS > ( self , f : impl FnOnce ( S ) -> SS ) -> Run < SS > {
390
- let Run { start, end, dst } = self ;
391
- Run { start, end, dst : f ( dst) }
392
- }
393
-
394
- /// Produces a new `Run` whose lower bound is the greater of
395
- /// `self`'s existing lower bound and `lower_bound`.
396
- pub ( super ) fn clamp_lower ( self , lower_bound : u8 ) -> Self {
397
- let Run { start, end, dst } = self ;
398
- Run { start : cmp:: max ( start, lower_bound) , end, dst }
399
- }
400
- }
401
- }
402
-
403
- /// The set of outbound byte edges associated with a DFA node (not including
404
- /// reference edges).
329
+ /// The set of outbound byte edges associated with a DFA node.
405
330
#[ derive( Eq , PartialEq , Clone , Debug ) ]
406
331
pub ( super ) struct EdgeSet < S = State > {
407
- // A sequence of runs stored in ascending order. Since the graph is a
408
- // DFA, these must be non-overlapping with one another.
409
- runs : SmallVec < [ Run < S > ; 1 ] > ,
410
- // The edge labeled with the uninit byte, if any.
332
+ // A sequence of byte edges with contiguous byte values and a common
333
+ // destination is stored as a single run.
411
334
//
412
- // FIXME(@joshlf): Make `State` a `NonZero` so that this is NPO'd .
413
- uninit : Option < S > ,
335
+ // Runs are non-empty, non-overlapping, and stored in ascending order .
336
+ runs : SmallVec < [ ( Byte , S ) ; 1 ] > ,
414
337
}
415
338
416
339
impl < S > EdgeSet < S > {
417
- pub ( crate ) fn new ( byte : Byte , dst : S ) -> Self {
418
- match byte . range ( ) {
419
- Some ( range ) => Self { runs : smallvec ! [ Run :: new ( range, dst ) ] , uninit : None } ,
420
- None => Self { runs : SmallVec :: new ( ) , uninit : Some ( dst) } ,
340
+ pub ( crate ) fn new ( range : Byte , dst : S ) -> Self {
341
+ let mut this = Self { runs : SmallVec :: new ( ) } ;
342
+ if ! range. is_empty ( ) {
343
+ this . runs . push ( ( range , dst) ) ;
421
344
}
345
+ this
422
346
}
423
347
424
348
pub ( crate ) fn empty ( ) -> Self {
425
- Self { runs : SmallVec :: new ( ) , uninit : None }
349
+ Self { runs : SmallVec :: new ( ) }
426
350
}
427
351
428
352
#[ cfg( test) ]
@@ -431,43 +355,23 @@ mod edge_set {
431
355
S : Ord ,
432
356
{
433
357
edges. sort ( ) ;
434
- Self {
435
- runs : edges
436
- . into_iter ( )
437
- . map ( |( byte, state) | Run :: new ( byte. range ( ) . unwrap ( ) , state) )
438
- . collect ( ) ,
439
- uninit : None ,
440
- }
358
+ Self { runs : edges. into ( ) }
441
359
}
442
360
443
361
pub ( crate ) fn iter ( & self ) -> impl Iterator < Item = ( Byte , S ) >
444
362
where
445
363
S : Copy ,
446
364
{
447
- self . uninit
448
- . map ( |dst| ( Byte :: uninit ( ) , dst) )
449
- . into_iter ( )
450
- . chain ( self . runs . iter ( ) . map ( |run| ( run. as_byte ( ) , run. dst ) ) )
451
- }
452
-
453
- pub ( crate ) fn states_from (
454
- & self ,
455
- byte : RangeInclusive < u8 > ,
456
- ) -> impl Iterator < Item = ( Byte , S ) >
457
- where
458
- S : Copy ,
459
- {
460
- // FIXME(@joshlf): Optimize this. A manual scan over `self.runs` may
461
- // permit us to more efficiently discard runs which will not be
462
- // produced by this iterator.
463
- self . iter ( ) . filter ( move |( o, _) | Byte :: new ( byte. clone ( ) ) . transmutable_into ( & o) )
365
+ self . runs . iter ( ) . copied ( )
464
366
}
465
367
466
368
pub ( crate ) fn get_uninit_edge_dst ( & self ) -> Option < S >
467
369
where
468
370
S : Copy ,
469
371
{
470
- self . uninit
372
+ // Uninit is ordered last.
373
+ let & ( range, dst) = self . runs . last ( ) ?;
374
+ if range. contains_uninit ( ) { Some ( dst) } else { None }
471
375
}
472
376
473
377
pub ( crate ) fn map_states < SS > ( self , mut f : impl FnMut ( S ) -> SS ) -> EdgeSet < SS > {
@@ -478,95 +382,106 @@ mod edge_set {
478
382
// allocates the correct number of elements once up-front [1].
479
383
//
480
384
// [1] https://doc.rust-lang.org/1.85.0/src/alloc/vec/spec_from_iter_nested.rs.html#47
481
- runs : self . runs . into_iter ( ) . map ( |run| run. map_state ( & mut f) ) . collect ( ) ,
482
- uninit : self . uninit . map ( f) ,
385
+ runs : self . runs . into_iter ( ) . map ( |( b, s) | ( b, f ( s) ) ) . collect ( ) ,
483
386
}
484
387
}
485
388
486
389
/// Unions two edge sets together.
487
390
///
488
391
/// If `u = a.union(b)`, then for each byte value, `u` will have an edge
489
- /// with that byte value and with the destination `(Some(_), None)`,
490
- /// `(None, Some(_))`, or `(Some(_), Some(_))` depending on whether `a`,
392
+ /// with that byte value and with the destination `join (Some(_), None)`,
393
+ /// `join (None, Some(_))`, or `join (Some(_), Some(_))` depending on whether `a`,
491
394
/// `b`, or both have an edge with that byte value.
492
395
///
493
396
/// If neither `a` nor `b` have an edge with a particular byte value,
494
397
/// then no edge with that value will be present in `u`.
495
- pub ( crate ) fn union ( & self , other : & Self ) -> EdgeSet < ( Option < S > , Option < S > ) >
398
+ pub ( crate ) fn union (
399
+ & self ,
400
+ other : & Self ,
401
+ mut join : impl FnMut ( Option < S > , Option < S > ) -> S ,
402
+ ) -> EdgeSet < S >
496
403
where
497
404
S : Copy ,
498
405
{
499
- let uninit = match ( self . uninit , other. uninit ) {
500
- ( None , None ) => None ,
501
- ( s, o) => Some ( ( s, o) ) ,
502
- } ;
503
-
504
- let mut runs = SmallVec :: new ( ) ;
505
-
506
- // Iterate over `self.runs` and `other.runs` simultaneously,
507
- // advancing `idx` as we go. At each step, we advance `idx` as far
508
- // as we can without crossing a run boundary in either `self.runs`
509
- // or `other.runs`.
510
-
511
- // INVARIANT: `idx < s[0].end && idx < o[0].end`.
512
- let ( mut s, mut o) = ( self . runs . as_slice ( ) , other. runs . as_slice ( ) ) ;
513
- let mut idx = 0u16 ;
514
- while let ( Some ( ( s_run, s_rest) ) , Some ( ( o_run, o_rest) ) ) =
515
- ( s. split_first ( ) , o. split_first ( ) )
516
- {
517
- let ( s_start, s_end) = s_run. as_inclusive_exclusive ( ) ;
518
- let ( o_start, o_end) = o_run. as_inclusive_exclusive ( ) ;
519
-
520
- // Compute `end` as the end of the current run (which starts
521
- // with `idx`).
522
- let ( end, dst) = match ( s_run. contains ( idx) , o_run. contains ( idx) ) {
523
- // `idx` is in an existing run in both `s` and `o`, so `end`
524
- // is equal to the smallest of the two ends of those runs.
525
- ( true , true ) => ( cmp:: min ( s_end, o_end) , ( Some ( s_run. dst ) , Some ( o_run. dst ) ) ) ,
526
- // `idx` is in an existing run in `s`, but not in any run in
527
- // `o`. `end` is either the end of the `s` run or the
528
- // beginning of the next `o` run, whichever comes first.
529
- ( true , false ) => ( cmp:: min ( s_end, o_start) , ( Some ( s_run. dst ) , None ) ) ,
530
- // The inverse of the previous case.
531
- ( false , true ) => ( cmp:: min ( s_start, o_end) , ( None , Some ( o_run. dst ) ) ) ,
532
- // `idx` is not in a run in either `s` or `o`, so advance it
533
- // to the beginning of the next run.
534
- ( false , false ) => {
535
- idx = cmp:: min ( s_start, o_start) ;
536
- continue ;
537
- }
538
- } ;
406
+ let xs = self . runs . iter ( ) . copied ( ) ;
407
+ let ys = other. runs . iter ( ) . copied ( ) ;
408
+ // FIXME(@joshlf): Merge contiguous runs with common destination.
409
+ EdgeSet { runs : union ( xs, ys) . map ( |( range, ( x, y) ) | ( range, join ( x, y) ) ) . collect ( ) }
410
+ }
411
+ }
412
+ }
413
+
414
+ /// Merges two sorted sequences into one sorted sequence.
415
+ pub ( crate ) fn union < S : Copy , X : Iterator < Item = ( Byte , S ) > , Y : Iterator < Item = ( Byte , S ) > > (
416
+ xs : X ,
417
+ ys : Y ,
418
+ ) -> UnionIter < X , Y > {
419
+ UnionIter { xs : xs. peekable ( ) , ys : ys. peekable ( ) }
420
+ }
421
+
422
+ pub ( crate ) struct UnionIter < X : Iterator , Y : Iterator > {
423
+ xs : Peekable < X > ,
424
+ ys : Peekable < Y > ,
425
+ }
426
+
427
+ // FIXME(jswrenn) we'd likely benefit from specializing try_fold here.
428
+ impl < S : Copy , X : Iterator < Item = ( Byte , S ) > , Y : Iterator < Item = ( Byte , S ) > > Iterator
429
+ for UnionIter < X , Y >
430
+ {
431
+ type Item = ( Byte , ( Option < S > , Option < S > ) ) ;
539
432
540
- // FIXME(@joshlf): If this is contiguous with the previous run
541
- // and has the same `dst`, just merge it into that run rather
542
- // than adding a new one.
543
- runs. push ( Run :: from_inclusive_exclusive ( idx..end, dst) ) ;
544
- idx = end;
433
+ fn next ( & mut self ) -> Option < Self :: Item > {
434
+ use std:: cmp:: { self , Ordering } ;
545
435
546
- if idx >= s_end {
547
- s = s_rest;
436
+ let ret;
437
+ match ( self . xs . peek_mut ( ) , self . ys . peek_mut ( ) ) {
438
+ ( None , None ) => {
439
+ ret = None ;
440
+ }
441
+ ( Some ( x) , None ) => {
442
+ ret = Some ( ( x. 0 , ( Some ( x. 1 ) , None ) ) ) ;
443
+ self . xs . next ( ) ;
444
+ }
445
+ ( None , Some ( y) ) => {
446
+ ret = Some ( ( y. 0 , ( None , Some ( y. 1 ) ) ) ) ;
447
+ self . ys . next ( ) ;
448
+ }
449
+ ( Some ( x) , Some ( y) ) => {
450
+ let start;
451
+ let end;
452
+ let dst;
453
+ match x. 0 . start . cmp ( & y. 0 . start ) {
454
+ Ordering :: Less => {
455
+ start = x. 0 . start ;
456
+ end = cmp:: min ( x. 0 . end , y. 0 . start ) ;
457
+ dst = ( Some ( x. 1 ) , None ) ;
458
+ }
459
+ Ordering :: Greater => {
460
+ start = y. 0 . start ;
461
+ end = cmp:: min ( x. 0 . start , y. 0 . end ) ;
462
+ dst = ( None , Some ( y. 1 ) ) ;
463
+ }
464
+ Ordering :: Equal => {
465
+ start = x. 0 . start ;
466
+ end = cmp:: min ( x. 0 . end , y. 0 . end ) ;
467
+ dst = ( Some ( x. 1 ) , Some ( y. 1 ) ) ;
468
+ }
548
469
}
549
- if idx >= o_end {
550
- o = o_rest;
470
+ ret = Some ( ( Byte { start, end } , dst) ) ;
471
+ if start == x. 0 . start {
472
+ x. 0 . start = end;
473
+ }
474
+ if start == y. 0 . start {
475
+ y. 0 . start = end;
476
+ }
477
+ if x. 0 . is_empty ( ) {
478
+ self . xs . next ( ) ;
479
+ }
480
+ if y. 0 . is_empty ( ) {
481
+ self . ys . next ( ) ;
551
482
}
552
483
}
553
-
554
- // At this point, either `s` or `o` have been exhausted, so the
555
- // remaining elements in the other slice are guaranteed to be
556
- // non-overlapping. We can add all remaining runs to `runs` with no
557
- // further processing.
558
- if let Ok ( idx) = u8:: try_from ( idx) {
559
- let ( slc, map) = if !s. is_empty ( ) {
560
- let map: fn ( _) -> _ = |st| ( Some ( st) , None ) ;
561
- ( s, map)
562
- } else {
563
- let map: fn ( _) -> _ = |st| ( None , Some ( st) ) ;
564
- ( o, map)
565
- } ;
566
- runs. extend ( slc. iter ( ) . map ( |run| run. clamp_lower ( idx) . map_state ( map) ) ) ;
567
- }
568
-
569
- EdgeSet { runs, uninit }
570
484
}
485
+ ret
571
486
}
572
487
}
0 commit comments