@@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
2828
2929BMPSet::BMPSet (const int32_t *parentList, int32_t parentListLength) :
3030 list(parentList), listLength(parentListLength) {
31- uprv_memset (asciiBytes , 0 , sizeof (asciiBytes ));
31+ uprv_memset (latin1Contains , 0 , sizeof (latin1Contains ));
3232 uprv_memset (table7FF, 0 , sizeof (table7FF));
3333 uprv_memset (bmpBlockBits, 0 , sizeof (bmpBlockBits));
3434
@@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
4545 list4kStarts[i]=findCodePoint (i<<12 , list4kStarts[i-1 ], listLength-1 );
4646 }
4747 list4kStarts[0x11 ]=listLength-1 ;
48+ containsFFFD=containsSlow (0xfffd , list4kStarts[0xf ], list4kStarts[0x10 ]);
4849
4950 initBits ();
5051 overrideIllegal ();
5152}
5253
5354BMPSet::BMPSet (const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
55+ containsFFFD(otherBMPSet.containsFFFD),
5456 list(newParentList), listLength(newParentListLength) {
55- uprv_memcpy (asciiBytes , otherBMPSet.asciiBytes , sizeof (asciiBytes ));
57+ uprv_memcpy (latin1Contains , otherBMPSet.latin1Contains , sizeof (latin1Contains ));
5658 uprv_memcpy (table7FF, otherBMPSet.table7FF , sizeof (table7FF));
5759 uprv_memcpy (bmpBlockBits, otherBMPSet.bmpBlockBits , sizeof (bmpBlockBits));
5860 uprv_memcpy (list4kStarts, otherBMPSet.list4kStarts , sizeof (list4kStarts));
@@ -120,21 +122,38 @@ void BMPSet::initBits() {
120122 UChar32 start, limit;
121123 int32_t listIndex=0 ;
122124
123- // Set asciiBytes [].
125+ // Set latin1Contains [].
124126 do {
125127 start=list[listIndex++];
126128 if (listIndex<listLength) {
127129 limit=list[listIndex++];
128130 } else {
129131 limit=0x110000 ;
130132 }
131- if (start>=0x80 ) {
133+ if (start>=0x100 ) {
132134 break ;
133135 }
134136 do {
135- asciiBytes[start++]=1 ;
136- } while (start<limit && start<0x80 );
137- } while (limit<=0x80 );
137+ latin1Contains[start++]=1 ;
138+ } while (start<limit && start<0x100 );
139+ } while (limit<=0x100 );
140+
141+ // Find the first range overlapping with (or after) 80..FF again,
142+ // to include them in table7FF as well.
143+ for (listIndex=0 ;;) {
144+ start=list[listIndex++];
145+ if (listIndex<listLength) {
146+ limit=list[listIndex++];
147+ } else {
148+ limit=0x110000 ;
149+ }
150+ if (limit>0x80 ) {
151+ if (start<0x80 ) {
152+ start=0x80 ;
153+ }
154+ break ;
155+ }
156+ }
138157
139158 // Set table7FF[].
140159 while (start<0x800 ) {
@@ -204,19 +223,14 @@ void BMPSet::initBits() {
204223 * for faster validity checking at runtime.
205224 * No need to set 0 values where they were reset to 0 in the constructor
206225 * and not modified by initBits().
207- * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
226+ * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
208227 * Need to set 0 values for surrogates D800..DFFF.
209228 */
210229void BMPSet::overrideIllegal () {
211230 uint32_t bits, mask;
212231 int32_t i;
213232
214- if (containsSlow (0xfffd , list4kStarts[0xf ], list4kStarts[0x10 ])) {
215- // contains(FFFD)==TRUE
216- for (i=0x80 ; i<0xc0 ; ++i) {
217- asciiBytes[i]=1 ;
218- }
219-
233+ if (containsFFFD) {
220234 bits=3 ; // Lead bytes 0xC0 and 0xC1.
221235 for (i=0 ; i<64 ; ++i) {
222236 table7FF[i]|=bits;
@@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
233247 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
234248 }
235249 } else {
236- // contains(FFFD)==FALSE
237250 mask=~(0x10001 <<0xd ); // Lead byte 0xED.
238251 for (i=32 ; i<64 ; ++i) { // Second half of 4k block.
239252 bmpBlockBits[i]&=mask;
@@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
277290
278291UBool
279292BMPSet::contains (UChar32 c) const {
280- if ((uint32_t )c<=0x7f ) {
281- return (UBool)asciiBytes [c];
293+ if ((uint32_t )c<=0xff ) {
294+ return (UBool)latin1Contains [c];
282295 } else if ((uint32_t )c<=0x7ff ) {
283296 return (UBool)((table7FF[c&0x3f ]&((uint32_t )1 <<(c>>6 )))!=0 );
284297 } else if ((uint32_t )c<0xd800 || (c>=0xe000 && c<=0xffff )) {
@@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
314327 // span
315328 do {
316329 c=*s;
317- if (c<=0x7f ) {
318- if (!asciiBytes [c]) {
330+ if (c<=0xff ) {
331+ if (!latin1Contains [c]) {
319332 break ;
320333 }
321334 } else if (c<=0x7ff ) {
@@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
354367 // span not
355368 do {
356369 c=*s;
357- if (c<=0x7f ) {
358- if (asciiBytes [c]) {
370+ if (c<=0xff ) {
371+ if (latin1Contains [c]) {
359372 break ;
360373 }
361374 } else if (c<=0x7ff ) {
@@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
403416 // span
404417 for (;;) {
405418 c=*(--limit);
406- if (c<=0x7f ) {
407- if (!asciiBytes [c]) {
419+ if (c<=0xff ) {
420+ if (!latin1Contains [c]) {
408421 break ;
409422 }
410423 } else if (c<=0x7ff ) {
@@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
446459 // span not
447460 for (;;) {
448461 c=*(--limit);
449- if (c<=0x7f ) {
450- if (asciiBytes [c]) {
462+ if (c<=0xff ) {
463+ if (latin1Contains [c]) {
451464 break ;
452465 }
453466 } else if (c<=0x7ff ) {
@@ -497,22 +510,22 @@ const uint8_t *
497510BMPSet::spanUTF8 (const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
498511 const uint8_t *limit=s+length;
499512 uint8_t b=*s;
500- if (( int8_t )b>= 0 ) {
513+ if (U8_IS_SINGLE (b) ) {
501514 // Initial all-ASCII span.
502515 if (spanCondition) {
503516 do {
504- if (!asciiBytes [b] || ++s==limit) {
517+ if (!latin1Contains [b] || ++s==limit) {
505518 return s;
506519 }
507520 b=*s;
508- } while (( int8_t )b>= 0 );
521+ } while (U8_IS_SINGLE (b) );
509522 } else {
510523 do {
511- if (asciiBytes [b] || ++s==limit) {
524+ if (latin1Contains [b] || ++s==limit) {
512525 return s;
513526 }
514527 b=*s;
515- } while (( int8_t )b>= 0 );
528+ } while (U8_IS_SINGLE (b) );
516529 }
517530 length=(int32_t )(limit-s);
518531 }
@@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
540553 // single trail byte, check for preceding 3- or 4-byte lead byte
541554 if (length>=2 && (b=*(limit-2 ))>=0xe0 ) {
542555 limit-=2 ;
543- if (asciiBytes[ 0x80 ] !=spanCondition) {
556+ if (containsFFFD !=spanCondition) {
544557 limit0=limit;
545558 }
546559 } else if (b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3 ))>=0xf0 ) {
547560 // 4-byte lead byte with only two trail bytes
548561 limit-=3 ;
549- if (asciiBytes[ 0x80 ] !=spanCondition) {
562+ if (containsFFFD !=spanCondition) {
550563 limit0=limit;
551564 }
552565 }
553566 } else {
554567 // lead byte with no trail bytes
555568 --limit;
556- if (asciiBytes[ 0x80 ] !=spanCondition) {
569+ if (containsFFFD !=spanCondition) {
557570 limit0=limit;
558571 }
559572 }
@@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
563576
564577 while (s<limit) {
565578 b=*s;
566- if (b< 0xc0 ) {
567- // ASCII; or trail bytes with the result of contains(FFFD).
579+ if (U8_IS_SINGLE (b) ) {
580+ // ASCII
568581 if (spanCondition) {
569582 do {
570- if (!asciiBytes [b]) {
583+ if (!latin1Contains [b]) {
571584 return s;
572585 } else if (++s==limit) {
573586 return limit0;
574587 }
575588 b=*s;
576- } while (b< 0xc0 );
589+ } while (U8_IS_SINGLE (b) );
577590 } else {
578591 do {
579- if (asciiBytes [b]) {
592+ if (latin1Contains [b]) {
580593 return s;
581594 } else if (++s==limit) {
582595 return limit0;
583596 }
584597 b=*s;
585- } while (b< 0xc0 );
598+ } while (U8_IS_SINGLE (b) );
586599 }
587600 }
588601 ++s; // Advance past the lead byte.
@@ -619,16 +632,17 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
619632 UChar32 c=((UChar32)(b-0xf0 )<<18 )|((UChar32)t1<<12 )|(t2<<6 )|t3;
620633 if ( ( (0x10000 <=c && c<=0x10ffff ) ?
621634 containsSlow (c, list4kStarts[0x10 ], list4kStarts[0x11 ]) :
622- asciiBytes[ 0x80 ]
635+ containsFFFD
623636 ) != spanCondition
624637 ) {
625638 return s-1 ;
626639 }
627640 s+=3 ;
628641 continue ;
629642 }
630- } else /* 0xc0<=b<0xe0 */ {
643+ } else {
631644 if ( /* handle U+0000..U+07FF inline */
645+ b>=0xc0 &&
632646 (t1=(uint8_t )(*s-0x80 )) <= 0x3f
633647 ) {
634648 if ((USetSpanCondition)((table7FF[t1]&((uint32_t )1 <<(b&0x1f )))!=0 ) != spanCondition) {
@@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
642656 // Give an illegal sequence the same value as the result of contains(FFFD).
643657 // Handle each byte of an illegal sequence separately to simplify the code;
644658 // no need to optimize error handling.
645- if (asciiBytes[ 0x80 ] !=spanCondition) {
659+ if (containsFFFD !=spanCondition) {
646660 return s-1 ;
647661 }
648662 }
@@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
667681
668682 do {
669683 b=s[--length];
670- if (( int8_t )b>= 0 ) {
684+ if (U8_IS_SINGLE (b) ) {
671685 // ASCII sub-span
672686 if (spanCondition) {
673687 do {
674- if (!asciiBytes [b]) {
688+ if (!latin1Contains [b]) {
675689 return length+1 ;
676690 } else if (length==0 ) {
677691 return 0 ;
678692 }
679693 b=s[--length];
680- } while (( int8_t )b>= 0 );
694+ } while (U8_IS_SINGLE (b) );
681695 } else {
682696 do {
683- if (asciiBytes [b]) {
697+ if (latin1Contains [b]) {
684698 return length+1 ;
685699 } else if (length==0 ) {
686700 return 0 ;
687701 }
688702 b=s[--length];
689- } while (( int8_t )b>= 0 );
703+ } while (U8_IS_SINGLE (b) );
690704 }
691705 }
692706
0 commit comments