@@ -202,6 +202,11 @@ static PyObject *
202202unicode_decode_utf8 (const char * s , Py_ssize_t size ,
203203 _Py_error_handler error_handler , const char * errors ,
204204 Py_ssize_t * consumed );
205+ static int
206+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
207+ const char * s , Py_ssize_t size ,
208+ _Py_error_handler error_handler , const char * errors ,
209+ Py_ssize_t * consumed );
205210#ifdef Py_DEBUG
206211static inline int unicode_is_finalizing (void );
207212static int unicode_is_singleton (PyObject * unicode );
@@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
23772382}
23782383
23792384static int
2380- unicode_fromformat_write_cstr (_PyUnicodeWriter * writer , const char * str ,
2385+ unicode_fromformat_write_utf8 (_PyUnicodeWriter * writer , const char * str ,
23812386 Py_ssize_t width , Py_ssize_t precision , int flags )
23822387{
23832388 /* UTF-8 */
23842389 Py_ssize_t length ;
2385- PyObject * unicode ;
2386- int res ;
2387-
23882390 if (precision == -1 ) {
23892391 length = strlen (str );
23902392 }
@@ -2394,11 +2396,19 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
23942396 length ++ ;
23952397 }
23962398 }
2397- unicode = PyUnicode_DecodeUTF8Stateful (str , length , "replace" , NULL );
2399+
2400+ if (width < 0 ) {
2401+ return unicode_decode_utf8_writer (writer , str , length ,
2402+ _Py_ERROR_REPLACE , "replace" , NULL );
2403+ }
2404+
2405+ PyObject * unicode = PyUnicode_DecodeUTF8Stateful (str , length ,
2406+ "replace" , NULL );
23982407 if (unicode == NULL )
23992408 return -1 ;
24002409
2401- res = unicode_fromformat_write_str (writer , unicode , width , -1 , flags );
2410+ int res = unicode_fromformat_write_str (writer , unicode ,
2411+ width , -1 , flags );
24022412 Py_DECREF (unicode );
24032413 return res ;
24042414}
@@ -2700,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27002710 else {
27012711 /* UTF-8 */
27022712 const char * s = va_arg (* vargs , const char * );
2703- if (unicode_fromformat_write_cstr (writer , s , width , precision , flags ) < 0 )
2713+ if (unicode_fromformat_write_utf8 (writer , s , width , precision , flags ) < 0 )
27042714 return NULL ;
27052715 }
27062716 break ;
@@ -2739,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
27392749 }
27402750 else {
27412751 assert (str != NULL );
2742- if (unicode_fromformat_write_cstr (writer , str , width , precision , flags ) < 0 )
2752+ if (unicode_fromformat_write_utf8 (writer , str , width , precision , flags ) < 0 )
27432753 return NULL ;
27442754 }
27452755 break ;
@@ -4737,65 +4747,33 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
47374747 return p - start ;
47384748}
47394749
4740- static PyObject *
4741- unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4742- _Py_error_handler error_handler , const char * errors ,
4743- Py_ssize_t * consumed )
4744- {
4745- if (size == 0 ) {
4746- if (consumed )
4747- * consumed = 0 ;
4748- _Py_RETURN_UNICODE_EMPTY ();
4749- }
4750-
4751- /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4752- if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4753- if (consumed ) {
4754- * consumed = 1 ;
4755- }
4756- return get_latin1_char ((unsigned char )s [0 ]);
4757- }
4758-
4759- const char * starts = s ;
4760- const char * end = s + size ;
4761-
4762- // fast path: try ASCII string.
4763- PyObject * u = PyUnicode_New (size , 127 );
4764- if (u == NULL ) {
4765- return NULL ;
4766- }
4767- s += ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4768- if (s == end ) {
4769- if (consumed ) {
4770- * consumed = size ;
4771- }
4772- return u ;
4773- }
4774-
4775- // Use _PyUnicodeWriter after fast path is failed.
4776- _PyUnicodeWriter writer ;
4777- _PyUnicodeWriter_InitWithBuffer (& writer , u );
4778- writer .pos = s - starts ;
47794750
4751+ static int
4752+ unicode_decode_utf8_impl (_PyUnicodeWriter * writer ,
4753+ const char * starts , const char * s , const char * end ,
4754+ _Py_error_handler error_handler ,
4755+ const char * errors ,
4756+ Py_ssize_t * consumed )
4757+ {
47804758 Py_ssize_t startinpos , endinpos ;
47814759 const char * errmsg = "" ;
47824760 PyObject * error_handler_obj = NULL ;
47834761 PyObject * exc = NULL ;
47844762
47854763 while (s < end ) {
47864764 Py_UCS4 ch ;
4787- int kind = writer . kind ;
4765+ int kind = writer -> kind ;
47884766
47894767 if (kind == PyUnicode_1BYTE_KIND ) {
4790- if (PyUnicode_IS_ASCII (writer . buffer ))
4791- ch = asciilib_utf8_decode (& s , end , writer . data , & writer . pos );
4768+ if (PyUnicode_IS_ASCII (writer -> buffer ))
4769+ ch = asciilib_utf8_decode (& s , end , writer -> data , & writer -> pos );
47924770 else
4793- ch = ucs1lib_utf8_decode (& s , end , writer . data , & writer . pos );
4771+ ch = ucs1lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
47944772 } else if (kind == PyUnicode_2BYTE_KIND ) {
4795- ch = ucs2lib_utf8_decode (& s , end , writer . data , & writer . pos );
4773+ ch = ucs2lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
47964774 } else {
47974775 assert (kind == PyUnicode_4BYTE_KIND );
4798- ch = ucs4lib_utf8_decode (& s , end , writer . data , & writer . pos );
4776+ ch = ucs4lib_utf8_decode (& s , end , writer -> data , & writer -> pos );
47994777 }
48004778
48014779 switch (ch ) {
@@ -4826,7 +4804,9 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48264804 endinpos = startinpos + ch - 1 ;
48274805 break ;
48284806 default :
4829- if (_PyUnicodeWriter_WriteCharInline (& writer , ch ) < 0 )
4807+ // ch doesn't fit into kind, so change the buffer kind to write
4808+ // the character
4809+ if (_PyUnicodeWriter_WriteCharInline (writer , ch ) < 0 )
48304810 goto onError ;
48314811 continue ;
48324812 }
@@ -4840,7 +4820,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48404820 break ;
48414821
48424822 case _Py_ERROR_REPLACE :
4843- if (_PyUnicodeWriter_WriteCharInline (& writer , 0xfffd ) < 0 )
4823+ if (_PyUnicodeWriter_WriteCharInline (writer , 0xfffd ) < 0 )
48444824 goto onError ;
48454825 s += (endinpos - startinpos );
48464826 break ;
@@ -4849,13 +4829,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48494829 {
48504830 Py_ssize_t i ;
48514831
4852- if (_PyUnicodeWriter_PrepareKind (& writer , PyUnicode_2BYTE_KIND ) < 0 )
4832+ if (_PyUnicodeWriter_PrepareKind (writer , PyUnicode_2BYTE_KIND ) < 0 )
48534833 goto onError ;
48544834 for (i = startinpos ; i < endinpos ; i ++ ) {
48554835 ch = (Py_UCS4 )(unsigned char )(starts [i ]);
4856- PyUnicode_WRITE (writer . kind , writer . data , writer . pos ,
4836+ PyUnicode_WRITE (writer -> kind , writer -> data , writer -> pos ,
48574837 ch + 0xdc00 );
4858- writer . pos ++ ;
4838+ writer -> pos ++ ;
48594839 }
48604840 s += (endinpos - startinpos );
48614841 break ;
@@ -4866,8 +4846,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48664846 errors , & error_handler_obj ,
48674847 "utf-8" , errmsg ,
48684848 & starts , & end , & startinpos , & endinpos , & exc , & s ,
4869- & writer ))
4849+ writer )) {
48704850 goto onError ;
4851+ }
4852+
4853+ if (_PyUnicodeWriter_Prepare (writer , end - s , 127 ) < 0 ) {
4854+ return -1 ;
4855+ }
48714856 }
48724857 }
48734858
@@ -4877,13 +4862,107 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
48774862
48784863 Py_XDECREF (error_handler_obj );
48794864 Py_XDECREF (exc );
4880- return _PyUnicodeWriter_Finish ( & writer ) ;
4865+ return 0 ;
48814866
48824867onError :
48834868 Py_XDECREF (error_handler_obj );
48844869 Py_XDECREF (exc );
4885- _PyUnicodeWriter_Dealloc (& writer );
4886- return NULL ;
4870+ return -1 ;
4871+ }
4872+
4873+
4874+ static PyObject *
4875+ unicode_decode_utf8 (const char * s , Py_ssize_t size ,
4876+ _Py_error_handler error_handler , const char * errors ,
4877+ Py_ssize_t * consumed )
4878+ {
4879+ if (size == 0 ) {
4880+ if (consumed ) {
4881+ * consumed = 0 ;
4882+ }
4883+ _Py_RETURN_UNICODE_EMPTY ();
4884+ }
4885+
4886+ /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4887+ if (size == 1 && (unsigned char )s [0 ] < 128 ) {
4888+ if (consumed ) {
4889+ * consumed = 1 ;
4890+ }
4891+ return get_latin1_char ((unsigned char )s [0 ]);
4892+ }
4893+
4894+ // fast path: try ASCII string.
4895+ const char * starts = s ;
4896+ const char * end = s + size ;
4897+ PyObject * u = PyUnicode_New (size , 127 );
4898+ if (u == NULL ) {
4899+ return NULL ;
4900+ }
4901+ Py_ssize_t decoded = ascii_decode (s , end , PyUnicode_1BYTE_DATA (u ));
4902+ if (decoded == size ) {
4903+ if (consumed ) {
4904+ * consumed = size ;
4905+ }
4906+ return u ;
4907+ }
4908+ s += decoded ;
4909+ size -= decoded ;
4910+
4911+ // Use _PyUnicodeWriter after fast path is failed.
4912+ _PyUnicodeWriter writer ;
4913+ _PyUnicodeWriter_InitWithBuffer (& writer , u );
4914+ writer .pos = decoded ;
4915+
4916+ if (unicode_decode_utf8_impl (& writer , starts , s , end ,
4917+ error_handler , errors ,
4918+ consumed ) < 0 ) {
4919+ _PyUnicodeWriter_Dealloc (& writer );
4920+ return NULL ;
4921+ }
4922+ return _PyUnicodeWriter_Finish (& writer );
4923+ }
4924+
4925+
4926+ static int
4927+ unicode_decode_utf8_writer (_PyUnicodeWriter * writer ,
4928+ const char * s , Py_ssize_t size ,
4929+ _Py_error_handler error_handler , const char * errors ,
4930+ Py_ssize_t * consumed )
4931+ {
4932+ if (size == 0 ) {
4933+ if (consumed ) {
4934+ * consumed = 0 ;
4935+ }
4936+ return 0 ;
4937+ }
4938+
4939+ // fast path: try ASCII string.
4940+ if (_PyUnicodeWriter_Prepare (writer , size , 127 ) < 0 ) {
4941+ return -1 ;
4942+ }
4943+
4944+ const char * starts = s ;
4945+ const char * end = s + size ;
4946+ Py_ssize_t decoded = 0 ;
4947+ Py_UCS1 * dest = (Py_UCS1 * )writer -> data + writer -> pos * writer -> kind ;
4948+ if (writer -> kind == PyUnicode_1BYTE_KIND
4949+ && _Py_IS_ALIGNED (dest , ALIGNOF_SIZE_T ))
4950+ {
4951+ decoded = ascii_decode (s , end , dest );
4952+ writer -> pos += decoded ;
4953+
4954+ if (decoded == size ) {
4955+ if (consumed ) {
4956+ * consumed = size ;
4957+ }
4958+ return 0 ;
4959+ }
4960+ s += decoded ;
4961+ size -= decoded ;
4962+ }
4963+
4964+ return unicode_decode_utf8_impl (writer , starts , s , end ,
4965+ error_handler , errors , consumed );
48874966}
48884967
48894968
0 commit comments