Skip to content

Commit 50fd46f

Browse files
committed
[WIP] Move meat of mb_str_split to mbfl_str_split (in mbfilter.c)
This follows the pattern of most other PHP-level functions in mbstring; mbstring.c handles marshalling of PHP-level arguments and return values, as well as dealing with .INI parameters, built-in globals, and so on, and calls into functions in mbfilter.c for the core logic. Note: It also fixes a bug! mb_str_split did one thing wrong in that after feeding the entire input string into a conversion filter function, it never called the corresponding flush function! That is a no-no in mbfl; conversion filters may cache bytes and only send them to the output when flushed. I need to add a test to expose the bug.
1 parent 2a94cff commit 50fd46f

File tree

3 files changed

+102
-112
lines changed

3 files changed

+102
-112
lines changed

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@
9595
#include "eaw_table.h"
9696
#include "filters/unicode_prop.h"
9797

98+
#include "zend_API.h"
99+
98100
static inline void mbfl_filter_feed_char(int c, mbfl_convert_filter *filter)
99101
{
100102
(filter->filter_function)(c, filter);
@@ -1510,3 +1512,91 @@ mbfl_string *mbfl_html_numeric_entity_decode(mbfl_string *string, mbfl_string *r
15101512
mbfl_convert_filter_delete(decoder);
15111513
return result;
15121514
}
1515+
1516+
HashTable *mbfl_str_split(mbfl_string *string, unsigned int split_length)
1517+
{
1518+
HashTable *result = NULL;
1519+
unsigned int mb_len;
1520+
size_t chunk_len = 0;
1521+
zval chunk;
1522+
unsigned char *p = string->val, *last = p + string->len;
1523+
const mbfl_encoding *encoding = string->encoding;
1524+
1525+
/* first scenario: 1/2/4-byte fixed width encoding */
1526+
if (encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1527+
mb_len = string->len;
1528+
chunk_len = split_length; /* chunk length in bytes */
1529+
} else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */
1530+
mb_len = string->len / 2;
1531+
chunk_len = split_length * 2;
1532+
} else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */
1533+
mb_len = string->len / 4;
1534+
chunk_len = split_length * 4;
1535+
} else if (encoding->mblen_table) {
1536+
/* second scenario: variable width encoding with length table */
1537+
const unsigned char *mbtab = encoding->mblen_table;
1538+
1539+
/* assume that we have 1-byte characters */
1540+
result = zend_new_array((string->len + split_length - 1) / split_length); /* round up */
1541+
1542+
while (p < last) {
1543+
unsigned char *chunk_p = p; /* pointer to first byte in chunk */
1544+
1545+
for (int char_count = 0; char_count < split_length && p < last; char_count++) {
1546+
p += mbtab[*(unsigned char*)p]; /* character byte length table */
1547+
}
1548+
if (p > last) { /* check if chunk is in bounds */
1549+
p = last;
1550+
}
1551+
ZVAL_STRINGL(&chunk, (const char*)chunk_p, p - chunk_p);
1552+
zend_hash_next_index_insert(result, &chunk);
1553+
}
1554+
1555+
return result;
1556+
} else {
1557+
/* third scenario: other multibyte encodings */
1558+
/* assume that we have 1-byte characters */
1559+
result = zend_new_array((string->len + split_length - 1) / split_length); /* round up */
1560+
1561+
/* decoder filter to decode wchar to encoding */
1562+
mbfl_memory_device device;
1563+
mbfl_memory_device_init(&device, split_length + 1, 0);
1564+
mbfl_convert_filter *decoder = mbfl_convert_filter_new(&mbfl_encoding_wchar, encoding,
1565+
mbfl_memory_device_output, NULL, &device);
1566+
ZEND_ASSERT(decoder);
1567+
1568+
size_t len;
1569+
unsigned int *wc_buffer = convert_string_to_wchar(string, &len), *w = wc_buffer, *e = wc_buffer + len;
1570+
1571+
while (w < e) {
1572+
(decoder->filter_function)(*w++, decoder);
1573+
1574+
if (split_length == ++chunk_len) { /* if current chunk size reached defined chunk size */
1575+
mbfl_convert_filter_flush(decoder);
1576+
ZVAL_STRINGL(&chunk, (const char*)device.buffer, device.pos);
1577+
zend_hash_next_index_insert(result, &chunk);
1578+
mbfl_memory_device_reset(&device);
1579+
chunk_len = 0;
1580+
}
1581+
}
1582+
1583+
if (chunk_len > 0) {
1584+
mbfl_convert_filter_flush(decoder);
1585+
ZVAL_STRINGL(&chunk, (const char*)device.buffer, device.pos);
1586+
zend_hash_next_index_insert(result, &chunk);
1587+
}
1588+
1589+
efree(wc_buffer);
1590+
mbfl_memory_device_clear(&device);
1591+
mbfl_convert_filter_delete(decoder);
1592+
return result;
1593+
}
1594+
1595+
/* first scenario: 1/2/4-byte fixed width encoding */
1596+
result = zend_new_array((mb_len + split_length - 1) / split_length); /* round up */
1597+
for (; p < last; p += chunk_len) {
1598+
ZVAL_STRINGL(&chunk, (const char*)p, chunk_len);
1599+
zend_hash_next_index_insert(result, &chunk);
1600+
}
1601+
return result;
1602+
}

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,9 @@ mbfl_html_numeric_entity_decode(mbfl_string *string, mbfl_string *result, int *c
309309
*/
310310
MBFLAPI extern mbfl_string* mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, intptr_t mode);
311311

312+
/*
313+
* str_split
314+
*/
315+
MBFLAPI extern HashTable* mbfl_str_split(mbfl_string *string, unsigned int split_length);
316+
312317
#endif /* MBFL_MBFILTER_H */

ext/mbstring/mbstring.c

Lines changed: 7 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,41 +1712,10 @@ PHP_FUNCTION(mb_output_handler)
17121712
/* {{{ Convert a multibyte string to an array. If split_length is specified,
17131713
break the string down into chunks each split_length characters long. */
17141714

1715-
/* structure to pass split params to the callback */
1716-
struct mbfl_split_params {
1717-
zval *return_value; /* php function return value structure pointer */
1718-
mbfl_string *result_string; /* string to store result chunk */
1719-
size_t mb_chunk_length; /* actual chunk length in chars */
1720-
size_t split_length; /* split length in chars */
1721-
mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1722-
};
1723-
1724-
/* callback function to fill split array */
1725-
static int mbfl_split_output(int c, void *data)
1726-
{
1727-
struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1728-
1729-
(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1730-
1731-
if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1732-
mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1733-
mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1734-
mbfl_string *chunk = params->result_string;
1735-
mbfl_memory_device_result(device, chunk); /* make chunk */
1736-
add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1737-
efree(chunk->val);
1738-
params->mb_chunk_length = 0; /* reset mb_chunk size */
1739-
}
1740-
1741-
return 0;
1742-
}
1743-
17441715
/* TODO Document this function on php.net */
17451716
PHP_FUNCTION(mb_str_split)
17461717
{
17471718
zend_string *str, *encoding = NULL;
1748-
size_t mb_len, chunks, chunk_len;
1749-
mbfl_string string, result_string;
17501719
zend_long split_length = 1;
17511720

17521721
ZEND_PARSE_PARAMETERS_START(1, 3)
@@ -1761,91 +1730,17 @@ PHP_FUNCTION(mb_str_split)
17611730
RETURN_THROWS();
17621731
}
17631732

1764-
string.val = (unsigned char *) ZSTR_VAL(str);
1765-
string.len = ZSTR_LEN(str);
1766-
const mbfl_encoding *mbfl_encoding = string.encoding = php_mb_get_encoding(encoding, 3);
1767-
if (!string.encoding) {
1733+
const mbfl_encoding *mbfl_encoding = php_mb_get_encoding(encoding, 3);
1734+
if (!mbfl_encoding) {
17681735
RETURN_THROWS();
17691736
}
17701737

1771-
const char *p = ZSTR_VAL(str);
1772-
const char *last = ZSTR_VAL(str) + ZSTR_LEN(str);
1773-
1774-
/* first scenario: 1/2/4-byte fixed width encoding */
1775-
if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1776-
mb_len = string.len;
1777-
chunk_len = (size_t)split_length; /* chunk length in bytes */
1778-
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */
1779-
mb_len = string.len / 2;
1780-
chunk_len = split_length * 2;
1781-
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */
1782-
mb_len = string.len / 4;
1783-
chunk_len = split_length * 4;
1784-
} else if (mbfl_encoding->mblen_table) {
1785-
/* second scenario: variable width encoding with length table */
1786-
const unsigned char *mbtab = mbfl_encoding->mblen_table;
1787-
1788-
/* assume that we have 1-byte characters */
1789-
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1790-
1791-
while (p < last) {
1792-
char *chunk_p = p; /* pointer to first byte in chunk */
1793-
1794-
for (int char_count = 0; char_count < split_length && p < last; char_count++) {
1795-
p += mbtab[*(unsigned char*)p]; /* character byte length table */
1796-
}
1797-
if (p > last) { /* check if chunk is in bounds */
1798-
p = last;
1799-
}
1800-
add_next_index_stringl(return_value, chunk_p, p - chunk_p);
1801-
}
1802-
return;
1803-
} else {
1804-
/* third scenario: other multibyte encodings */
1805-
/* assume that we have 1-byte characters */
1806-
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1807-
1808-
/* decoder filter to decode wchar to encoding */
1809-
mbfl_memory_device device;
1810-
mbfl_memory_device_init(&device, split_length + 1, 0);
1811-
1812-
mbfl_convert_filter *decoder = mbfl_convert_filter_new(&mbfl_encoding_wchar, string.encoding,
1813-
mbfl_memory_device_output, NULL, &device);
1814-
ZEND_ASSERT(decoder);
1815-
1816-
/* wchar filter */
1817-
mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1818-
struct mbfl_split_params params = { /* init callback function params structure */
1819-
.return_value = return_value,
1820-
.result_string = &result_string,
1821-
.mb_chunk_length = 0,
1822-
.split_length = (size_t)split_length,
1823-
.next_filter = decoder,
1824-
};
1825-
1826-
mbfl_convert_filter *filter = mbfl_convert_filter_new(string.encoding, &mbfl_encoding_wchar,
1827-
mbfl_split_output, NULL, &params);
1828-
ZEND_ASSERT(filter);
1829-
1830-
while (p < last - 1) { /* cycle each byte except last with callback function */
1831-
(*filter->filter_function)(*p++, filter);
1832-
}
1833-
params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1834-
(*filter->filter_function)(*p++, filter); /* process last char */
1835-
1836-
mbfl_convert_filter_delete(decoder);
1837-
mbfl_convert_filter_delete(filter);
1838-
mbfl_memory_device_clear(&device);
1839-
return;
1840-
}
1738+
mbfl_string string;
1739+
string.val = (unsigned char*)ZSTR_VAL(str);
1740+
string.len = ZSTR_LEN(str);
1741+
string.encoding = mbfl_encoding;
18411742

1842-
/* first scenario: 1/2/4-byte fixed width encoding */
1843-
chunks = (mb_len + split_length - 1) / split_length; /* round up */
1844-
array_init_size(return_value, chunks);
1845-
while (chunks--) {
1846-
add_next_index_stringl(return_value, p, chunk_len);
1847-
p += chunk_len;
1848-
}
1743+
RETVAL_ARR(mbfl_str_split(&string, split_length));
18491744
}
18501745
/* }}} */
18511746

0 commit comments

Comments
 (0)