Skip to content

Commit c5c8d74

Browse files
committed
[WIP] Move meat of mb_str_split to mbfl_str_split (in mbfilter.c)
This follows the pattern of most other PHP-level functions in mbstring; mbstring.c handles marshalling of PHP-level arguments and return values, as well as dealing with .INI parameters, built-in globals, and so on, and calls into functions in mbfilter.c for the core logic. Note: It also fixes a bug! mb_str_split did one thing wrong in that after feeding the entire input string into a conversion filter function, it never called the corresponding flush function! That is a no-no in mbfl; conversion filters may cache bytes and only send them to the output when flushed. I need to add a test to expose the bug.
1 parent 3297f92 commit c5c8d74

File tree

3 files changed

+105
-112
lines changed

3 files changed

+105
-112
lines changed

ext/mbstring/libmbfl/mbfl/mbfilter.c

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@
9696
#include "eaw_table.h"
9797
#include "filters/unicode_prop.h"
9898

99+
#include "zend_API.h"
100+
99101
static inline void mbfl_filter_feed_char(int c, mbfl_convert_filter *filter)
100102
{
101103
(filter->filter_function)(c, filter);
@@ -1595,3 +1597,94 @@ mbfl_string *mbfl_html_numeric_entity_decode(mbfl_string *string, mbfl_string *r
15951597
mbfl_convert_filter_delete(decoder);
15961598
return result;
15971599
}
1600+
1601+
/*
1602+
* str_split
1603+
*/
1604+
HashTable *mbfl_str_split(mbfl_string *string, unsigned int split_length)
1605+
{
1606+
HashTable *result = NULL;
1607+
unsigned int mb_len;
1608+
size_t chunk_len = 0;
1609+
zval chunk;
1610+
unsigned char *p = string->val, *last = p + string->len;
1611+
const mbfl_encoding *encoding = string->encoding;
1612+
1613+
/* first scenario: 1/2/4-byte fixed width encoding */
1614+
if (encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1615+
mb_len = string->len;
1616+
chunk_len = split_length; /* chunk length in bytes */
1617+
} else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */
1618+
mb_len = string->len / 2;
1619+
chunk_len = split_length * 2;
1620+
} else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */
1621+
mb_len = string->len / 4;
1622+
chunk_len = split_length * 4;
1623+
} else if (encoding->mblen_table) {
1624+
/* second scenario: variable width encoding with length table */
1625+
const unsigned char *mbtab = encoding->mblen_table;
1626+
1627+
/* assume that we have 1-byte characters */
1628+
result = zend_new_array((string->len + split_length - 1) / split_length); /* round up */
1629+
1630+
while (p < last) {
1631+
unsigned char *chunk_p = p; /* pointer to first byte in chunk */
1632+
1633+
for (int char_count = 0; char_count < split_length && p < last; char_count++) {
1634+
p += mbtab[*(unsigned char*)p]; /* character byte length table */
1635+
}
1636+
if (p > last) { /* check if chunk is in bounds */
1637+
p = last;
1638+
}
1639+
ZVAL_STRINGL(&chunk, (const char*)chunk_p, p - chunk_p);
1640+
zend_hash_next_index_insert(result, &chunk);
1641+
}
1642+
1643+
return result;
1644+
} else {
1645+
/* third scenario: other multibyte encodings */
1646+
/* assume that we have 1-byte characters */
1647+
result = zend_new_array((string->len + split_length - 1) / split_length); /* round up */
1648+
1649+
/* decoder filter to decode wchar to encoding */
1650+
mbfl_memory_device device;
1651+
mbfl_memory_device_init(&device, split_length + 1, 0);
1652+
mbfl_convert_filter *decoder = mbfl_convert_filter_new(&mbfl_encoding_wchar, encoding,
1653+
mbfl_memory_device_output, NULL, &device);
1654+
ZEND_ASSERT(decoder);
1655+
1656+
size_t len;
1657+
unsigned int *wc_buffer = convert_string_to_wchar(string, &len), *w = wc_buffer, *e = wc_buffer + len;
1658+
1659+
while (w < e) {
1660+
(decoder->filter_function)(*w++, decoder);
1661+
1662+
if (split_length == ++chunk_len) { /* if current chunk size reached defined chunk size */
1663+
mbfl_convert_filter_flush(decoder);
1664+
ZVAL_STRINGL(&chunk, (const char*)device.buffer, device.pos);
1665+
zend_hash_next_index_insert(result, &chunk);
1666+
mbfl_memory_device_reset(&device);
1667+
chunk_len = 0;
1668+
}
1669+
}
1670+
1671+
if (chunk_len > 0) {
1672+
mbfl_convert_filter_flush(decoder);
1673+
ZVAL_STRINGL(&chunk, (const char*)device.buffer, device.pos);
1674+
zend_hash_next_index_insert(result, &chunk);
1675+
}
1676+
1677+
efree(wc_buffer);
1678+
mbfl_memory_device_clear(&device);
1679+
mbfl_convert_filter_delete(decoder);
1680+
return result;
1681+
}
1682+
1683+
/* first scenario: 1/2/4-byte fixed width encoding */
1684+
result = zend_new_array((mb_len + split_length - 1) / split_length); /* round up */
1685+
for (; p < last; p += chunk_len) {
1686+
ZVAL_STRINGL(&chunk, (const char*)p, chunk_len);
1687+
zend_hash_next_index_insert(result, &chunk);
1688+
}
1689+
return result;
1690+
}

ext/mbstring/libmbfl/mbfl/mbfilter.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,9 @@ mbfl_html_numeric_entity_decode(mbfl_string *string, mbfl_string *result, int *c
309309
*/
310310
MBFLAPI extern mbfl_string* mbfl_ja_jp_hantozen(mbfl_string *string, mbfl_string *result, intptr_t mode);
311311

312+
/*
313+
* str_split
314+
*/
315+
MBFLAPI extern HashTable* mbfl_str_split(mbfl_string *string, unsigned int split_length);
316+
312317
#endif /* MBFL_MBFILTER_H */

ext/mbstring/mbstring.c

Lines changed: 7 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1712,41 +1712,10 @@ PHP_FUNCTION(mb_output_handler)
17121712
/* {{{ Convert a multibyte string to an array. If split_length is specified,
17131713
break the string down into chunks each split_length characters long. */
17141714

1715-
/* structure to pass split params to the callback */
1716-
struct mbfl_split_params {
1717-
zval *return_value; /* php function return value structure pointer */
1718-
mbfl_string *result_string; /* string to store result chunk */
1719-
size_t mb_chunk_length; /* actual chunk length in chars */
1720-
size_t split_length; /* split length in chars */
1721-
mbfl_convert_filter *next_filter; /* widechar to encoding converter */
1722-
};
1723-
1724-
/* callback function to fill split array */
1725-
static int mbfl_split_output(int c, void *data)
1726-
{
1727-
struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */
1728-
1729-
(*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */
1730-
1731-
if (params->split_length == ++params->mb_chunk_length) { /* if current chunk size reached defined chunk size or last char reached */
1732-
mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */
1733-
mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */
1734-
mbfl_string *chunk = params->result_string;
1735-
mbfl_memory_device_result(device, chunk); /* make chunk */
1736-
add_next_index_stringl(params->return_value, (const char *)chunk->val, chunk->len); /* add chunk to the array */
1737-
efree(chunk->val);
1738-
params->mb_chunk_length = 0; /* reset mb_chunk size */
1739-
}
1740-
1741-
return 0;
1742-
}
1743-
17441715
/* TODO Document this function on php.net */
17451716
PHP_FUNCTION(mb_str_split)
17461717
{
17471718
zend_string *str, *encoding = NULL;
1748-
size_t mb_len, chunks, chunk_len;
1749-
mbfl_string string, result_string;
17501719
zend_long split_length = 1;
17511720

17521721
ZEND_PARSE_PARAMETERS_START(1, 3)
@@ -1761,91 +1730,17 @@ PHP_FUNCTION(mb_str_split)
17611730
RETURN_THROWS();
17621731
}
17631732

1764-
string.val = (unsigned char *) ZSTR_VAL(str);
1765-
string.len = ZSTR_LEN(str);
1766-
const mbfl_encoding *mbfl_encoding = string.encoding = php_mb_get_encoding(encoding, 3);
1767-
if (!string.encoding) {
1733+
const mbfl_encoding *mbfl_encoding = php_mb_get_encoding(encoding, 3);
1734+
if (!mbfl_encoding) {
17681735
RETURN_THROWS();
17691736
}
17701737

1771-
const char *p = ZSTR_VAL(str);
1772-
const char *last = ZSTR_VAL(str) + ZSTR_LEN(str);
1773-
1774-
/* first scenario: 1/2/4-byte fixed width encoding */
1775-
if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */
1776-
mb_len = string.len;
1777-
chunk_len = (size_t)split_length; /* chunk length in bytes */
1778-
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { /* 2 bytes */
1779-
mb_len = string.len / 2;
1780-
chunk_len = split_length * 2;
1781-
} else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { /* 4 bytes */
1782-
mb_len = string.len / 4;
1783-
chunk_len = split_length * 4;
1784-
} else if (mbfl_encoding->mblen_table) {
1785-
/* second scenario: variable width encoding with length table */
1786-
const unsigned char *mbtab = mbfl_encoding->mblen_table;
1787-
1788-
/* assume that we have 1-byte characters */
1789-
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1790-
1791-
while (p < last) {
1792-
char *chunk_p = p; /* pointer to first byte in chunk */
1793-
1794-
for (int char_count = 0; char_count < split_length && p < last; char_count++) {
1795-
p += mbtab[*(unsigned char*)p]; /* character byte length table */
1796-
}
1797-
if (p > last) { /* check if chunk is in bounds */
1798-
p = last;
1799-
}
1800-
add_next_index_stringl(return_value, chunk_p, p - chunk_p);
1801-
}
1802-
return;
1803-
} else {
1804-
/* third scenario: other multibyte encodings */
1805-
/* assume that we have 1-byte characters */
1806-
array_init_size(return_value, (string.len + split_length) / split_length); /* round up */
1807-
1808-
/* decoder filter to decode wchar to encoding */
1809-
mbfl_memory_device device;
1810-
mbfl_memory_device_init(&device, split_length + 1, 0);
1811-
1812-
mbfl_convert_filter *decoder = mbfl_convert_filter_new(&mbfl_encoding_wchar, string.encoding,
1813-
mbfl_memory_device_output, NULL, &device);
1814-
ZEND_ASSERT(decoder);
1815-
1816-
/* wchar filter */
1817-
mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */
1818-
struct mbfl_split_params params = { /* init callback function params structure */
1819-
.return_value = return_value,
1820-
.result_string = &result_string,
1821-
.mb_chunk_length = 0,
1822-
.split_length = (size_t)split_length,
1823-
.next_filter = decoder,
1824-
};
1825-
1826-
mbfl_convert_filter *filter = mbfl_convert_filter_new(string.encoding, &mbfl_encoding_wchar,
1827-
mbfl_split_output, NULL, &params);
1828-
ZEND_ASSERT(filter);
1829-
1830-
while (p < last - 1) { /* cycle each byte except last with callback function */
1831-
(*filter->filter_function)(*p++, filter);
1832-
}
1833-
params.mb_chunk_length = split_length - 1; /* force to finish current chunk */
1834-
(*filter->filter_function)(*p++, filter); /* process last char */
1835-
1836-
mbfl_convert_filter_delete(decoder);
1837-
mbfl_convert_filter_delete(filter);
1838-
mbfl_memory_device_clear(&device);
1839-
return;
1840-
}
1738+
mbfl_string string;
1739+
string.val = (unsigned char*)ZSTR_VAL(str);
1740+
string.len = ZSTR_LEN(str);
1741+
string.encoding = mbfl_encoding;
18411742

1842-
/* first scenario: 1/2/4-byte fixed width encoding */
1843-
chunks = (mb_len + split_length - 1) / split_length; /* round up */
1844-
array_init_size(return_value, chunks);
1845-
while (chunks--) {
1846-
add_next_index_stringl(return_value, p, chunk_len);
1847-
p += chunk_len;
1848-
}
1743+
RETVAL_ARR(mbfl_str_split(&string, split_length));
18491744
}
18501745
/* }}} */
18511746

0 commit comments

Comments
 (0)