Skip to content

Commit 896f396

Browse files
Ivan Koptelovkyukhin
authored andcommitted
sql: rename instr to position & add collation usage
Before this patch we had instr() SQL function. After the patch it is renamed to position() for a better ANSI compatibility. Also a few things have been changed: arguments order, allowed arguments types and usage of collations. Note: after the patch position() syntax is still different from ANSI. The desirable syntax is POSITION(substring IN string). It is not possible to implement right now, because in our SQL grammar we lack expr types. We have only basic 'expr' type and it is not possible to write unambiguous rule for POSITION IN. To solve this we have to refactor grammar and add something like 'string_expr' (as it is done in other DBs grammars) Workaround for #3933 @TarantoolBot document Title: instr() is replaced with position() Name and order of the arguments has changed for a better ANSI compatibility: Before: instr(haystack, needle). After: position(needle, haystack). Type checking became more strict: before it was possible to call the function with INTEGER arguments or with arguments of different types. Now both arguments must have the same type and be either text or binary strings. Before the patch collations were not taken into consideration during the search. Now it is fixed, and both implicit (column) collations and explicit (using COLLATE expression) are used. Single collation which would be used in function is determined using ANSI “Type combination” rules.
1 parent 86e8054 commit 896f396

File tree

6 files changed

+996
-761
lines changed

6 files changed

+996
-761
lines changed

src/box/sql/func.c

Lines changed: 126 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
#include "vdbeInt.h"
3939
#include "version.h"
4040
#include "coll/coll.h"
41+
#include "tarantoolInt.h"
4142
#include <unicode/ustring.h>
4243
#include <unicode/ucasemap.h>
4344
#include <unicode/ucnv.h>
@@ -211,61 +212,140 @@ absFunc(sql_context * context, int argc, sql_value ** argv)
211212
}
212213
}
213214

214-
/*
215-
* Implementation of the instr() function.
215+
/**
216+
* Implementation of the position() function.
216217
*
217-
* instr(haystack,needle) finds the first occurrence of needle
218-
* in haystack and returns the number of previous characters plus 1,
219-
* or 0 if needle does not occur within haystack.
218+
* position(needle, haystack) finds the first occurrence of needle
219+
* in haystack and returns the number of previous characters
220+
* plus 1, or 0 if needle does not occur within haystack.
220221
*
221-
* If both haystack and needle are BLOBs, then the result is one more than
222-
* the number of bytes in haystack prior to the first occurrence of needle,
223-
* or 0 if needle never occurs in haystack.
222+
* If both haystack and needle are BLOBs, then the result is one
223+
* more than the number of bytes in haystack prior to the first
224+
* occurrence of needle, or 0 if needle never occurs in haystack.
224225
*/
225226
static void
226-
instrFunc(sql_context * context, int argc, sql_value ** argv)
227+
position_func(struct sql_context *context, int argc, struct Mem **argv)
227228
{
228-
const unsigned char *zHaystack;
229-
const unsigned char *zNeedle;
230-
int nHaystack;
231-
int nNeedle;
232-
int typeHaystack, typeNeedle;
233-
int N = 1;
234-
int isText;
235-
236229
UNUSED_PARAMETER(argc);
237-
typeHaystack = sql_value_type(argv[0]);
238-
typeNeedle = sql_value_type(argv[1]);
239-
if (typeHaystack == SQL_NULL || typeNeedle == SQL_NULL)
230+
struct Mem *needle = argv[0];
231+
struct Mem *haystack = argv[1];
232+
int needle_type = sql_value_type(needle);
233+
int haystack_type = sql_value_type(haystack);
234+
235+
if (haystack_type == SQL_NULL || needle_type == SQL_NULL)
236+
return;
237+
/*
238+
* Position function can be called only with string
239+
* or blob params.
240+
*/
241+
struct Mem *inconsistent_type_arg = NULL;
242+
if (needle_type != SQL_TEXT && needle_type != SQL_BLOB)
243+
inconsistent_type_arg = needle;
244+
if (haystack_type != SQL_TEXT && haystack_type != SQL_BLOB)
245+
inconsistent_type_arg = haystack;
246+
if (inconsistent_type_arg != NULL) {
247+
diag_set(ClientError, ER_INCONSISTENT_TYPES, "TEXT or BLOB",
248+
mem_type_to_str(inconsistent_type_arg));
249+
context->isError = SQL_TARANTOOL_ERROR;
250+
context->fErrorOrAux = 1;
251+
return;
252+
}
253+
/*
254+
* Both params of Position function must be of the same
255+
* type.
256+
*/
257+
if (haystack_type != needle_type) {
258+
diag_set(ClientError, ER_INCONSISTENT_TYPES,
259+
mem_type_to_str(needle), mem_type_to_str(haystack));
260+
context->isError = SQL_TARANTOOL_ERROR;
261+
context->fErrorOrAux = 1;
240262
return;
241-
nHaystack = sql_value_bytes(argv[0]);
242-
nNeedle = sql_value_bytes(argv[1]);
243-
if (nNeedle > 0) {
244-
if (typeHaystack == SQL_BLOB && typeNeedle == SQL_BLOB) {
245-
zHaystack = sql_value_blob(argv[0]);
246-
zNeedle = sql_value_blob(argv[1]);
247-
assert(zNeedle != 0);
248-
assert(zHaystack != 0 || nHaystack == 0);
249-
isText = 0;
263+
}
264+
265+
int n_needle_bytes = sql_value_bytes(needle);
266+
int n_haystack_bytes = sql_value_bytes(haystack);
267+
int position = 1;
268+
if (n_needle_bytes > 0) {
269+
const unsigned char *haystack_str;
270+
const unsigned char *needle_str;
271+
if (haystack_type == SQL_BLOB) {
272+
needle_str = sql_value_blob(needle);
273+
haystack_str = sql_value_blob(haystack);
274+
assert(needle_str != NULL);
275+
assert(haystack_str != NULL || n_haystack_bytes == 0);
276+
/*
277+
* Naive implementation of substring
278+
* searching: matching time O(n * m).
279+
* Can be improved.
280+
*/
281+
while (n_needle_bytes <= n_haystack_bytes &&
282+
memcmp(haystack_str, needle_str, n_needle_bytes) != 0) {
283+
position++;
284+
n_haystack_bytes--;
285+
haystack_str++;
286+
}
287+
if (n_needle_bytes > n_haystack_bytes)
288+
position = 0;
250289
} else {
251-
zHaystack = sql_value_text(argv[0]);
252-
zNeedle = sql_value_text(argv[1]);
253-
isText = 1;
254-
if (zHaystack == 0 || zNeedle == 0)
255-
return;
256-
}
257-
while (nNeedle <= nHaystack
258-
&& memcmp(zHaystack, zNeedle, nNeedle) != 0) {
259-
N++;
260-
do {
261-
nHaystack--;
262-
zHaystack++;
263-
} while (isText && (zHaystack[0] & 0xc0) == 0x80);
290+
/*
291+
* Code below handles not only simple
292+
* cases like position('a', 'bca'), but
293+
* also more complex ones:
294+
* position('a', 'bcá' COLLATE "unicode_ci")
295+
* To do so, we need to use comparison
296+
* window, which has constant character
297+
* size, but variable byte size.
298+
* Character size is equal to
299+
* needle char size.
300+
*/
301+
haystack_str = sql_value_text(haystack);
302+
needle_str = sql_value_text(needle);
303+
304+
int n_needle_chars =
305+
sql_utf8_char_count(needle_str, n_needle_bytes);
306+
int n_haystack_chars =
307+
sql_utf8_char_count(haystack_str,
308+
n_haystack_bytes);
309+
310+
if (n_haystack_chars < n_needle_chars) {
311+
position = 0;
312+
goto finish;
313+
}
314+
/*
315+
* Comparison window is determined by
316+
* beg_offset and end_offset. beg_offset
317+
* is offset in bytes from haystack
318+
* beginning to window beginning.
319+
* end_offset is offset in bytes from
320+
* haystack beginning to window end.
321+
*/
322+
int end_offset = 0;
323+
for (int c = 0; c < n_needle_chars; c++) {
324+
SQL_UTF8_FWD_1(haystack_str, end_offset,
325+
n_haystack_bytes);
326+
}
327+
int beg_offset = 0;
328+
struct coll *coll = sqlGetFuncCollSeq(context);
329+
int c;
330+
for (c = 0; c + n_needle_chars <= n_haystack_chars; c++) {
331+
if (coll->cmp((const char *) haystack_str + beg_offset,
332+
end_offset - beg_offset,
333+
(const char *) needle_str,
334+
n_needle_bytes, coll) == 0)
335+
goto finish;
336+
position++;
337+
/* Update offsets. */
338+
SQL_UTF8_FWD_1(haystack_str, beg_offset,
339+
n_haystack_bytes);
340+
SQL_UTF8_FWD_1(haystack_str, end_offset,
341+
n_haystack_bytes);
342+
}
343+
/* Needle was not found in the haystack. */
344+
position = 0;
264345
}
265-
if (nNeedle > nHaystack)
266-
N = 0;
267346
}
268-
sql_result_int(context, N);
347+
finish:
348+
sql_result_int(context, position);
269349
}
270350

271351
/*
@@ -1756,7 +1836,7 @@ sqlRegisterBuiltinFunctions(void)
17561836
FIELD_TYPE_STRING),
17571837
FUNCTION2(length, 1, 0, 0, lengthFunc, SQL_FUNC_LENGTH,
17581838
FIELD_TYPE_INTEGER),
1759-
FUNCTION(instr, 2, 0, 0, instrFunc, FIELD_TYPE_INTEGER),
1839+
FUNCTION(position, 2, 0, 1, position_func, FIELD_TYPE_INTEGER),
17601840
FUNCTION(printf, -1, 0, 0, printfFunc, FIELD_TYPE_STRING),
17611841
FUNCTION(unicode, 1, 0, 0, unicodeFunc, FIELD_TYPE_STRING),
17621842
FUNCTION(char, -1, 0, 0, charFunc, FIELD_TYPE_STRING),

test/sql-tap/func.test.lua

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2860,25 +2860,25 @@ test:do_execsql_test(
28602860
SELECT GROUP_CONCAT(b, '') FROM t100;",
28612861
{string.char(00,65,00,65,00)})
28622862

2863-
-- INSTR
2863+
-- POSITION
28642864
test:do_execsql_test(
28652865
"func-73",
2866-
"SELECT INSTR(CHAR(00,65,00,66,00), CHAR(65));",
2866+
"SELECT POSITION(CHAR(65), CHAR(00,65,00,66,00));",
28672867
{2})
28682868

28692869
test:do_execsql_test(
28702870
"func-74",
2871-
"SELECT INSTR(CHAR(00,65,00,66,00), CHAR(66));",
2871+
"SELECT POSITION(CHAR(66), CHAR(00,65,00,66,00));",
28722872
{4})
28732873

28742874
test:do_execsql_test(
28752875
"func-75",
2876-
"SELECT INSTR(CHAR(00,65,00,66,00), CHAR(00));",
2876+
"SELECT POSITION(CHAR(00), CHAR(00,65,00,66,00));",
28772877
{1})
28782878

28792879
test:do_execsql_test(
28802880
"func-76",
2881-
"SELECT INSTR(CHAR(65,66), CHAR(00));",
2881+
"SELECT POSITION(CHAR(00), CHAR(65,66));",
28822882
{0})
28832883

28842884
test:finish_test()

test/sql-tap/func5.test.lua

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ test:do_execsql_test(
3030
INSERT INTO t1 VALUES(3,'pqr','fuzzy',99);
3131
INSERT INTO t1 VALUES(4,'abcdefg','xy',22);
3232
INSERT INTO t1 VALUES(5,'shoe','mayer',2953);
33-
SELECT x FROM t1 WHERE c=instr('abcdefg',b) OR a='abcdefg' ORDER BY +x;
33+
SELECT x FROM t1 WHERE c=position(b, 'abcdefg') OR a='abcdefg' ORDER BY +x;
3434
]], {
3535
-- <func5-1.1>
3636
2, 4
@@ -40,7 +40,7 @@ test:do_execsql_test(
4040
test:do_execsql_test(
4141
"func5-1.2",
4242
[[
43-
SELECT x FROM t1 WHERE a='abcdefg' OR c=instr('abcdefg',b) ORDER BY +x;
43+
SELECT x FROM t1 WHERE a='abcdefg' OR c=position(b, 'abcdefg') ORDER BY +x;
4444
]], {
4545
-- <func5-1.1>
4646
2, 4

0 commit comments

Comments
 (0)