1 /* -*- c-basic-offset: 2 -*- */
2 /*
3   Copyright(C) 2012-2018  Brazil
4   Copyright(C) 2020-2021  Sutou Kouhei <kou@clear-code.com>
5 
6   This library is free software; you can redistribute it and/or
7   modify it under the terms of the GNU Lesser General Public
8   License version 2.1 as published by the Free Software Foundation.
9 
10   This library is distributed in the hope that it will be useful,
11   but WITHOUT ANY WARRANTY; without even the implied warranty of
12   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13   Lesser General Public License for more details.
14 
15   You should have received a copy of the GNU Lesser General Public
16   License along with this library; if not, write to the Free Software
17   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18 */
19 module groonga_d.tokenizer;
20 
21 
22 private static import groonga_d.groonga;
23 private static import groonga_d.token;
24 private static import groonga_d.tokenizer_query_deprecated;
25 
26 extern(C):
27 nothrow @nogc:
28 
29 /+
30 #include <groonga/plugin.h>
31 #include <groonga/tokenizer_query_deprecated.h>
32 +/
33 
34 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8 = "\xEF\xBF\xBE";
35 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN = 3;
36 
37 enum GRN_TOKENIZER_BEGIN_MARK_UTF8 = "\xEF\xBF\xAF";
38 enum GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN = 3;
39 enum GRN_TOKENIZER_END_MARK_UTF8 = "\xEF\xBF\xB0";
40 enum GRN_TOKENIZER_END_MARK_UTF8_LEN = 3;
41 
42 /*
43   grn_tokenizer_charlen() returns the length (#bytes) of the first character
44   in the string specified by `str_ptr' and `str_length'. If the starting bytes
45   are invalid as a character, grn_tokenizer_charlen() returns 0. See
46   groonga_d.groonga.grn_encoding in "groonga.h" for more details of `encoding'
47 
48   Deprecated. Use grn_plugin_charlen() instead.
49  */
50 int grn_tokenizer_charlen(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
51 
52 /*
53   grn_tokenizer_isspace() returns the length (#bytes) of the first character
54   in the string specified by `str_ptr' and `str_length' if it is a space
55   character. Otherwise, grn_tokenizer_isspace() returns 0.
56 
57   Deprecated. Use grn_plugin_isspace() instead.
58  */
59 int grn_tokenizer_isspace(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
60 
61 /*
62   grn_tokenizer_is_tokenized_delimiter() returns whether is the first
63   character in the string specified by `str_ptr' and `str_length' the
64   special tokenized delimiter character or not.
65  */
66 groonga_d.groonga.grn_bool grn_tokenizer_is_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
67 
68 /*
69   grn_tokenizer_have_tokenized_delimiter() returns whether is there
70   the special delimiter character in the string specified by `str_ptr'
71   and `str_length' the special tokenized delimiter character or not.
72  */
73 
74 //GRN_PLUGIN_EXPORT
75 export groonga_d.groonga.grn_bool grn_tokenizer_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
76 
77 /*
78   grn_tokenizer_query_open() parses `args' and returns a new object of
79   grn_tokenizer_query. The new object stores information of the query.
80   grn_tokenizer_query_open() normalizes the query if the target table
81   requires normalization. grn_tokenizer_query_open() returns NULL if
82   something goes wrong. Note that grn_tokenizer_query_open() must be called
83   just once in the function that initializes a tokenizer.
84 
85   See `GRN_STRING_*' flags for `normalize_flags'.
86  */
87 
88 //GRN_PLUGIN_EXPORT
89 export groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_open(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args, uint normalize_flags);
90 
91 /*
92   grn_tokenizer_query_create() is deprecated. Use grn_tokenizer_query_open()
93   instead.
94 */
95 
96 groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_create(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args);
97 
98 /*
99   grn_tokenizer_query_close() finalizes an object of grn_tokenizer_query
100   and then frees memory allocated for that object.
101  */
102 
103 //GRN_PLUGIN_EXPORT
104 export void grn_tokenizer_query_close(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
105 
106 /*
107   grn_tokenizer_query_destroy() is deprecated. Use grn_tokenizer_query_close()
108   instead.
109  */
110 void grn_tokenizer_query_destroy(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
111 
112 //GRN_PLUGIN_EXPORT
113 export groonga_d.groonga.grn_rc
114 grn_tokenizer_query_set_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, uint flags);
115 
116 //GRN_PLUGIN_EXPORT
117 export uint
118 grn_tokenizer_query_get_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
119 
120 //GRN_PLUGIN_EXPORT
121 export groonga_d.groonga.grn_obj*
122 grn_tokenizer_query_get_normalized_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
123 
124 //GRN_PLUGIN_EXPORT
125 export const (char)*
126 grn_tokenizer_query_get_raw_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, size_t* length);
127 
128 //GRN_PLUGIN_EXPORT
129 export groonga_d.groonga.grn_encoding
130 grn_tokenizer_query_get_encoding(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
131 
132 //GRN_PLUGIN_EXPORT
133 export uint
134 grn_tokenizer_query_get_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
135 
136 //GRN_PLUGIN_EXPORT
137 export groonga_d.groonga.grn_bool
138 grn_tokenizer_query_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
139 
140 //GRN_PLUGIN_EXPORT
141 export groonga_d.token.grn_tokenize_mode
142 grn_tokenizer_query_get_mode(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
143 
144 //GRN_PLUGIN_EXPORT
145 export groonga_d.groonga.grn_obj*
146 grn_tokenizer_query_get_lexicon(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
147 
148 //GRN_PLUGIN_EXPORT
149 export uint
150 grn_tokenizer_query_get_token_filter_index(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
151 
152 //GRN_PLUGIN_EXPORT
153 export groonga_d.groonga.grn_obj* grn_tokenizer_query_get_source_column(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
154 
155 //GRN_PLUGIN_EXPORT
156 export groonga_d.groonga.grn_id grn_tokenizer_query_get_source_id(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
157 
158 //GRN_PLUGIN_EXPORT
159 export groonga_d.groonga.grn_obj* grn_tokenizer_query_get_index_column(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
160 
161 //GRN_PLUGIN_EXPORT
162 export groonga_d.groonga.grn_obj* grn_tokenizer_query_get_options(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
163 
164 /*
165   grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
166   stores a token to be returned and it must be maintained until a request for
167   next token or finalization comes.
168  */
169 alias grn_tokenizer_token = _grn_tokenizer_token;
170 
171 struct _grn_tokenizer_token
172 {
173 	groonga_d.groonga.grn_obj str;
174 	groonga_d.groonga.grn_obj status;
175 }
176 
177 /*
178   grn_tokenizer_token_init() initializes `token'. Note that an initialized
179   object must be finalized by grn_tokenizer_token_fin().
180  */
181 
182 //GRN_PLUGIN_EXPORT
183 export void grn_tokenizer_token_init(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token);
184 
185 /*
186   grn_tokenizer_token_fin() finalizes `token' that has been initialized by
187   grn_tokenizer_token_init().
188  */
189 
190 //GRN_PLUGIN_EXPORT
191 export void grn_tokenizer_token_fin(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token);
192 
193 /*
194  * grn_tokenizer_status is a flag set for tokenizer status codes.
195  * If a document or query contains no tokens, push an empty string with
196  * GRN_TOKENIZER_TOKEN_LAST as a token.
197  *
198  * @deprecated since 4.0.8. Use grn_token_status instead.
199  */
200 alias grn_tokenizer_status = groonga_d.token.grn_token_status;
201 
202 /*
203  * GRN_TOKENIZER_TOKEN_CONTINUE means that the next token is not the last one.
204  *
205  * @deprecated since 4.0.8. Use GRN_TOKEN_CONTINUE instead.
206  */
207 enum GRN_TOKENIZER_TOKEN_CONTINUE = groonga_d.token.GRN_TOKEN_CONTINUE;
208 /*
209  * GRN_TOKENIZER_TOKEN_LAST means that the next token is the last one.
210  *
211  * @deprecated since 4.0.8. Use GRN_TOKEN_LAST instead.
212  */
213 enum GRN_TOKENIZER_TOKEN_LAST = groonga_d.token.GRN_TOKEN_LAST;
214 /*
215  * GRN_TOKENIZER_TOKEN_OVERLAP means that ...
216  *
217  * @deprecated since 4.0.8. Use GRN_TOKEN_OVERLAP instead.
218  */
219 enum GRN_TOKENIZER_TOKEN_OVERLAP = groonga_d.token.GRN_TOKEN_OVERLAP;
220 /*
221  * GRN_TOKENIZER_TOKEN_UNMATURED means that ...
222  *
223  * @deprecated since 4.0.8. Use GRN_TOKEN_UNMATURED instead.
224  */
225 enum GRN_TOKENIZER_TOKEN_UNMATURED = groonga_d.token.GRN_TOKEN_UNMATURED;
226 /*
227  * GRN_TOKENIZER_TOKEN_REACH_END means that ...
228  *
229  * @deprecated since 4.0.8. Use GRN_TOKEN_REACH_END instead.
230  */
231 enum GRN_TOKENIZER_TOKEN_REACH_END = groonga_d.token.GRN_TOKEN_REACH_END;
232 /*
233  * GRN_TOKENIZER_TOKEN_SKIP means that the token is skipped
234  *
235  * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP instead.
236  */
237 enum GRN_TOKENIZER_TOKEN_SKIP = groonga_d.token.GRN_TOKEN_SKIP;
238 /*
239  * GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION means that the token and postion is skipped
240  *
241  * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP_WITH_POSITION instead.
242  */
243 enum GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION = groonga_d.token.GRN_TOKEN_SKIP_WITH_POSITION;
244 /*
245  * GRN_TOKENIZER_TOKEN_FORCE_PREIX that the token is used common prefix search
246  *
247  * @deprecated since 4.0.8. Use GRN_TOKEN_FORCE_PREIX instead.
248  */
249 enum GRN_TOKENIZER_TOKEN_FORCE_PREFIX = groonga_d.token.GRN_TOKEN_FORCE_PREFIX;
250 
251 /*
252  * GRN_TOKENIZER_CONTINUE and GRN_TOKENIZER_LAST are deprecated. They
253  * are just for backward compatibility. Use
254  * GRN_TOKENIZER_TOKEN_CONTINUE and GRN_TOKENIZER_TOKEN_LAST
255  * instead.
256  */
257 enum GRN_TOKENIZER_CONTINUE = GRN_TOKENIZER_TOKEN_CONTINUE;
258 enum GRN_TOKENIZER_LAST = GRN_TOKENIZER_TOKEN_LAST;
259 
260 /*
261   grn_tokenizer_token_push() pushes the next token into `token'. Note that
262   grn_tokenizer_token_push() does not make a copy of the given string. This
263   means that you have to maintain a memory space allocated to the string.
264   Also note that the grn_tokenizer_token object must be maintained until the
265   request for the next token or finalization comes. See grn_token_status in
266   this header for more details of `status'.
267  */
268 
269 //GRN_PLUGIN_EXPORT
270 export void grn_tokenizer_token_push(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.token.grn_token_status status);
271 
272 /*
273   grn_tokenizer_tokenized_delimiter_next() extracts the next token
274   from the string specified by `str_ptr' and `str_length' and pushes
275   the next token into `token'. It returns the string after the next
276   token. The returned string may be `NULL' when all tokens are
277   extracted.
278 
279   @deprecated since 8.0.9. It's for old tokenizer next API. Use
280   grn_tokenizer_next_by_tokenized_delimiter() for new tokenizer next
281   API (grn_tokenizer_next_func).
282  */
283 
284 //GRN_PLUGIN_EXPORT
285 export const (char)* grn_tokenizer_tokenized_delimiter_next(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
286 
287 /*
288   Extract the next token by delimiting by
289   GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8.
290 
291   This is for grn_tokenizer_next_func.
292 
293   @since 8.0.9.
294  */
295 
296 //GRN_PLUGIN_EXPORT
297 export const (char)*
298 grn_tokenizer_next_by_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.token.grn_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
299 
300 /*
301   grn_tokenizer_register() registers a plugin to the database which is
302   associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
303   plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
304   an underscore ('_') are capable characters. `init', `next' and `fin' specify
305   the plugin functions. `init' is called for initializing a tokenizer for a
306   document or query. `next' is called for extracting tokens one by one. `fin'
307   is called for finalizing a tokenizer. grn_tokenizer_register() returns
308   GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more
309   details of grn_proc_func and grn_user_data, that is used as an argument of
310   grn_proc_func.
311 
312   @deprecated since 8.0.2. Use grn_tokenizer_create() and
313   grn_tokenizer_set_*_func().
314  */
315 
316 //GRN_PLUGIN_EXPORT
317 export groonga_d.groonga.grn_rc
318 grn_tokenizer_register(groonga_d.groonga.grn_ctx* ctx, const (char)* plugin_name_ptr, uint plugin_name_length, groonga_d.groonga.grn_proc_func* init, groonga_d.groonga.grn_proc_func* next, groonga_d.groonga.grn_proc_func* fin);
319 
320 //GRN_PLUGIN_EXPORT
321 export groonga_d.groonga.grn_obj*
322 grn_tokenizer_create(groonga_d.groonga.grn_ctx* ctx, const (char)* name, int name_length);
323 
324 //typedef void* grn_tokenizer_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
325 alias grn_tokenizer_init_func = extern (C) nothrow @nogc void* function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
326 
327 //typedef void grn_tokenizer_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data);
328 alias grn_tokenizer_next_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data);
329 
330 //typedef void grn_tokenizer_fin_func(groonga_d.groonga.grn_ctx* ctx, void* user_data);
331 alias grn_tokenizer_fin_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, void* user_data);
332 
333 //GRN_PLUGIN_EXPORT
334 export groonga_d.groonga.grn_rc
335 grn_tokenizer_set_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_init_func* init);
336 
337 //GRN_PLUGIN_EXPORT
338 export groonga_d.groonga.grn_rc
339 grn_tokenizer_set_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_next_func* next);
340 
341 //GRN_PLUGIN_EXPORT
342 export groonga_d.groonga.grn_rc
343 grn_tokenizer_set_fin_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_fin_func* fin);