1 /* -*- c-basic-offset: 2 -*- */
2 /*
3   Copyright(C) 2012-2018 Brazil
4 
5   This library is free software; you can redistribute it and/or
6   modify it under the terms of the GNU Lesser General Public
7   License version 2.1 as published by the Free Software Foundation.
8 
9   This library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13 
14   You should have received a copy of the GNU Lesser General Public
15   License along with this library; if not, write to the Free Software
16   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
17 */
18 module groonga_d.tokenizer;
19 
20 
21 private static import groonga_d.groonga;
22 private static import groonga_d.token;
23 
24 extern(C):
25 nothrow @nogc:
26 
27 /+
28 #include <groonga/plugin.h>
29 #include <groonga/tokenizer_query_deprecated.h>
30 +/
31 
32 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8 = "\xEF\xBF\xBE";
33 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN = 3;
34 
35 enum GRN_TOKENIZER_BEGIN_MARK_UTF8 = "\xEF\xBF\xAF";
36 enum GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN = 3;
37 enum GRN_TOKENIZER_END_MARK_UTF8 = "\xEF\xBF\xB0";
38 enum GRN_TOKENIZER_END_MARK_UTF8_LEN = 3;
39 
40 /*
41   grn_tokenizer_charlen() returns the length (#bytes) of the first character
42   in the string specified by `str_ptr' and `str_length'. If the starting bytes
43   are invalid as a character, grn_tokenizer_charlen() returns 0. See
44   groonga_d.groonga.grn_encoding in "groonga.h" for more details of `encoding'
45 
46   Deprecated. Use grn_plugin_charlen() instead.
47  */
48 int grn_tokenizer_charlen(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
49 
50 /*
51   grn_tokenizer_isspace() returns the length (#bytes) of the first character
52   in the string specified by `str_ptr' and `str_length' if it is a space
53   character. Otherwise, grn_tokenizer_isspace() returns 0.
54 
55   Deprecated. Use grn_plugin_isspace() instead.
56  */
57 int grn_tokenizer_isspace(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
58 
59 /*
60   grn_tokenizer_is_tokenized_delimiter() returns whether is the first
61   character in the string specified by `str_ptr' and `str_length' the
62   special tokenized delimiter character or not.
63  */
64 ubyte grn_tokenizer_is_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
65 
66 /*
67   grn_tokenizer_have_tokenized_delimiter() returns whether is there
68   the special delimiter character in the string specified by `str_ptr'
69   and `str_length' the special tokenized delimiter character or not.
70  */
71 
72 //GRN_PLUGIN_EXPORT
73 export ubyte grn_tokenizer_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
74 
75 /*
76   grn_tokenizer_query_open() parses `args' and returns a new object of
77   grn_tokenizer_query. The new object stores information of the query.
78   grn_tokenizer_query_open() normalizes the query if the target table
79   requires normalization. grn_tokenizer_query_open() returns NULL if
80   something goes wrong. Note that grn_tokenizer_query_open() must be called
81   just once in the function that initializes a tokenizer.
82 
83   See `GRN_STRING_*' flags for `normalize_flags'.
84  */
85 
86 //GRN_PLUGIN_EXPORT
87 export groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_open(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args, uint normalize_flags);
88 
89 /*
90   grn_tokenizer_query_create() is deprecated. Use grn_tokenizer_query_open()
91   instead.
92 */
93 
94 groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_create(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args);
95 
96 /*
97   grn_tokenizer_query_close() finalizes an object of grn_tokenizer_query
98   and then frees memory allocated for that object.
99  */
100 
101 //GRN_PLUGIN_EXPORT
102 export void grn_tokenizer_query_close(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
103 
104 /*
105   grn_tokenizer_query_destroy() is deprecated. Use grn_tokenizer_query_close()
106   instead.
107  */
108 void grn_tokenizer_query_destroy(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
109 
110 //GRN_PLUGIN_EXPORT
111 export groonga_d.groonga.grn_rc
112 grn_tokenizer_query_set_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, uint flags);
113 
114 //GRN_PLUGIN_EXPORT
115 export uint
116 grn_tokenizer_query_get_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
117 
118 //GRN_PLUGIN_EXPORT
119 export groonga_d.groonga.grn_obj*
120 grn_tokenizer_query_get_normalized_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
121 
122 //GRN_PLUGIN_EXPORT
123 export const (char)*
124 grn_tokenizer_query_get_raw_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, size_t* length);
125 
126 //GRN_PLUGIN_EXPORT
127 export groonga_d.groonga.grn_encoding
128 grn_tokenizer_query_get_encoding(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
129 
130 //GRN_PLUGIN_EXPORT
131 export uint
132 grn_tokenizer_query_get_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
133 
134 //GRN_PLUGIN_EXPORT
135 export ubyte
136 grn_tokenizer_query_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
137 
138 //GRN_PLUGIN_EXPORT
139 export groonga_d.token.grn_tokenize_mode
140 grn_tokenizer_query_get_mode(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
141 
142 //GRN_PLUGIN_EXPORT
143 export groonga_d.groonga.grn_obj*
144 grn_tokenizer_query_get_lexicon(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
145 
146 //GRN_PLUGIN_EXPORT
147 export uint
148 grn_tokenizer_query_get_token_filter_index(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
149 
150 /*
151   grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object
152   stores a token to be returned and it must be maintained until a request for
153   next token or finalization comes.
154  */
155 alias grn_tokenizer_token = _grn_tokenizer_token;
156 
157 struct _grn_tokenizer_token
158 {
159 	groonga_d.groonga.grn_obj str;
160 	groonga_d.groonga.grn_obj status;
161 }
162 
163 /*
164   grn_tokenizer_token_init() initializes `token'. Note that an initialized
165   object must be finalized by grn_tokenizer_token_fin().
166  */
167 
168 //GRN_PLUGIN_EXPORT
169 export void grn_tokenizer_token_init(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token);
170 
171 /*
172   grn_tokenizer_token_fin() finalizes `token' that has been initialized by
173   grn_tokenizer_token_init().
174  */
175 
176 //GRN_PLUGIN_EXPORT
177 export void grn_tokenizer_token_fin(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token);
178 
179 /*
180  * grn_tokenizer_status is a flag set for tokenizer status codes.
181  * If a document or query contains no tokens, push an empty string with
182  * GRN_TOKENIZER_TOKEN_LAST as a token.
183  *
184  * @deprecated since 4.0.8. Use grn_token_status instead.
185  */
186 alias grn_tokenizer_status = groonga_d.token.grn_token_status;
187 
188 /*
189  * GRN_TOKENIZER_TOKEN_CONTINUE means that the next token is not the last one.
190  *
191  * @deprecated since 4.0.8. Use GRN_TOKEN_CONTINUE instead.
192  */
193 enum GRN_TOKENIZER_TOKEN_CONTINUE = groonga_d.token.GRN_TOKEN_CONTINUE;
194 /*
195  * GRN_TOKENIZER_TOKEN_LAST means that the next token is the last one.
196  *
197  * @deprecated since 4.0.8. Use GRN_TOKEN_LAST instead.
198  */
199 enum GRN_TOKENIZER_TOKEN_LAST = groonga_d.token.GRN_TOKEN_LAST;
200 /*
201  * GRN_TOKENIZER_TOKEN_OVERLAP means that ...
202  *
203  * @deprecated since 4.0.8. Use GRN_TOKEN_OVERLAP instead.
204  */
205 enum GRN_TOKENIZER_TOKEN_OVERLAP = groonga_d.token.GRN_TOKEN_OVERLAP;
206 /*
207  * GRN_TOKENIZER_TOKEN_UNMATURED means that ...
208  *
209  * @deprecated since 4.0.8. Use GRN_TOKEN_UNMATURED instead.
210  */
211 enum GRN_TOKENIZER_TOKEN_UNMATURED = groonga_d.token.GRN_TOKEN_UNMATURED;
212 /*
213  * GRN_TOKENIZER_TOKEN_REACH_END means that ...
214  *
215  * @deprecated since 4.0.8. Use GRN_TOKEN_REACH_END instead.
216  */
217 enum GRN_TOKENIZER_TOKEN_REACH_END = groonga_d.token.GRN_TOKEN_REACH_END;
218 /*
219  * GRN_TOKENIZER_TOKEN_SKIP means that the token is skipped
220  *
221  * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP instead.
222  */
223 enum GRN_TOKENIZER_TOKEN_SKIP = groonga_d.token.GRN_TOKEN_SKIP;
224 /*
225  * GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION means that the token and postion is skipped
226  *
227  * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP_WITH_POSITION instead.
228  */
229 enum GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION = groonga_d.token.GRN_TOKEN_SKIP_WITH_POSITION;
230 /*
231  * GRN_TOKENIZER_TOKEN_FORCE_PREIX that the token is used common prefix search
232  *
233  * @deprecated since 4.0.8. Use GRN_TOKEN_FORCE_PREIX instead.
234  */
235 enum GRN_TOKENIZER_TOKEN_FORCE_PREFIX = groonga_d.token.GRN_TOKEN_FORCE_PREFIX;
236 
237 /*
238  * GRN_TOKENIZER_CONTINUE and GRN_TOKENIZER_LAST are deprecated. They
239  * are just for backward compatibility. Use
240  * GRN_TOKENIZER_TOKEN_CONTINUE and GRN_TOKENIZER_TOKEN_LAST
241  * instead.
242  */
243 enum GRN_TOKENIZER_CONTINUE = GRN_TOKENIZER_TOKEN_CONTINUE;
244 enum GRN_TOKENIZER_LAST = GRN_TOKENIZER_TOKEN_LAST;
245 
246 /*
247   grn_tokenizer_token_push() pushes the next token into `token'. Note that
248   grn_tokenizer_token_push() does not make a copy of the given string. This
249   means that you have to maintain a memory space allocated to the string.
250   Also note that the grn_tokenizer_token object must be maintained until the
251   request for the next token or finalization comes. See grn_token_status in
252   this header for more details of `status'.
253  */
254 
255 //GRN_PLUGIN_EXPORT
256 export void grn_tokenizer_token_push(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.token.grn_token_status status);
257 
258 /*
259   grn_tokenizer_tokenized_delimiter_next() extracts the next token
260   from the string specified by `str_ptr' and `str_length' and pushes
261   the next token into `token'. It returns the string after the next
262   token. The returned string may be `NULL' when all tokens are
263   extracted.
264 
265   @deprecated since 8.0.9. It's for old tokenizer next API. Use
266   grn_tokenizer_next_by_tokenized_delimiter() for new tokenizer next
267   API (grn_tokenizer_next_func).
268  */
269 
270 //GRN_PLUGIN_EXPORT
271 export const (char)* grn_tokenizer_tokenized_delimiter_next(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
272 
273 /*
274   Extract the next token by delimiting by
275   GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8.
276 
277   This is for grn_tokenizer_next_func.
278 
279   @since 8.0.9.
280  */
281 
282 //GRN_PLUGIN_EXPORT
283 export const (char)*
284 grn_tokenizer_next_by_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.token.grn_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding);
285 
286 /*
287   grn_tokenizer_register() registers a plugin to the database which is
288   associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the
289   plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and
290   an underscore ('_') are capable characters. `init', `next' and `fin' specify
291   the plugin functions. `init' is called for initializing a tokenizer for a
292   document or query. `next' is called for extracting tokens one by one. `fin'
293   is called for finalizing a tokenizer. grn_tokenizer_register() returns
294   GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more
295   details of grn_proc_func and grn_user_data, that is used as an argument of
296   grn_proc_func.
297 
298   @deprecated since 8.0.2. Use grn_tokenizer_create() and
299   grn_tokenizer_set_*_func().
300  */
301 
302 //GRN_PLUGIN_EXPORT
303 export groonga_d.groonga.grn_rc
304 grn_tokenizer_register(groonga_d.groonga.grn_ctx* ctx, const (char)* plugin_name_ptr, uint plugin_name_length, groonga_d.groonga.grn_proc_func* init, groonga_d.groonga.grn_proc_func* next, groonga_d.groonga.grn_proc_func* fin);
305 
306 //GRN_PLUGIN_EXPORT
307 export groonga_d.groonga.grn_obj*
308 grn_tokenizer_create(groonga_d.groonga.grn_ctx* ctx, const (char)* name, int name_length);
309 
310 //typedef void* grn_tokenizer_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
311 alias grn_tokenizer_init_func = extern (C) nothrow @nogc void* function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query);
312 
313 //typedef void grn_tokenizer_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data);
314 alias grn_tokenizer_next_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data);
315 
316 //typedef void grn_tokenizer_fin_func(groonga_d.groonga.grn_ctx* ctx, void* user_data);
317 alias grn_tokenizer_fin_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, void* user_data);
318 
319 //GRN_PLUGIN_EXPORT
320 export groonga_d.groonga.grn_rc
321 grn_tokenizer_set_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_init_func* init);
322 
323 //GRN_PLUGIN_EXPORT
324 export groonga_d.groonga.grn_rc
325 grn_tokenizer_set_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_next_func* next);
326 
327 //GRN_PLUGIN_EXPORT
328 export groonga_d.groonga.grn_rc
329 grn_tokenizer_set_fin_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_fin_func* fin);