1 /* -*- c-basic-offset: 2 -*- */ 2 /* 3 Copyright(C) 2012-2018 Brazil 4 5 This library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License version 2.1 as published by the Free Software Foundation. 8 9 This library is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 Lesser General Public License for more details. 13 14 You should have received a copy of the GNU Lesser General Public 15 License along with this library; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 module groonga_d.tokenizer; 19 20 21 private static import groonga_d.groonga; 22 private static import groonga_d.token; 23 24 extern(C): 25 nothrow @nogc: 26 27 /+ 28 #include <groonga/plugin.h> 29 #include <groonga/tokenizer_query_deprecated.h> 30 +/ 31 32 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8 = "\xEF\xBF\xBE"; 33 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN = 3; 34 35 enum GRN_TOKENIZER_BEGIN_MARK_UTF8 = "\xEF\xBF\xAF"; 36 enum GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN = 3; 37 enum GRN_TOKENIZER_END_MARK_UTF8 = "\xEF\xBF\xB0"; 38 enum GRN_TOKENIZER_END_MARK_UTF8_LEN = 3; 39 40 /* 41 grn_tokenizer_charlen() returns the length (#bytes) of the first character 42 in the string specified by `str_ptr' and `str_length'. If the starting bytes 43 are invalid as a character, grn_tokenizer_charlen() returns 0. See 44 groonga_d.groonga.grn_encoding in "groonga.h" for more details of `encoding' 45 46 Deprecated. Use grn_plugin_charlen() instead. 47 */ 48 int grn_tokenizer_charlen(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 49 50 /* 51 grn_tokenizer_isspace() returns the length (#bytes) of the first character 52 in the string specified by `str_ptr' and `str_length' if it is a space 53 character. Otherwise, grn_tokenizer_isspace() returns 0. 54 55 Deprecated. Use grn_plugin_isspace() instead. 56 */ 57 int grn_tokenizer_isspace(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 58 59 /* 60 grn_tokenizer_is_tokenized_delimiter() returns whether is the first 61 character in the string specified by `str_ptr' and `str_length' the 62 special tokenized delimiter character or not. 63 */ 64 ubyte grn_tokenizer_is_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 65 66 /* 67 grn_tokenizer_have_tokenized_delimiter() returns whether is there 68 the special delimiter character in the string specified by `str_ptr' 69 and `str_length' the special tokenized delimiter character or not. 70 */ 71 72 //GRN_PLUGIN_EXPORT 73 export ubyte grn_tokenizer_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 74 75 /* 76 grn_tokenizer_query_open() parses `args' and returns a new object of 77 grn_tokenizer_query. The new object stores information of the query. 78 grn_tokenizer_query_open() normalizes the query if the target table 79 requires normalization. grn_tokenizer_query_open() returns NULL if 80 something goes wrong. Note that grn_tokenizer_query_open() must be called 81 just once in the function that initializes a tokenizer. 82 83 See `GRN_STRING_*' flags for `normalize_flags'. 84 */ 85 86 //GRN_PLUGIN_EXPORT 87 export groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_open(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args, uint normalize_flags); 88 89 /* 90 grn_tokenizer_query_create() is deprecated. Use grn_tokenizer_query_open() 91 instead. 92 */ 93 94 groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_create(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args); 95 96 /* 97 grn_tokenizer_query_close() finalizes an object of grn_tokenizer_query 98 and then frees memory allocated for that object. 99 */ 100 101 //GRN_PLUGIN_EXPORT 102 export void grn_tokenizer_query_close(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 103 104 /* 105 grn_tokenizer_query_destroy() is deprecated. Use grn_tokenizer_query_close() 106 instead. 107 */ 108 void grn_tokenizer_query_destroy(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 109 110 //GRN_PLUGIN_EXPORT 111 export groonga_d.groonga.grn_rc 112 grn_tokenizer_query_set_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, uint flags); 113 114 //GRN_PLUGIN_EXPORT 115 export uint 116 grn_tokenizer_query_get_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 117 118 //GRN_PLUGIN_EXPORT 119 export groonga_d.groonga.grn_obj* 120 grn_tokenizer_query_get_normalized_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 121 122 //GRN_PLUGIN_EXPORT 123 export const (char)* 124 grn_tokenizer_query_get_raw_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, size_t* length); 125 126 //GRN_PLUGIN_EXPORT 127 export groonga_d.groonga.grn_encoding 128 grn_tokenizer_query_get_encoding(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 129 130 //GRN_PLUGIN_EXPORT 131 export uint 132 grn_tokenizer_query_get_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 133 134 //GRN_PLUGIN_EXPORT 135 export ubyte 136 grn_tokenizer_query_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 137 138 //GRN_PLUGIN_EXPORT 139 export groonga_d.token.grn_tokenize_mode 140 grn_tokenizer_query_get_mode(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 141 142 //GRN_PLUGIN_EXPORT 143 export groonga_d.groonga.grn_obj* 144 grn_tokenizer_query_get_lexicon(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 145 146 //GRN_PLUGIN_EXPORT 147 export uint 148 grn_tokenizer_query_get_token_filter_index(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 149 150 /* 151 grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object 152 stores a token to be returned and it must be maintained until a request for 153 next token or finalization comes. 154 */ 155 alias grn_tokenizer_token = _grn_tokenizer_token; 156 157 struct _grn_tokenizer_token 158 { 159 groonga_d.groonga.grn_obj str; 160 groonga_d.groonga.grn_obj status; 161 } 162 163 /* 164 grn_tokenizer_token_init() initializes `token'. Note that an initialized 165 object must be finalized by grn_tokenizer_token_fin(). 166 */ 167 168 //GRN_PLUGIN_EXPORT 169 export void grn_tokenizer_token_init(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token); 170 171 /* 172 grn_tokenizer_token_fin() finalizes `token' that has been initialized by 173 grn_tokenizer_token_init(). 174 */ 175 176 //GRN_PLUGIN_EXPORT 177 export void grn_tokenizer_token_fin(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token); 178 179 /* 180 * grn_tokenizer_status is a flag set for tokenizer status codes. 181 * If a document or query contains no tokens, push an empty string with 182 * GRN_TOKENIZER_TOKEN_LAST as a token. 183 * 184 * @deprecated since 4.0.8. Use grn_token_status instead. 185 */ 186 alias grn_tokenizer_status = groonga_d.token.grn_token_status; 187 188 /* 189 * GRN_TOKENIZER_TOKEN_CONTINUE means that the next token is not the last one. 190 * 191 * @deprecated since 4.0.8. Use GRN_TOKEN_CONTINUE instead. 192 */ 193 enum GRN_TOKENIZER_TOKEN_CONTINUE = groonga_d.token.GRN_TOKEN_CONTINUE; 194 /* 195 * GRN_TOKENIZER_TOKEN_LAST means that the next token is the last one. 196 * 197 * @deprecated since 4.0.8. Use GRN_TOKEN_LAST instead. 198 */ 199 enum GRN_TOKENIZER_TOKEN_LAST = groonga_d.token.GRN_TOKEN_LAST; 200 /* 201 * GRN_TOKENIZER_TOKEN_OVERLAP means that ... 202 * 203 * @deprecated since 4.0.8. Use GRN_TOKEN_OVERLAP instead. 204 */ 205 enum GRN_TOKENIZER_TOKEN_OVERLAP = groonga_d.token.GRN_TOKEN_OVERLAP; 206 /* 207 * GRN_TOKENIZER_TOKEN_UNMATURED means that ... 208 * 209 * @deprecated since 4.0.8. Use GRN_TOKEN_UNMATURED instead. 210 */ 211 enum GRN_TOKENIZER_TOKEN_UNMATURED = groonga_d.token.GRN_TOKEN_UNMATURED; 212 /* 213 * GRN_TOKENIZER_TOKEN_REACH_END means that ... 214 * 215 * @deprecated since 4.0.8. Use GRN_TOKEN_REACH_END instead. 216 */ 217 enum GRN_TOKENIZER_TOKEN_REACH_END = groonga_d.token.GRN_TOKEN_REACH_END; 218 /* 219 * GRN_TOKENIZER_TOKEN_SKIP means that the token is skipped 220 * 221 * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP instead. 222 */ 223 enum GRN_TOKENIZER_TOKEN_SKIP = groonga_d.token.GRN_TOKEN_SKIP; 224 /* 225 * GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION means that the token and postion is skipped 226 * 227 * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP_WITH_POSITION instead. 228 */ 229 enum GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION = groonga_d.token.GRN_TOKEN_SKIP_WITH_POSITION; 230 /* 231 * GRN_TOKENIZER_TOKEN_FORCE_PREIX that the token is used common prefix search 232 * 233 * @deprecated since 4.0.8. Use GRN_TOKEN_FORCE_PREIX instead. 234 */ 235 enum GRN_TOKENIZER_TOKEN_FORCE_PREFIX = groonga_d.token.GRN_TOKEN_FORCE_PREFIX; 236 237 /* 238 * GRN_TOKENIZER_CONTINUE and GRN_TOKENIZER_LAST are deprecated. They 239 * are just for backward compatibility. Use 240 * GRN_TOKENIZER_TOKEN_CONTINUE and GRN_TOKENIZER_TOKEN_LAST 241 * instead. 242 */ 243 enum GRN_TOKENIZER_CONTINUE = GRN_TOKENIZER_TOKEN_CONTINUE; 244 enum GRN_TOKENIZER_LAST = GRN_TOKENIZER_TOKEN_LAST; 245 246 /* 247 grn_tokenizer_token_push() pushes the next token into `token'. Note that 248 grn_tokenizer_token_push() does not make a copy of the given string. This 249 means that you have to maintain a memory space allocated to the string. 250 Also note that the grn_tokenizer_token object must be maintained until the 251 request for the next token or finalization comes. See grn_token_status in 252 this header for more details of `status'. 253 */ 254 255 //GRN_PLUGIN_EXPORT 256 export void grn_tokenizer_token_push(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.token.grn_token_status status); 257 258 /* 259 grn_tokenizer_tokenized_delimiter_next() extracts the next token 260 from the string specified by `str_ptr' and `str_length' and pushes 261 the next token into `token'. It returns the string after the next 262 token. The returned string may be `NULL' when all tokens are 263 extracted. 264 265 @deprecated since 8.0.9. It's for old tokenizer next API. Use 266 grn_tokenizer_next_by_tokenized_delimiter() for new tokenizer next 267 API (grn_tokenizer_next_func). 268 */ 269 270 //GRN_PLUGIN_EXPORT 271 export const (char)* grn_tokenizer_tokenized_delimiter_next(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 272 273 /* 274 Extract the next token by delimiting by 275 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8. 276 277 This is for grn_tokenizer_next_func. 278 279 @since 8.0.9. 280 */ 281 282 //GRN_PLUGIN_EXPORT 283 export const (char)* 284 grn_tokenizer_next_by_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.token.grn_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 285 286 /* 287 grn_tokenizer_register() registers a plugin to the database which is 288 associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the 289 plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and 290 an underscore ('_') are capable characters. `init', `next' and `fin' specify 291 the plugin functions. `init' is called for initializing a tokenizer for a 292 document or query. `next' is called for extracting tokens one by one. `fin' 293 is called for finalizing a tokenizer. grn_tokenizer_register() returns 294 GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more 295 details of grn_proc_func and grn_user_data, that is used as an argument of 296 grn_proc_func. 297 298 @deprecated since 8.0.2. Use grn_tokenizer_create() and 299 grn_tokenizer_set_*_func(). 300 */ 301 302 //GRN_PLUGIN_EXPORT 303 export groonga_d.groonga.grn_rc 304 grn_tokenizer_register(groonga_d.groonga.grn_ctx* ctx, const (char)* plugin_name_ptr, uint plugin_name_length, groonga_d.groonga.grn_proc_func* init, groonga_d.groonga.grn_proc_func* next, groonga_d.groonga.grn_proc_func* fin); 305 306 //GRN_PLUGIN_EXPORT 307 export groonga_d.groonga.grn_obj* 308 grn_tokenizer_create(groonga_d.groonga.grn_ctx* ctx, const (char)* name, int name_length); 309 310 //typedef void* grn_tokenizer_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 311 alias grn_tokenizer_init_func = extern (C) nothrow @nogc void* function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 312 313 //typedef void grn_tokenizer_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data); 314 alias grn_tokenizer_next_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data); 315 316 //typedef void grn_tokenizer_fin_func(groonga_d.groonga.grn_ctx* ctx, void* user_data); 317 alias grn_tokenizer_fin_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, void* user_data); 318 319 //GRN_PLUGIN_EXPORT 320 export groonga_d.groonga.grn_rc 321 grn_tokenizer_set_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_init_func* init); 322 323 //GRN_PLUGIN_EXPORT 324 export groonga_d.groonga.grn_rc 325 grn_tokenizer_set_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_next_func* next); 326 327 //GRN_PLUGIN_EXPORT 328 export groonga_d.groonga.grn_rc 329 grn_tokenizer_set_fin_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_fin_func* fin);