1 /* -*- c-basic-offset: 2 -*- */ 2 /* 3 Copyright(C) 2012-2018 Brazil 4 Copyright(C) 2020-2021 Sutou Kouhei <kou@clear-code.com> 5 6 This library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License version 2.1 as published by the Free Software Foundation. 9 10 This library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with this library; if not, write to the Free Software 17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 */ 19 module groonga_d.tokenizer; 20 21 22 private static import groonga_d.groonga; 23 private static import groonga_d.token; 24 private static import groonga_d.tokenizer_query_deprecated; 25 26 extern(C): 27 nothrow @nogc: 28 29 /+ 30 #include <groonga/plugin.h> 31 #include <groonga/tokenizer_query_deprecated.h> 32 +/ 33 34 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8 = "\xEF\xBF\xBE"; 35 enum GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8_LEN = 3; 36 37 enum GRN_TOKENIZER_BEGIN_MARK_UTF8 = "\xEF\xBF\xAF"; 38 enum GRN_TOKENIZER_BEGIN_MARK_UTF8_LEN = 3; 39 enum GRN_TOKENIZER_END_MARK_UTF8 = "\xEF\xBF\xB0"; 40 enum GRN_TOKENIZER_END_MARK_UTF8_LEN = 3; 41 42 /* 43 grn_tokenizer_charlen() returns the length (#bytes) of the first character 44 in the string specified by `str_ptr' and `str_length'. If the starting bytes 45 are invalid as a character, grn_tokenizer_charlen() returns 0. See 46 groonga_d.groonga.grn_encoding in "groonga.h" for more details of `encoding' 47 48 Deprecated. Use grn_plugin_charlen() instead. 49 */ 50 int grn_tokenizer_charlen(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 51 52 /* 53 grn_tokenizer_isspace() returns the length (#bytes) of the first character 54 in the string specified by `str_ptr' and `str_length' if it is a space 55 character. Otherwise, grn_tokenizer_isspace() returns 0. 56 57 Deprecated. Use grn_plugin_isspace() instead. 58 */ 59 int grn_tokenizer_isspace(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 60 61 /* 62 grn_tokenizer_is_tokenized_delimiter() returns whether is the first 63 character in the string specified by `str_ptr' and `str_length' the 64 special tokenized delimiter character or not. 65 */ 66 groonga_d.groonga.grn_bool grn_tokenizer_is_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 67 68 /* 69 grn_tokenizer_have_tokenized_delimiter() returns whether is there 70 the special delimiter character in the string specified by `str_ptr' 71 and `str_length' the special tokenized delimiter character or not. 72 */ 73 74 //GRN_PLUGIN_EXPORT 75 export groonga_d.groonga.grn_bool grn_tokenizer_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 76 77 /* 78 grn_tokenizer_query_open() parses `args' and returns a new object of 79 grn_tokenizer_query. The new object stores information of the query. 80 grn_tokenizer_query_open() normalizes the query if the target table 81 requires normalization. grn_tokenizer_query_open() returns NULL if 82 something goes wrong. Note that grn_tokenizer_query_open() must be called 83 just once in the function that initializes a tokenizer. 84 85 See `GRN_STRING_*' flags for `normalize_flags'. 86 */ 87 88 //GRN_PLUGIN_EXPORT 89 export groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_open(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args, uint normalize_flags); 90 91 /* 92 grn_tokenizer_query_create() is deprecated. Use grn_tokenizer_query_open() 93 instead. 94 */ 95 96 groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* grn_tokenizer_query_create(groonga_d.groonga.grn_ctx* ctx, int num_args, groonga_d.groonga.grn_obj** args); 97 98 /* 99 grn_tokenizer_query_close() finalizes an object of grn_tokenizer_query 100 and then frees memory allocated for that object. 101 */ 102 103 //GRN_PLUGIN_EXPORT 104 export void grn_tokenizer_query_close(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 105 106 /* 107 grn_tokenizer_query_destroy() is deprecated. Use grn_tokenizer_query_close() 108 instead. 109 */ 110 void grn_tokenizer_query_destroy(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 111 112 //GRN_PLUGIN_EXPORT 113 export groonga_d.groonga.grn_rc 114 grn_tokenizer_query_set_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, uint flags); 115 116 //GRN_PLUGIN_EXPORT 117 export uint 118 grn_tokenizer_query_get_normalize_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 119 120 //GRN_PLUGIN_EXPORT 121 export groonga_d.groonga.grn_obj* 122 grn_tokenizer_query_get_normalized_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 123 124 //GRN_PLUGIN_EXPORT 125 export const (char)* 126 grn_tokenizer_query_get_raw_string(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, size_t* length); 127 128 //GRN_PLUGIN_EXPORT 129 export groonga_d.groonga.grn_encoding 130 grn_tokenizer_query_get_encoding(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 131 132 //GRN_PLUGIN_EXPORT 133 export uint 134 grn_tokenizer_query_get_flags(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 135 136 //GRN_PLUGIN_EXPORT 137 export groonga_d.groonga.grn_bool 138 grn_tokenizer_query_have_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 139 140 //GRN_PLUGIN_EXPORT 141 export groonga_d.token.grn_tokenize_mode 142 grn_tokenizer_query_get_mode(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 143 144 //GRN_PLUGIN_EXPORT 145 export groonga_d.groonga.grn_obj* 146 grn_tokenizer_query_get_lexicon(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 147 148 //GRN_PLUGIN_EXPORT 149 export uint 150 grn_tokenizer_query_get_token_filter_index(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 151 152 //GRN_PLUGIN_EXPORT 153 export groonga_d.groonga.grn_obj* grn_tokenizer_query_get_source_column(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 154 155 //GRN_PLUGIN_EXPORT 156 export groonga_d.groonga.grn_id grn_tokenizer_query_get_source_id(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 157 158 //GRN_PLUGIN_EXPORT 159 export groonga_d.groonga.grn_obj* grn_tokenizer_query_get_index_column(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 160 161 //GRN_PLUGIN_EXPORT 162 export groonga_d.groonga.grn_obj* grn_tokenizer_query_get_options(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 163 164 /* 165 grn_tokenizer_token is needed to return tokens. A grn_tokenizer_token object 166 stores a token to be returned and it must be maintained until a request for 167 next token or finalization comes. 168 */ 169 alias grn_tokenizer_token = _grn_tokenizer_token; 170 171 struct _grn_tokenizer_token 172 { 173 groonga_d.groonga.grn_obj str; 174 groonga_d.groonga.grn_obj status; 175 } 176 177 /* 178 grn_tokenizer_token_init() initializes `token'. Note that an initialized 179 object must be finalized by grn_tokenizer_token_fin(). 180 */ 181 182 //GRN_PLUGIN_EXPORT 183 export void grn_tokenizer_token_init(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token); 184 185 /* 186 grn_tokenizer_token_fin() finalizes `token' that has been initialized by 187 grn_tokenizer_token_init(). 188 */ 189 190 //GRN_PLUGIN_EXPORT 191 export void grn_tokenizer_token_fin(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token); 192 193 /* 194 * grn_tokenizer_status is a flag set for tokenizer status codes. 195 * If a document or query contains no tokens, push an empty string with 196 * GRN_TOKENIZER_TOKEN_LAST as a token. 197 * 198 * @deprecated since 4.0.8. Use grn_token_status instead. 199 */ 200 alias grn_tokenizer_status = groonga_d.token.grn_token_status; 201 202 /* 203 * GRN_TOKENIZER_TOKEN_CONTINUE means that the next token is not the last one. 204 * 205 * @deprecated since 4.0.8. Use GRN_TOKEN_CONTINUE instead. 206 */ 207 enum GRN_TOKENIZER_TOKEN_CONTINUE = groonga_d.token.GRN_TOKEN_CONTINUE; 208 /* 209 * GRN_TOKENIZER_TOKEN_LAST means that the next token is the last one. 210 * 211 * @deprecated since 4.0.8. Use GRN_TOKEN_LAST instead. 212 */ 213 enum GRN_TOKENIZER_TOKEN_LAST = groonga_d.token.GRN_TOKEN_LAST; 214 /* 215 * GRN_TOKENIZER_TOKEN_OVERLAP means that ... 216 * 217 * @deprecated since 4.0.8. Use GRN_TOKEN_OVERLAP instead. 218 */ 219 enum GRN_TOKENIZER_TOKEN_OVERLAP = groonga_d.token.GRN_TOKEN_OVERLAP; 220 /* 221 * GRN_TOKENIZER_TOKEN_UNMATURED means that ... 222 * 223 * @deprecated since 4.0.8. Use GRN_TOKEN_UNMATURED instead. 224 */ 225 enum GRN_TOKENIZER_TOKEN_UNMATURED = groonga_d.token.GRN_TOKEN_UNMATURED; 226 /* 227 * GRN_TOKENIZER_TOKEN_REACH_END means that ... 228 * 229 * @deprecated since 4.0.8. Use GRN_TOKEN_REACH_END instead. 230 */ 231 enum GRN_TOKENIZER_TOKEN_REACH_END = groonga_d.token.GRN_TOKEN_REACH_END; 232 /* 233 * GRN_TOKENIZER_TOKEN_SKIP means that the token is skipped 234 * 235 * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP instead. 236 */ 237 enum GRN_TOKENIZER_TOKEN_SKIP = groonga_d.token.GRN_TOKEN_SKIP; 238 /* 239 * GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION means that the token and postion is skipped 240 * 241 * @deprecated since 4.0.8. Use GRN_TOKEN_SKIP_WITH_POSITION instead. 242 */ 243 enum GRN_TOKENIZER_TOKEN_SKIP_WITH_POSITION = groonga_d.token.GRN_TOKEN_SKIP_WITH_POSITION; 244 /* 245 * GRN_TOKENIZER_TOKEN_FORCE_PREIX that the token is used common prefix search 246 * 247 * @deprecated since 4.0.8. Use GRN_TOKEN_FORCE_PREIX instead. 248 */ 249 enum GRN_TOKENIZER_TOKEN_FORCE_PREFIX = groonga_d.token.GRN_TOKEN_FORCE_PREFIX; 250 251 /* 252 * GRN_TOKENIZER_CONTINUE and GRN_TOKENIZER_LAST are deprecated. They 253 * are just for backward compatibility. Use 254 * GRN_TOKENIZER_TOKEN_CONTINUE and GRN_TOKENIZER_TOKEN_LAST 255 * instead. 256 */ 257 enum GRN_TOKENIZER_CONTINUE = GRN_TOKENIZER_TOKEN_CONTINUE; 258 enum GRN_TOKENIZER_LAST = GRN_TOKENIZER_TOKEN_LAST; 259 260 /* 261 grn_tokenizer_token_push() pushes the next token into `token'. Note that 262 grn_tokenizer_token_push() does not make a copy of the given string. This 263 means that you have to maintain a memory space allocated to the string. 264 Also note that the grn_tokenizer_token object must be maintained until the 265 request for the next token or finalization comes. See grn_token_status in 266 this header for more details of `status'. 267 */ 268 269 //GRN_PLUGIN_EXPORT 270 export void grn_tokenizer_token_push(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.token.grn_token_status status); 271 272 /* 273 grn_tokenizer_tokenized_delimiter_next() extracts the next token 274 from the string specified by `str_ptr' and `str_length' and pushes 275 the next token into `token'. It returns the string after the next 276 token. The returned string may be `NULL' when all tokens are 277 extracted. 278 279 @deprecated since 8.0.9. It's for old tokenizer next API. Use 280 grn_tokenizer_next_by_tokenized_delimiter() for new tokenizer next 281 API (grn_tokenizer_next_func). 282 */ 283 284 //GRN_PLUGIN_EXPORT 285 export const (char)* grn_tokenizer_tokenized_delimiter_next(groonga_d.groonga.grn_ctx* ctx, grn_tokenizer_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 286 287 /* 288 Extract the next token by delimiting by 289 GRN_TOKENIZER_TOKENIZED_DELIMITER_UTF8. 290 291 This is for grn_tokenizer_next_func. 292 293 @since 8.0.9. 294 */ 295 296 //GRN_PLUGIN_EXPORT 297 export const (char)* 298 grn_tokenizer_next_by_tokenized_delimiter(groonga_d.groonga.grn_ctx* ctx, groonga_d.token.grn_token* token, const (char)* str_ptr, uint str_length, groonga_d.groonga.grn_encoding encoding); 299 300 /* 301 grn_tokenizer_register() registers a plugin to the database which is 302 associated with `ctx'. `plugin_name_ptr' and `plugin_name_length' specify the 303 plugin name. Alphabetic letters ('A'-'Z' and 'a'-'z'), digits ('0'-'9') and 304 an underscore ('_') are capable characters. `init', `next' and `fin' specify 305 the plugin functions. `init' is called for initializing a tokenizer for a 306 document or query. `next' is called for extracting tokens one by one. `fin' 307 is called for finalizing a tokenizer. grn_tokenizer_register() returns 308 GRN_SUCCESS on success, an error code on failure. See "groonga.h" for more 309 details of grn_proc_func and grn_user_data, that is used as an argument of 310 grn_proc_func. 311 312 @deprecated since 8.0.2. Use grn_tokenizer_create() and 313 grn_tokenizer_set_*_func(). 314 */ 315 316 //GRN_PLUGIN_EXPORT 317 export groonga_d.groonga.grn_rc 318 grn_tokenizer_register(groonga_d.groonga.grn_ctx* ctx, const (char)* plugin_name_ptr, uint plugin_name_length, groonga_d.groonga.grn_proc_func* init, groonga_d.groonga.grn_proc_func* next, groonga_d.groonga.grn_proc_func* fin); 319 320 //GRN_PLUGIN_EXPORT 321 export groonga_d.groonga.grn_obj* 322 grn_tokenizer_create(groonga_d.groonga.grn_ctx* ctx, const (char)* name, int name_length); 323 324 //typedef void* grn_tokenizer_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 325 alias grn_tokenizer_init_func = extern (C) nothrow @nogc void* function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query); 326 327 //typedef void grn_tokenizer_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data); 328 alias grn_tokenizer_next_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, groonga_d.tokenizer_query_deprecated.grn_tokenizer_query* query, groonga_d.token.grn_token* token, void* user_data); 329 330 //typedef void grn_tokenizer_fin_func(groonga_d.groonga.grn_ctx* ctx, void* user_data); 331 alias grn_tokenizer_fin_func = extern (C) nothrow @nogc void function(groonga_d.groonga.grn_ctx* ctx, void* user_data); 332 333 //GRN_PLUGIN_EXPORT 334 export groonga_d.groonga.grn_rc 335 grn_tokenizer_set_init_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_init_func* init); 336 337 //GRN_PLUGIN_EXPORT 338 export groonga_d.groonga.grn_rc 339 grn_tokenizer_set_next_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_next_func* next); 340 341 //GRN_PLUGIN_EXPORT 342 export groonga_d.groonga.grn_rc 343 grn_tokenizer_set_fin_func(groonga_d.groonga.grn_ctx* ctx, groonga_d.groonga.grn_obj* tokenizer, grn_tokenizer_fin_func* fin);