diff --git a/docs/encoding.md b/docs/encoding.md new file mode 100644 index 000000000..fd99bdff4 --- /dev/null +++ b/docs/encoding.md @@ -0,0 +1,56 @@ +# RBS File Encoding + +## Best Practice + +**Use UTF-8** for both file encoding and your system locale. + +## Supported Encodings + +RBS parser supports ASCII-compatible encodings (similar to Ruby's script encoding support). + +**Examples**: UTF-8, US-ASCII, Shift JIS, EUC-JP, ... + +## Unicode Codepoint Symbols + +String literal types in RBS can contain Unicode codepoint escape sequences (`\uXXXX`). + +When the file encoding is UTF-8, the parser translates Unicode codepoint symbols: + +```rbs +# In UTF-8 encoded files + +type t = "\u0123" # Translated to the actual Unicode character ģ +type s = "\u3042" # Translated to the actual Unicode character あ +``` + +When the file encoding is not UTF-8, Unicode escape sequences are interpreted literally as the string `\uXXXX`: + +```rbs +# In non-UTF-8 encoded files + +type t = "\u0123" # Remains as the literal string "\u0123" +``` + +## Implementation + +RBS gem currently doesn't do anything for file encoding. It relies on Ruby's encoding handling, specifically `Encoding.default_external` and `Encoding.default_internal`. + +`Encoding.default_external` is the encoding Ruby assumes when it reads external resources like files. The Ruby interpreter sets it based on the locale. `Encoding.default_internal` is the encoding Ruby converts the external resources to. The default is `nil` (no conversion.) + +When your locale is set to use `UTF-8` encoding, `default_external` is `Encoding::UTF_8`. So the RBS file content read from the disk will have UTF-8 encoding. + +### Parsing non UTF-8 RBS source text + +If you want to work with another encoding, ensure the source string has ASCII compatible encoding. + +```ruby +source = '"日本語"' +RBS::Parser.parse_type(source.encode(Encoding::EUC_JP)) # => Parses successfully +RBS::Parser.parse_type(source.encode(Encoding::UTF_32)) # => Returns `nil` since UTF-32 is not ASCII compatible +``` + +### Specifying file encoding + +Currently, RBS doesn't support specifying file encoding directly. + +You can use `Encoding.default_external` while the gem loads RBS files from the storage. diff --git a/ext/rbs_extension/class_constants.c b/ext/rbs_extension/class_constants.c index 5b61043fd..0fbdb1801 100644 --- a/ext/rbs_extension/class_constants.c +++ b/ext/rbs_extension/class_constants.c @@ -7,8 +7,6 @@ #include "rbs_extension.h" -VALUE RBS_Parser; - VALUE RBS; VALUE RBS_AST; VALUE RBS_AST_Declarations; diff --git a/ext/rbs_extension/legacy_location.c b/ext/rbs_extension/legacy_location.c index ba7f30684..20bfb1087 100644 --- a/ext/rbs_extension/legacy_location.c +++ b/ext/rbs_extension/legacy_location.c @@ -33,7 +33,7 @@ void rbs_loc_legacy_alloc_children(rbs_loc *loc, unsigned short cap) { check_children_max(cap); size_t s = RBS_LOC_CHILDREN_SIZE(cap); - loc->children = malloc(s); + loc->children = (rbs_loc_children *) malloc(s); *loc->children = (rbs_loc_children) { .len = 0, @@ -50,7 +50,7 @@ static void check_children_cap(rbs_loc *loc) { if (loc->children->len == loc->children->cap) { check_children_max(loc->children->cap + 1); size_t s = RBS_LOC_CHILDREN_SIZE(++loc->children->cap); - loc->children = realloc(loc->children, s); + loc->children = (rbs_loc_children *) realloc(loc->children, s); } } } @@ -86,12 +86,12 @@ void rbs_loc_free(rbs_loc *loc) { } static void rbs_loc_mark(void *ptr) { - rbs_loc *loc = ptr; + rbs_loc *loc = (rbs_loc *) ptr; rb_gc_mark(loc->buffer); } static size_t rbs_loc_memsize(const void *ptr) { - const rbs_loc *loc = ptr; + const rbs_loc *loc = (const rbs_loc *) ptr; if (loc->children == NULL) { return sizeof(rbs_loc); } else { @@ -117,7 +117,7 @@ static VALUE location_s_allocate(VALUE klass) { } rbs_loc *rbs_check_location(VALUE obj) { - return rb_check_typeddata(obj, &location_type); + return (rbs_loc *) rb_check_typeddata(obj, &location_type); } static VALUE location_initialize(VALUE self, VALUE buffer, VALUE start_pos, VALUE end_pos) { diff --git a/ext/rbs_extension/main.c b/ext/rbs_extension/main.c index 7884c5f8a..0bc0ce226 100644 --- a/ext/rbs_extension/main.c +++ b/ext/rbs_extension/main.c @@ -187,18 +187,10 @@ static VALUE parse_method_type_try(VALUE a) { } rbs_method_type_t *method_type = NULL; - rbs_parse_method_type(parser, &method_type); + rbs_parse_method_type(parser, &method_type, RB_TEST(arg->require_eof)); raise_error_if_any(parser, arg->buffer); - if (RB_TEST(arg->require_eof)) { - rbs_parser_advance(parser); - if (parser->current_token.type != pEOF) { - rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF)); - raise_error(parser->error, arg->buffer); - } - } - rbs_translation_context_t ctx = rbs_translation_context_create( &parser->constant_pool, arg->buffer, diff --git a/include/rbs/parser.h b/include/rbs/parser.h index f6cf14f5b..4e5dbf2d0 100644 --- a/include/rbs/parser.h +++ b/include/rbs/parser.h @@ -44,7 +44,7 @@ typedef struct rbs_error_t { * An RBS parser is a LL(3) parser. * */ typedef struct { - rbs_lexer_t *rbs_lexer_t; + rbs_lexer_t *lexer; rbs_token_t current_token; rbs_token_t next_token; /* The first lookahead token */ @@ -127,7 +127,7 @@ rbs_ast_comment_t *rbs_parser_get_comment(rbs_parser_t *parser, int subject_line void rbs_parser_set_error(rbs_parser_t *parser, rbs_token_t tok, bool syntax_error, const char *fmt, ...) RBS_ATTRIBUTE_FORMAT(4, 5); bool rbs_parse_type(rbs_parser_t *parser, rbs_node_t **type); -bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type); +bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type, bool require_eof); bool rbs_parse_signature(rbs_parser_t *parser, rbs_signature_t **signature); bool rbs_parse_type_params(rbs_parser_t *parser, bool module_type_params, rbs_node_list_t **params); diff --git a/include/rbs/string.h b/include/rbs/string.h index d07e1d7cb..452d63ac4 100644 --- a/include/rbs/string.h +++ b/include/rbs/string.h @@ -44,6 +44,4 @@ size_t rbs_string_len(const rbs_string_t self); */ bool rbs_string_equal(const rbs_string_t lhs, const rbs_string_t rhs); -unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string); - #endif diff --git a/include/rbs/util/rbs_unescape.h b/include/rbs/util/rbs_unescape.h index cc551cbdd..18e9e28bc 100644 --- a/include/rbs/util/rbs_unescape.h +++ b/include/rbs/util/rbs_unescape.h @@ -4,6 +4,7 @@ #include #include "rbs/util/rbs_allocator.h" #include "rbs/string.h" +#include "rbs/util/rbs_encoding.h" /** * Receives `rbs_parser_t` and `range`, which represents a string token or symbol token, and returns a string VALUE. @@ -18,6 +19,6 @@ * * @returns A new owned string that will be freed when the allocator is freed. * */ -rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input); +rbs_string_t rbs_unquote_string(rbs_allocator_t *, const rbs_string_t input, const rbs_encoding_t *encoding); #endif // RBS_RBS_UNESCAPE_H diff --git a/src/location.c b/src/location.c index c741537f1..a3f6284a2 100644 --- a/src/location.c +++ b/src/location.c @@ -8,7 +8,7 @@ void rbs_loc_alloc_children(rbs_allocator_t *allocator, rbs_location_t *loc, size_t capacity) { RBS_ASSERT(capacity <= sizeof(rbs_loc_entry_bitmap) * 8, "Capacity %zu is too large. Max is %zu", capacity, sizeof(rbs_loc_entry_bitmap) * 8); - loc->children = rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children)); + loc->children = (rbs_loc_children *) rbs_allocator_malloc_impl(allocator, RBS_LOC_CHILDREN_SIZE(capacity), rbs_alignof(rbs_loc_children)); loc->children->len = 0; loc->children->required_p = 0; diff --git a/src/parser.c b/src/parser.c index 40b1e0e4b..20ac2645a 100644 --- a/src/parser.c +++ b/src/parser.c @@ -20,12 +20,12 @@ strlen(str) \ ) -#define INTERN_TOKEN(parser, tok) \ - rbs_constant_pool_insert_shared_with_encoding( \ - &parser->constant_pool, \ - (const uint8_t *) rbs_peek_token(parser->rbs_lexer_t, tok), \ - rbs_token_bytes(tok), \ - (void *) parser->rbs_lexer_t->encoding \ +#define INTERN_TOKEN(parser, tok) \ + rbs_constant_pool_insert_shared_with_encoding( \ + &parser->constant_pool, \ + (const uint8_t *) rbs_peek_token(parser->lexer, tok), \ + rbs_token_bytes(tok), \ + parser->lexer->encoding \ ) #define KEYWORD_CASES \ @@ -128,7 +128,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type); static rbs_string_t rbs_parser_peek_current_token(rbs_parser_t *parser) { rbs_range_t rg = parser->current_token.range; - const char *start = parser->rbs_lexer_t->string.start + rg.start.byte_pos; + const char *start = parser->lexer->string.start + rg.start.byte_pos; size_t length = rg.end.byte_pos - rg.start.byte_pos; return rbs_string_new(start, start + length); @@ -189,7 +189,7 @@ static bool parse_type_name(rbs_parser_t *parser, TypeNameKind kind, rbs_range_t .end = parser->current_token.range.end }; rbs_location_t *loc = rbs_location_new(ALLOCATOR(), namespace_range); - rbs_namespace_t *namespace = rbs_namespace_new(ALLOCATOR(), loc, path, absolute); + rbs_namespace_t *ns = rbs_namespace_new(ALLOCATOR(), loc, path, absolute); switch (parser->current_token.type) { case tLIDENT: @@ -213,7 +213,7 @@ success: { rbs_location_t *symbolLoc = rbs_location_current_token(parser); rbs_constant_id_t name = INTERN_TOKEN(parser, parser->current_token); rbs_ast_symbol_t *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, name); - *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), namespace, symbol); + *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), ns, symbol); return true; } @@ -317,7 +317,7 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_ return false; } - rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser)); + rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->lexer->encoding); rbs_location_t *symbolLoc = rbs_location_current_token(parser); rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str); rbs_ast_symbol_t *name = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id); @@ -334,9 +334,9 @@ static bool parse_function_param(rbs_parser_t *parser, rbs_types_function_param_ static rbs_constant_id_t intern_token_start_end(rbs_parser_t *parser, rbs_token_t start_token, rbs_token_t end_token) { return rbs_constant_pool_insert_shared_with_encoding( &parser->constant_pool, - (const uint8_t *) rbs_peek_token(parser->rbs_lexer_t, start_token), + (const uint8_t *) rbs_peek_token(parser->lexer, start_token), end_token.range.end.byte_pos - start_token.range.start.byte_pos, - parser->rbs_lexer_t->encoding + parser->lexer->encoding ); } @@ -902,7 +902,7 @@ static bool parse_record_attributes(rbs_parser_t *parser, rbs_hash_t **fields) { */ NODISCARD static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_types_literal_t **symbol) { - size_t offset_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) ":", (size_t) 1); + size_t offset_bytes = parser->lexer->encoding->char_width((const uint8_t *) ":", (size_t) 1); size_t bytes = rbs_token_bytes(parser->current_token) - offset_bytes; rbs_ast_symbol_t *literal; @@ -911,7 +911,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ case tSYMBOL: { rbs_location_t *symbolLoc = rbs_location_current_token(parser); - char *buffer = rbs_peek_token(parser->rbs_lexer_t, parser->current_token); + char *buffer = rbs_peek_token(parser->lexer, parser->current_token); rbs_constant_id_t constant_id = rbs_constant_pool_insert_shared( &parser->constant_pool, (const uint8_t *) buffer + offset_bytes, @@ -927,7 +927,7 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ rbs_string_t symbol = rbs_string_new(current_token.start + offset_bytes, current_token.end); - rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol); + rbs_string_t unquoted_symbol = rbs_unquote_string(ALLOCATOR(), symbol, parser->lexer->encoding); rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_symbol); @@ -951,9 +951,9 @@ static bool parse_symbol(rbs_parser_t *parser, rbs_location_t *location, rbs_typ */ NODISCARD static bool parse_instance_type(rbs_parser_t *parser, bool parse_alias, rbs_node_t **type) { - TypeNameKind expected_kind = INTERFACE_NAME | CLASS_NAME; + TypeNameKind expected_kind = (TypeNameKind) (INTERFACE_NAME | CLASS_NAME); if (parse_alias) { - expected_kind |= ALIAS_NAME; + expected_kind = (TypeNameKind) (expected_kind | ALIAS_NAME); } rbs_range_t name_range; @@ -1157,7 +1157,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type) { case tDQSTRING: { rbs_location_t *loc = rbs_location_current_token(parser); - rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser)); + rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), rbs_parser_peek_current_token(parser), parser->lexer->encoding); rbs_node_t *literal = (rbs_node_t *) rbs_ast_string_new(ALLOCATOR(), loc, unquoted_str); *type = (rbs_node_t *) rbs_types_literal_new(ALLOCATOR(), loc, literal); return true; @@ -1172,7 +1172,7 @@ static bool parse_simple(rbs_parser_t *parser, rbs_node_t **type) { return true; } case tUIDENT: { - const char *name_str = rbs_peek_token(parser->rbs_lexer_t, parser->current_token); + const char *name_str = rbs_peek_token(parser->lexer, parser->current_token); size_t name_len = rbs_token_bytes(parser->current_token); rbs_constant_id_t name = rbs_constant_pool_find(&parser->constant_pool, (const uint8_t *) name_str, name_len); @@ -1452,7 +1452,7 @@ static bool parser_pop_typevar_table(rbs_parser_t *parser) { method_type ::= {} type_params */ // TODO: Should this be NODISCARD? -bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type) { +bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type, bool require_eof) { rbs_parser_push_typevar_table(parser, false); rbs_range_t rg; @@ -1468,10 +1468,18 @@ bool rbs_parse_method_type(rbs_parser_t *parser, rbs_method_type_t **method_type parse_function_result *result = rbs_allocator_alloc(ALLOCATOR(), parse_function_result); CHECK_PARSE(parse_function(parser, false, &result)); + CHECK_PARSE(parser_pop_typevar_table(parser)); + rg.end = parser->current_token.range.end; type_range.end = rg.end; - CHECK_PARSE(parser_pop_typevar_table(parser)); + if (require_eof) { + rbs_parser_advance(parser); + if (parser->current_token.type != pEOF) { + rbs_parser_set_error(parser, parser->current_token, true, "expected a token `%s`", rbs_token_type_str(pEOF)); + return false; + } + } rbs_location_t *loc = rbs_location_new(ALLOCATOR(), rg); rbs_loc_alloc_children(ALLOCATOR(), loc, 2); @@ -1598,14 +1606,16 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota rbs_range_t rg = parser->current_token.range; size_t offset_bytes = - parser->rbs_lexer_t->encoding->char_width((const uint8_t *) "%", (size_t) 1) + - parser->rbs_lexer_t->encoding->char_width((const uint8_t *) "a", (size_t) 1); + parser->lexer->encoding->char_width((const uint8_t *) "%", (size_t) 1) + + parser->lexer->encoding->char_width((const uint8_t *) "a", (size_t) 1); rbs_string_t str = rbs_string_new( - parser->rbs_lexer_t->string.start + rg.start.byte_pos + offset_bytes, - parser->rbs_lexer_t->string.end + parser->lexer->string.start + rg.start.byte_pos + offset_bytes, + parser->lexer->string.end ); - unsigned int open_char = rbs_utf8_string_to_codepoint(str); + + // Assumes the input is ASCII compatible + unsigned int open_char = str.start[0]; unsigned int close_char; @@ -1630,8 +1640,8 @@ static bool parse_annotation(rbs_parser_t *parser, rbs_ast_annotation_t **annota return false; } - size_t open_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) &open_char, (size_t) 1); - size_t close_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) &close_char, (size_t) 1); + size_t open_bytes = parser->lexer->encoding->char_width((const uint8_t *) &open_char, (size_t) 1); + size_t close_bytes = parser->lexer->encoding->char_width((const uint8_t *) &close_char, (size_t) 1); rbs_string_t current_token = rbs_parser_peek_current_token(parser); size_t total_offset = offset_bytes + open_bytes; @@ -1695,9 +1705,9 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_ rbs_constant_id_t constant_id = rbs_constant_pool_insert_shared_with_encoding( &parser->constant_pool, - (const uint8_t *) parser->rbs_lexer_t->string.start + range->start.byte_pos, + (const uint8_t *) parser->lexer->string.start + range->start.byte_pos, range->end.byte_pos - range->start.byte_pos, - parser->rbs_lexer_t->encoding + parser->lexer->encoding ); rbs_location_t *symbolLoc = rbs_location_new(ALLOCATOR(), *range); @@ -1718,7 +1728,7 @@ static bool parse_method_name(rbs_parser_t *parser, rbs_range_t *range, rbs_ast_ } case tQIDENT: { rbs_string_t string = rbs_parser_peek_current_token(parser); - rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string); + rbs_string_t unquoted_str = rbs_unquote_string(ALLOCATOR(), string, parser->lexer->encoding); rbs_constant_id_t constant_id = rbs_constant_pool_insert_string(&parser->constant_pool, unquoted_str); rbs_location_t *symbolLoc = rbs_location_current_token(parser); *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, constant_id); @@ -1879,7 +1889,7 @@ static bool parse_member_def(rbs_parser_t *parser, bool instance_only, bool acce case pLBRACKET: case pQUESTION: { rbs_method_type_t *method_type = NULL; - CHECK_PARSE(rbs_parse_method_type(parser, &method_type)); + CHECK_PARSE(rbs_parse_method_type(parser, &method_type, false)); overload_range.end = parser->current_token.range.end; rbs_location_t *loc = rbs_location_new(ALLOCATOR(), overload_range); @@ -2021,7 +2031,7 @@ static bool parse_mixin_member(rbs_parser_t *parser, bool from_interface, rbs_po rbs_type_name_t *name = NULL; CHECK_PARSE(class_instance_name( parser, - from_interface ? INTERFACE_NAME : (INTERFACE_NAME | CLASS_NAME), + from_interface ? INTERFACE_NAME : (TypeNameKind) (INTERFACE_NAME | CLASS_NAME), args, &name_range, &args_range, @@ -2486,7 +2496,7 @@ static bool parse_module_self_types(rbs_parser_t *parser, rbs_node_list_t *array rbs_range_t name_range; rbs_type_name_t *module_name = NULL; - CHECK_PARSE(parse_type_name(parser, CLASS_NAME | INTERFACE_NAME, &name_range, &module_name)); + CHECK_PARSE(parse_type_name(parser, (TypeNameKind) (CLASS_NAME | INTERFACE_NAME), &name_range, &module_name)); self_range.end = name_range.end; rbs_node_list_t *args = rbs_node_list_new(ALLOCATOR()); @@ -2949,7 +2959,7 @@ static bool parse_decl(rbs_parser_t *parser, rbs_node_t **decl) { | {} <> (empty -- returns empty namespace) */ NODISCARD -static bool parse_namespace(rbs_parser_t *parser, rbs_range_t *rg, rbs_namespace_t **namespace) { +static bool parse_namespace(rbs_parser_t *parser, rbs_range_t *rg, rbs_namespace_t **out_ns) { bool is_absolute = false; if (parser->next_token.type == pCOLON2) { @@ -2980,7 +2990,7 @@ static bool parse_namespace(rbs_parser_t *parser, rbs_range_t *rg, rbs_namespace } } - *namespace = rbs_namespace_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), path, is_absolute); + *out_ns = rbs_namespace_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), *rg), path, is_absolute); return true; } @@ -2995,8 +3005,8 @@ NODISCARD static bool parse_use_clauses(rbs_parser_t *parser, rbs_node_list_t *clauses) { while (true) { rbs_range_t namespace_range = NULL_RANGE; - rbs_namespace_t *namespace = NULL; - CHECK_PARSE(parse_namespace(parser, &namespace_range, &namespace)); + rbs_namespace_t *ns = NULL; + CHECK_PARSE(parse_namespace(parser, &namespace_range, &ns)); switch (parser->next_token.type) { case tLIDENT: @@ -3010,7 +3020,7 @@ static bool parse_use_clauses(rbs_parser_t *parser, rbs_node_list_t *clauses) { rbs_location_t *symbolLoc = rbs_location_current_token(parser); rbs_ast_symbol_t *symbol = rbs_ast_symbol_new(ALLOCATOR(), symbolLoc, &parser->constant_pool, INTERN_TOKEN(parser, parser->current_token)); - rbs_type_name_t *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), type_name_range), namespace, symbol); + rbs_type_name_t *type_name = rbs_type_name_new(ALLOCATOR(), rbs_location_new(ALLOCATOR(), type_name_range), ns, symbol); rbs_range_t keyword_range = NULL_RANGE; rbs_range_t new_name_range = NULL_RANGE; @@ -3053,7 +3063,7 @@ static bool parse_use_clauses(rbs_parser_t *parser, rbs_node_list_t *clauses) { rbs_loc_add_required_child(loc, INTERN("namespace"), namespace_range); rbs_loc_add_required_child(loc, INTERN("star"), star_range); - rbs_ast_directives_use_wildcard_clause_t *clause = rbs_ast_directives_use_wildcard_clause_new(ALLOCATOR(), loc, namespace); + rbs_ast_directives_use_wildcard_clause_t *clause = rbs_ast_directives_use_wildcard_clause_new(ALLOCATOR(), loc, ns); rbs_node_list_append(clauses, (rbs_node_t *) clause); break; @@ -3100,8 +3110,8 @@ static bool parse_use_directive(rbs_parser_t *parser, rbs_ast_directives_use_t * } static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_t *com) { - size_t hash_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) "#", (size_t) 1); - size_t space_bytes = parser->rbs_lexer_t->encoding->char_width((const uint8_t *) " ", (size_t) 1); + size_t hash_bytes = parser->lexer->encoding->char_width((const uint8_t *) "#", (size_t) 1); + size_t space_bytes = parser->lexer->encoding->char_width((const uint8_t *) " ", (size_t) 1); rbs_buffer_t rbs_buffer; rbs_buffer_init(ALLOCATOR(), &rbs_buffer); @@ -3109,14 +3119,16 @@ static rbs_ast_comment_t *parse_comment_lines(rbs_parser_t *parser, rbs_comment_ for (size_t i = 0; i < com->line_tokens_count; i++) { rbs_token_t tok = com->line_tokens[i]; - const char *comment_start = parser->rbs_lexer_t->string.start + tok.range.start.byte_pos + hash_bytes; + const char *comment_start = parser->lexer->string.start + tok.range.start.byte_pos + hash_bytes; size_t comment_bytes = RBS_RANGE_BYTES(tok.range) - hash_bytes; rbs_string_t str = rbs_string_new( comment_start, - parser->rbs_lexer_t->string.end + parser->lexer->string.end ); - unsigned char c = rbs_utf8_string_to_codepoint(str); + + // Assumes the input is ASCII compatible + unsigned char c = str.start[0]; if (c == ' ') { comment_start += space_bytes; @@ -3332,7 +3344,7 @@ void rbs_parser_advance(rbs_parser_t *parser) { break; } - parser->next_token3 = rbs_lexer_next_token(parser->rbs_lexer_t); + parser->next_token3 = rbs_lexer_next_token(parser->lexer); if (parser->next_token3.type == tCOMMENT) { // skip @@ -3424,7 +3436,7 @@ rbs_parser_t *rbs_parser_new(rbs_string_t string, const rbs_encoding_t *encoding rbs_parser_t *parser = rbs_allocator_alloc(allocator, rbs_parser_t); *parser = (rbs_parser_t) { - .rbs_lexer_t = lexer, + .lexer = lexer, .current_token = NullToken, .next_token = NullToken, diff --git a/src/string.c b/src/string.c index cc7de5e98..6081739f9 100644 --- a/src/string.c +++ b/src/string.c @@ -1,59 +1,10 @@ #include "rbs/string.h" -#include "rbs/defines.h" #include #include #include #include -unsigned int rbs_utf8_string_to_codepoint(const rbs_string_t string) { - unsigned int codepoint = 0; - int remaining_bytes = 0; - - const char *s = string.start; - const char *end = string.end; - - if (s >= end) return 0; // End of string - - if (RBS_LIKELY((*s & 0x80) == 0)) { - // Single byte character (0xxxxxxx) - return *s; - } else if ((*s & 0xE0) == 0xC0) { - // Two byte character (110xxxxx 10xxxxxx) - codepoint = *s & 0x1F; - remaining_bytes = 1; - } else if ((*s & 0xF0) == 0xE0) { - // Three byte character (1110xxxx 10xxxxxx 10xxxxxx) - codepoint = *s & 0x0F; - remaining_bytes = 2; - } else if ((*s & 0xF8) == 0xF0) { - // Four byte character (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) - codepoint = *s & 0x07; - remaining_bytes = 3; - } else { - // Invalid UTF-8 sequence - return 0xFFFD; // Unicode replacement character - } - - s++; - while (remaining_bytes > 0 && s < end) { - if ((*s & 0xC0) != 0x80) { - // Invalid continuation byte - return 0xFFFD; - } - codepoint = (codepoint << 6) | (*s & 0x3F); - s++; - remaining_bytes--; - } - - if (remaining_bytes > 0) { - // Incomplete sequence - return 0xFFFD; - } - - return codepoint; -} - rbs_string_t rbs_string_new(const char *start, const char *end) { return (rbs_string_t) { .start = start, diff --git a/src/util/rbs_allocator.c b/src/util/rbs_allocator.c index f0be8097f..b13240940 100644 --- a/src/util/rbs_allocator.c +++ b/src/util/rbs_allocator.c @@ -57,7 +57,7 @@ static size_t get_system_page_size(void) { static rbs_allocator_page_t *rbs_allocator_page_new(size_t payload_size) { const size_t page_header_size = sizeof(rbs_allocator_page_t); - rbs_allocator_page_t *page = malloc(page_header_size + payload_size); + rbs_allocator_page_t *page = (rbs_allocator_page_t *) malloc(page_header_size + payload_size); page->size = payload_size; page->used = 0; @@ -65,7 +65,7 @@ static rbs_allocator_page_t *rbs_allocator_page_new(size_t payload_size) { } rbs_allocator_t *rbs_allocator_init(void) { - rbs_allocator_t *allocator = malloc(sizeof(rbs_allocator_t)); + rbs_allocator_t *allocator = (rbs_allocator_t *) malloc(sizeof(rbs_allocator_t)); const size_t system_page_size = get_system_page_size(); diff --git a/src/util/rbs_constant_pool.c b/src/util/rbs_constant_pool.c index cb2b92b9a..b3441772a 100644 --- a/src/util/rbs_constant_pool.c +++ b/src/util/rbs_constant_pool.c @@ -57,8 +57,8 @@ rbs_constant_pool_resize(rbs_constant_pool_t *pool) { void *next = calloc(next_capacity, element_size); if (next == NULL) return false; - rbs_constant_pool_bucket_t *next_buckets = next; - rbs_constant_t *next_constants = (void *) (((char *) next) + next_capacity * sizeof(rbs_constant_pool_bucket_t)); + rbs_constant_pool_bucket_t *next_buckets = (rbs_constant_pool_bucket_t *) next; + rbs_constant_t *next_constants = (rbs_constant_t *) (((char *) next) + next_capacity * sizeof(rbs_constant_pool_bucket_t)); // For each bucket in the current constant pool, find the index in the // next constant pool, and insert it. @@ -111,8 +111,8 @@ bool rbs_constant_pool_init(rbs_constant_pool_t *pool, uint32_t capacity) { void *memory = calloc(capacity, element_size); if (memory == NULL) return false; - pool->buckets = memory; - pool->constants = (void *) (((char *) memory) + capacity * sizeof(rbs_constant_pool_bucket_t)); + pool->buckets = (rbs_constant_pool_bucket_t *) memory; + pool->constants = (rbs_constant_t *) (((char *) memory) + capacity * sizeof(rbs_constant_pool_bucket_t)); pool->size = 0; pool->capacity = capacity; return true; diff --git a/src/util/rbs_unescape.c b/src/util/rbs_unescape.c index 909c9c41d..a7352f398 100644 --- a/src/util/rbs_unescape.c +++ b/src/util/rbs_unescape.c @@ -1,4 +1,5 @@ #include "rbs/util/rbs_unescape.h" +#include "rbs/util/rbs_encoding.h" #include #include #include @@ -42,20 +43,44 @@ static int octal_to_int(const char *octal, int length) { return result; } -int rbs_utf8_codelen(unsigned int c) { - if (c <= 0x7F) return 1; - if (c <= 0x7FF) return 2; - if (c <= 0xFFFF) return 3; - if (c <= 0x10FFFF) return 4; - return 1; // Invalid Unicode codepoint, treat as 1 byte +// Fills buf starting at index 'start' with the UTF-8 encoding of 'codepoint'. +// Returns the number of bytes written, or 0 when the output is not changed. +// +size_t rbs_utf8_fill_codepoint(char *buf, size_t start, size_t end, unsigned int codepoint) { + if (start + 4 > end) { + return 0; + } + + if (codepoint <= 0x7F) { + buf[start] = codepoint & 0x7F; + return 1; + } else if (codepoint <= 0x7FF) { + buf[start + 0] = 0xC0 | ((codepoint >> 6) & 0x1F); + buf[start + 1] = 0x80 | (codepoint & 0x3F); + return 2; + } else if (codepoint <= 0xFFFF) { + buf[start + 0] = 0xE0 | ((codepoint >> 12) & 0x0F); + buf[start + 1] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[start + 2] = 0x80 | (codepoint & 0x3F); + return 3; + } else if (codepoint <= 0x10FFFF) { + buf[start + 0] = 0xF0 | ((codepoint >> 18) & 0x07); + buf[start + 1] = 0x80 | ((codepoint >> 12) & 0x3F); + buf[start + 2] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[start + 3] = 0x80 | (codepoint & 0x3F); + return 4; + } else { + return 0; + } } -rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote) { +rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t string, bool is_double_quote, bool is_unicode) { if (!string.start) return RBS_STRING_NULL; size_t len = string.end - string.start; const char *input = string.start; + // The output cannot be longer than the input even after unescaping. char *output = rbs_allocator_alloc_many(allocator, len + 1, char); if (!output) return RBS_STRING_NULL; @@ -79,9 +104,21 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri i += hex_len + 2; } else if (input[i + 1] == 'u' && i + 5 < len) { // Unicode escape - int value = hex_to_int(input + i + 2, 4); - output[j++] = (char) value; - i += 6; + + if (is_unicode) { + // The UTF-8 representation is at most 4 bytes, shorter than the input length. + int value = hex_to_int(input + i + 2, 4); + j += rbs_utf8_fill_codepoint(output, j, len + 1, value); + i += 6; + } else { + // Copy the escape sequence as-is + output[j++] = input[i++]; + output[j++] = input[i++]; + output[j++] = input[i++]; + output[j++] = input[i++]; + output[j++] = input[i++]; + output[j++] = input[i++]; + } } else { // Other escapes int found = 0; @@ -114,18 +151,17 @@ rbs_string_t unescape_string(rbs_allocator_t *allocator, const rbs_string_t stri return rbs_string_new(output, output + j); } -rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input) { - unsigned int first_char = rbs_utf8_string_to_codepoint(input); - size_t byte_length = rbs_string_len(input); +rbs_string_t rbs_unquote_string(rbs_allocator_t *allocator, rbs_string_t input, const rbs_encoding_t *encoding) { + unsigned int first_char = input.start[0]; + + const char *new_start = input.start; + const char *new_end = input.end; - ptrdiff_t start_offset = 0; if (first_char == '"' || first_char == '\'' || first_char == '`') { - int bs = rbs_utf8_codelen(first_char); - start_offset += bs; - byte_length -= 2 * bs; + new_start += 1; + new_end -= 1; } - const char *new_start = input.start + start_offset; - rbs_string_t string = rbs_string_new(new_start, new_start + byte_length); - return unescape_string(allocator, string, first_char == '"'); + rbs_string_t string = rbs_string_new(new_start, new_end); + return unescape_string(allocator, string, first_char == '"', encoding == RBS_ENCODING_UTF_8_ENTRY); } diff --git a/templates/ext/rbs_extension/class_constants.c.erb b/templates/ext/rbs_extension/class_constants.c.erb index 6b131a7bd..0392c9c60 100644 --- a/templates/ext/rbs_extension/class_constants.c.erb +++ b/templates/ext/rbs_extension/class_constants.c.erb @@ -1,7 +1,5 @@ #include "rbs_extension.h" -VALUE RBS_Parser; - VALUE RBS; VALUE RBS_AST; VALUE RBS_AST_Declarations; diff --git a/test/rbs/parser_test.rb b/test/rbs/parser_test.rb index 78f47b296..4cf3256b1 100644 --- a/test/rbs/parser_test.rb +++ b/test/rbs/parser_test.rb @@ -689,6 +689,27 @@ def test_parse_method_type end end + def test_parse_method_type__require_eof + RBS::Parser.parse_method_type(buffer("-> Foo extra input")).tap do |method_type| + assert_instance_of RBS::MethodType, method_type + assert_equal "-> Foo", method_type.location.source + end + + RBS::Parser.parse_method_type(buffer("-> Foo extra input")).tap do |method_type| + assert_instance_of RBS::MethodType, method_type + assert_equal "-> Foo", method_type.location.source + end + + assert_raises RBS::ParsingError do + RBS::Parser.parse_method_type(buffer("-> Foo extra input"), require_eof: true) + end.tap do |exn| + assert_equal( + "test.rbs:1:7...1:12: Syntax error: expected a token `pEOF`, token=`extra` (tLIDENT)", + exn.message + ) + end + end + def test_duplicate_keyword RBS::Parser.parse_method_type(buffer("(top foo, foo: top) -> void")).tap do |method_type| assert_equal "top foo, foo: top", method_type.type.param_to_s diff --git a/test/rbs/type_parsing_test.rb b/test/rbs/type_parsing_test.rb index 8195db7e5..2ee195f15 100644 --- a/test/rbs/type_parsing_test.rb +++ b/test/rbs/type_parsing_test.rb @@ -883,4 +883,40 @@ def test_escape_sequences assert_equal "\x00", type.types[2].literal end end + + def test_parse__string_octal_escape + Parser.parse_type('"\100"').yield_self do |type| + assert_equal "\100", type.literal + end + Parser.parse_type('"\400"').yield_self do |type| + assert_equal "\400", type.literal + end + end + + def test_parse__string_hex_escape + Parser.parse_type('"\x10"').yield_self do |type| + assert_equal "\x10", type.literal + end + Parser.parse_type('"\x40"').yield_self do |type| + assert_equal "\x40", type.literal + end + end + + def test_parse__string_unicode_escape + Parser.parse_type('"\u005a"').yield_self do |type| + assert_equal "Z", type.literal + end + Parser.parse_type('"[\u30eb]"').yield_self do |type| + assert_equal "[ル]", type.literal + end + end + + def test_parse__string_unicode_escape__non_unicode + Parser.parse_type('"\u005a"'.encode(Encoding::ASCII)).yield_self do |type| + assert_equal "\\u005a", type.literal + end + Parser.parse_type('"[\u30eb]"'.encode(Encoding::Shift_JIS)).yield_self do |type| + assert_equal "[\\u30eb]", type.literal + end + end end