hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 77613ea2dfba5bb22866968995c36a480861a3b3
parent 635eeda5b00835b94c5b696df140e8bf28c53134
Author: KAAtheWise <KAAtheWise@protonmail.com>
Date:   Mon, 30 Jan 2023 13:17:33 +0000

regex: Convert ranges to use u32 representation of runes

Changed range expressions to cast runes to u32 instead of u8.  This
means the range expressions now support multibyte codepoints.

Also added tests for different alphabets.  It includes
Cyrillic/Polish/Thai alphabets.  In addition, edited the testing code to
use rune length instead of byte length.

Signed-off-by: Andrey Kolchin <kaathewise@protonmail.com>

Diffstat:
Mregex/+test.ha | 30+++++++++++++++++++++++++++++-
Mregex/regex.ha | 16+++++-----------
2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -1,6 +1,7 @@ // License: MPL-2.0 // (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net> use fmt; +use strings; type matchres = enum { MATCH, NOMATCH, ERROR }; @@ -288,6 +289,30 @@ fn run_findall_case( (`^test[[:upper:]]+$`, "testa", matchres::NOMATCH, 0, -1), (`^test[[:upper:]]+$`, "testA", matchres::MATCH, 0, -1), (`^test[[:xdigit:]]+$`, "testCAFE", matchres::MATCH, 0, -1), + // range expressions + (`[a-z]+`, "onlylatinletters", matchres::MATCH, 0, -1), + (`[x-z]+`, "xyz", matchres::MATCH, 0, -1), + (`[x-z]+`, "wxyz", matchres::MATCH, 1, 4), + (`[a-e]+`, "-abcdefg", matchres::MATCH, 1, 6), + (`[a-z]`, "-1234567890@#$%^&*(!)-+=", matchres::NOMATCH, 0, -1), + (`[0-9]+`, "9246", matchres::MATCH, 0, -1), + // # Cyrillic + (`[а-я]+`, "кирилица", matchres::MATCH, 0, -1), + (`[а-д]`, "е", matchres::NOMATCH, 0, -1), + (`[я-ф]`, "-", matchres::ERROR, 0, -1), + (`[А-Я]+`, "АБВГд", matchres::MATCH, 0, 4), + // because Macedonian uses cyrrilics, the broad range does + // not include special symbols + (`[а-ш]+`, "ѓљњќ", matchres::NOMATCH, 0, -1), + // # Polish Alphabet + (`[a-ż]+`, "polskialfabet", matchres::MATCH, 0, -1), + (`[a-ż]+`, "źśółęćą", matchres::MATCH, 0, -1), + // because Polish alphabet uses Latin with special characters, + // other characters can be accepted + (`[a-ż]+`, "englishspeak", matchres::MATCH, 0, -1), + (`[a-ż]+`, "{|}~", matchres::MATCH, 0, -1), + // # Thai Alphabet + (`[ก-ฮ]+`, "ศอผจข", matchres::MATCH, 0, -1), // [:alpha:] etc. plus extra characters (`^test[[:digit:]][[:alpha:]]$`, "test1a", matchres::MATCH, 0, -1), (`^test[[:digit:]][[:alpha:]]$`, "testa1", matchres::NOMATCH, 0, -1), @@ -538,7 +563,10 @@ fn run_findall_case( const should_match = cases[i].2; const start = cases[i].3; const end = if (cases[i].4 == -1) { - yield len(string): int; + // workaround to get the length in codepoints + let runes = strings::runes(string); + defer free(runes); + yield len(runes): int; } else { yield cases[i].4; }; diff --git a/regex/regex.ha b/regex/regex.ha @@ -61,7 +61,7 @@ export type charclass = enum { export type charset = [](charset_lit_item | charset_range_item | charset_class_item), charset_lit_item = rune, - charset_range_item = (u8, u8), + charset_range_item = (u32, u32), charset_class_item = *fn(c: rune) bool; const charclass_map: [](str, *fn(c: rune) bool) = [ @@ -173,13 +173,8 @@ fn handle_bracket( return `No character class after '[:'`: error; }; } else if (is_range) { - const start_enc = utf8::encoderune(r); - assert(len(start_enc) == 1, "Character ranges do not currently support characters larger than one byte"); - const start_b = start_enc[0]; - - const end_enc = utf8::encoderune(range_end as rune); - assert(len(end_enc) == 1, "Character ranges do not currently support characters larger than one byte"); - const end_b = end_enc[0]; + const start_b = r: u32; + const end_b = range_end as rune: u32; if (end_b < start_b) { return `Decending bracket expression range '[z-a]'`: error; @@ -600,9 +595,8 @@ fn run_thread( break; }; case let range: charset_range_item => - const r_enc = utf8::encoderune(r); - assert(len(r_enc) == 1, "Character ranges do not currently support characters larger than one byte"); - const r_b = r_enc[0]; + const r_b = r: u32; + if (r_b >= range.0 && r_b <= range.1) { // Succeeded if positive match // Failed if negative match