# # patch "ChangeLog" # from [7dcbc08486ee7d40e11f95d1a9b4e1a7c9084a30] # to [520f6d02017bcd27e02e240763bfdfbde44d8620] # # patch "transforms.cc" # from [a84ed105809f55ba4097a4559ecc53ec21f9ab17] # to [9630106a94b0a726c54ca4187f36f3311bfa0ece] # # patch "transforms.hh" # from [a6b73afd42a8ffa3ff52368cc3b5c7b8e445b713] # to [230e09c455faf1c7be4a2af700e5da6a6cfc0a4e] # --- ChangeLog +++ ChangeLog @@ -1,3 +1,11 @@ +2005-06-27 Nathaniel Smith + + * transforms.cc (glob_to_regex, globs_to_regex, regexes_to_regex): + Choose "regex" as standard spelling. Clean up code, add code for + handling sets, start improving tests (don't currently pass). + * transforms.hh (glob_to_regex, globs_to_regex, regexes_to_regex): + Prototype. + 2005-06-25 Nathaniel Smith * netsync.cc: Tweak comment. --- transforms.cc +++ transforms.cc @@ -837,38 +837,36 @@ dst += linesep_str; } -// glob_to_regexp converts a sh file glob to a regexp. The regexp should -// be usable by the Boost regexp library. +// glob_to_regex converts a sh file glob to a regex. The regex should +// be usable by the Boost regex library. // // Pattern tranformation: // // - Any character except those described below are copied as they are. // - The backslash (\) escapes the following character. The escaping -// backslash is copied to the regexp along with the following character. -// - * is transformed to .* in the regexp. -// - ? is transformed to . in the regexp. -// - { is transformed to ( in the regexp, unless within [ and ]. -// - } is transformed to ) in the regexp, unless within [ and ]. -// - , is transformed to | in the regexp, if within { and } and not +// backslash is copied to the regex along with the following character. +// - * is transformed to .* in the regex. +// - ? is transformed to . in the regex. +// - { is transformed to ( in the regex, unless within [ and ]. +// - } is transformed to ) in the regex, unless within [ and ]. +// - , is transformed to | in the regex, if within { and } and not // within [ and ]. // - ^ is escaped unless it comes directly after an unescaped [. -// - ! is transformed to ^ in the regexp if it comes directly after an +// - ! is transformed to ^ in the regex if it comes directly after an // unescaped [. // - ] directly following an unescaped [ is escaped. -string glob_to_regexp(const string & glob) +void +glob_to_regex(std::string const & glob, std::string & regex) { int in_braces = 0; // counter for levels if {} bool in_brackets = false; // flags if we're inside a [], which // has higher precedence than {}. // Also, [ is accepted inside [] unescaped. bool this_was_opening_bracket = false; - string tmp; - tmp.reserve(glob.size() * 2); + regex.reserve(glob.size() * 2); -#ifdef BUILD_UNIT_TESTS - cerr << "DEBUG[glob_to_regexp]: input = \"" << glob << "\"" << endl; -#endif + L(F("glob_to_regex: input = '%s'\n") % glob); for (string::const_iterator i = glob.begin(); i != glob.end(); ++i) { @@ -880,7 +878,7 @@ if (in_brackets && last_was_opening_bracket && (c == '!' || c == '^')) { - tmp += '^'; + regex += '^'; if (++i == glob.end()) break; c = *i; @@ -888,10 +886,10 @@ if (c == '\\') { - tmp += c; + regex += c; if (++i == glob.end()) break; - tmp += *i; + regex += *i; } else if (in_brackets) { @@ -901,7 +899,7 @@ if (!last_was_opening_bracket) { in_brackets = false; - tmp += c; + regex += c; break; } // Trickling through to the standard character conversion, @@ -910,9 +908,9 @@ default: if (!(isalnum(c) || c == '_')) { - tmp += '\\'; + regex += '\\'; } - tmp += c; + regex += c; break; } } @@ -921,30 +919,30 @@ switch(c) { case '*': - tmp += ".*"; + regex += ".*"; break; case '?': - tmp += '.'; + regex += '.'; break; case '{': in_braces++; - tmp += '('; + regex += '('; break; case '}': N(in_braces != 0, F("trying to end a brace expression in a glob when none is started")); - tmp += ')'; + regex += ')'; in_braces--; break; case '[': in_brackets = true; this_was_opening_bracket = true; - tmp += c; + regex += c; break; case ',': if (in_braces > 0) { - tmp += '|'; + regex += '|'; break; } // Trickling through to default: here, since a comma outside of @@ -952,9 +950,9 @@ default: if (!(isalnum(c) || c == '_')) { - tmp += '\\'; + regex += '\\'; } - tmp += c; + regex += c; break; } } @@ -965,15 +963,43 @@ N(in_braces == 0, F("run-away brace expression in glob")); -#ifdef BUILD_UNIT_TESTS - cerr << "DEBUG[glob_to_regexp]: output = \"" << tmp << "\"" << endl; -#endif + L(F("glob_to_regex: output = '%s'\n") % regex); +} - return tmp; +void +globs_to_regex(std::set const & globs, std::string & regex) +{ + std::set regexes; + for (std::set::const_iterator i = globs.begin(); i != globs.end(); ++i) + { + std::string r; + glob_to_regex(*i, r); + regexes.insert(r); + } + regexes_to_regex(regexes, regex); } +void +regexes_to_regex(std::set const & regexes, std::string & regex) +{ + I(regexes.size() > 0); + if (regexes.size() == 1) + { + regex = *regexes.begin(); + return; + } + bool first = true; + for (std::set::const_iterator i = regexes.begin(); i != regexes.end(); ++i) + { + if (!first) + regex += "|"; + regex += "(" + *i + ")"; + } +} + #ifdef BUILD_UNIT_TESTS #include "unit_tests.hh" +#include static void enc_test() @@ -1250,13 +1276,45 @@ check_idna_encoding(); } -static void glob_to_regexp_test() +static void +check_glob_to_regex(std::string const & glob, std::string const & regex) { - BOOST_CHECK(glob_to_regexp("abc,v") == "abc\\,v"); - BOOST_CHECK(glob_to_regexp("foo[12m,]") == "foo[12m\\,]"); + std::string translated_regex; + glob_to_regex(glob, translated_regex); + BOOST_CHECK(translated_regex == regex); +} + +static void +check_glob_match(std::string const & glob, std::string const & line, + bool matchp) +{ + std::string regex; + glob_to_regex(glob, regex); + BOOST_CHECK(boost::regex_match(line, boost::regex(regex, boost::regex::extended)) == matchp); +} + +static void glob_to_regex_test() +{ + // our converter is very conserative about metacharacters, and quotes all + // non-alphabetic characters + check_glob_to_regex("abc,v", "abc\\,v"); + check_glob_to_regex("foo[12m,]", "foo[12m\\,]"); + check_glob_to_regex("foo{bar,baz*}", "foo(bar|baz.*)"); // A full fledged, use all damn features test... - BOOST_CHECK(glob_to_regexp("foo.{bar*,cookie?{haha,hehe[^\\123!,]}}[!]a^b]") - == "foo\\.(bar.*|cookie.(haha|hehe[^\\123\\!\\,]))[^\\]a\\^b]"); + check_glob_to_regex("foo.{bar*,cookie?{haha,hehe[^\\123!,]}}[!]a^b]", + "foo\\.(bar.*|cookie.(haha|hehe[^\\123\\!\\,]))[^\\]a\\^b]"); + + check_glob_match("foobar", "foobar", true); + check_glob_match("foobar", "foobaz", false); + check_glob_match("foo*", "foobar", true); + check_glob_match("foo{bar,baz}", "foobar", true); + check_glob_match("foo{bar,baz}", "foobaz", true); + check_glob_match("foo{bar,baz}", "bazbar", false); + check_glob_match("foo[123]", "foo1", true); + check_glob_match("foo[123]", "foo3", true); + check_glob_match("foo[123]", "foo5", false); + check_glob_match("foo[123]", "foo1a", false); + check_glob_match("foo,bar()", "foo,bar()", false); } void @@ -1270,7 +1328,7 @@ suite->add(BOOST_TEST_CASE(&join_lines_test)); suite->add(BOOST_TEST_CASE(&strip_ws_test)); suite->add(BOOST_TEST_CASE(&encode_test)); - suite->add(BOOST_TEST_CASE(&glob_to_regexp_test)); + suite->add(BOOST_TEST_CASE(&glob_to_regex_test)); } #endif // BUILD_UNIT_TESTS --- transforms.hh +++ transforms.hh @@ -195,6 +195,11 @@ // line-ending conversion void line_end_convert(std::string const & linesep, std::string const & src, std::string & dst); +// glob/regex handling +void glob_to_regex(std::string const & glob, std::string & regex); +void globs_to_regex(std::set const & globs, std::string & regex); +void regexes_to_regex(std::set const & regexes, std::string & regex); + #endif // __TRANSFORMS_HH__