# # add_file "globish.cc" # # add_file "globish.hh" # # patch "ChangeLog" # from [520f6d02017bcd27e02e240763bfdfbde44d8620] # to [50ec104f400f556a46bae530d9cecd9cdd85140d] # # patch "Makefile.am" # from [8f57b338067b43d1d1ae5de118323a6c67f6be7d] # to [78ca8f5e5c7726ed29bf7ebd7db41367e8af60ae] # # patch "globish.cc" # from [] # to [42062864963a68b6d468f784dace238974658f3c] # # patch "globish.hh" # from [] # to [c43704114fc9a8866638686f3899749824c49179] # # patch "transforms.cc" # from [9630106a94b0a726c54ca4187f36f3311bfa0ece] # to [fe0f025921c7a342635e152d4f7c8b93499be401] # # patch "transforms.hh" # from [230e09c455faf1c7be4a2af700e5da6a6cfc0a4e] # to [446f1de66317ee1a5a4ee76fd0887a686c8264b8] # # patch "unit_tests.cc" # from [230b4227d08bed323a2625f593073427bca9fe9b] # to [99e4b00f7e1071461ac6d79372edda008bb5b182] # # patch "unit_tests.hh" # from [b4b20ac190077a08db547eb1dcd79f40c58b2ebc] # to [340f8c9e449facd6744cf6b926367fbd98bbb062] # --- ChangeLog +++ ChangeLog @@ -1,3 +1,10 @@ +2005-06-28 Nathaniel Smith + + * globish.{cc,hh}: New files. + * Makefile.am (MOST_SOURCES): Add them. + * transforms.{cc,hh}: Remove glob-related stuff. + * unit_tests.{cc,hh}: Call globish unit tests. + 2005-06-27 Nathaniel Smith * transforms.cc (glob_to_regex, globs_to_regex, regexes_to_regex): --- Makefile.am +++ Makefile.am @@ -40,6 +40,7 @@ selectors.cc selectors.hh \ annotate.cc annotate.hh \ restrictions.cc restrictions.hh \ + globish.cc globish.hh \ \ cleanup.hh unit_tests.hh interner.hh \ cycle_detector.hh randomfile.hh adler32.hh quick_alloc.hh \ --- globish.cc +++ globish.cc @@ -0,0 +1,210 @@ +// copyright (C) 2005 Richard Levitte +// copyright (C) 2005 nathaniel smith +// all rights reserved. +// licensed to the public under the terms of the GNU GPL (>= 2) +// see the file COPYING for details + +#include "sanity.hh" +#include "globish.hh" + +// this converts a globish pattern to a regex. The regex should be usable by +// the Boost regex library operating in default mode, i.e., it should be a +// valid ECMAscript regex. +// +// Pattern tranformation: +// +// - Any character except those described below are copied as they are. +// - The backslash (\) escapes the following character. The escaping +// backslash is copied to the regex along with the following character. +// - * is transformed to .* in the regex. +// - ? is transformed to . in the regex. +// - { is transformed to ( in the regex +// - } is transformed to ) in the regex +// - , is transformed to | in the regex, if within { and } +// - ^ is escaped unless it comes directly after an unescaped [. +// - ! is transformed to ^ in the regex if it comes directly after an +// unescaped [. +// - ] directly following an unescaped [ is escaped. +static void +maybe_quote(char c, std::string & re) +{ + if (!(isalnum(c) || c == '_')) + { + re += '\\'; + } + re += c; +} + +static void +checked_globish_to_regex(std::string const & glob, std::string & regex) +{ + int in_braces = 0; // counter for levels if {} + + regex.clear(); + regex.reserve(glob.size() * 2); + + L(F("checked_globish_to_regex: input = '%s'\n") % glob); + + for (std::string::const_iterator i = glob.begin(); i != glob.end(); ++i) + { + char c = *i; + + N(in_braces < 5, F("braces nested too deep in pattern '%s'") % glob); + + switch(c) + { + case '*': + regex += ".*"; + break; + case '?': + regex += '.'; + break; + case '{': + in_braces++; + regex += '('; + break; + case '}': + N(in_braces != 0, + F("trying to end a brace expression in a glob when none is started")); + regex += ')'; + in_braces--; + break; + case ',': + if (in_braces > 0) + regex += '|'; + else + maybe_quote(c, regex); + break; + case '\\': + N(++i != glob.end(), F("pattern '%s' ends with backslash") % glob); + maybe_quote(*i, regex); + break; + default: + maybe_quote(c, regex); + break; + } + } + + N(in_braces == 0, + F("run-away brace expression in pattern '%s'") % glob); + + L(F("checked_globish_to_regex: output = '%s'\n") % regex); +} + +void +combine_and_check_globish(std::set const &patterns, utf8 & pattern) +{ + std::string p; + p += '{'; + bool first = true; + for (std::set::const_iterator i = patterns.begin(); i != patterns.end(); ++i) + { + std::string tmp; + // run for the checking it does + checked_globish_to_regex((*i)(), tmp); + if (!first) + p += ','; + first = false; + p += (*i)(); + } + p += '}'; + pattern = utf8(p); +} + +globish_matcher::globish_matcher(utf8 const & include_pat, utf8 const & exclude_pat) +{ + std::string re; + checked_globish_to_regex(include_pat(), re); + r_inc = re; + checked_globish_to_regex(exclude_pat(), re); + r_exc = re; +} + +bool +globish_matcher::operator()(std::string const & s) +{ + // regex_match may throw a std::runtime_error, if the regex turns out to be + // really pathological + return boost::regex_match(s, r_inc) && !boost::regex_match(s, r_exc); +} + + +#ifdef BUILD_UNIT_TESTS +#include "unit_tests.hh" + +static void +checked_globish_to_regex_test() +{ + std::string pat; + + checked_globish_to_regex("*", pat); + BOOST_CHECK(pat == ".*"); + checked_globish_to_regex("?", pat); + BOOST_CHECK(pat == "."); + checked_globish_to_regex("{a,b,c}d", pat); + BOOST_CHECK(pat == "(a|b|c)d"); + checked_globish_to_regex("foo{a,{b,c},?*}d", pat); + BOOST_CHECK(pat == "foo(a|(b|c)|..*)d"); + checked_globish_to_regex("\\a\\b\\|\\{\\*", pat); + BOOST_CHECK(pat == "ab\\|\\{\\*"); + checked_globish_to_regex(".+$^{}", pat); + BOOST_CHECK(pat == "\\.\\+\\$\\^()"); + checked_globish_to_regex(",", pat); + // we're very conservative about metacharacters, and quote all + // non-alphanumerics, hence the backslash + BOOST_CHECK(pat == "\\,"); + + BOOST_CHECK_THROW(checked_globish_to_regex("foo\\", pat), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("{foo", pat), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("{foo,bar{baz,quux}", pat), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("foo}", pat), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("foo,bar{baz,quux}}", pat), informative_failure); + BOOST_CHECK_THROW(checked_globish_to_regex("{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}", pat), informative_failure); +} + +static void +combine_and_check_globish_test() +{ + std::set s; + s.insert(utf8("a")); + s.insert(utf8("b")); + s.insert(utf8("c")); + utf8 combined; + combine_and_check_globish(s, combined); + BOOST_CHECK(combined() == "{a,b,c}"); +} + +static void +globish_matcher_test() +{ + { + globish_matcher m(utf8("{a,b}?*\\*|"), utf8("*c*")); + BOOST_CHECK(m("aq*|")); + BOOST_CHECK(m("bq*|")); + BOOST_CHECK(!m("bc*|")); + BOOST_CHECK(!m("bq|")); + BOOST_CHECK(!m("b*|")); + BOOST_CHECK(!m("")); + } + { + globish_matcher m(utf8("{a,\\\\,b*}"), utf8("*c*")); + BOOST_CHECK(m("a")); + BOOST_CHECK(!m("ab")); + BOOST_CHECK(m("\\")); + BOOST_CHECK(!m("\\\\")); + BOOST_CHECK(m("b")); + BOOST_CHECK(m("bfoobar")); + BOOST_CHECK(!m("bfoobarcfoobar")); + } +} + + +void add_globish_tests(test_suite * suite) +{ + I(suite); + suite->add(BOOST_TEST_CASE(&checked_globish_to_regex_test)); + suite->add(BOOST_TEST_CASE(&combine_and_check_globish_test)); + suite->add(BOOST_TEST_CASE(&globish_matcher_test)); +} + +#endif // BUILD_UNIT_TESTS --- globish.hh +++ globish.hh @@ -0,0 +1,44 @@ +#ifndef __GLOBISH_HH__ +#define __GLOBISH_HH__ + +// copyright (C) 2005 nathaniel smith +// all rights reserved. +// licensed to the public under the terms of the GNU GPL (>= 2) +// see the file COPYING for details + +// a sort of glob-like pattern matcher, for use in specifying branch +// collections for netsync. it is important that it not be too expensive to +// match (as opposed to common regex engines, which can be exponential on +// pathological patterns), because we must match branches against untrusted +// patterns when doing netsync. + +// the syntax is: +// most things - match themselves +// * - match 0 or more characters +// ? - match 0 or 1 characters +// \ - match +// {,,...} - match any of the given items +// so like standard globs, except without [] character sets, and with {} +// alternation. + +#include +#include +#include + +#include "vocab.hh" + +void combine_and_check_globish(std::set const &patterns, utf8 & pattern); + +class globish_matcher +{ +public: + // this may throw an informative_failure if a pattern is invalid + globish_matcher(utf8 const & include_pat, utf8 const & exclude_pat); + // this method may throw a std::runtime_error if the pattern is really + // pathological + bool operator()(std::string const & s); +private: + boost::regex r_inc, r_exc; +}; + +#endif --- transforms.cc +++ transforms.cc @@ -837,169 +837,9 @@ dst += linesep_str; } -// glob_to_regex converts a sh file glob to a regex. The regex should -// be usable by the Boost regex library. -// -// Pattern tranformation: -// -// - Any character except those described below are copied as they are. -// - The backslash (\) escapes the following character. The escaping -// backslash is copied to the regex along with the following character. -// - * is transformed to .* in the regex. -// - ? is transformed to . in the regex. -// - { is transformed to ( in the regex, unless within [ and ]. -// - } is transformed to ) in the regex, unless within [ and ]. -// - , is transformed to | in the regex, if within { and } and not -// within [ and ]. -// - ^ is escaped unless it comes directly after an unescaped [. -// - ! is transformed to ^ in the regex if it comes directly after an -// unescaped [. -// - ] directly following an unescaped [ is escaped. -void -glob_to_regex(std::string const & glob, std::string & regex) -{ - int in_braces = 0; // counter for levels if {} - bool in_brackets = false; // flags if we're inside a [], which - // has higher precedence than {}. - // Also, [ is accepted inside [] unescaped. - bool this_was_opening_bracket = false; - regex.reserve(glob.size() * 2); - - L(F("glob_to_regex: input = '%s'\n") % glob); - - for (string::const_iterator i = glob.begin(); i != glob.end(); ++i) - { - char c = *i; - bool last_was_opening_bracket = this_was_opening_bracket; - this_was_opening_bracket = false; - - // Special case ^ and ! at the beginning of a [] expression. - if (in_brackets && last_was_opening_bracket - && (c == '!' || c == '^')) - { - regex += '^'; - if (++i == glob.end()) - break; - c = *i; - } - - if (c == '\\') - { - regex += c; - if (++i == glob.end()) - break; - regex += *i; - } - else if (in_brackets) - { - switch(c) - { - case ']': - if (!last_was_opening_bracket) - { - in_brackets = false; - regex += c; - break; - } - // Trickling through to the standard character conversion, - // because ] as the first character of a set is regarded as - // a normal character. - default: - if (!(isalnum(c) || c == '_')) - { - regex += '\\'; - } - regex += c; - break; - } - } - else - { - switch(c) - { - case '*': - regex += ".*"; - break; - case '?': - regex += '.'; - break; - case '{': - in_braces++; - regex += '('; - break; - case '}': - N(in_braces != 0, - F("trying to end a brace expression in a glob when none is started")); - regex += ')'; - in_braces--; - break; - case '[': - in_brackets = true; - this_was_opening_bracket = true; - regex += c; - break; - case ',': - if (in_braces > 0) - { - regex += '|'; - break; - } - // Trickling through to default: here, since a comma outside of - // brace notation is just a normal character. - default: - if (!(isalnum(c) || c == '_')) - { - regex += '\\'; - } - regex += c; - break; - } - } - } - - N(!in_brackets, - F("run-away bracket expression in glob")); - N(in_braces == 0, - F("run-away brace expression in glob")); - - L(F("glob_to_regex: output = '%s'\n") % regex); -} - -void -globs_to_regex(std::set const & globs, std::string & regex) -{ - std::set regexes; - for (std::set::const_iterator i = globs.begin(); i != globs.end(); ++i) - { - std::string r; - glob_to_regex(*i, r); - regexes.insert(r); - } - regexes_to_regex(regexes, regex); -} - -void -regexes_to_regex(std::set const & regexes, std::string & regex) -{ - I(regexes.size() > 0); - if (regexes.size() == 1) - { - regex = *regexes.begin(); - return; - } - bool first = true; - for (std::set::const_iterator i = regexes.begin(); i != regexes.end(); ++i) - { - if (!first) - regex += "|"; - regex += "(" + *i + ")"; - } -} - #ifdef BUILD_UNIT_TESTS #include "unit_tests.hh" -#include static void enc_test() @@ -1276,47 +1116,6 @@ check_idna_encoding(); } -static void -check_glob_to_regex(std::string const & glob, std::string const & regex) -{ - std::string translated_regex; - glob_to_regex(glob, translated_regex); - BOOST_CHECK(translated_regex == regex); -} - -static void -check_glob_match(std::string const & glob, std::string const & line, - bool matchp) -{ - std::string regex; - glob_to_regex(glob, regex); - BOOST_CHECK(boost::regex_match(line, boost::regex(regex, boost::regex::extended)) == matchp); -} - -static void glob_to_regex_test() -{ - // our converter is very conserative about metacharacters, and quotes all - // non-alphabetic characters - check_glob_to_regex("abc,v", "abc\\,v"); - check_glob_to_regex("foo[12m,]", "foo[12m\\,]"); - check_glob_to_regex("foo{bar,baz*}", "foo(bar|baz.*)"); - // A full fledged, use all damn features test... - check_glob_to_regex("foo.{bar*,cookie?{haha,hehe[^\\123!,]}}[!]a^b]", - "foo\\.(bar.*|cookie.(haha|hehe[^\\123\\!\\,]))[^\\]a\\^b]"); - - check_glob_match("foobar", "foobar", true); - check_glob_match("foobar", "foobaz", false); - check_glob_match("foo*", "foobar", true); - check_glob_match("foo{bar,baz}", "foobar", true); - check_glob_match("foo{bar,baz}", "foobaz", true); - check_glob_match("foo{bar,baz}", "bazbar", false); - check_glob_match("foo[123]", "foo1", true); - check_glob_match("foo[123]", "foo3", true); - check_glob_match("foo[123]", "foo5", false); - check_glob_match("foo[123]", "foo1a", false); - check_glob_match("foo,bar()", "foo,bar()", false); -} - void add_transform_tests(test_suite * suite) { @@ -1328,7 +1127,6 @@ suite->add(BOOST_TEST_CASE(&join_lines_test)); suite->add(BOOST_TEST_CASE(&strip_ws_test)); suite->add(BOOST_TEST_CASE(&encode_test)); - suite->add(BOOST_TEST_CASE(&glob_to_regex_test)); } #endif // BUILD_UNIT_TESTS --- transforms.hh +++ transforms.hh @@ -195,11 +195,4 @@ // line-ending conversion void line_end_convert(std::string const & linesep, std::string const & src, std::string & dst); -// glob/regex handling -void glob_to_regex(std::string const & glob, std::string & regex); -void globs_to_regex(std::set const & globs, std::string & regex); -void regexes_to_regex(std::set const & regexes, std::string & regex); - - - #endif // __TRANSFORMS_HH__ --- unit_tests.cc +++ unit_tests.cc @@ -69,9 +69,11 @@ if (t.empty() || t.find("netcmd") != t.end()) add_netcmd_tests(suite); - if (t.empty() || t.find("netcmd") != t.end()) + if (t.empty() || t.find("path_component") != t.end()) add_path_component_tests(suite); + if (t.empty() || t.find("globish") != t.end()) + add_globish_tests(suite); // all done, add our clean-shutdown-indicator suite->add(BOOST_TEST_CASE(&clean_shutdown_dummy_test)); --- unit_tests.hh +++ unit_tests.hh @@ -32,5 +32,6 @@ void add_packet_tests(test_suite * suite); void add_netcmd_tests(test_suite * suite); void add_path_component_tests(test_suite * suite); +void add_globish_tests(test_suite * suite); #endif