#
# add_file "globish.cc"
#
# add_file "globish.hh"
#
# patch "ChangeLog"
# from [520f6d02017bcd27e02e240763bfdfbde44d8620]
# to [50ec104f400f556a46bae530d9cecd9cdd85140d]
#
# patch "Makefile.am"
# from [8f57b338067b43d1d1ae5de118323a6c67f6be7d]
# to [78ca8f5e5c7726ed29bf7ebd7db41367e8af60ae]
#
# patch "globish.cc"
# from []
# to [42062864963a68b6d468f784dace238974658f3c]
#
# patch "globish.hh"
# from []
# to [c43704114fc9a8866638686f3899749824c49179]
#
# patch "transforms.cc"
# from [9630106a94b0a726c54ca4187f36f3311bfa0ece]
# to [fe0f025921c7a342635e152d4f7c8b93499be401]
#
# patch "transforms.hh"
# from [230e09c455faf1c7be4a2af700e5da6a6cfc0a4e]
# to [446f1de66317ee1a5a4ee76fd0887a686c8264b8]
#
# patch "unit_tests.cc"
# from [230b4227d08bed323a2625f593073427bca9fe9b]
# to [99e4b00f7e1071461ac6d79372edda008bb5b182]
#
# patch "unit_tests.hh"
# from [b4b20ac190077a08db547eb1dcd79f40c58b2ebc]
# to [340f8c9e449facd6744cf6b926367fbd98bbb062]
#
--- ChangeLog
+++ ChangeLog
@@ -1,3 +1,10 @@
+2005-06-28 Nathaniel Smith
+
+ * globish.{cc,hh}: New files.
+ * Makefile.am (MOST_SOURCES): Add them.
+ * transforms.{cc,hh}: Remove glob-related stuff.
+ * unit_tests.{cc,hh}: Call globish unit tests.
+
2005-06-27 Nathaniel Smith
* transforms.cc (glob_to_regex, globs_to_regex, regexes_to_regex):
--- Makefile.am
+++ Makefile.am
@@ -40,6 +40,7 @@
selectors.cc selectors.hh \
annotate.cc annotate.hh \
restrictions.cc restrictions.hh \
+ globish.cc globish.hh \
\
cleanup.hh unit_tests.hh interner.hh \
cycle_detector.hh randomfile.hh adler32.hh quick_alloc.hh \
--- globish.cc
+++ globish.cc
@@ -0,0 +1,210 @@
+// copyright (C) 2005 Richard Levitte
+// copyright (C) 2005 nathaniel smith
+// all rights reserved.
+// licensed to the public under the terms of the GNU GPL (>= 2)
+// see the file COPYING for details
+
+#include "sanity.hh"
+#include "globish.hh"
+
+// this converts a globish pattern to a regex. The regex should be usable by
+// the Boost regex library operating in default mode, i.e., it should be a
+// valid ECMAscript regex.
+//
+// Pattern tranformation:
+//
+// - Any character except those described below are copied as they are.
+// - The backslash (\) escapes the following character. The escaping
+// backslash is copied to the regex along with the following character.
+// - * is transformed to .* in the regex.
+// - ? is transformed to . in the regex.
+// - { is transformed to ( in the regex
+// - } is transformed to ) in the regex
+// - , is transformed to | in the regex, if within { and }
+// - ^ is escaped unless it comes directly after an unescaped [.
+// - ! is transformed to ^ in the regex if it comes directly after an
+// unescaped [.
+// - ] directly following an unescaped [ is escaped.
+static void
+maybe_quote(char c, std::string & re)
+{
+ if (!(isalnum(c) || c == '_'))
+ {
+ re += '\\';
+ }
+ re += c;
+}
+
+static void
+checked_globish_to_regex(std::string const & glob, std::string & regex)
+{
+ int in_braces = 0; // counter for levels if {}
+
+ regex.clear();
+ regex.reserve(glob.size() * 2);
+
+ L(F("checked_globish_to_regex: input = '%s'\n") % glob);
+
+ for (std::string::const_iterator i = glob.begin(); i != glob.end(); ++i)
+ {
+ char c = *i;
+
+ N(in_braces < 5, F("braces nested too deep in pattern '%s'") % glob);
+
+ switch(c)
+ {
+ case '*':
+ regex += ".*";
+ break;
+ case '?':
+ regex += '.';
+ break;
+ case '{':
+ in_braces++;
+ regex += '(';
+ break;
+ case '}':
+ N(in_braces != 0,
+ F("trying to end a brace expression in a glob when none is started"));
+ regex += ')';
+ in_braces--;
+ break;
+ case ',':
+ if (in_braces > 0)
+ regex += '|';
+ else
+ maybe_quote(c, regex);
+ break;
+ case '\\':
+ N(++i != glob.end(), F("pattern '%s' ends with backslash") % glob);
+ maybe_quote(*i, regex);
+ break;
+ default:
+ maybe_quote(c, regex);
+ break;
+ }
+ }
+
+ N(in_braces == 0,
+ F("run-away brace expression in pattern '%s'") % glob);
+
+ L(F("checked_globish_to_regex: output = '%s'\n") % regex);
+}
+
+void
+combine_and_check_globish(std::set const &patterns, utf8 & pattern)
+{
+ std::string p;
+ p += '{';
+ bool first = true;
+ for (std::set::const_iterator i = patterns.begin(); i != patterns.end(); ++i)
+ {
+ std::string tmp;
+ // run for the checking it does
+ checked_globish_to_regex((*i)(), tmp);
+ if (!first)
+ p += ',';
+ first = false;
+ p += (*i)();
+ }
+ p += '}';
+ pattern = utf8(p);
+}
+
+globish_matcher::globish_matcher(utf8 const & include_pat, utf8 const & exclude_pat)
+{
+ std::string re;
+ checked_globish_to_regex(include_pat(), re);
+ r_inc = re;
+ checked_globish_to_regex(exclude_pat(), re);
+ r_exc = re;
+}
+
+bool
+globish_matcher::operator()(std::string const & s)
+{
+ // regex_match may throw a std::runtime_error, if the regex turns out to be
+ // really pathological
+ return boost::regex_match(s, r_inc) && !boost::regex_match(s, r_exc);
+}
+
+
+#ifdef BUILD_UNIT_TESTS
+#include "unit_tests.hh"
+
+static void
+checked_globish_to_regex_test()
+{
+ std::string pat;
+
+ checked_globish_to_regex("*", pat);
+ BOOST_CHECK(pat == ".*");
+ checked_globish_to_regex("?", pat);
+ BOOST_CHECK(pat == ".");
+ checked_globish_to_regex("{a,b,c}d", pat);
+ BOOST_CHECK(pat == "(a|b|c)d");
+ checked_globish_to_regex("foo{a,{b,c},?*}d", pat);
+ BOOST_CHECK(pat == "foo(a|(b|c)|..*)d");
+ checked_globish_to_regex("\\a\\b\\|\\{\\*", pat);
+ BOOST_CHECK(pat == "ab\\|\\{\\*");
+ checked_globish_to_regex(".+$^{}", pat);
+ BOOST_CHECK(pat == "\\.\\+\\$\\^()");
+ checked_globish_to_regex(",", pat);
+ // we're very conservative about metacharacters, and quote all
+ // non-alphanumerics, hence the backslash
+ BOOST_CHECK(pat == "\\,");
+
+ BOOST_CHECK_THROW(checked_globish_to_regex("foo\\", pat), informative_failure);
+ BOOST_CHECK_THROW(checked_globish_to_regex("{foo", pat), informative_failure);
+ BOOST_CHECK_THROW(checked_globish_to_regex("{foo,bar{baz,quux}", pat), informative_failure);
+ BOOST_CHECK_THROW(checked_globish_to_regex("foo}", pat), informative_failure);
+ BOOST_CHECK_THROW(checked_globish_to_regex("foo,bar{baz,quux}}", pat), informative_failure);
+ BOOST_CHECK_THROW(checked_globish_to_regex("{{{{{{{{{{a,b},c},d},e},f},g},h},i},j},k}", pat), informative_failure);
+}
+
+static void
+combine_and_check_globish_test()
+{
+ std::set s;
+ s.insert(utf8("a"));
+ s.insert(utf8("b"));
+ s.insert(utf8("c"));
+ utf8 combined;
+ combine_and_check_globish(s, combined);
+ BOOST_CHECK(combined() == "{a,b,c}");
+}
+
+static void
+globish_matcher_test()
+{
+ {
+ globish_matcher m(utf8("{a,b}?*\\*|"), utf8("*c*"));
+ BOOST_CHECK(m("aq*|"));
+ BOOST_CHECK(m("bq*|"));
+ BOOST_CHECK(!m("bc*|"));
+ BOOST_CHECK(!m("bq|"));
+ BOOST_CHECK(!m("b*|"));
+ BOOST_CHECK(!m(""));
+ }
+ {
+ globish_matcher m(utf8("{a,\\\\,b*}"), utf8("*c*"));
+ BOOST_CHECK(m("a"));
+ BOOST_CHECK(!m("ab"));
+ BOOST_CHECK(m("\\"));
+ BOOST_CHECK(!m("\\\\"));
+ BOOST_CHECK(m("b"));
+ BOOST_CHECK(m("bfoobar"));
+ BOOST_CHECK(!m("bfoobarcfoobar"));
+ }
+}
+
+
+void add_globish_tests(test_suite * suite)
+{
+ I(suite);
+ suite->add(BOOST_TEST_CASE(&checked_globish_to_regex_test));
+ suite->add(BOOST_TEST_CASE(&combine_and_check_globish_test));
+ suite->add(BOOST_TEST_CASE(&globish_matcher_test));
+}
+
+#endif // BUILD_UNIT_TESTS
--- globish.hh
+++ globish.hh
@@ -0,0 +1,44 @@
+#ifndef __GLOBISH_HH__
+#define __GLOBISH_HH__
+
+// copyright (C) 2005 nathaniel smith
+// all rights reserved.
+// licensed to the public under the terms of the GNU GPL (>= 2)
+// see the file COPYING for details
+
+// a sort of glob-like pattern matcher, for use in specifying branch
+// collections for netsync. it is important that it not be too expensive to
+// match (as opposed to common regex engines, which can be exponential on
+// pathological patterns), because we must match branches against untrusted
+// patterns when doing netsync.
+
+// the syntax is:
+// most things - match themselves
+// * - match 0 or more characters
+// ? - match 0 or 1 characters
+// \ - match
+// {,,...} - match any of the given items
+// so like standard globs, except without [] character sets, and with {}
+// alternation.
+
+#include
+#include
+#include
+
+#include "vocab.hh"
+
+void combine_and_check_globish(std::set const &patterns, utf8 & pattern);
+
+class globish_matcher
+{
+public:
+ // this may throw an informative_failure if a pattern is invalid
+ globish_matcher(utf8 const & include_pat, utf8 const & exclude_pat);
+ // this method may throw a std::runtime_error if the pattern is really
+ // pathological
+ bool operator()(std::string const & s);
+private:
+ boost::regex r_inc, r_exc;
+};
+
+#endif
--- transforms.cc
+++ transforms.cc
@@ -837,169 +837,9 @@
dst += linesep_str;
}
-// glob_to_regex converts a sh file glob to a regex. The regex should
-// be usable by the Boost regex library.
-//
-// Pattern tranformation:
-//
-// - Any character except those described below are copied as they are.
-// - The backslash (\) escapes the following character. The escaping
-// backslash is copied to the regex along with the following character.
-// - * is transformed to .* in the regex.
-// - ? is transformed to . in the regex.
-// - { is transformed to ( in the regex, unless within [ and ].
-// - } is transformed to ) in the regex, unless within [ and ].
-// - , is transformed to | in the regex, if within { and } and not
-// within [ and ].
-// - ^ is escaped unless it comes directly after an unescaped [.
-// - ! is transformed to ^ in the regex if it comes directly after an
-// unescaped [.
-// - ] directly following an unescaped [ is escaped.
-void
-glob_to_regex(std::string const & glob, std::string & regex)
-{
- int in_braces = 0; // counter for levels if {}
- bool in_brackets = false; // flags if we're inside a [], which
- // has higher precedence than {}.
- // Also, [ is accepted inside [] unescaped.
- bool this_was_opening_bracket = false;
- regex.reserve(glob.size() * 2);
-
- L(F("glob_to_regex: input = '%s'\n") % glob);
-
- for (string::const_iterator i = glob.begin(); i != glob.end(); ++i)
- {
- char c = *i;
- bool last_was_opening_bracket = this_was_opening_bracket;
- this_was_opening_bracket = false;
-
- // Special case ^ and ! at the beginning of a [] expression.
- if (in_brackets && last_was_opening_bracket
- && (c == '!' || c == '^'))
- {
- regex += '^';
- if (++i == glob.end())
- break;
- c = *i;
- }
-
- if (c == '\\')
- {
- regex += c;
- if (++i == glob.end())
- break;
- regex += *i;
- }
- else if (in_brackets)
- {
- switch(c)
- {
- case ']':
- if (!last_was_opening_bracket)
- {
- in_brackets = false;
- regex += c;
- break;
- }
- // Trickling through to the standard character conversion,
- // because ] as the first character of a set is regarded as
- // a normal character.
- default:
- if (!(isalnum(c) || c == '_'))
- {
- regex += '\\';
- }
- regex += c;
- break;
- }
- }
- else
- {
- switch(c)
- {
- case '*':
- regex += ".*";
- break;
- case '?':
- regex += '.';
- break;
- case '{':
- in_braces++;
- regex += '(';
- break;
- case '}':
- N(in_braces != 0,
- F("trying to end a brace expression in a glob when none is started"));
- regex += ')';
- in_braces--;
- break;
- case '[':
- in_brackets = true;
- this_was_opening_bracket = true;
- regex += c;
- break;
- case ',':
- if (in_braces > 0)
- {
- regex += '|';
- break;
- }
- // Trickling through to default: here, since a comma outside of
- // brace notation is just a normal character.
- default:
- if (!(isalnum(c) || c == '_'))
- {
- regex += '\\';
- }
- regex += c;
- break;
- }
- }
- }
-
- N(!in_brackets,
- F("run-away bracket expression in glob"));
- N(in_braces == 0,
- F("run-away brace expression in glob"));
-
- L(F("glob_to_regex: output = '%s'\n") % regex);
-}
-
-void
-globs_to_regex(std::set const & globs, std::string & regex)
-{
- std::set regexes;
- for (std::set::const_iterator i = globs.begin(); i != globs.end(); ++i)
- {
- std::string r;
- glob_to_regex(*i, r);
- regexes.insert(r);
- }
- regexes_to_regex(regexes, regex);
-}
-
-void
-regexes_to_regex(std::set const & regexes, std::string & regex)
-{
- I(regexes.size() > 0);
- if (regexes.size() == 1)
- {
- regex = *regexes.begin();
- return;
- }
- bool first = true;
- for (std::set::const_iterator i = regexes.begin(); i != regexes.end(); ++i)
- {
- if (!first)
- regex += "|";
- regex += "(" + *i + ")";
- }
-}
-
#ifdef BUILD_UNIT_TESTS
#include "unit_tests.hh"
-#include
static void
enc_test()
@@ -1276,47 +1116,6 @@
check_idna_encoding();
}
-static void
-check_glob_to_regex(std::string const & glob, std::string const & regex)
-{
- std::string translated_regex;
- glob_to_regex(glob, translated_regex);
- BOOST_CHECK(translated_regex == regex);
-}
-
-static void
-check_glob_match(std::string const & glob, std::string const & line,
- bool matchp)
-{
- std::string regex;
- glob_to_regex(glob, regex);
- BOOST_CHECK(boost::regex_match(line, boost::regex(regex, boost::regex::extended)) == matchp);
-}
-
-static void glob_to_regex_test()
-{
- // our converter is very conserative about metacharacters, and quotes all
- // non-alphabetic characters
- check_glob_to_regex("abc,v", "abc\\,v");
- check_glob_to_regex("foo[12m,]", "foo[12m\\,]");
- check_glob_to_regex("foo{bar,baz*}", "foo(bar|baz.*)");
- // A full fledged, use all damn features test...
- check_glob_to_regex("foo.{bar*,cookie?{haha,hehe[^\\123!,]}}[!]a^b]",
- "foo\\.(bar.*|cookie.(haha|hehe[^\\123\\!\\,]))[^\\]a\\^b]");
-
- check_glob_match("foobar", "foobar", true);
- check_glob_match("foobar", "foobaz", false);
- check_glob_match("foo*", "foobar", true);
- check_glob_match("foo{bar,baz}", "foobar", true);
- check_glob_match("foo{bar,baz}", "foobaz", true);
- check_glob_match("foo{bar,baz}", "bazbar", false);
- check_glob_match("foo[123]", "foo1", true);
- check_glob_match("foo[123]", "foo3", true);
- check_glob_match("foo[123]", "foo5", false);
- check_glob_match("foo[123]", "foo1a", false);
- check_glob_match("foo,bar()", "foo,bar()", false);
-}
-
void
add_transform_tests(test_suite * suite)
{
@@ -1328,7 +1127,6 @@
suite->add(BOOST_TEST_CASE(&join_lines_test));
suite->add(BOOST_TEST_CASE(&strip_ws_test));
suite->add(BOOST_TEST_CASE(&encode_test));
- suite->add(BOOST_TEST_CASE(&glob_to_regex_test));
}
#endif // BUILD_UNIT_TESTS
--- transforms.hh
+++ transforms.hh
@@ -195,11 +195,4 @@
// line-ending conversion
void line_end_convert(std::string const & linesep, std::string const & src, std::string & dst);
-// glob/regex handling
-void glob_to_regex(std::string const & glob, std::string & regex);
-void globs_to_regex(std::set const & globs, std::string & regex);
-void regexes_to_regex(std::set const & regexes, std::string & regex);
-
-
-
#endif // __TRANSFORMS_HH__
--- unit_tests.cc
+++ unit_tests.cc
@@ -69,9 +69,11 @@
if (t.empty() || t.find("netcmd") != t.end())
add_netcmd_tests(suite);
- if (t.empty() || t.find("netcmd") != t.end())
+ if (t.empty() || t.find("path_component") != t.end())
add_path_component_tests(suite);
+ if (t.empty() || t.find("globish") != t.end())
+ add_globish_tests(suite);
// all done, add our clean-shutdown-indicator
suite->add(BOOST_TEST_CASE(&clean_shutdown_dummy_test));
--- unit_tests.hh
+++ unit_tests.hh
@@ -32,5 +32,6 @@
void add_packet_tests(test_suite * suite);
void add_netcmd_tests(test_suite * suite);
void add_path_component_tests(test_suite * suite);
+void add_globish_tests(test_suite * suite);
#endif