emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

feature/tree-sitter e171ef933f 10/26: Support compiled queries in treesi


From: Yuan Fu
Subject: feature/tree-sitter e171ef933f 10/26: Support compiled queries in treesit-query-capture
Date: Thu, 16 Jun 2022 14:53:45 -0400 (EDT)

branch: feature/tree-sitter
commit e171ef933feefd67d7f1b3b3693ce730111660e9
Author: Yuan Fu <casouri@gmail.com>
Commit: Yuan Fu <casouri@gmail.com>

    Support compiled queries in treesit-query-capture
    
    Last commit added this new type, this commit adds functionalities.
    treesit.el only has documentation changes.
    
    * lisp/treesit.el (treesit-query-in, treesit-font-lock-settings,
    treesit-defun-query): Update docstring.
    * src/treesit.c (make_ts_query): New function.
    (Ftreesit_query_compile): New function.
    (Ftreesit_query_capture): Remove code that creates a query object and
    instead either use make_ts_query or use the give compiled query.  Free
    the query object conditonally.
    (syms_of_treesit): New symbol.
---
 lisp/treesit.el |  21 ++++++---
 src/treesit.c   | 143 ++++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 134 insertions(+), 30 deletions(-)

diff --git a/lisp/treesit.el b/lisp/treesit.el
index 78dfcae7e5..09f750f9d5 100644
--- a/lisp/treesit.el
+++ b/lisp/treesit.el
@@ -366,9 +366,12 @@ language symbol, use the root node of the first parser for 
that
 language; if a parser, use the root node of that parser; if a
 node, use that node.
 
-QUERY is either a string query or a sexp query.  See Info node
-`(elisp)Pattern Matching' for how to write a query pattern in either
-string or s-expression form.
+QUERY is either a string query, a sexp query, or a compiled
+query.  See Info node `(elisp)Pattern Matching' for how to write
+a query in either string or s-expression form.  When using
+repeatedly, a compiled query is much faster than a string or sexp
+one, so it is recommend to compile your queries if it will be
+used over and over.
 
 BEG and END, if _both_ non-nil, specifies the range in which the query
 is executed.
@@ -442,8 +445,12 @@ Each SETTING controls one parser (often of different 
language).
 LANGUAGE is the language symbol.  See Info node `(elisp)Language
 Definitions'.
 
-QUERY is either a string query or a sexp query.
-See Info node `(elisp)Pattern Matching' for writing queries.
+QUERY is either a string query, a sexp query, or a compiled
+query.  See Info node `(elisp)Pattern Matching' for how to write
+a query in either string or s-expression form.  When using
+repeatedly, a compiled query is much faster than a string or sexp
+one, so it is recommend to compile your queries if it will be
+used over and over.
 
 Capture names in QUERY should be face names like
 `font-lock-keyword-face'.  The captured node will be fontified
@@ -923,7 +930,9 @@ search failed."
 (defvar-local treesit-defun-query nil
   "A tree-sitter query that matches function/class definitions.
 Capture names don't matter.  This variable is used by navigation
-functions like `treesit-beginning-of-defun'.")
+functions like `treesit-beginning-of-defun'.
+
+It is recommended to use compiled query for this variable.")
 
 (defun treesit-beginning-of-defun (&optional arg)
   "Move backward to the beginning of a defun.
diff --git a/src/treesit.c b/src/treesit.c
index 3c8edc9213..5b344a2ea1 100644
--- a/src/treesit.c
+++ b/src/treesit.c
@@ -88,6 +88,28 @@ along with GNU Emacs.  If not, see 
<https://www.gnu.org/licenses/>.  */
      parser of buffer changes.
    - lisp/emacs-lisp/cl-preloaded.el & data.c & lisp.h for parser and
      node type.
+
+   Because it is pretty slow (comparing to other tree-sitter
+   operations) for tree-sitter to parse the query and produce a query
+   object, it is very wasteful to reparse the query every time
+   treesit-query-capture is called, and it completely kills the
+   performance of querying in a loop for a moderate amount of times
+   (hundreds of queries takes seconds rather than milliseconds to
+   complete).  Therefore we want some caching. We can either use a
+   search.c style transparent caching, or simply expose a new type,
+   compiled-ts-query and let the user to manually compile AOT.  I
+   believe AOT compiling gives users more control, makes the
+   performance stable and easy to understand (compiled -> fast,
+   uncompiled -> slow), and avoids some edge cases transparent cache
+   could have (see below).  So I implemented the AOT compilation.
+
+   Problems a transparent cache could have: Suppose we store cache
+   entries in a fixed-length linked-list, and compare with EQ.  1)
+   One-off query could kick out useful cache.  2) if the user messed
+   up and the query doesn't EQ to the cache anymore, the performance
+   mysteriously drops.  3) what if a user uses so many stuff that the
+   default cache size (20) is not enough and we end up thrashing?
+   These are all imagined scenarios but they are not impossible :-)
  */
 
 /*** Initialization */
@@ -536,6 +558,31 @@ make_ts_node (Lisp_Object parser, TSNode node)
   return make_lisp_ptr (lisp_node, Lisp_Vectorlike);
 }
 
+/* Make a compiled query struct.  Return NULL if error occurs.  QUERY
+   has to be either a cons or a string.  */
+static struct Lisp_TS_Query *
+make_ts_query (Lisp_Object query, const TSLanguage *language,
+              uint32_t *error_offset, TSQueryError *error_type)
+{
+  if (CONSP (query))
+    query = Ftreesit_expand_query (query);
+  char *source = SSDATA (query);
+
+  TSQuery *ts_query = ts_query_new (language, source, strlen (source),
+                                   error_offset, error_type);
+  TSQueryCursor *ts_cursor = ts_query_cursor_new ();
+
+  if (ts_query == NULL)
+    return NULL;
+
+  struct Lisp_TS_Query *lisp_query
+    = ALLOCATE_PLAIN_PSEUDOVECTOR (struct Lisp_TS_Query,
+                                  PVEC_TS_COMPILED_QUERY);
+  lisp_query->query = ts_query;
+  lisp_query->cursor = ts_cursor;
+  return lisp_query;
+}
+
 DEFUN ("treesit-parser-p",
        Ftreesit_parser_p, Streesit_parser_p, 1, 1, 0,
        doc: /* Return t if OBJECT is a tree-sitter parser.  */)
@@ -1467,6 +1514,39 @@ ts_eval_predicates
   return pass;
 }
 
+DEFUN ("treesit-query-compile",
+       Ftreesit_query_compile,
+       Streesit_query_compile, 2, 2, 0,
+       doc: /* Compile QUERY to a compiled query.
+
+Querying a compiled query is much faster than an uncompiled one.
+LANGUAGE is the language this query is for.
+
+Signals treesit-query-error if QUERY is malformed or something
+else goes wrong.  */)
+  (Lisp_Object language, Lisp_Object query)
+{
+  if (!Ftreesit_query_p (query))
+    wrong_type_argument (Qtreesit_query_p, query);
+  CHECK_SYMBOL (language);
+  if (TS_COMPILED_QUERY_P (query))
+    return query;
+
+  TSLanguage *ts_lang = ts_load_language (language, true);
+  uint32_t error_offset;
+  TSQueryError error_type;
+
+  struct Lisp_TS_Query *lisp_query
+    = make_ts_query (query, ts_lang, &error_offset, &error_type);
+
+  if (lisp_query == NULL)
+    xsignal2 (Qtreesit_query_error,
+             build_string (ts_query_error_to_string (error_type)),
+             make_fixnum (error_offset + 1));
+
+  return make_lisp_ptr (lisp_query, Lisp_Vectorlike);
+}
+
 DEFUN ("treesit-query-capture",
        Ftreesit_query_capture,
        Streesit_query_capture, 2, 4, 0,
@@ -1475,9 +1555,11 @@ DEFUN ("treesit-query-capture",
 Return a list of (CAPTURE_NAME . NODE).  CAPTURE_NAME is the name
 assigned to the node in PATTERN.  NODE is the captured node.
 
-QUERY is either a string query or a sexp query.  See Info node
-`(elisp)Pattern Matching' for how to write a query in either string or
-s-expression form.
+QUERY is either a string query, a sexp query, or a compiled query.
+See Info node `(elisp)Pattern Matching' for how to write a query in
+either string or s-expression form.  When using repeatedly, a compiled
+query is much faster than a string or sexp one, so it is recommend to
+compile your queries if it will be used over and over.
 
 BEG and END, if both non-nil, specifies the range in which the query
 is executed.
@@ -1493,10 +1575,9 @@ else goes wrong.  */)
   if (!NILP (end))
     CHECK_INTEGER (end);
 
-  if (CONSP (query))
-    query = Ftreesit_expand_query (query);
-  else
-    CHECK_STRING (query);
+  if (!(TS_COMPILED_QUERY_P (query)
+       || CONSP (query) || STRINGP (query)))
+    wrong_type_argument (Qtreesit_query_p, query);
 
   /* Extract C values from Lisp objects.  */
   TSNode ts_node = XTS_NODE (node)->node;
@@ -1505,25 +1586,34 @@ else goes wrong.  */)
     XTS_PARSER (XTS_NODE (node)->parser)->visible_beg;
   const TSLanguage *lang = ts_parser_language
     (XTS_PARSER (lisp_parser)->parser);
-  char *source = SSDATA (query);
 
   /* Initialize query objects, and execute query.  */
-  uint32_t error_offset;
-  TSQueryError error_type;
-  /* TODO: We could cache the query object, so that repeatedly
-     querying with the same query can reuse the query object.  It also
-     saves us from expanding the sexp query into a string.  I don't
-     know how much time that could save though.  */
-  TSQuery *ts_query = ts_query_new (lang, source, strlen (source),
-                                   &error_offset, &error_type);
-  TSQueryCursor *cursor = ts_query_cursor_new ();
-
-  if (ts_query == NULL)
+  struct Lisp_TS_Query *lisp_query;
+  /* If the lisp query is temporary, we need to free it after use. */
+  bool lisp_query_temp_p;
+  if (TS_COMPILED_QUERY_P (query))
     {
-      xsignal2 (Qtreesit_query_error,
-               build_string (ts_query_error_to_string (error_type)),
-               make_fixnum (error_offset + 1));
+      lisp_query_temp_p = false;
+      lisp_query = XTS_COMPILED_QUERY (query);
     }
+  else
+    {
+      lisp_query_temp_p = true;
+      uint32_t error_offset;
+      TSQueryError error_type;
+      lisp_query = make_ts_query (query, lang,
+                                 &error_offset, &error_type);
+      if (lisp_query == NULL)
+       {
+         xsignal2 (Qtreesit_query_error,
+                   build_string
+                   (ts_query_error_to_string (error_type)),
+                   make_fixnum (error_offset + 1));
+       }
+    }
+  TSQuery *ts_query = lisp_query->query;
+  TSQueryCursor *cursor = lisp_query->cursor;
+
   if (!NILP (beg) && !NILP (end))
     {
       EMACS_INT beg_byte = XFIXNUM (beg);
@@ -1578,8 +1668,11 @@ else goes wrong.  */)
          result = prev_result;
        }
     }
-  ts_query_delete (ts_query);
-  ts_query_cursor_delete (cursor);
+  if (lisp_query_temp_p)
+    {
+      ts_query_delete (ts_query);
+      ts_query_cursor_delete (cursor);
+    }
   return Fnreverse (result);
 }
 
@@ -1592,6 +1685,7 @@ syms_of_treesit (void)
   DEFSYM (Qtreesit_parser_p, "treesit-parser-p");
   DEFSYM (Qtreesit_node_p, "treesit-node-p");
   DEFSYM (Qtreesit_compiled_query_p, "treesit-compiled-query-p");
+  DEFSYM (Qtreesit_query_p, "treesit-query-p");
   DEFSYM (Qnamed, "named");
   DEFSYM (Qmissing, "missing");
   DEFSYM (Qextra, "extra");
@@ -1705,5 +1799,6 @@ dynamic libraries, in that order.  */);
 
   defsubr (&Streesit_expand_pattern);
   defsubr (&Streesit_expand_query);
+  defsubr (&Streesit_query_compile);
   defsubr (&Streesit_query_capture);
 }



reply via email to

[Prev in Thread] Current Thread [Next in Thread]