[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/elisa e877f8f5c7 44/98: First implementation for web se
From: |
ELPA Syncer |
Subject: |
[elpa] externals/elisa e877f8f5c7 44/98: First implementation for web search |
Date: |
Wed, 17 Jul 2024 18:58:03 -0400 (EDT) |
branch: externals/elisa
commit e877f8f5c759fbcd9df5b63f4b63e4d853d9077c
Author: Sergey Kostyaev <kostyaev.sergey2@wb.ru>
Commit: Sergey Kostyaev <kostyaev.sergey2@wb.ru>
First implementation for web search
---
elisa.el | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 143 insertions(+), 1 deletion(-)
diff --git a/elisa.el b/elisa.el
index d91582b4db..da9e7089c5 100644
--- a/elisa.el
+++ b/elisa.el
@@ -135,6 +135,17 @@ prompt only. User prompt:
:group 'tools
:type 'function)
+(defcustom elisa-web-search-function 'elisa-search-duckduckgo
+ "Function to search the web.
+Function should get prompt and return list of urls."
+ :group 'tools
+ :type 'function)
+
+(defcustom elisa-web-pages-limit 10
+ "Limit of web pages to parse during web search."
+ :group 'tools
+ :type 'integer)
+
(defun elisa-sqlite-vss-download-url ()
"Generate sqlite vss download url based on current system."
(cond ((string-equal system-type "darwin")
@@ -269,6 +280,12 @@ FOREIGN KEY(collection_id) REFERENCES collections(rowid)
(string-replace "'" "''")
(string-replace "\\" "\\\\")))
+(defun elisa-sqlite-format-int-list (ids)
+ "Convert list of integer IDS list to sqlite list representation."
+ (format
+ "(%s)"
+ (string-join (mapcar (lambda (id) (format "%d" id)) ids) ", ")))
+
(defun elisa-parse-info-manual (name)
"Parse info manual with NAME and save index to database."
(with-temp-buffer
@@ -404,7 +421,10 @@ than T, it will be packed into single semantic chunk."
(push current result)
(cl-remove-if
#'string-empty-p
- (mapcar #'string-trim
+ (mapcar (lambda (s)
+ (if s
+ (string-trim s)
+ ""))
(nreverse result)))))
(defun elisa-search-duckduckgo (prompt)
@@ -477,6 +497,128 @@ You can customize `elisa-searxng-url' to use non local
instance."
buffer-name t)
buffer-name)))
+(defun elisa-fts-query (prompt)
+ "Return fts match query for PROMPT."
+ (thread-last
+ prompt
+ (string-trim)
+ (downcase)
+ (string-replace "-" " ")
+ (replace-regexp-in-string "[^[:alnum:] ]+" "")
+ (string-trim)
+ (replace-regexp-in-string "[[:space:]]+" " OR ")))
+
+(defun elisa-web-search (prompt)
+ "Search the web for PROMPT."
+ (interactive "sAsk elisa with web search: ")
+ (message "searching the web")
+ (sqlite-execute
+ elisa-db
+ (format
+ "insert into collections (name) values ('%s') on conflict do nothing;"
+ (elisa-sqlite-escape prompt)))
+ (let* ((kind-id (caar (sqlite-select
+ elisa-db "select rowid from kinds where name =
'web';")))
+ (collection-id (caar (sqlite-select
+ elisa-db
+ (format
+ "select rowid from collections where name =
'%s';"
+ (elisa-sqlite-escape prompt)))))
+ (urls (funcall elisa-web-search-function prompt))
+ (collected-pages 0))
+ (mapc (lambda (url)
+ (when (<= collected-pages elisa-web-pages-limit)
+ (message "collecting data from %s" url)
+ (mapc
+ (lambda (chunk)
+ (let* ((hash (secure-hash 'sha256 chunk))
+ (embedding (llm-embedding elisa-embeddings-provider
chunk))
+ (rowid
+ (if-let ((rowid (caar (sqlite-select
+ elisa-db
+ (format "select rowid from data
where kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';"
kind-id collection-id url hash)))))
+ nil
+ (sqlite-execute
+ elisa-db
+ (format
+ "insert into data(kind_id, collection_id, path,
hash, data) values (%s, %s, '%s', '%s', '%s');"
+ kind-id collection-id url hash
(elisa-sqlite-escape chunk)))
+ (caar (sqlite-select
+ elisa-db
+ (format "select rowid from data where kind_id
= %s and collection_id = %s and path = '%s' and hash = '%s';" kind-id
collection-id url hash))))))
+ (when rowid
+ (sqlite-execute
+ elisa-db
+ (format "insert into data_embeddings(rowid, embedding)
values (%s, %s);"
+ rowid (elisa-vector-to-sqlite embedding)))
+ (sqlite-execute
+ elisa-db
+ (format "insert into data_fts(rowid, data) values (%s,
'%s');"
+ rowid (elisa-sqlite-escape chunk))))))
+ (elisa-extact-webpage-chunks url))
+ (cl-incf collected-pages)))
+ urls)
+ (message "searching in collected data")
+ (let* ((rowids (mapcar
+ #'car
+ (sqlite-select
+ elisa-db
+ (format "select rowid from data where collection_id = %s;"
collection-id))))
+ (query (format "WITH
+vector_search AS (
+ SELECT rowid, distance
+ FROM data_embeddings
+ WHERE vss_search(embedding, %s)
+ ORDER BY distance ASC
+ LIMIT 40
+),
+semantic_search AS (
+ SELECT rowid, RANK () OVER (ORDER BY distance ASC) AS rank
+ FROM vector_search
+ WHERE rowid IN %s
+ ORDER BY distance ASC
+ LIMIT 20
+),
+keyword_search AS (
+ SELECT rowid, RANK () OVER (ORDER BY bm25(data_fts) ASC) AS rank
+ FROM data_fts
+ WHERE rowid in %s and data_fts MATCH '%s'
+ ORDER BY bm25(data_fts) ASC
+ LIMIT 20
+),
+hybrid_search AS (
+SELECT
+ COALESCE(semantic_search.rowid, keyword_search.rowid) AS rowid,
+ COALESCE(1.0 / (60 + semantic_search.rank), 0.0) +
+ COALESCE(1.0 / (60 + keyword_search.rank), 0.0) AS score
+FROM semantic_search
+FULL OUTER JOIN keyword_search ON semantic_search.rowid = keyword_search.rowid
+ORDER BY score DESC
+LIMIT %d
+)
+SELECT
+ hybrid_search.rowid,
+ d.path AS url,
+ d.data AS text
+FROM hybrid_search
+LEFT JOIN data d ON hybrid_search.rowid = d.rowid
+;
+"
+ (elisa-vector-to-sqlite (llm-embedding
elisa-embeddings-provider prompt))
+ (elisa-sqlite-format-int-list rowids)
+ (elisa-sqlite-format-int-list rowids)
+ (elisa-fts-query
+ prompt)
+ elisa-limit)))
+ ;; (message "query:\n%s" query)
+ (mapc
+ (lambda (row)
+ (when-let ((url (cl-second row))
+ (text (cl-third row)))
+ (ellama-context-add-webpage-quote-noninteractive url url text)))
+ (sqlite-select elisa-db query))))
+ (ellama-chat prompt nil :provider elisa-chat-provider))
+
(defun elisa-get-builtin-manuals ()
"Get builtin manual names list."
(mapcar
- [elpa] externals/elisa 5d62cf3803 24/98: Reopen db after download sqlite extensions, (continued)
- [elpa] externals/elisa 5d62cf3803 24/98: Reopen db after download sqlite extensions, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 9a6dfa678b 25/98: Add info about downloading sqlite extensions into docs, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 7f5a5fff5d 26/98: Fix CI warnings, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 48d96a9716 27/98: Merge pull request #5 from s-kostyaev/fix-load-file, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 8794e14d75 29/98: Improve built-in manuals directory location method, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa c03baded1e 32/98: Bump version, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa a99ed0b234 33/98: Add semantic splitting, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 90a76fc7c2 37/98: Add webpage semantic chunks extraction function, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa d90df5889d 38/98: Add function that return buffer with url content, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 0f94c23a5d 40/98: Add more sqlite tables, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa e877f8f5c7 44/98: First implementation for web search,
ELPA Syncer <=
- [elpa] externals/elisa 5ca66e9f0d 47/98: Fix custom variables, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 273a1d492d 50/98: Add reranker to RAG pipeline, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa ade7ac0af9 52/98: Update info manual parsing, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 8a2c92dc34 54/98: Fix parsing info manuals, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa cecc5cb13f 55/98: Make sync parsing interactive, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 4cad3085fd 57/98: Use more async calls to prevent emacs from blocking, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa ad130b564f 60/98: Add parse file function, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa b419fb2cf2 61/98: Add code for parsing directory as an elisa collection, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa ef06534f46 62/98: Implement incremental parsing, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 0e32d7bb5c 63/98: Add async directory parsing, ELPA Syncer, 2024/07/17