[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/elisa 9c7004e15c 77/98: Add functionality to parse web
From: |
ELPA Syncer |
Subject: |
[elpa] externals/elisa 9c7004e15c 77/98: Add functionality to parse web pages and add them to collections |
Date: |
Wed, 17 Jul 2024 18:58:06 -0400 (EDT) |
branch: externals/elisa
commit 9c7004e15c7f0dc14d62ff1808eadc1a84bc822a
Author: Sergey Kostyaev <kostyaev.sergey2@wb.ru>
Commit: Sergey Kostyaev <kostyaev.sergey2@wb.ru>
Add functionality to parse web pages and add them to collections
This commit adds a new function `elisa--parse-web-page` that parses a
given URL, extracts semantic chunks using a third-party library (e.g.,
BeautifulSoup), calculates a SHA256 hash for each chunk, generates an
embedding using an LLM provider like OpenAI's API, and inserts the
data into the SQLite database associated with the specified
collection.
It also includes updates to `elisa--web-search` and
`elisa-add-webpage-to-collection` to utilize this new functionality
for adding web pages to collections.
Furthermore, it adds functions `elisa-create-empty-collection`,
`elisa-add-file-to-collection`, and `elisa-add-webpage-to-collection`
for creating empty collections and adding files or web pages to
existing ones.
---
elisa.el | 122 ++++++++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 89 insertions(+), 33 deletions(-)
diff --git a/elisa.el b/elisa.el
index 1182c5f343..c87af292a1 100644
--- a/elisa.el
+++ b/elisa.el
@@ -973,6 +973,39 @@ You can customize `elisa-searxng-url' to use non local
instance."
elisa-reranker-limit
elisa-limit))
+(defun elisa--parse-web-page (collection-id url)
+ "Parse URL into collection with COLLECTION-ID."
+ (let ((kind-id (caar (sqlite-select
+ elisa-db "select rowid from kinds where name =
'web';"))))
+ (message "collecting data from %s" url)
+ (mapc
+ (lambda (chunk)
+ (let* ((hash (secure-hash 'sha256 chunk))
+ (embedding (llm-embedding elisa-embeddings-provider chunk))
+ (rowid
+ (if-let ((rowid (caar (sqlite-select
+ elisa-db
+ (format "select rowid from data where
kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';" kind-id
collection-id url hash)))))
+ nil
+ (sqlite-execute
+ elisa-db
+ (format
+ "insert into data(kind_id, collection_id, path, hash, data)
values (%s, %s, '%s', '%s', '%s');"
+ kind-id collection-id url hash (elisa-sqlite-escape chunk)))
+ (caar (sqlite-select
+ elisa-db
+ (format "select rowid from data where kind_id = %s and
collection_id = %s and path = '%s' and hash = '%s';" kind-id collection-id url
hash))))))
+ (when rowid
+ (sqlite-execute
+ elisa-db
+ (format "insert into data_embeddings(rowid, embedding) values (%s,
%s);"
+ rowid (elisa-vector-to-sqlite embedding)))
+ (sqlite-execute
+ elisa-db
+ (format "insert into data_fts(rowid, data) values (%s, '%s');"
+ rowid (elisa-sqlite-escape chunk))))))
+ (elisa-extact-webpage-chunks url))))
+
(defun elisa--web-search (prompt)
"Search the web for PROMPT.
Return sqlite query that extract data for adding to context."
@@ -981,9 +1014,7 @@ Return sqlite query that extract data for adding to
context."
(format
"insert into collections (name) values ('%s') on conflict do nothing;"
(elisa-sqlite-escape prompt)))
- (let* ((kind-id (caar (sqlite-select
- elisa-db "select rowid from kinds where name =
'web';")))
- (collection-id (caar (sqlite-select
+ (let* ((collection-id (caar (sqlite-select
elisa-db
(format
"select rowid from collections where name =
'%s';"
@@ -992,34 +1023,7 @@ Return sqlite query that extract data for adding to
context."
(collected-pages 0))
(mapc (lambda (url)
(when (<= collected-pages elisa-web-pages-limit)
- (message "collecting data from %s" url)
- (mapc
- (lambda (chunk)
- (let* ((hash (secure-hash 'sha256 chunk))
- (embedding (llm-embedding elisa-embeddings-provider
chunk))
- (rowid
- (if-let ((rowid (caar (sqlite-select
- elisa-db
- (format "select rowid from data
where kind_id = %s and collection_id = %s and path = '%s' and hash = '%s';"
kind-id collection-id url hash)))))
- nil
- (sqlite-execute
- elisa-db
- (format
- "insert into data(kind_id, collection_id, path,
hash, data) values (%s, %s, '%s', '%s', '%s');"
- kind-id collection-id url hash
(elisa-sqlite-escape chunk)))
- (caar (sqlite-select
- elisa-db
- (format "select rowid from data where kind_id
= %s and collection_id = %s and path = '%s' and hash = '%s';" kind-id
collection-id url hash))))))
- (when rowid
- (sqlite-execute
- elisa-db
- (format "insert into data_embeddings(rowid, embedding)
values (%s, %s);"
- rowid (elisa-vector-to-sqlite embedding)))
- (sqlite-execute
- elisa-db
- (format "insert into data_fts(rowid, data) values (%s,
'%s');"
- rowid (elisa-sqlite-escape chunk))))))
- (elisa-extact-webpage-chunks url))
+ (elisa--parse-web-page collection-id url)
(cl-incf collected-pages)))
urls)))
@@ -1185,8 +1189,8 @@ Call ON-DONE callback with result as an argument after
FUNC evaluation done."
(elisa--reopen-db)
(when on-done
(funcall on-done res))
- (message "%.40s done."
- (or command func))))))
+ (message "%s done."
+ (or command "async elisa processing"))))))
(defun elisa-extact-webpage-chunks (url)
"Extract semantic chunks for webpage fetched from URL."
@@ -1264,6 +1268,58 @@ It does nothing if buffer file not inside one of
existing collections."
"select name from collections;")))))))
(push col elisa-enabled-collections)))
+;;;###autoload
+(defun elisa-create-empty-collection (&optional collection)
+ "Create new empty COLLECTION."
+ (interactive "sNew collection name: ")
+ (save-window-excursion
+ (sqlite-execute
+ elisa-db
+ (format
+ "insert into collections (name) values ('%s') on conflict do nothing;"
+ (elisa-sqlite-escape collection)))))
+
+;;;###autoload
+(defun elisa-add-file-to-collection (file collection)
+ "Add FILE to COLLECTION."
+ (interactive
+ (list
+ (read-file-name "File: ")
+ (completing-read
+ "Enable collection: "
+ (flatten-tree
+ (sqlite-select
+ elisa-db
+ "select name from collections;")))))
+ (let ((collection-id (caar (sqlite-select
+ elisa-db
+ (format
+ "select rowid from collections where name =
'%s';"
+ (elisa-sqlite-escape collection))))))
+ (elisa--async-do (lambda () (elisa-parse-file collection-id file)))))
+
+;;;###autoload
+(defun elisa-add-webpage-to-collection (url collection)
+ "Add webpage by URL to COLLECTION."
+ (interactive
+ (list
+ (if-let ((url (or (and (fboundp 'thing-at-point) (thing-at-point 'url))
+ (shr-url-at-point nil))))
+ url
+ (read-string "Enter URL you want to summarize: "))
+ (completing-read
+ "Enable collection: "
+ (flatten-tree
+ (sqlite-select
+ elisa-db
+ "select name from collections;")))))
+ (let ((collection-id (caar (sqlite-select
+ elisa-db
+ (format
+ "select rowid from collections where name =
'%s';"
+ (elisa-sqlite-escape collection))))))
+ (elisa--async-do (lambda () (elisa--parse-web-page collection-id url)))))
+
;;;###autoload
(defun elisa-remove-collection (&optional collection)
"Remove COLLECTION."
- [elpa] externals/elisa 80401a0b52 30/98: Search for gzipped builtin manuals, (continued)
- [elpa] externals/elisa 80401a0b52 30/98: Search for gzipped builtin manuals, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa c59d491e18 35/98: Add prompt rewriting with ellama-chain, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 3874a7007c 36/98: Improve semantic split api, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa f2bf34b201 39/98: Improve sqlite escape function, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 45b854ba2d 46/98: Fix linter warning, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa d58e172912 48/98: Make web search async, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa f312d189f2 51/98: Fix one word lines in webpage quotes parsed asyncronously, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 0fc73b4b9a 69/98: Add instruction to elisa-chat, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 4710f87851 72/98: Refactor parsing info manuals, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 86f4ea0afb 74/98: Fix parsing info manuals with infinite loop, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 9c7004e15c 77/98: Add functionality to parse web pages and add them to collections,
ELPA Syncer <=
- [elpa] externals/elisa 4063c45908 78/98: Add custom variables documentation, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 0c432dd1c8 81/98: Add commands to documentation, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 29c17f86a7 83/98: Update package description, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa cb69d24b0d 88/98: Improve documentation, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 8d0ad42134 90/98: Small documentation markup fixes, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 55200f8d8b 89/98: Fix readme markup for github rendering, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 21048a5033 91/98: Add melpa stable badge, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa fe93e52d2a 95/98: Review fixes, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa 56bc22a8c3 96/98: Improve package documentation, ELPA Syncer, 2024/07/17
- [elpa] externals/elisa c842f1ce32 98/98: Bump version, ELPA Syncer, 2024/07/17