[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/llm 213964f792: Add multimodal support for openai and g
From: |
ELPA Syncer |
Subject: |
[elpa] externals/llm 213964f792: Add multimodal support for openai and gemini (#88) |
Date: |
Sat, 9 Nov 2024 09:58:30 -0500 (EST) |
branch: externals/llm
commit 213964f792882e72cc5142c448729ffa40da4412
Author: Andrew Swan <wakelin.swan@gmail.com>
Commit: GitHub <noreply@github.com>
Add multimodal support for openai and gemini (#88)
I've added multimodal support for two of the providers (openai &
vertex/gemini) so images can be included as part of the input prompt.
I settled on the following system for doing this:
1. I follow the convention that if the role is user and content is a
list, then it is a multipart message.
2. If it's a string everything works the same as before, and if the role
is assistant and content is a list, then I assume it is a list of
function calls and use the existing code.
3. If content is a list and role is user then the interaction can have
both text parts and image parts. Each element is either a string or the
struct `llm-provider-utils-image` that I've added to
`llm-provider-utils.ell`.
4. `llm-provider-utils-image` has a both MIME type and a (binary) string
with the actual data.
While doing this, it made sense to add system prompts to vertex. I think
all their newer models support system prompts now, so hopefully this
won't cause any problems.
I think the same system should work for other providers, at least for
Claude, but I don't have any api keys of theirs to test out on.
---
llm-gemini.el | 8 +++--
llm-ollama.el | 36 +++++++++++++++-------
llm-openai.el | 27 +++++++++++++---
llm-vertex.el | 30 ++++++++++++++----
llm.el | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
5 files changed, 161 insertions(+), 38 deletions(-)
diff --git a/llm-gemini.el b/llm-gemini.el
index 5c70c406b4..14226d52cd 100644
--- a/llm-gemini.el
+++ b/llm-gemini.el
@@ -99,9 +99,11 @@ If STREAMING-P is non-nil, use the streaming endpoint."
(cl-defmethod llm-capabilities ((provider llm-gemini))
(append
(list 'streaming 'embeddings)
- (let ((model (llm-models-match (llm-gemini-chat-model provider))))
- (when (and model (member 'tool-use (llm-model-capabilities model)))
- (list 'function-calls)))))
+ (when-let ((model (llm-models-match (llm-gemini-chat-model provider)))
+ (capabilities (llm-model-capabilities model)))
+ (append
+ (when (member 'tool-use capabilities) '(function-calls))
+ (seq-intersection capabilities '(image-input audio-input
video-input))))))
(provide 'llm-gemini)
diff --git a/llm-ollama.el b/llm-ollama.el
index b88666a315..03902a2836 100644
--- a/llm-ollama.el
+++ b/llm-ollama.el
@@ -112,12 +112,25 @@ PROVIDER is the llm-ollama provider."
(let (request-alist messages options)
(setq messages
(mapcar (lambda (interaction)
- `(("role" . ,(symbol-name
(llm-chat-prompt-interaction-role interaction)))
- ("content" . ,(let ((content
-
(llm-chat-prompt-interaction-content interaction)))
- (if (stringp content)
- content
- (json-encode content))))))
+ (let* ((role (llm-chat-prompt-interaction-role interaction))
+ (content (llm-chat-prompt-interaction-content
interaction))
+ (content-text "")
+ (images nil))
+ (if (stringp content)
+ (setq content-text content)
+ (if (eq 'user role)
+ (dolist (part (llm-multipart-parts content))
+ (if (llm-media-p part)
+ (setq images (append images (list part)))
+ (setq content-text (concat content-text part))))
+ (setq content-text (json-encode content))))
+ (append
+ `(("role" . ,(symbol-name role)))
+ `(("content" . ,content-text))
+ (when images
+ `(("images" .
+ ,(mapcar (lambda (img) (base64-encode-string
(llm-media-data img) t))
+ images)))))))
(llm-chat-prompt-interactions prompt)))
(when (llm-chat-prompt-context prompt)
(push `(("role" . "system")
@@ -181,11 +194,12 @@ PROVIDER is the llm-ollama provider."
(and embedding-model
(member 'embedding (llm-model-capabilities
embedding-model)))))
'(embeddings embeddings-batch))
- (when (let ((chat-model (llm-models-match
- (llm-ollama-chat-model provider))))
- (and chat-model
- (member 'tool-use (llm-model-capabilities chat-model))))
- '(function-calls))))
+ (when-let ((chat-model (llm-models-match
+ (llm-ollama-chat-model provider)))
+ (capabilities (llm-model-capabilities chat-model)))
+ (append
+ (when (member 'tool-use capabilities) '(function-calls))
+ (seq-intersection capabilities '(image-input))))))
(provide 'llm-ollama)
diff --git a/llm-openai.el b/llm-openai.el
index b2fc5c0cd4..63813693fd 100644
--- a/llm-openai.el
+++ b/llm-openai.el
@@ -173,8 +173,24 @@ STREAMING if non-nil, turn on response streaming."
(append
`(("role" . ,(llm-chat-prompt-interaction-role i)))
(when-let ((content
(llm-chat-prompt-interaction-content i)))
- (if (stringp content) `(("content" . ,content))
- (llm-openai-function-call-to-response
content)))))))
+ `(("content"
+ . ,(pcase content
+ ((pred llm-multipart-p)
+ (mapcar (lambda (part)
+ (if (llm-media-p part)
+ `(("type" . "image_url")
+ ("image_url"
+ . (("url"
+ . ,(concat
+ "data:"
+
(llm-media-mime-type part)
+ ";base64,"
+
(base64-encode-string (llm-media-data part)))))))
+ `(("type" . "text")
+ ("text" . ,part))))
+ (llm-multipart-parts content)))
+ ((pred listp)
(llm-openai-function-call-to-response content))
+ (_ content)))))))))
(llm-chat-prompt-interactions prompt)))
request-alist)
(push `("model" . ,(or (llm-openai-chat-model provider) "gpt-4o"))
request-alist)
@@ -276,8 +292,11 @@ RESPONSE can be nil if the response is complete."
(cl-defmethod llm-chat-token-limit ((provider llm-openai))
(llm-provider-utils-model-token-limit (llm-openai-chat-model provider)))
-(cl-defmethod llm-capabilities ((_ llm-openai))
- (list 'streaming 'embeddings 'function-calls))
+(cl-defmethod llm-capabilities ((provider llm-openai))
+ (append '(streaming embeddings function-calls)
+ (when-let ((model (llm-models-match (llm-openai-chat-model
provider))))
+ (seq-intersection (llm-model-capabilities model)
+ '(image-input)))))
(cl-defmethod llm-capabilities ((provider llm-openai-compatible))
(append '(streaming)
diff --git a/llm-vertex.el b/llm-vertex.el
index d91a8b5878..d7411a33e9 100644
--- a/llm-vertex.el
+++ b/llm-vertex.el
@@ -176,8 +176,14 @@ the key must be regenerated every hour."
(llm-provider-extract-function-calls provider (json-read-from-string
response)))
(cl-defmethod llm-provider-chat-request ((_ llm-google) prompt _)
- (llm-provider-utils-combine-to-user-prompt prompt llm-vertex-example-prelude)
+ (llm-provider-utils-combine-to-system-prompt prompt
llm-vertex-example-prelude)
(append
+ (let ((first (car (llm-chat-prompt-interactions prompt))))
+ ;; System prompts for vertex only really make sense when they are
+ ;; the first interaction, since they are sent separately
+ (when (eq (llm-chat-prompt-interaction-role first) 'system)
+ `((system_instruction
+ . ((parts . (((text . ,(llm-chat-prompt-interaction-content
first))))))))))
`((contents
.
,(mapcar (lambda (interaction)
@@ -203,8 +209,18 @@ the key must be regenerated every hour."
(content .
,(llm-chat-prompt-function-call-result-result fc)))))))))
(llm-chat-prompt-interaction-function-call-results interaction))
- (llm-chat-prompt-interaction-content
interaction))))))
- (llm-chat-prompt-interactions prompt))))
+ (if (llm-multipart-p
(llm-chat-prompt-interaction-content interaction))
+ (mapcar (lambda (part)
+ (if (llm-media-p part)
+ `((inline_data
+ . ((mime_type .
,(llm-media-mime-type part))
+ (data .
,(base64-encode-string (llm-media-data part) t)))))
+ `((text . ,part))))
+ (llm-multipart-parts
(llm-chat-prompt-interaction-content interaction)))
+ (llm-chat-prompt-interaction-content
interaction)))))))
+ (seq-filter
+ (lambda (interaction) (not (eq 'system
(llm-chat-prompt-interaction-role interaction))))
+ (llm-chat-prompt-interactions prompt)))))
(when (llm-chat-prompt-functions prompt)
;; Although Gemini claims to be compatible with Open AI's function
declaration,
;; it's only somewhat compatible.
@@ -285,9 +301,11 @@ If STREAMING is non-nil, use the URL for the streaming
API."
(cl-defmethod llm-capabilities ((provider llm-vertex))
(append
(list 'streaming 'embeddings)
- (let ((model (llm-models-match (llm-vertex-chat-model provider))))
- (when (and model (member 'tool-use (llm-model-capabilities model)))
- (list 'function-calls)))))
+ (when-let ((model (llm-models-match (llm-vertex-chat-model provider)))
+ (capabilities (llm-model-capabilities model)))
+ (append
+ (when (member 'tool-use capabilities) '(function-calls))
+ (seq-intersection capabilities '(image-input audio-input
video-input))))))
(provide 'llm-vertex)
diff --git a/llm.el b/llm.el
index 733c619c4b..ca0c92c1ad 100644
--- a/llm.el
+++ b/llm.el
@@ -76,6 +76,9 @@ Use of this directly is deprecated, instead use
`llm-make-chat-prompt'."
"This defines a single interaction given as part of a chat prompt.
ROLE can a symbol, of either `user', `assistant', or `function'.
+CONTENT is the content of the interaction. It should be either a
+string, an `llm-multipart' object or a list of function calls.
+
FUNCTION-CALL-RESULTS is a list of structs of type
`llm-chat-prompt-function-call-results', which is only populated
if `role' is `function'. It stores the results of the function
@@ -128,6 +131,63 @@ REQUIRED is whether this is required or not."
type
required)
+(cl-defstruct llm-media
+ "Contains media that can be sent as part of an interaction.
+
+MIME-TYPE is a string containing the mime type of the media. Not all
+MIME types are accepted by all providers.
+
+DATA is a (binary) string containing the data. The string should use
+unibyte encoding.
+
+This should only be used if the `image-input' or `audio-input'
+capability is available, as indicated by `llm-capabilities'."
+ mime-type data)
+
+(defun llm--image-to-media (image)
+ "Convert an IMAGE object to an `llm-media' object."
+ (make-llm-media
+ :mime-type (pcase (image-property image :type)
+ ('svg "image/svg+xml")
+ ('webp "image/webp")
+ ('png "image/png")
+ ('gif "image/gif")
+ ('tiff "image/tiff")
+ ('jpeg "image/jpeg")
+ ('xpm "image/x-xpixmap")
+ ('xbm "image/x-xbitmap"))
+ :data (if-let ((data (image-property image :data))) data
+ (with-temp-buffer
+ (set-buffer-multibyte nil)
+ (insert-file-contents-literally (image-property image :file))
+ (buffer-string)))))
+
+(cl-defstruct llm-multipart
+ "A multipart message that can contain both text and media.
+
+PARTS is a list of the parts of the interaction. Each element
+should be either a string for text, or a `llm-media' object for
+media.
+
+Note that this includes the special case where there are multiple
+text parts and no media parts, although this case is only
+supported by some providers. For example, this can be used to
+send instructions and code blocks separately."
+ parts)
+
+(defun llm-make-multipart (&rest parts)
+ "Create a multipart message from the arguments PARTS.
+
+Each argument should be either a string, image object or an
+`llm-media' object. The arguments are combined into a single
+multipart message."
+ (make-llm-multipart
+ :parts (mapcar (lambda (part)
+ (if (and (fboundp 'imagep) (imagep part))
+ (llm--image-to-media part)
+ part))
+ parts)))
+
(cl-defun llm--log (type &key provider prompt msg)
"Log a MSG of TYPE, given PROVIDER, PROMPT, and MSG.
These are all optional, each one should be the normal meaning of
@@ -168,10 +228,10 @@ This is deprecated, and you should use
`llm-make-chat-prompt'
instead."
(llm-make-chat-prompt text))
-(cl-defun llm-make-chat-prompt (text &key context examples functions
+(cl-defun llm-make-chat-prompt (content &key context examples functions
temperature max-tokens
non-standard-params)
- "Create a `llm-chat-prompt' with TEXT sent to the LLM provider.
+ "Create a `llm-chat-prompt' with CONTENT sent to the LLM provider.
This is the most correct and easy way to create an
`llm-chat-prompt', and should suffice for almost all uses.
@@ -185,12 +245,14 @@ populated, a best effort is made to do something
reasonable, but
it may not be quite the same on all providers as the prompt
mutating in terms of an actual conversation.
-TEXT is the latest user input to the LLM, the thing to be
-responded to. This is required. This can also be a string, in
-which case it represents the chat history, starting with the
-user's initial chat, followed by the response, and so on. If it
-is a list, it MUST be an odd number, since the presumption is
-that it ends with the user's latest input to the LLM.
+CONTENT is the latest user input to the LLM, the thing to be
+responded to, in form of a string containing text or an
+`llm-multipart' object containing both text and media. This is
+required. This can also be a list, in which case it represents
+the chat history, starting with the user's initial chat, followed
+by the response, and so on. If it is a list, it MUST be an odd
+number, since the presumption is that it ends with the user's
+latest input to the LLM.
CONTEXT is a string given to the LLM as context for the entire
interaction, such as instructions to the LLM on how to reply,
@@ -225,10 +287,10 @@ to be provider specific. Don't use this if you want the
prompt
to be used amongst different providers, because it is likely to
cause a request error. The cars of the alist are strings and the
cdrs can be strings or numbers. This is optional."
- (unless text
- (error "TEXT is required"))
- (when (and (listp text) (zerop (mod (length text) 2)))
- (error "TEXT, as a list, must have an odd number of elements"))
+ (unless content
+ (error "CONTENT is required"))
+ (when (and (listp content) (zerop (mod (length content) 2)))
+ (error "CONTENT, as a list, must have an odd number of elements"))
(make-llm-chat-prompt
:context context
:examples examples
@@ -236,7 +298,7 @@ cdrs can be strings or numbers. This is optional."
(make-llm-chat-prompt-interaction
:role (if (zerop (mod i 2)) 'user
'assistant)
:content s))
- (if (listp text) text (list text)))
+ (if (listp content) content (list content)))
:functions functions
:temperature temperature
:max-tokens max-tokens
@@ -614,7 +676,15 @@ This should only be used for logging or debugging."
('user "User")
('system "System")
('assistant "Assistant"))
- (llm-chat-prompt-interaction-content i)))
+ (let ((content (llm-chat-prompt-interaction-content i)))
+ (if (llm-multipart-p content)
+ (mapcar (lambda (part) (if (llm-media-p part)
+ (format "[%s data, %d
bytes]"
+
(llm-media-mime-type part)
+ (length
(llm-media-data part)))
+ part))
+ (llm-multipart-parts content))
+ content))))
(llm-chat-prompt-interactions prompt) "\n")
"\n"
(when (llm-chat-prompt-temperature prompt)
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [elpa] externals/llm 213964f792: Add multimodal support for openai and gemini (#88),
ELPA Syncer <=