[elpa] externals/llm 213964f792: Add multimodal support for openai and g

emacs-elpa-diffs
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[elpa] externals/llm 213964f792: Add multimodal support for openai and g

From:	ELPA Syncer
Subject:	[elpa] externals/llm 213964f792: Add multimodal support for openai and gemini (#88)
Date:	Sat, 9 Nov 2024 09:58:30 -0500 (EST)
branch: externals/llm
commit 213964f792882e72cc5142c448729ffa40da4412
Author: Andrew Swan <wakelin.swan@gmail.com>
Commit: GitHub <noreply@github.com>

    Add multimodal support for openai and gemini (#88)
    
    I've added multimodal support for two of the providers (openai &
    vertex/gemini) so images can be included as part of the input prompt.
    
    I settled on the following system for doing this:
    1. I follow the convention that if the role is user and content is a
    list, then it is a multipart message.
    2. If it's a string everything works the same as before, and if the role
    is assistant and content is a list, then I assume it is a list of
    function calls and use the existing code.
    3. If content is a list and role is user then the interaction can have
    both text parts and image parts. Each element is either a string or the
    struct `llm-provider-utils-image` that I've added to
    `llm-provider-utils.ell`.
    4. `llm-provider-utils-image` has a both MIME type and a (binary) string
    with the actual data.
    
    While doing this, it made sense to add system prompts to vertex. I think
    all their newer models support system prompts now, so hopefully this
    won't cause any problems.
    
    I think the same system should work for other providers, at least for
    Claude, but I don't have any api keys of theirs to test out on.
---
 llm-gemini.el |  8 +++--
 llm-ollama.el | 36 +++++++++++++++-------
 llm-openai.el | 27 +++++++++++++---
 llm-vertex.el | 30 ++++++++++++++----
 llm.el        | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
 5 files changed, 161 insertions(+), 38 deletions(-)

diff --git a/llm-gemini.el b/llm-gemini.el
index 5c70c406b4..14226d52cd 100644
--- a/llm-gemini.el
+++ b/llm-gemini.el
@@ -99,9 +99,11 @@ If STREAMING-P is non-nil, use the streaming endpoint."
 (cl-defmethod llm-capabilities ((provider llm-gemini))
   (append
    (list 'streaming 'embeddings)
-   (let ((model (llm-models-match (llm-gemini-chat-model provider))))
-     (when (and model (member 'tool-use (llm-model-capabilities model)))
-       (list 'function-calls)))))
+   (when-let ((model (llm-models-match (llm-gemini-chat-model provider)))
+             (capabilities (llm-model-capabilities model)))
+     (append
+      (when (member 'tool-use capabilities) '(function-calls))
+      (seq-intersection capabilities '(image-input audio-input 
video-input))))))
 
 (provide 'llm-gemini)
 
diff --git a/llm-ollama.el b/llm-ollama.el
index b88666a315..03902a2836 100644
--- a/llm-ollama.el
+++ b/llm-ollama.el
@@ -112,12 +112,25 @@ PROVIDER is the llm-ollama provider."
   (let (request-alist messages options)
     (setq messages
           (mapcar (lambda (interaction)
-                    `(("role" . ,(symbol-name 
(llm-chat-prompt-interaction-role interaction)))
-                      ("content" . ,(let ((content
-                                           
(llm-chat-prompt-interaction-content interaction)))
-                                      (if (stringp content)
-                                          content
-                                        (json-encode content))))))
+                   (let* ((role (llm-chat-prompt-interaction-role interaction))
+                          (content (llm-chat-prompt-interaction-content 
interaction))
+                          (content-text "")
+                          (images nil))
+                     (if (stringp content)
+                         (setq content-text content)
+                       (if (eq 'user role)
+                           (dolist (part (llm-multipart-parts content))
+                             (if (llm-media-p part)
+                                 (setq images (append images (list part)))
+                               (setq content-text (concat content-text part))))
+                         (setq content-text (json-encode content))))
+                     (append
+                      `(("role" . ,(symbol-name role)))
+                      `(("content" . ,content-text))
+                      (when images
+                        `(("images" .
+                           ,(mapcar (lambda (img) (base64-encode-string 
(llm-media-data img) t))
+                                    images)))))))
                   (llm-chat-prompt-interactions prompt)))
     (when (llm-chat-prompt-context prompt)
       (push `(("role" . "system")
@@ -181,11 +194,12 @@ PROVIDER is the llm-ollama provider."
                        (and embedding-model
                             (member 'embedding (llm-model-capabilities 
embedding-model)))))
             '(embeddings embeddings-batch))
-          (when (let ((chat-model (llm-models-match
-                                   (llm-ollama-chat-model provider))))
-                  (and chat-model
-                       (member 'tool-use (llm-model-capabilities chat-model))))
-            '(function-calls))))
+          (when-let ((chat-model (llm-models-match
+                                  (llm-ollama-chat-model provider)))
+                    (capabilities (llm-model-capabilities chat-model)))
+           (append
+            (when (member 'tool-use capabilities) '(function-calls))
+            (seq-intersection capabilities '(image-input))))))
 
 (provide 'llm-ollama)
 
diff --git a/llm-openai.el b/llm-openai.el
index b2fc5c0cd4..63813693fd 100644
--- a/llm-openai.el
+++ b/llm-openai.el
@@ -173,8 +173,24 @@ STREAMING if non-nil, turn on response streaming."
                           (append
                            `(("role" . ,(llm-chat-prompt-interaction-role i)))
                            (when-let ((content 
(llm-chat-prompt-interaction-content i)))
-                             (if (stringp content) `(("content" . ,content))
-                               (llm-openai-function-call-to-response 
content)))))))
+                            `(("content"
+                               . ,(pcase content
+                                   ((pred llm-multipart-p)
+                                    (mapcar (lambda (part)
+                                                  (if (llm-media-p part)
+                                                      `(("type" . "image_url")
+                                                        ("image_url"
+                                                         . (("url"
+                                                             . ,(concat
+                                                                 "data:"
+                                                                 
(llm-media-mime-type part)
+                                                                 ";base64,"
+                                                                 
(base64-encode-string (llm-media-data part)))))))
+                                                    `(("type" . "text")
+                                                      ("text" . ,part))))
+                                            (llm-multipart-parts content)))
+                                   ((pred listp) 
(llm-openai-function-call-to-response content))
+                                   (_ content)))))))))
                      (llm-chat-prompt-interactions prompt)))
           request-alist)
     (push `("model" . ,(or (llm-openai-chat-model provider) "gpt-4o")) 
request-alist)
@@ -276,8 +292,11 @@ RESPONSE can be nil if the response is complete."
 (cl-defmethod llm-chat-token-limit ((provider llm-openai))
   (llm-provider-utils-model-token-limit (llm-openai-chat-model provider)))
 
-(cl-defmethod llm-capabilities ((_ llm-openai))
-  (list 'streaming 'embeddings 'function-calls))
+(cl-defmethod llm-capabilities ((provider llm-openai))
+  (append '(streaming embeddings function-calls)
+         (when-let ((model (llm-models-match (llm-openai-chat-model 
provider))))
+           (seq-intersection (llm-model-capabilities model)
+                             '(image-input)))))
 
 (cl-defmethod llm-capabilities ((provider llm-openai-compatible))
   (append '(streaming)
diff --git a/llm-vertex.el b/llm-vertex.el
index d91a8b5878..d7411a33e9 100644
--- a/llm-vertex.el
+++ b/llm-vertex.el
@@ -176,8 +176,14 @@ the key must be regenerated every hour."
   (llm-provider-extract-function-calls provider (json-read-from-string 
response)))
 
 (cl-defmethod llm-provider-chat-request ((_ llm-google) prompt _)
-  (llm-provider-utils-combine-to-user-prompt prompt llm-vertex-example-prelude)
+  (llm-provider-utils-combine-to-system-prompt prompt 
llm-vertex-example-prelude)
   (append
+   (let ((first (car (llm-chat-prompt-interactions prompt))))
+     ;; System prompts for vertex only really make sense when they are
+     ;; the first interaction, since they are sent separately
+     (when (eq (llm-chat-prompt-interaction-role first) 'system)
+       `((system_instruction
+         . ((parts . (((text . ,(llm-chat-prompt-interaction-content 
first))))))))))
    `((contents
       .
       ,(mapcar (lambda (interaction)
@@ -203,8 +209,18 @@ the key must be regenerated every hour."
                                                   (content . 
,(llm-chat-prompt-function-call-result-result fc)))))))))
                                          
(llm-chat-prompt-interaction-function-call-results interaction))
 
-                               (llm-chat-prompt-interaction-content 
interaction))))))
-               (llm-chat-prompt-interactions prompt))))
+                               (if (llm-multipart-p 
(llm-chat-prompt-interaction-content interaction))
+                                  (mapcar (lambda (part)
+                                        (if (llm-media-p part)
+                                            `((inline_data
+                                              . ((mime_type . 
,(llm-media-mime-type part))
+                                                 (data . 
,(base64-encode-string (llm-media-data part) t)))))
+                                          `((text . ,part))))
+                                          (llm-multipart-parts 
(llm-chat-prompt-interaction-content interaction)))
+                                (llm-chat-prompt-interaction-content 
interaction)))))))
+               (seq-filter
+               (lambda (interaction) (not (eq 'system 
(llm-chat-prompt-interaction-role interaction))))
+               (llm-chat-prompt-interactions prompt)))))
    (when (llm-chat-prompt-functions prompt)
      ;; Although Gemini claims to be compatible with Open AI's function 
declaration,
      ;; it's only somewhat compatible.
@@ -285,9 +301,11 @@ If STREAMING is non-nil, use the URL for the streaming 
API."
 (cl-defmethod llm-capabilities ((provider llm-vertex))
   (append
    (list 'streaming 'embeddings)
-   (let ((model (llm-models-match (llm-vertex-chat-model provider))))
-     (when (and model (member 'tool-use (llm-model-capabilities model)))
-       (list 'function-calls)))))
+   (when-let ((model (llm-models-match (llm-vertex-chat-model provider)))
+             (capabilities (llm-model-capabilities model)))
+     (append
+      (when (member 'tool-use capabilities) '(function-calls))
+      (seq-intersection capabilities '(image-input audio-input 
video-input))))))
 
 (provide 'llm-vertex)
 
diff --git a/llm.el b/llm.el
index 733c619c4b..ca0c92c1ad 100644
--- a/llm.el
+++ b/llm.el
@@ -76,6 +76,9 @@ Use of this directly is deprecated, instead use 
`llm-make-chat-prompt'."
   "This defines a single interaction given as part of a chat prompt.
 ROLE can a symbol, of either `user', `assistant', or `function'.
 
+CONTENT is the content of the interaction.  It should be either a
+string, an `llm-multipart' object or a list of function calls.
+
 FUNCTION-CALL-RESULTS is a list of structs of type
 `llm-chat-prompt-function-call-results', which is only populated
 if `role' is `function'.  It stores the results of the function
@@ -128,6 +131,63 @@ REQUIRED is whether this is required or not."
   type
   required)
 
+(cl-defstruct llm-media
+  "Contains media that can be sent as part of an interaction.
+
+MIME-TYPE is a string containing the mime type of the media.  Not all
+MIME types are accepted by all providers.
+
+DATA is a (binary) string containing the data.  The string should use
+unibyte encoding.
+
+This should only be used if the `image-input' or `audio-input'
+capability is available, as indicated by `llm-capabilities'."
+  mime-type data)
+
+(defun llm--image-to-media (image)
+  "Convert an IMAGE object to an `llm-media' object."
+  (make-llm-media
+   :mime-type (pcase (image-property image :type)
+               ('svg "image/svg+xml")
+               ('webp "image/webp")
+               ('png "image/png")
+               ('gif "image/gif")
+               ('tiff "image/tiff")
+               ('jpeg "image/jpeg")
+               ('xpm "image/x-xpixmap")
+               ('xbm "image/x-xbitmap"))
+   :data (if-let ((data (image-property image :data))) data
+          (with-temp-buffer
+            (set-buffer-multibyte nil)
+            (insert-file-contents-literally (image-property image :file))
+            (buffer-string)))))
+
+(cl-defstruct llm-multipart
+  "A multipart message that can contain both text and media.
+
+PARTS is a list of the parts of the interaction.  Each element
+should be either a string for text, or a `llm-media' object for
+media.
+
+Note that this includes the special case where there are multiple
+text parts and no media parts, although this case is only
+supported by some providers.  For example, this can be used to
+send instructions and code blocks separately."
+  parts)
+
+(defun llm-make-multipart (&rest parts)
+  "Create a multipart message from the arguments PARTS.
+
+Each argument should be either a string, image object or an
+`llm-media' object.  The arguments are combined into a single
+multipart message."
+  (make-llm-multipart
+   :parts (mapcar (lambda (part)
+                   (if (and (fboundp 'imagep) (imagep part))
+                       (llm--image-to-media part)
+                     part))
+                 parts)))
+
 (cl-defun llm--log (type &key provider prompt msg)
   "Log a MSG of TYPE, given PROVIDER, PROMPT, and MSG.
 These are all optional, each one should be the normal meaning of
@@ -168,10 +228,10 @@ This is deprecated, and you should use 
`llm-make-chat-prompt'
 instead."
   (llm-make-chat-prompt text))
 
-(cl-defun llm-make-chat-prompt (text &key context examples functions
+(cl-defun llm-make-chat-prompt (content &key context examples functions
                                      temperature max-tokens
                                      non-standard-params)
-  "Create a `llm-chat-prompt' with TEXT sent to the LLM provider.
+  "Create a `llm-chat-prompt' with CONTENT sent to the LLM provider.
 
 This is the most correct and easy way to create an
 `llm-chat-prompt', and should suffice for almost all uses.
@@ -185,12 +245,14 @@ populated, a best effort is made to do something 
reasonable, but
 it may not be quite the same on all providers as the prompt
 mutating in terms of an actual conversation.
 
-TEXT is the latest user input to the LLM, the thing to be
-responded to.  This is required.  This can also be a string, in
-which case it represents the chat history, starting with the
-user's initial chat, followed by the response, and so on.  If it
-is a list, it MUST be an odd number, since the presumption is
-that it ends with the user's latest input to the LLM.
+CONTENT is the latest user input to the LLM, the thing to be
+responded to, in form of a string containing text or an
+`llm-multipart' object containing both text and media.  This is
+required.  This can also be a list, in which case it represents
+the chat history, starting with the user's initial chat, followed
+by the response, and so on.  If it is a list, it MUST be an odd
+number, since the presumption is that it ends with the user's
+latest input to the LLM.
 
 CONTEXT is a string given to the LLM as context for the entire
 interaction, such as instructions to the LLM on how to reply,
@@ -225,10 +287,10 @@ to be provider specific.  Don't use this if you want the 
prompt
 to be used amongst different providers, because it is likely to
 cause a request error.  The cars of the alist are strings and the
 cdrs can be strings or numbers.  This is optional."
-  (unless text
-    (error "TEXT is required"))
-  (when (and (listp text) (zerop (mod (length text) 2)))
-    (error "TEXT, as a list, must have an odd number of elements"))
+  (unless content
+    (error "CONTENT is required"))
+  (when (and (listp content) (zerop (mod (length content) 2)))
+    (error "CONTENT, as a list, must have an odd number of elements"))
   (make-llm-chat-prompt
    :context context
    :examples examples
@@ -236,7 +298,7 @@ cdrs can be strings or numbers.  This is optional."
                                     (make-llm-chat-prompt-interaction
                                      :role (if (zerop (mod i 2)) 'user 
'assistant)
                                      :content s))
-                                  (if (listp text) text (list text)))
+                                  (if (listp content) content (list content)))
    :functions functions
    :temperature temperature
    :max-tokens max-tokens
@@ -614,7 +676,15 @@ This should only be used for logging or debugging."
                           ('user "User")
                           ('system "System")
                           ('assistant "Assistant"))
-                        (llm-chat-prompt-interaction-content i)))
+                       (let ((content (llm-chat-prompt-interaction-content i)))
+                         (if (llm-multipart-p content)
+                             (mapcar (lambda (part) (if (llm-media-p part)
+                                                        (format "[%s data, %d 
bytes]"
+                                                                
(llm-media-mime-type part)
+                                                                (length 
(llm-media-data part)))
+                                                      part))
+                                     (llm-multipart-parts content))
+                           content))))
               (llm-chat-prompt-interactions prompt) "\n")
    "\n"
    (when (llm-chat-prompt-temperature prompt)
[Prev in Thread]
Current Thread
[Next in Thread]
[elpa] externals/llm 213964f792: Add multimodal support for openai and gemini (#88), ELPA Syncer <=
Prev by Date: [elpa] externals/indent-bars f860825f24 2/2: Merge pull request #87 from ktfleming/patch-1
Next by Date: [elpa] externals/org 7c2d960e2b 1/3: Merge branch 'bugfix'
Previous by thread: [elpa] externals/indent-bars f860825f24 2/2: Merge pull request #87 from ktfleming/patch-1
Next by thread: [elpa] externals-release/org updated (23a3c82532 -> ba9734aedc)
Index(es):
- Date
- Thread