Skip to content
130 changes: 130 additions & 0 deletions org-ai-openai-image-query.el
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
;;; org-ai-openai-image-query.el --- Send base64 or URL image and question to OpenAI -*- lexical-binding: t; -*-

;;; Code:

;; TODO handle multiple images https://platform.openai.com/docs/guides/vision/multiple-image-inputs
;; TODO compress images if larger https://platform.openai.com/docs/guides/vision/managing-images, https://platform.openai.com/docs/guides/vision/is-there-a-limit-to-the-size-of-the-image-i-can-upload
;; TODO ensure file is png, jpeg, webp, or non-animated gif https://platform.openai.com/docs/guides/vision/what-type-of-files-can-i-upload
(require 'org-ai-openai)
(require 'org-ai-useful)

(defcustom org-ai-image-query-model "gpt-4o-mini"
"The model to use for image queries."
:group 'org-ai
:type '(choice (const :tag "gpt-4o-mini" "gpt-4o-mini")
(const :tag "gpt-4o" "gpt-4o")
(const :tag "gpt-4-turbo" "gpt-4-turbo")))

(defcustom org-ai-query-image-file nil
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rksm you probably recognize this from org-ai-on-region-file, just calling attention in case you want me to do this in a different way

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally fine :)

"Optional file used to store the `org-ai-query-image' conversations in.
If nil, a buffer with no file backing is used. If a file is
specified, new conversations are appended to the file or
function's output. Function should take no arguments and return a
filename."
:group 'org-ai
:type '(choice (const :tag "No file" nil)
(file :tag "File")
(function :tag "Function")))

(defvar org-ai-openai-image-query-endpoint "https://api.openai.com/v1/chat/completions"
"Endpoint for querying images with OpenAI.")

(defun org-ai--send-image-query (image-content question callback)
"Send IMAGE-CONTENT and QUESTION to the OpenAI API.
Calls CALLBACK with the response."
(let ((url-request-extra-headers `(("Authorization" . ,(string-join `("Bearer" ,(org-ai--openai-get-token)) " "))
("Content-Type" . "application/json")))
(url-request-method "POST")
(url-request-data (json-encode `(("model" . ,org-ai-image-query-model)
("messages" . [(("role" . "user")
("content" . [((type . "text") (text . ,question))
,image-content]))])
("max_tokens" . ,(or org-ai-default-max-tokens 300))))))
(url-retrieve
org-ai-openai-image-query-endpoint
(lambda (_status)
(goto-char url-http-end-of-headers)
(let ((response (json-read)))
(if response
(funcall callback response)
(message "Failed to get a valid response from OpenAI API.")))))))

(defun org-ai--send-base64-image-query (base64-image question callback)
"Send a BASE64-IMAGE and QUESTION to the OpenAI API.
Calls CALLBACK with the response."
(org-ai--send-image-query
`((type . "image_url")
(image_url . (("url" . ,(concat "data:image/jpeg;base64," base64-image)))))
question callback))

(defun org-ai--send-url-image-query (image-url question callback)
"Send an IMAGE-URL and QUESTION to the OpenAI API.
Calls CALLBACK with the response."
(org-ai--send-image-query
`((type . "image_url")
(image_url . (("url" . ,image-url))))
question callback))

(defun org-ai--extract-content-from-response (response)
"Extract content from OpenAI API RESPONSE.
Returns the content string or signals an error if content is not found."
(let* ((choices (alist-get 'choices response))
(choice (and (vectorp choices)
(> (length choices) 0)
(aref choices 0)))
(message_ (alist-get 'message choice))
(content (alist-get 'content message_)))
(unless content
(error "Content not found in the response"))
content))

(defun org-ai--handle-openai-response (response)
"Handle the RESPONSE from OpenAI API."
(let* ((content (org-ai--extract-content-from-response response))
(output-buffer (get-buffer-create (or org-ai-query-image-file "*org-ai-output*"))))
(org-ai-prompt--insert output-buffer content t)
(pop-to-buffer output-buffer)))

(defun org-ai--get-image-path-or-url ()
"Prompt the user for a non-empty image path or URL."
(let ((default (or (thing-at-point 'url)
(thing-at-point 'filename)))
(input ""))
(while (string-empty-p input)
(setq input
(read-string (if default
(format "Enter image URL or file path (default %s): " default)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consider adding "URL must start with http?s://

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

feels bulky to add more text, and people probably already know that urls must start with http[s]:// in emacs right?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yo!

"Enter image URL or file path: ")
nil 'minibuffer-history default))
(if (string-empty-p input)
(if default
(setq input default)
(progn
(message "Input cannot be empty.")
(sit-for 1)))))
input))

(defun org-ai--get-question ()
"Prompt the user for a non-empty question."
(let ((question ""))
(while (string-empty-p question)
(setq question (read-string "Question (must be non-empty): ")))
question))

;;;###autoload
(defun org-ai-query-image ()
"Query OpenAI API with a BASE64 encoded image path or URL and a QUESTION."
(interactive)
(let ((image-path-or-url (org-ai--get-image-path-or-url))
(question (org-ai--get-question)))
(if (string-match-p "^https?://" image-path-or-url)
(org-ai--send-url-image-query image-path-or-url question #'org-ai--handle-openai-response)
(let ((base64-image (with-temp-buffer
(insert-file-contents-literally image-path-or-url)
(base64-encode-region (point-min) (point-max))
(buffer-string))))
(org-ai--send-base64-image-query base64-image question #'org-ai--handle-openai-response)))))

(provide 'org-ai-openai-image-query)

;;; org-ai-openai-image-query.el ends here
7 changes: 7 additions & 0 deletions org-ai.el
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
(require 'org-ai-talk)
(require 'org-ai-sd)
(require 'org-ai-oobabooga)
(require 'org-ai-openai-image-query)

;; -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

Expand Down Expand Up @@ -186,12 +187,18 @@ It's designed to \"do the right thing\":

;; -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

(defun org-ai-query-image-and-display ()
"Query OpenAI API with a BASE64 encoded IMAGE-PATH and a QUESTION, and display the result."
(interactive)
(org-ai-query-image))

(defvar org-ai-mode-map (make-sparse-keymap)
"Keymap for `org-ai-mode'.")

(let ((map org-ai-mode-map))
(define-key map (kbd "C-c M-a v") 'org-ai-image-variation)
(define-key map (kbd "C-c M-a $") 'org-ai-open-account-usage-page)
(define-key map (kbd "C-c M-a q") 'org-ai-query-image-and-display) ; Bind to C-c M-a q
(define-key map (kbd "C-c M-a SPC") 'org-ai-mark-region-at-point)
(define-key map (kbd "C-c DEL") 'org-ai-kill-region-at-point)
(define-key map (kbd "C-c <backspace>") 'org-ai-kill-region-at-point)
Expand Down