-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Propagate multipart exceptions and accept custom charsets #56
Changes from 2 commits
460b6c5
580844b
75d5d64
0294985
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,8 +3,11 @@ | |
(:refer-clojure :exclude [get]) | ||
(:require [clojure.java.io :as io] | ||
[clojure.spec.alpha :as s]) | ||
(:import (java.io PipedOutputStream PipedInputStream File) | ||
(java.nio.file Files))) | ||
(:import (java.io ByteArrayInputStream File InputStream SequenceInputStream) | ||
(java.net Socket URI URL) | ||
(java.nio.charset Charset StandardCharsets) | ||
(java.nio.file Files Path) | ||
(java.util Collections))) | ||
|
||
;;; Helpers | ||
|
||
|
@@ -21,7 +24,11 @@ | |
[{:keys [content content-type]}] | ||
(str "Content-Type: " | ||
(cond | ||
content-type content-type | ||
content-type (if (string? content-type) | ||
content-type | ||
(str (:mime-type content-type) | ||
(when-let [charset (:charset content-type)] | ||
(str "; charset=" charset)))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is an issue today where if you supply a String, we essentially only ever allow UTF-8. This allows the content-type param to be either a string or map which can describe the mime type and charset. I could see an argument for this being another key but this approach more closely aligns with how libraries like Apache HttpClient represent ContentType. |
||
(string? content) "text/plain; charset=UTF-8" | ||
(instance? File content) (or (Files/probeContentType (.toPath ^File content)) | ||
"application/octet-stream") | ||
|
@@ -33,8 +40,30 @@ | |
"Content-Transfer-Encoding: 8bit" | ||
"Content-Transfer-Encoding: binary")) | ||
|
||
(def ^:private line-break (.getBytes "\r\n")) | ||
|
||
(defn- charset-from-content-type | ||
"Parses the charset from a content-type string. Examples: | ||
text/html;charset=utf-8 | ||
text/html;charset=UTF-8 | ||
Text/HTML;Charset=\"utf-8\" | ||
text/html; charset=\"utf-8\" | ||
|
||
See https://www.rfc-editor.org/rfc/rfc7231" | ||
[content-type] | ||
(second (re-matches #".*charset=\s*\"?([^\";]+)\"?.*" content-type))) | ||
|
||
(defn- ^Charset charset-encoding | ||
"Determines the appropriate charset to encode a string with given the supplied content-type." | ||
[{:keys [content-type]}] | ||
(as-> content-type $ | ||
(if (string? $) | ||
(charset-from-content-type $) | ||
(:charset $)) | ||
(or $ StandardCharsets/UTF_8) | ||
(if (instance? String $) | ||
(Charset/forName $) | ||
$))) | ||
|
||
(def ^{:private true :tag String} line-break "\r\n") | ||
|
||
;;; Exposed functions | ||
|
||
|
@@ -55,12 +84,94 @@ | |
(take 30) | ||
(apply str "hatoBoundary"))) | ||
|
||
(defn body | ||
"Returns an InputStream from the multipart inputs. | ||
(defprotocol MultipartParam | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The following protocol defines the possible multiple param content types (can be easily extended). We need to be able to coerce the param into an input stream and determine it's length (if possible). For types that have potentially unknown lengths (such as an InputStream), a client can set it by supplying :content-length in the multipart param map. |
||
(input-stream [content opts]) | ||
(length [content opts])) | ||
|
||
(extend-protocol MultipartParam | ||
String | ||
(input-stream [^String content opts] | ||
(io/input-stream (.getBytes content (charset-encoding opts)))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to make a best guess to determine the charset for the string so we look at either the :content-type string/map and parse it. |
||
(length [^String content opts] | ||
(count (.getBytes content (charset-encoding opts)))) | ||
|
||
File | ||
(input-stream [^File content _opts] | ||
(io/input-stream content)) | ||
(length [^File content _opts] | ||
(.length content)) | ||
|
||
Path | ||
(input-stream [^Path content _opts] | ||
(io/input-stream (.toFile content))) | ||
(length [^Path content opts] | ||
(length (.toFile content) opts)) | ||
|
||
InputStream | ||
(input-stream [^InputStream content _opts] | ||
content) | ||
(length [^InputStream _content opts] | ||
(or (:content-length opts) -1)) | ||
|
||
URL | ||
(input-stream [^URL content _opts] | ||
(io/input-stream content)) | ||
(length [^URL content opts] | ||
(if (= "file" (.getProtocol content)) | ||
(length (io/file content) opts) | ||
(or (:content-length opts) -1))) | ||
|
||
URI | ||
(input-stream [^URI content _opts] | ||
(io/input-stream content)) | ||
(length [^URI content opts] | ||
(length (.toURL content) opts)) | ||
|
||
Socket | ||
(input-stream [^InputStream content _opts] | ||
(io/input-stream content)) | ||
(length [^InputStream _content opts] | ||
(or (:content-length opts) -1))) | ||
|
||
;; See https://clojure.atlassian.net/browse/CLJ-1381 for why these are defined separately | ||
|
||
(extend-protocol MultipartParam | ||
(Class/forName "[B") | ||
(input-stream [^bytes content _opts] | ||
(io/input-stream content)) | ||
(length [^bytes content _opts] | ||
(count content))) | ||
|
||
(defn raw-multipart-payload-segments | ||
"Given a collection of multipart parts, return a collection of tuples containing a segment of data | ||
representing the multipart body. Each tuple is the segment's content and options for the specific | ||
part (if applicable). These individual segments may be used to compute the content length or construct | ||
an InputStream. | ||
|
||
By default, a part's :content type must extend the MultipartParam protocol above!" | ||
[parts boundary] | ||
(let [payload-end-signal (.getBytes (str "--" boundary "--" line-break) StandardCharsets/UTF_8)] | ||
(concat | ||
(for [part parts | ||
raw-segment [[(.getBytes (str "--" | ||
boundary | ||
line-break | ||
(content-disposition part) | ||
line-break | ||
(content-type part) | ||
line-break | ||
(content-transfer-encoding part) | ||
line-break | ||
line-break) | ||
StandardCharsets/UTF_8) nil] | ||
[(:content part) (dissoc part :content)] | ||
[(.getBytes line-break StandardCharsets/UTF_8) nil]]] | ||
raw-segment) | ||
[[payload-end-signal nil]]))) | ||
|
||
This is achieved by writing all the inputs to an output stream which is piped | ||
back into an InputStream, hopefully to avoid making a copy of everything (which could | ||
be the case if we read all the bytes and used a ByteArrayOutputStream instead). | ||
(defn body | ||
"Returns an InputStream from the supplied multipart parts. See raw-multipart-payload-segments | ||
for more information. | ||
|
||
Output looks something like: | ||
|
||
|
@@ -72,44 +183,47 @@ | |
Some Content\r | ||
--hatoBoundary....\r | ||
...more components | ||
--hatoBoundary....--\r | ||
" | ||
[ms b] | ||
(let [in-stream (PipedInputStream.) | ||
out-stream (PipedOutputStream. in-stream)] | ||
(.start (Thread. #(do (doseq [m ms | ||
s [(str "--" b) | ||
line-break | ||
(content-disposition m) | ||
line-break | ||
(content-type m) | ||
line-break | ||
(content-transfer-encoding m) | ||
line-break | ||
line-break | ||
(:content m) | ||
line-break]] | ||
(io/copy s out-stream)) | ||
(io/copy (str "--" b "--") out-stream) | ||
(io/copy line-break out-stream) | ||
(.close out-stream)))) | ||
in-stream)) | ||
--hatoBoundary....--\r" | ||
([raw-multipart-payload-segments] | ||
(->> raw-multipart-payload-segments | ||
(mapv (fn [[content opts]] | ||
(input-stream content opts))) | ||
Collections/enumeration | ||
(SequenceInputStream.))) | ||
([ms b] | ||
(body (raw-multipart-payload-segments ms b)))) | ||
|
||
(defn content-length | ||
"Returns the content length from the supplied multipart parts. If any of the parts return a | ||
content length of -1, return -1 indicating that we can't determine the appropriate size of | ||
the multipart body. See raw-multipart-payload-segments for more information." | ||
([raw-multipart-payload-segments] | ||
(reduce | ||
(fn [acc [content opts]] | ||
(let [len (length content opts)] | ||
(if (= -1 len) | ||
(reduced -1) | ||
(+ acc len)))) | ||
0 | ||
raw-multipart-payload-segments)) | ||
([ms b] | ||
(content-length (raw-multipart-payload-segments ms b)))) | ||
|
||
(comment | ||
(def b (boundary)) | ||
(def ms [{:name "title" :content "My Awesome Picture"} | ||
{:name "Content/type" :content "image/jpeg"} | ||
{:name "foo.txt" :part-name "eggplant" :content "Eggplants"} | ||
{:name "file" :content (io/file ".nrepl-port")}]) | ||
|
||
; Create the body | ||
(body ms b) | ||
|
||
; Copy to out for testing | ||
(with-open [xin (io/input-stream *1) | ||
xout (java.io.ByteArrayOutputStream.)] | ||
(io/copy xin xout) | ||
(.toByteArray xout)) | ||
|
||
; Print as string | ||
(String. *1)) | ||
(def b (boundary)) | ||
(def ms [{:name "title" :content "My Awesome Picture"} | ||
{:name "Content/type" :content "image/jpeg"} | ||
{:name "foo.txt" :part-name "eggplant" :content "Eggplants"} | ||
{:name "file" :content (io/file ".nrepl-port")}]) | ||
|
||
; Create the body | ||
(body ms b) | ||
|
||
; Copy to out for testing | ||
(with-open [xin (io/input-stream *1) | ||
xout (java.io.ByteArrayOutputStream.)] | ||
(io/copy xin xout) | ||
(.toByteArray xout)) | ||
|
||
; Print as string | ||
(String. *1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we are unable to determine the content length, leave the body as an input stream and use transfer encoding chunked. Otherwise, specify the content length so the appropriate content-length header is used.