Skip to content

Commit

Permalink
Fixes #435
Browse files Browse the repository at this point in the history
  • Loading branch information
cnuernber committed Dec 8, 2024
1 parent 6239db1 commit 73db2e1
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 39 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Changelog
# 7.035 - UNRELEASED
* Latest dtype-next (10.124) - contains upgrades to ham-fisted which allow pmap et al. to accept arbitrary executor services.
* Fix for [issue 438](https://github.com/techascent/tech.ml.dataset/issues/438) - keyword dataset names in tribuo.
* Fix for [issue 435](https://github.com/techascent/tech.ml.dataset/issues/435) - pd-merge's outer must accept empty datasets.



# 7.034
Expand Down
103 changes: 64 additions & 39 deletions src/tech/v3/dataset/join.clj
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,32 @@
(-> (hash-join colname lhs rhs (assoc options :lhs-missing? true))
:left-outer)))

(defn- col-or-data->reader
([tuple-data ds])
([tuple-data ds outer?]
;;Else not having the column is an error
(if (and (sequential? tuple-data)
(not= 1 (count tuple-data)))
(-> (ds-base/select-columns ds tuple-data)
(ds-readers/value-reader {:copying? true}))
(let [tuple-data (if (sequential? tuple-data)
(first tuple-data)
tuple-data)]
(if outer?
(get ds tuple-data [])
(ds-base/column ds tuple-data))))))

(defn- ensure-sequential
[colname]
(if-not (sequential? colname) [colname] colname))

(defn- filter-columns
[ds collist outer?]
(when collist
(if outer?
(vec (filter (set (ds-base/column-names ds)) collist))
collist)))


(defn pd-merge
"Pandas-style [merge](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html).
Expand Down Expand Up @@ -419,15 +445,6 @@ outer-join [8 4]:
([left-ds right-ds options]
(let [lhs-table-name (default-table-name left-ds "left")
rhs-table-name (default-table-name right-ds "right")
col-or-data->reader
(fn [tuple-data ds]
(if (and (sequential? tuple-data)
(not= 1 (count tuple-data)))
(-> (ds-base/select-columns ds tuple-data)
(ds-readers/value-reader {:copying? true}))
(if (sequential? tuple-data)
(ds-base/column ds (first tuple-data))
(ds-base/column ds tuple-data))))
how (get options :how :inner)]
(if (identical? how :cross)
(do
Expand All @@ -453,10 +470,10 @@ outer-join [8 4]:
[lhs-table-name lhs-columns]
[rhs-table-name rhs-columns]))
(update-join-metadata lhs-table-name rhs-table-name))))
(let [left-on (get options :left-on (get options :on))
right-on (get options :right-on (get options :on))
left-on (when left-on (if-not (sequential? left-on) [left-on] left-on))
right-on (when right-on (if-not (sequential? right-on) [right-on] right-on))
(let [left-on (ensure-sequential (get options :left-on (get options :on)))
right-on (ensure-sequential (get options :right-on (get options :on)))

outer? (identical? :outer (get options :how))
on-int (->> (concat left-on right-on)
(filter (set/intersection (set left-on) (set right-on)))
(distinct)
Expand All @@ -465,8 +482,10 @@ outer-join [8 4]:
(== (count left-on) (count right-on))
"Number of left join columns (%d) doesn't equal number of right join columns %d"
(count left-on) (count right-on))
left-join-data (col-or-data->reader left-on left-ds)
right-join-data (col-or-data->reader right-on right-ds)
left-on (filter-columns left-ds left-on outer?)
right-on (filter-columns right-ds right-on outer?)
left-join-data (col-or-data->reader left-on left-ds outer?)
right-join-data (col-or-data->reader right-on right-ds outer?)


{:keys [lhs-indexes rhs-indexes lhs-missing rhs-missing]}
Expand Down Expand Up @@ -524,30 +543,36 @@ outer-join [8 4]:
[rhs-table-name rhs-cols]))
(update-join-metadata lhs-table-name rhs-table-name)))
:outer
(let [n-left-empty (count rhs-missing)
n-right-empty (count lhs-missing)
;;Order is intersection, left-missing, right-missing
lhs-indexes (add-all! (dtype/clone lhs-indexes) lhs-missing)
left-valid (ds-base/select-rows left-ds lhs-indexes)
right-valid (ds-base/select-rows right-ds rhs-indexes)
right-missing (ds-base/select-rows right-ds rhs-missing)
;;For the columns we perhaps joined on
intersection-ds (-> (ds-base/select-columns left-valid on-int)
(ds-base/concat-copying (ds-base/select-columns
right-missing on-int)))
left-full (-> (ds-base/remove-columns left-valid on-int)
(ds-base/extend-with-empty n-left-empty))
right-full (-> (ds-base/remove-columns right-valid on-int)
(ds-base/extend-with-empty n-right-empty)
(ds-base/concat-copying (ds-base/remove-columns
right-missing on-int)))]
(-> (ds-impl/new-dataset
"outer-join"
(nice-column-names
[lhs-table-name (concat (ds-base/columns intersection-ds)
(ds-base/columns left-full))]
[rhs-table-name (ds-base/columns right-full)]))
(update-join-metadata lhs-table-name rhs-table-name))))))))
(cond
(== 0 (ds-base/row-count left-ds))
(vary-meta right-ds assoc :name "outer-join")
(== 0 (ds-base/row-count right-ds))
(vary-meta left-ds assoc :name "outer-join")
:else
(let [n-left-empty (count rhs-missing)
n-right-empty (count lhs-missing)
;;Order is intersection, left-missing, right-missing
lhs-indexes (add-all! (dtype/clone lhs-indexes) lhs-missing)
left-valid (ds-base/select-rows left-ds lhs-indexes)
right-valid (ds-base/select-rows right-ds rhs-indexes)
right-missing (ds-base/select-rows right-ds rhs-missing)
;;For the columns we perhaps joined on
intersection-ds (-> (ds-base/select-columns left-valid on-int)
(ds-base/concat-copying (ds-base/select-columns
right-missing on-int)))
left-full (-> (ds-base/remove-columns left-valid on-int)
(ds-base/extend-with-empty n-left-empty))
right-full (-> (ds-base/remove-columns right-valid on-int)
(ds-base/extend-with-empty n-right-empty)
(ds-base/concat-copying (ds-base/remove-columns
right-missing on-int)))]
(-> (ds-impl/new-dataset
"outer-join"
(nice-column-names
[lhs-table-name (concat (ds-base/columns intersection-ds)
(ds-base/columns left-full))]
[rhs-table-name (ds-base/columns right-full)]))
(update-join-metadata lhs-table-name rhs-table-name)))))))))
([left-ds right-ds]
(pd-merge left-ds right-ds {:on (set/intersection
(set (ds-base/column-names left-ds))
Expand Down
8 changes: 8 additions & 0 deletions test/tech/v3/dataset/join_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -391,3 +391,11 @@
(is (= #{:product :customer}
(set (ds/column-names mm))))))


(deftest pd-merge-issue-435
(is (ds-join/pd-merge (ds/empty-dataset)
(ds/->dataset {:t [0 1] :x [:a :b]})
{:on :t :how :outer}))
(is (ds-join/pd-merge (ds/->dataset {:t [0 1] :x [:a :b]})
(ds/empty-dataset)
{:on :t :how :outer})))

0 comments on commit 73db2e1

Please sign in to comment.