Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Linux 4.1 compat regarding loop device on ZFS #3651

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions config/kernel-kmap-atomic-args.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
dnl #
dnl # 2.6.37 API change
dnl # kmap_atomic changed from assigning hard-coded named slot to using
dnl # push/pop based dynamical allocation.
dnl #
AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [
AC_MSG_CHECKING([whether kmap_atomic wants 1 args])
ZFS_LINUX_TRY_COMPILE([
#include <linux/pagemap.h>
],[
struct page page;
kmap_atomic(&page);
],[
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_1ARG_KMAP_ATOMIC, 1,
[kmap_atomic wants 1 args])
],[
AC_MSG_RESULT(no)
])
])
1 change: 1 addition & 0 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
ZFS_AC_KERNEL_LSEEK_EXECUTE
ZFS_AC_KERNEL_VFS_ITERATE
ZFS_AC_KERNEL_VFS_RW_ITERATE
ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS

AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ"
Expand Down
3 changes: 2 additions & 1 deletion include/linux/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ KERNEL_H = \
$(top_srcdir)/include/linux/xattr_compat.h \
$(top_srcdir)/include/linux/vfs_compat.h \
$(top_srcdir)/include/linux/blkdev_compat.h \
$(top_srcdir)/include/linux/utsname_compat.h
$(top_srcdir)/include/linux/utsname_compat.h \
$(top_srcdir)/include/linux/kmap_compat.h

USER_H =

Expand Down
40 changes: 40 additions & 0 deletions include/linux/kmap_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
*/

#ifndef _ZFS_KMAP_H
#define _ZFS_KMAP_H

#include <linux/highmem.h>

#ifdef HAVE_1ARG_KMAP_ATOMIC
/* 2.6.37 API change */
#define zfs_kmap_atomic(page, km_type) kmap_atomic(page)
#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr)
#else
#define zfs_kmap_atomic(page, km_type) kmap_atomic(page, km_type)
#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr, km_type)
#endif

#endif /* _ZFS_KMAP_H */
205 changes: 108 additions & 97 deletions module/zcommon/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
* software developed by the University of California, Berkeley, and its
* contributors.
*/
/*
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
*/

/*
* The uio support from OpenSolaris has been added as a short term
Expand All @@ -46,27 +49,25 @@

#include <sys/types.h>
#include <sys/uio_impl.h>
#include <linux/kmap_compat.h>

/*
* Move "n" bytes at byte address "p"; "rw" indicates the direction
* of the move, and the I/O parameters are provided in "uio", which is
* update to reflect the data which was moved. Returns 0 on success or
* a non-zero errno on failure.
*/
int
uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
static int
uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
{
struct iovec *iov;
const struct iovec *iov = uio->uio_iov;
size_t skip = uio->uio_skip;
ulong_t cnt;

ASSERT3U(skip, <, iov->iov_len);

while (n && uio->uio_resid) {
iov = uio->uio_iov;
cnt = MIN(iov->iov_len, n);
if (cnt == 0l) {
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
cnt = MIN(iov->iov_len - skip, n);
switch (uio->uio_segflg) {
case UIO_USERSPACE:
case UIO_USERISPACE:
Expand All @@ -75,29 +76,80 @@ uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
* iov->iov_base = user data pointer
*/
if (rw == UIO_READ) {
if (copy_to_user(iov->iov_base, p, cnt))
if (copy_to_user(iov->iov_base+skip, p, cnt))
return (EFAULT);
} else {
if (copy_from_user(p, iov->iov_base, cnt))
if (copy_from_user(p, iov->iov_base+skip, cnt))
return (EFAULT);
}
break;
case UIO_SYSSPACE:
if (rw == UIO_READ)
bcopy(p, iov->iov_base, cnt);
bcopy(p, iov->iov_base + skip, cnt);
else
bcopy(iov->iov_base, p, cnt);
bcopy(iov->iov_base + skip, p, cnt);
break;
default:
ASSERT(0);
}
skip += cnt;
if (skip == iov->iov_len) {
skip = 0;
uio->uio_iov = (++iov);
uio->uio_iovcnt--;
}
iov->iov_base += cnt;
iov->iov_len -= cnt;
uio->uio_skip = skip;
uio->uio_resid -= cnt;
uio->uio_loffset += cnt;
p = (caddr_t)p + cnt;
n -= cnt;
}
return (0);
}

static int
uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio)
{
const struct bio_vec *bv = uio->uio_bvec;
size_t skip = uio->uio_skip;
ulong_t cnt;

ASSERT3U(skip, <, bv->bv_len);

while (n && uio->uio_resid) {
void *paddr;
cnt = MIN(bv->bv_len - skip, n);

paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unsafe on older kernels because interrupts must be disabled to prevent another thread from using the CPU's KM_USER1 slot. This is likely okay on newer kernels where the slots are dynamically allocated.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not true, KM_USER* are not used in interrupt context since... well it's "user". So it's perfectly safe.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When CONFIG_PREEMPT is set, the kernel scheduler should be able to preempt us. If the scheduler preempts us and schedules us to another CPU or runs something else on this CPU that uses this, the value will be clobbered. See the comment for __schedule:

http://lxr.free-electrons.com/source/kernel/sched/core.c#L2693

All we need is an IRQ to occur when the scheduler thinks something else should run. As the comment for __schedule() indicates, the code in arch/x86/entry/entry_64.S will call into the scheduler on return from the IRQ and preempt_schedule_irq() will be called. That will call schedule() and something else can have the CPU.

You are correct that there is code in the mainline kernel doing this, but I think those routines will need to be changed for the same reason. The reason why they have not yet been changed is that these events are rare enough that no one has encountered them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tuxoko It seems that I used the wrong term. It is unsafe to use kmap_atomic in a preemptible context. An interrupt context is definitely unsafe. Also, my statement about it being okay on newer kernels was wrong.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ryao
kmap_atomic does turn off preemption itself.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tuxoko Where does it do that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ryao
http://lxr.free-electrons.com/source/include/linux/uaccess.h#L16

  7 /*
  8  * These routines enable/disable the pagefault handler in that
  9  * it will not take any locks and go straight to the fixup table.
 10  *
 11  * They have great resemblance to the preempt_disable/enable calls
 12  * and in fact they are identical; this is because currently there is
 13  * no other way to make the pagefault handlers do this. So we do
 14  * disable preemption but we don't necessarily care about that.
 15  */
 16 static inline void pagefault_disable(void)

if (rw == UIO_READ)
bcopy(p, paddr + bv->bv_offset + skip, cnt);
else
bcopy(paddr + bv->bv_offset + skip, p, cnt);
zfs_kunmap_atomic(paddr, KM_USER1);

skip += cnt;
if (skip == bv->bv_len) {
skip = 0;
uio->uio_bvec = (++bv);
uio->uio_iovcnt--;
}
uio->uio_skip = skip;
uio->uio_resid -= cnt;
uio->uio_loffset += cnt;
p = (caddr_t)p + cnt;
n -= cnt;
}
return (0);
}

int
uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
{
if (uio->uio_segflg != UIO_BVEC)
return (uiomove_iov(p, n, rw, uio));
else
return (uiomove_bvec(p, n, rw, uio));
}
EXPORT_SYMBOL(uiomove);

#define fuword8(uptr, vptr) get_user((*vptr), (uptr))
Expand All @@ -111,39 +163,39 @@ EXPORT_SYMBOL(uiomove);
void
uio_prefaultpages(ssize_t n, struct uio *uio)
{
struct iovec *iov;
const struct iovec *iov;
ulong_t cnt, incr;
caddr_t p;
uint8_t tmp;
int iovcnt;
size_t skip = uio->uio_skip;

/* no need to fault in kernel pages */
switch (uio->uio_segflg) {
case UIO_SYSSPACE:
case UIO_BVEC:
return;
case UIO_USERSPACE:
case UIO_USERISPACE:
break;
default:
ASSERT(0);
}

iov = uio->uio_iov;
iovcnt = uio->uio_iovcnt;
ASSERT3U(skip, <, iov->iov_len);

while ((n > 0) && (iovcnt > 0)) {
cnt = MIN(iov->iov_len, n);
if (cnt == 0) {
/* empty iov entry */
iov++;
iovcnt--;
continue;
}
cnt = MIN(iov->iov_len - skip, n);
n -= cnt;
/*
* touch each page in this segment.
*/
p = iov->iov_base;
p = iov->iov_base + skip;
while (cnt) {
switch (uio->uio_segflg) {
case UIO_USERSPACE:
case UIO_USERISPACE:
if (fuword8((uint8_t *) p, &tmp))
return;
break;
case UIO_SYSSPACE:
bcopy(p, &tmp, 1);
break;
}
if (fuword8((uint8_t *) p, &tmp))
return;
incr = MIN(cnt, PAGESIZE);
p += incr;
cnt -= incr;
Expand All @@ -152,18 +204,11 @@ uio_prefaultpages(ssize_t n, struct uio *uio)
* touch the last byte in case it straddles a page.
*/
p--;
switch (uio->uio_segflg) {
case UIO_USERSPACE:
case UIO_USERISPACE:
if (fuword8((uint8_t *) p, &tmp))
return;
break;
case UIO_SYSSPACE:
bcopy(p, &tmp, 1);
break;
}
if (fuword8((uint8_t *) p, &tmp))
return;
iov++;
iovcnt--;
skip = 0;
}
}
EXPORT_SYMBOL(uio_prefaultpages);
Expand All @@ -175,49 +220,13 @@ EXPORT_SYMBOL(uio_prefaultpages);
int
uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
{
struct iovec *iov;
ulong_t cnt;
int iovcnt;

iovcnt = uio->uio_iovcnt;
*cbytes = 0;

for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) {
cnt = MIN(iov->iov_len, n);
if (cnt == 0)
continue;

switch (uio->uio_segflg) {

case UIO_USERSPACE:
case UIO_USERISPACE:
/*
* p = kernel data pointer
* iov->iov_base = user data pointer
*/
if (rw == UIO_READ) {
/* UIO_READ = copy data from kernel to user */
if (copy_to_user(iov->iov_base, p, cnt))
return (EFAULT);
} else {
/* UIO_WRITE = copy data from user to kernel */
if (copy_from_user(p, iov->iov_base, cnt))
return (EFAULT);
}
break;
struct uio uio_copy;
int ret;

case UIO_SYSSPACE:
if (rw == UIO_READ)
bcopy(p, iov->iov_base, cnt);
else
bcopy(iov->iov_base, p, cnt);
break;
}
p = (caddr_t)p + cnt;
n -= cnt;
*cbytes += cnt;
}
return (0);
bcopy(uio, &uio_copy, sizeof (struct uio));
ret = uiomove(p, n, rw, &uio_copy);
*cbytes = uio->uio_resid - uio_copy.uio_resid;
return (ret);
}
EXPORT_SYMBOL(uiocopy);

Expand All @@ -229,21 +238,23 @@ uioskip(uio_t *uiop, size_t n)
{
if (n > uiop->uio_resid)
return;
while (n != 0) {
iovec_t *iovp = uiop->uio_iov;
size_t niovb = MIN(iovp->iov_len, n);

if (niovb == 0) {
uiop->uio_skip += n;
if (uiop->uio_segflg != UIO_BVEC) {
while (uiop->uio_skip >= uiop->uio_iov->iov_len) {
uiop->uio_skip -= uiop->uio_iov->iov_len;
uiop->uio_iov++;
uiop->uio_iovcnt--;
continue;
}
iovp->iov_base += niovb;
uiop->uio_loffset += niovb;
iovp->iov_len -= niovb;
uiop->uio_resid -= niovb;
n -= niovb;
} else {
while (uiop->uio_skip >= uiop->uio_bvec->bv_len) {
uiop->uio_skip -= uiop->uio_bvec->bv_len;
uiop->uio_bvec++;
uiop->uio_iovcnt--;
}
}
uiop->uio_loffset += n;
uiop->uio_resid -= n;
}
EXPORT_SYMBOL(uioskip);
#endif /* _KERNEL */
Loading