xfs
[Top] [All Lists]

Re: [PATCH, RFC] xfs: batched discard support

To: Christoph Hellwig <hch@xxxxxxxxxxxxx>, Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>, Paul Mackerras <paulus@xxxxxxxxx>, Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Subject: Re: [PATCH, RFC] xfs: batched discard support
From: Ingo Molnar <mingo@xxxxxxx>
Date: Wed, 19 Aug 2009 22:39:16 +0200
Cc: xfs@xxxxxxxxxxx, linux-fsdevel@xxxxxxxxxxxxxxx, linux-scsi@xxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, liml@xxxxxx, jens.axboe@xxxxxxxxxx
In-reply-to: <20090816004705.GA7347@xxxxxxxxxxxxx>
References: <20090816004705.GA7347@xxxxxxxxxxxxx>
User-agent: Mutt/1.5.18 (2008-05-17)
* Christoph Hellwig <hch@xxxxxxxxxxxxx> wrote:

> Given that everyone is so big in the discard discussion 
> I'd like to present what I had started to prepare for 
> XFS.  I didn't plan to send it out until I get my hands 
> onto a TRIM capable device (or at least get time to add 
> support to qemu), and so far it's only been tested in 
> dry-run mode.
> 
> The basic idea is to add an ioctl which walks the free 
> space btrees in each allocation group and simply 
> discard everythin that is free. [...]

A general interface design question: you added a new 
ioctl XFS_IOC_TRIM case. It's a sub-case of an 
ugly-looking demultiplexing xfs_file_ioctl().

What is your threshold for turning something into a 
syscall? When are ioctls acceptable in your opinion?

I'm asking this because we are facing a similar problem 
with perfcounters: we need to extend the ioctl 
functionality there but introducing a new syscall looks 
wasteful.

So i'm torn about the 'syscall versus ioctl' issue, i'd 
like to avoid making interface design mistakes and i'd 
like to solicit some opinions about this. I've attached 
the perfcounters ioctl patch below.

Thanks,

        Ingo

----- Forwarded message from Peter Zijlstra <a.p.zijlstra@xxxxxxxxx> -----

Date: Wed, 19 Aug 2009 11:18:27 +0200
From: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
To: Ingo Molnar <mingo@xxxxxxx>, Paul Mackerras <paulus@xxxxxxxxx>
Subject: [PATCH 4/4][RFC] perf_counter: Allow sharing of output channels
Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>,
        Frederic Weisbecker <fweisbec@xxxxxxxxx>,
        Mike Galbraith <efault@xxxxxx>, linux-kernel@xxxxxxxxxxxxxxx,
        stephane eranian <eranian@xxxxxxxxxxxxxx>,
        Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>

Provide the ability to configure a counter to send its output to
another (already existing) counter's output stream.

[ compile tested only ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
Cc: stephane eranian <eranian@xxxxxxxxxxxxxx>
---
 include/linux/perf_counter.h |    5 ++
 kernel/perf_counter.c        |   83 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 85 insertions(+), 3 deletions(-)

Index: linux-2.6/include/linux/perf_counter.h
===================================================================
--- linux-2.6.orig/include/linux/perf_counter.h
+++ linux-2.6/include/linux/perf_counter.h
@@ -216,6 +216,7 @@ struct perf_counter_attr {
 #define PERF_COUNTER_IOC_REFRESH       _IO ('$', 2)
 #define PERF_COUNTER_IOC_RESET         _IO ('$', 3)
 #define PERF_COUNTER_IOC_PERIOD                _IOW('$', 4, u64)
+#define PERF_COUNTER_IOC_SET_OUTPUT    _IO ('$', 5)
 
 enum perf_counter_ioc_flags {
        PERF_IOC_FLAG_GROUP             = 1U << 0,
@@ -415,6 +416,9 @@ enum perf_callchain_context {
        PERF_CONTEXT_MAX                = (__u64)-4095,
 };
 
+#define PERF_FLAG_FD_NO_GROUP  (1U << 0)
+#define PERF_FLAG_FD_OUTPUT    (1U << 1)
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -536,6 +540,7 @@ struct perf_counter {
        struct list_head                sibling_list;
        int                             nr_siblings;
        struct perf_counter             *group_leader;
+       struct perf_counter             *output;
        const struct pmu                *pmu;
 
        enum perf_counter_active_state  state;
Index: linux-2.6/kernel/perf_counter.c
===================================================================
--- linux-2.6.orig/kernel/perf_counter.c
+++ linux-2.6/kernel/perf_counter.c
@@ -1686,6 +1686,11 @@ static void free_counter(struct perf_cou
                        atomic_dec(&nr_task_counters);
        }
 
+       if (counter->output) {
+               fput(counter->output->filp);
+               counter->output = NULL;
+       }
+
        if (counter->destroy)
                counter->destroy(counter);
 
@@ -1971,6 +1976,8 @@ unlock:
        return ret;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        struct perf_counter *counter = file->private_data;
@@ -1994,6 +2001,9 @@ static long perf_ioctl(struct file *file
        case PERF_COUNTER_IOC_PERIOD:
                return perf_counter_period(counter, (u64 __user *)arg);
 
+       case PERF_COUNTER_IOC_SET_OUTPUT:
+               return perf_counter_set_output(counter, arg);
+
        default:
                return -ENOTTY;
        }
@@ -2264,6 +2274,11 @@ static int perf_mmap(struct file *file, 
 
        WARN_ON_ONCE(counter->ctx->parent_ctx);
        mutex_lock(&counter->mmap_mutex);
+       if (counter->output) {
+               ret = -EINVAL;
+               goto unlock;
+       }
+
        if (atomic_inc_not_zero(&counter->mmap_count)) {
                if (nr_pages != counter->data->nr_pages)
                        ret = -EINVAL;
@@ -2649,6 +2664,7 @@ static int perf_output_begin(struct perf
                             struct perf_counter *counter, unsigned int size,
                             int nmi, int sample)
 {
+       struct perf_counter *output_counter;
        struct perf_mmap_data *data;
        unsigned int offset, head;
        int have_lost;
@@ -2658,13 +2674,17 @@ static int perf_output_begin(struct perf
                u64                      lost;
        } lost_event;
 
+       rcu_read_lock();
        /*
         * For inherited counters we send all the output towards the parent.
         */
        if (counter->parent)
                counter = counter->parent;
 
-       rcu_read_lock();
+       output_counter = rcu_dereference(counter->output);
+       if (output_counter)
+               counter = output_counter;
+
        data = rcu_dereference(counter->data);
        if (!data)
                goto out;
@@ -4214,6 +4234,57 @@ err_size:
        goto out;
 }
 
+int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+{
+       struct perf_counter *output_counter = NULL;
+       struct file *output_file = NULL;
+       struct perf_counter *old_output;
+       int fput_needed = 0;
+       int ret = -EINVAL;
+
+       if (!output_fd)
+               goto set;
+
+       output_file = fget_light(output_fd, &fput_needed);
+       if (!output_file)
+               return -EBADF;
+
+       if (output_file->f_op != &perf_fops)
+               goto out;
+
+       output_counter = output_file->private_data;
+
+       /* Don't chain output fds */
+       if (output_counter->output)
+               goto out;
+
+       /* Don't set an output fd when we already have an output channel */
+       if (counter->data)
+               goto out;
+
+       atomic_long_inc(&output_file->f_count);
+
+set:
+       mutex_lock(&counter->mmap_mutex);
+       old_output = counter->output;
+       rcu_assign_pointer(counter->output, output_counter);
+       mutex_unlock(&counter->mmap_mutex);
+
+       if (old_output) {
+               /*
+                * we need to make sure no existing perf_output_*()
+                * is still referencing this counter.
+                */
+               synchronize_rcu();
+               fput(old_output->filp);
+       }
+
+       ret = 0;
+out:
+       fput_light(output_file, fput_needed);
+       return ret;
+}
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a 
task/cpu
  *
@@ -4236,7 +4307,7 @@ SYSCALL_DEFINE5(perf_counter_open,
        int ret;
 
        /* for future expandability... */
-       if (flags)
+       if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
                return -EINVAL;
 
        ret = perf_copy_attr(attr_uptr, &attr);
@@ -4264,7 +4335,7 @@ SYSCALL_DEFINE5(perf_counter_open,
         * Look up the group leader (we will attach this counter to it):
         */
        group_leader = NULL;
-       if (group_fd != -1) {
+       if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
                ret = -EINVAL;
                group_file = fget_light(group_fd, &fput_needed);
                if (!group_file)
@@ -4306,6 +4377,12 @@ SYSCALL_DEFINE5(perf_counter_open,
        if (!counter_file)
                goto err_free_put_context;
 
+       if (flags & PERF_FLAG_FD_OUTPUT) {
+               ret = perf_counter_set_output(counter, group_fd);
+               if (ret)
+                       goto err_free_put_context;
+       }
+
        counter->filp = counter_file;
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);

-- 

----- End forwarded message -----

<Prev in Thread] Current Thread [Next in Thread>