#include "ruby.h"
#ifdef HAVE_RUBY_IO_H
#  include "ruby/io.h"
#else
#  include "rubyio.h"
#endif
#include <errno.h>
#include <fcntl.h>
#include <assert.h>
#include <sys/uio.h>
#include <limits.h>
#include <alloca.h>
#include <sys/utsname.h>

static VALUE sym_EAGAIN;

#ifndef F_LINUX_SPECIFIC_BASE
#  define F_LINUX_SPECIFIC_BASE 1024
#endif

#ifndef F_GETPIPE_SZ
#  define F_SETPIPE_SZ    (F_LINUX_SPECIFIC_BASE + 7)
#  define F_GETPIPE_SZ    (F_LINUX_SPECIFIC_BASE + 8)
#endif

#if ! HAVE_RB_IO_T
#  define rb_io_t OpenFile
#endif

#ifdef GetReadFile
#  define FPTR_TO_FD(fptr) (fileno(GetReadFile(fptr)))
#else
#  if !HAVE_RB_IO_T || (RUBY_VERSION_MAJOR == 1 && RUBY_VERSION_MINOR == 8)
#    define FPTR_TO_FD(fptr) fileno(fptr->f)
#  else
#    define FPTR_TO_FD(fptr) fptr->fd
#  endif
#endif

static int my_fileno(VALUE io)
{
	rb_io_t *fptr;

	for (;;) {
		switch (TYPE(io)) {
		case T_FIXNUM: return NUM2INT(io);
		case T_FILE: {
			GetOpenFile(io, fptr);
			return FPTR_TO_FD(fptr);
		}
		default:
			io = rb_convert_type(io, T_FILE, "IO", "to_io");
			/* retry */
		}
	}
}
#ifndef HAVE_RB_THREAD_BLOCKING_REGION
/* partial emulation of the 1.9 rb_thread_blocking_region under 1.8 */
#  include <rubysig.h>
#  define RUBY_UBF_IO ((rb_unblock_function_t *)-1)
typedef void rb_unblock_function_t(void *);
typedef VALUE rb_blocking_function_t(void *);
static VALUE
rb_thread_blocking_region(
	rb_blocking_function_t *fn, void *data1,
	rb_unblock_function_t *ubf, void *data2)
{
	VALUE rv;

	assert(RUBY_UBF_IO == ubf && "RUBY_UBF_IO required for emulation");

	TRAP_BEG;
	rv = fn(data1);
	TRAP_END;

	return rv;
}
#endif /* ! HAVE_RB_THREAD_BLOCKING_REGION */

#ifndef RSTRING_PTR
#  define RSTRING_PTR(s) (RSTRING(s)->ptr)
#endif
#ifndef RSTRING_LEN
#  define RSTRING_LEN(s) (RSTRING(s)->len)
#endif
#ifndef RARRAY_PTR
#  define RARRAY_PTR(s) (RARRAY(s)->ptr)
#endif
#ifndef RARRAY_LEN
#  define RARRAY_LEN(s) (RARRAY(s)->len)
#endif

static VALUE io_run(rb_blocking_function_t *fn, void *data)
{
	return rb_thread_blocking_region(fn, data, RUBY_UBF_IO, 0);
}

/*
 * Releases GVL only iff blocking I/O is used.
 * Only use this if all file descriptors in data are pipes.
 * We'll trust programmers who use non-blocking I/O explicitly to
 * want the fastest possible performance without resorting to threads,
 * so releasing and them immediately reacquiring the GVL would be
 * a waste of time.
 */
static VALUE nb_io_run(rb_blocking_function_t *fn, void *data, unsigned flags)
{
	if (flags & SPLICE_F_NONBLOCK)
		return fn(data);
	return io_run(fn, data);
}

struct splice_args {
	int fd_in;
	off_t *off_in;
	int fd_out;
	off_t *off_out;
	size_t len;
	unsigned flags;
};

static VALUE nogvl_splice(void *ptr)
{
	struct splice_args *a = ptr;

	return (VALUE)splice(a->fd_in, a->off_in, a->fd_out, a->off_out,
	                     a->len, a->flags);
}

static long do_splice(int argc, VALUE *argv, unsigned dflags)
{
	off_t i, o;
	VALUE fd_in, off_in, fd_out, off_out, len, flags;
	struct splice_args a;

	rb_scan_args(argc, argv, "51",
	             &fd_in, &off_in, &fd_out, &off_out, &len, &flags);

	a.off_in = NIL_P(off_in) ? NULL : (i = NUM2OFFT(off_in), &i);
	a.off_out = NIL_P(off_out) ? NULL : (o = NUM2OFFT(off_out), &o);
	a.fd_in = my_fileno(fd_in);
	a.fd_out = my_fileno(fd_out);
	a.len = (size_t)NUM2ULONG(len);
	a.flags = NIL_P(flags) ? dflags : NUM2UINT(flags) | dflags;

	return (long)io_run(nogvl_splice, &a);
}

/*
 * call-seq:
 *    IO.splice(fd_in, off_in, fd_out, off_out, len) => integer
 *    IO.splice(fd_in, off_in, fd_out, off_out, len, flags) => integer
 *
 * Splice +len+ bytes from/to a pipe.  Either +fd_in+ or +fd_out+
 * MUST be a pipe.  +fd_in+ and +fd_out+ may BOTH be pipes as of
 * Linux 2.6.31 or later.
 *
 * +off_in+ and +off_out+ if non-nil may be used to
 * specify an offset for the non-pipe file descriptor.
 *
 * +flags+ defaults to zero if unspecified.
 * +flags+ may be a bitmask of the following flags:
 *
 * IO::Splice::F_MOVE, IO::Splice::F_NONBLOCK, IO::Splice::F_MORE
 *
 * Returns the number of bytes spliced.
 * Raises EOFError when +fd_in+ has reached end of file.
 * Raises Errno::EAGAIN if the IO::Splice::F_NONBLOCK flag is set
 * and the pipe has no data to read from or space to write to.  May
 * also raise Errno::EAGAIN if the non-pipe descriptor has no data
 * to read from or space to write to.
 *
 *     rd, wr = (pipe = IO.pipe).map { |io| io.fileno }
 *     src_io, dst_io = File.open("/path/to/src"), File.open("/path/to/dst")
 *     src, dst = src_io.fileno, dst_io.fileno
 *
 *     nr = IO.splice(src, nil, wr, nil, IO::Splice::PIPE_CAPA, 0)
 *     IO.splice(rd, nil, dst, nil, nr, 0)
 *
 * As splice never exposes buffers to userspace, it will not take
 * into account userspace buffering done by Ruby or stdio.  It is
 * also not subject to encoding/decoding filters under Ruby 1.9.
 *
 * Consider using IO.trysplice if you are using non-blocking I/O on
 * both descriptors as it avoids the cost of raising common Errno::EAGAIN
 * exceptions.
 *
 * See manpage for full documentation:
 * http://kernel.org/doc/man-pages/online/pages/man2/splice.2.html
 */
static VALUE my_splice(int argc, VALUE *argv, VALUE self)
{
	long n = do_splice(argc, argv, 0);

	if (n == 0)
		rb_eof_error();
	if (n < 0)
		rb_sys_fail("splice");
	return LONG2NUM(n);
}

/*
 * call-seq:
 *    IO.trysplice(fd_in, off_in, fd_out, off_out, len) => integer
 *    IO.trysplice(fd_in, off_in, fd_out, off_out, len, flags) => integer
 *
 * Exactly like IO.splice, except +:EAGAIN+ is returned when either
 * the read or write end would block instead of raising Errno::EAGAIN.
 *
 * IO::Splice::F_NONBLOCK is always passed for the pipe descriptor,
 * but this can still block if the non-pipe descriptor is blocking.
 *
 * See IO.splice documentation for more details.
 */
static VALUE trysplice(int argc, VALUE *argv, VALUE self)
{
	long n = do_splice(argc, argv, SPLICE_F_NONBLOCK);

	if (n == 0)
		return Qnil;
	if (n < 0) {
		if (errno == EAGAIN)
			return sym_EAGAIN;
		rb_sys_fail("splice");
	}
	return LONG2NUM(n);
}

struct tee_args {
	int fd_in;
	int fd_out;
	size_t len;
	unsigned flags;
};

/* runs without GVL */
static VALUE nogvl_tee(void *ptr)
{
	struct tee_args *a = ptr;

	return (VALUE)tee(a->fd_in, a->fd_out, a->len, a->flags);
}

static long do_tee(int argc, VALUE *argv, unsigned dflags)
{
	VALUE fd_in, fd_out, len, flags;
	struct tee_args a;

	rb_scan_args(argc, argv, "31", &fd_in, &fd_out, &len, &flags);
	a.fd_in = my_fileno(fd_in);
	a.fd_out = my_fileno(fd_out);
	a.len = (size_t)NUM2ULONG(len);
	a.flags = NIL_P(flags) ? dflags : NUM2UINT(flags) | dflags;

	return (long)nb_io_run(nogvl_tee, &a, a.flags);
}

/*
 * call-seq:
 *   IO.tee(fd_in, fd_out, len) => integer
 *   IO.tee(fd_in, fd_out, len, flags) => integer
 *
 * Copies up to +len+ bytes of data from +fd_in+ to +fd_out+.  +fd_in+
 * and +fd_out+ must both refer to pipe descriptors.  +fd_in+ and +fd_out+
 * may not be endpoints of the same pipe.
 *
 * +flags+ may be zero (the default) or IO::Splice::F_NONBLOCK
 * Other IO::Splice flags are currently unimplemented or have no effect.
 *
 * Returns the number of bytes duplicated if successful.
 * Raises EOFError when +fd_in+ is closed and emptied.
 * Raises Errno::EAGAIN when +fd_in+ is empty and/or +fd_out+ is full
 * and +flags+ contains IO::Splice::F_NONBLOCK
 *
 * Consider using IO.trytee if you are using IO::Splice::F_NONBLOCK
 * as it avoids the cost of raising common Errno::EAGAIN exceptions.
 *
 * See manpage for full documentation:
 * http://kernel.org/doc/man-pages/online/pages/man2/tee.2.html
 */
static VALUE my_tee(int argc, VALUE *argv, VALUE self)
{
	long n = do_tee(argc, argv, 0);

	if (n == 0)
		rb_eof_error();
	if (n < 0)
		rb_sys_fail("tee");

	return LONG2NUM(n);
}

/*
 * call-seq:
 *    IO.trytee(fd_in, fd_out, len) => integer
 *    IO.trytee(fd_in, fd_out, len, flags) => integer
 *
 * Exactly like IO.tee, except +:EAGAIN+ is returned when either
 * the read or write end would block instead of raising Errno::EAGAIN.
 *
 * IO::Splice::F_NONBLOCK is always passed for the pipe descriptor,
 * but this can still block if the non-pipe descriptor is blocking.
 *
 * See IO.tee documentation for more details.
 */
static VALUE trytee(int argc, VALUE *argv, VALUE self)
{
	long n = do_tee(argc, argv, SPLICE_F_NONBLOCK);

	if (n == 0)
		return Qnil;
	if (n < 0) {
		if (errno == EAGAIN)
			return sym_EAGAIN;
		rb_sys_fail("tee");
	}

	return LONG2NUM(n);
}

struct vmsplice_args {
	int fd;
	struct iovec *iov;
	unsigned long nr_segs;
	unsigned flags;
};

static VALUE nogvl_vmsplice(void *ptr)
{
	struct vmsplice_args *a = ptr;

	return (VALUE)vmsplice(a->fd, a->iov, a->nr_segs, a->flags);
}

/* this can't be a function since we use alloca() */
#define ARY2IOVEC(iov,iovcnt,expect,ary) \
do { \
	VALUE *cur; \
	struct iovec *tmp; \
	long n; \
	cur = RARRAY_PTR(ary); \
	n = RARRAY_LEN(ary); \
	if (n > IOV_MAX) \
		rb_raise(rb_eArgError, "array is larger than IOV_MAX"); \
	iov = tmp = alloca(sizeof(struct iovec) * n); \
	expect = 0; \
	iovcnt = n; \
	for (; --n >= 0; tmp++, cur++) { \
		Check_Type(*cur, T_STRING); \
		tmp->iov_base = RSTRING_PTR(*cur); \
		tmp->iov_len = RSTRING_LEN(*cur); \
		expect += tmp->iov_len; \
	} \
} while (0)

static void advance_vmsplice_args(struct vmsplice_args *a, long n)
{
	struct iovec *new_iov = a->iov;
	unsigned long i;

	/* skip over iovecs we've already written completely */
	for (i = 0; i < a->nr_segs; i++, new_iov++) {
		if (n == 0)
			break;
		/*
		 * partially written iov,
		 * modify and retry with current iovec in
		 * front
		 */
		if (new_iov->iov_len > (size_t)n) {
			VALUE base = (VALUE)new_iov->iov_base;

			new_iov->iov_len -= n;
			new_iov->iov_base = (void *)(base + n);
			break;
		}

		n -= new_iov->iov_len;
	}

	/* setup to retry without the already-written iovecs */
	a->nr_segs -= i;
	a->iov = new_iov;
}

/*
 * call-seq:
 *   IO.vmsplice(fd, string_array) => integer
 *   IO.vmsplice(fd, string_array, flags) => integer
 *   IO.vmsplice(fd, string) => integer
 *   IO.vmsplice(fd, string, flags) => integer
 *
 * Transfers an array of strings into the pipe descriptor given by fd.
 * +fd+ must be the writable end of a pipe.
 *
 * This may allow the kernel to avoid data copies in some cases.
 * but is (probably) of limited usefulness in Ruby.  If you have
 * use cases or ideas for making this more useful for Ruby users,
 * please tell us at ruby.io.splice@librelist.com!
 *
 * Also consider the "sendfile" RubyGem or IO.copy_stream in Ruby 1.9
 * if you want to do zero-copy file transfers to pipes or sockets.  As
 * of Linux 2.6.33, sendfile(2) can copy to any output descriptor,
 * not just sockets.
 *
 * See manpage for full documentation:
 * http://kernel.org/doc/man-pages/online/pages/man2/vmsplice.2.html
 */
static VALUE my_vmsplice(int argc, VALUE * argv, VALUE self)
{
	long rv = 0;
	ssize_t left;
	struct vmsplice_args a;
	VALUE fd, data, flags;

	rb_scan_args(argc, argv, "21", &fd, &data, &flags);

	switch (TYPE(data)) {
	case T_STRING: {
		struct iovec iov;

		iov.iov_base = RSTRING_PTR(data);
		iov.iov_len = (size_t)(left = (ssize_t)RSTRING_LEN(data));
		a.iov = &iov;
		a.nr_segs = 1;
		}
		break;
	case T_ARRAY:
		ARY2IOVEC(a.iov, a.nr_segs, left, data);
		break;
	default:
		rb_raise(rb_eTypeError, "wrong argument type %s "
		         "(expected a String or Array of strings)",
		         rb_obj_classname(data));
	}
	a.fd = my_fileno(fd);
	a.flags = NIL_P(flags) ? 0 : NUM2UINT(flags);

	for (;;) {
		long n = (long)nb_io_run(nogvl_vmsplice, &a, a.flags);

		if (n < 0) {
			if (errno == EAGAIN) {
				if (a.flags & SPLICE_F_NONBLOCK)
					rb_sys_fail("vmsplice");
				else if (rb_io_wait_writable(a.fd))
					continue;
				/* fall through on error */
			}
			/*
			 * unlikely to hit this case, return the
			 * already written bytes, we'll let the next
			 * write (or close) fail instead
			 */
			if (rv > 0)
				break;
			rb_sys_fail("vmsplice");
		}

		rv += n;
		left -= n;
		if (left == 0)
			break;
		advance_vmsplice_args(&a, n);
	}

	return LONG2NUM(rv);
}

/*
 * call-seq:
 *   reader, writer = IO.pipe
 *   reader.pipe_size => integer
 *
 * Returns the pipe capacity of the underlying pipe in bytes.  The
 * default capacity is 65536 bytes since Linux 2.6.11, and 4096 bytes
 * in previous kernels.
 *
 * Since the pipe is a circular buffer in the same kernel, the size
 * of the reader is exactly the same as the size of the writer.
 *
 * This method is only exposed on Linux 2.6.35 or later.
 */
static VALUE pipe_size(VALUE self)
{
	int size = fcntl(my_fileno(self),  F_GETPIPE_SZ);

	if (size < 0)
		rb_sys_fail("fcntl(F_GETPIPE_SZ)");

	return INT2NUM(size);
}

/*
 * call-seq:
 *   reader, writer = IO.pipe
 *   reader.pipe_size = integer
 *
 * Sets and returns the pipe capacity of the underlying pipe in bytes.
 *
 * This MUST be a power-of-two, or Errno::EINVAL will be raised.
 * Linux will silently increase this to be equal to the page size
 * (4096 bytes on most architectures) if the specified value is
 * less than the size of a page.
 *
 * For users without CAP_SYS_RESOURCE, this raises Errno::EPERM when
 * attempting to specify a value greater than the value in
 * /proc/sys/fs/pipe-max-size.
 *
 * Since the pipe is a circular buffer in the same kernel, the size
 * of the reader is exactly the same as the size of the writer.
 *
 * Raises Errno::EBUSY if the assigned value is less than
 * the currently filled portion of the pipe.
 *
 * This method is only exposed on Linux 2.6.35 or later.
 */
static VALUE set_pipe_size(VALUE self, VALUE size)
{
	int fd = my_fileno(self);
	int bytes = NUM2INT(size);
	int rv = fcntl(fd, F_SETPIPE_SZ, bytes);

	if (rv < 0) {
		if (errno == ENOMEM) {
			rb_gc();
			rv = fcntl(fd, F_SETPIPE_SZ, bytes);
		}
		if (rv < 0)
			rb_sys_fail("fcntl(F_SETPIPE_SZ)");
	}

	return size;
}

void Init_io_splice_ext(void)
{
	VALUE mSplice = rb_define_module_under(rb_cIO, "Splice");
	struct utsname utsname;

	rb_define_singleton_method(rb_cIO, "splice", my_splice, -1);
	rb_define_singleton_method(rb_cIO, "trysplice", trysplice, -1);
	rb_define_singleton_method(rb_cIO, "tee", my_tee, -1);
	rb_define_singleton_method(rb_cIO, "trytee", trytee, -1);
	rb_define_singleton_method(rb_cIO, "vmsplice", my_vmsplice, -1);

	/*
	 * Attempt to move pages instead of copying.  This is only a hint
	 * and support for it was removed in Linux 2.6.21.  It will be
         * re-added for FUSE filesystems only in Linux 2.6.35.
	 */
	rb_define_const(mSplice, "F_MOVE", UINT2NUM(SPLICE_F_MOVE));

	/*
	 * Do not block on pipe I/O.  This flag only affects the pipe(s)
	 * being spliced from/to and has no effect on the non-pipe
	 * descriptor (which requires non-blocking operation to be set
	 * explicitly).
	 *
	 * The non-blocking flag (O_NONBLOCK) on the pipe descriptors
	 * themselves are ignored by this family of functions, and
	 * using this flag is the only way to get non-blocking operation
	 * out of them.
	 */
	rb_define_const(mSplice, "F_NONBLOCK", UINT2NUM(SPLICE_F_NONBLOCK));

	/*
	 * Indicate that there may be more data coming into the outbound
	 * descriptor.  This can allow the kernel to avoid sending partial
	 * frames from sockets.  Currently only used with splice.
	 */
	rb_define_const(mSplice, "F_MORE", UINT2NUM(SPLICE_F_MORE));

	/*
	 * Only usable by vmsplice.  This flag probably not useful in the
	 * context of Ruby applications which cannot control alignment.
	 */
	rb_define_const(mSplice, "F_GIFT", UINT2NUM(SPLICE_F_GIFT));

	/*
	 * The maximum size of an atomic write to a pipe
	 * POSIX requires this to be at least 512 bytes.
	 * Under Linux, this is 4096 bytes.
	 */
	rb_define_const(mSplice, "PIPE_BUF", UINT2NUM(PIPE_BUF));

	if (uname(&utsname) == -1)
		rb_sys_fail("uname");

	/* includes 2.6.35-rc[1-6] */
	if (strcmp(utsname.release, "2.6.35") >= 0) {
		rb_define_method(rb_cIO, "pipe_size", pipe_size, 0);
		rb_define_method(rb_cIO, "pipe_size=", set_pipe_size, 1);

		/*
		 * fcntl() command constant used to return the size of a pipe.
		 * This constant is only defined when running Linux 2.6.35
		 * or later.  For convenience, use IO#pipe_size instead.
		 */
		rb_define_const(mSplice, "F_GETPIPE_SZ",
		                UINT2NUM(F_GETPIPE_SZ));

		/*
		 * fcntl() command constant used to set the size of a pipe.
		 * This constant is only defined when running Linux 2.6.35
		 * or later.  For convenience, use IO#pipe_size= instead.
		 */
		rb_define_const(mSplice, "F_SETPIPE_SZ",
		                UINT2NUM(F_SETPIPE_SZ));
	}

	sym_EAGAIN = ID2SYM(rb_intern("EAGAIN"));
}