/*
 * Copyright © 2011 Intel Corporation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Authors:
 *    Chris Wilson <chris@chris-wilson.co.uk>
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "sna.h"
#include "sna_render.h"
#include "sna_render_inline.h"
#include "fb/fbpict.h"

#define NO_REDIRECT 0
#define NO_CONVERT 0
#define NO_FIXUP 0
#define NO_EXTRACT 0

#define DBG_FORCE_UPLOAD 0
#define DBG_NO_CPU_BO 0

#define alphaless(format) PICT_FORMAT(PICT_FORMAT_BPP(format),		\
				      PICT_FORMAT_TYPE(format),		\
				      0,				\
				      PICT_FORMAT_R(format),		\
				      PICT_FORMAT_G(format),		\
				      PICT_FORMAT_B(format))

CARD32
sna_format_for_depth(int depth)
{
	switch (depth) {
	case 1: return PICT_a1;
	case 4: return PICT_x4a4;
	case 8: return PICT_a8;
	case 15: return PICT_x1r5g5b5;
	case 16: return PICT_r5g6b5;
	default: assert(0);
	case 24: return PICT_x8r8g8b8;
#if XORG_VERSION_CURRENT >= XORG_VERSION_NUMERIC(1,6,99,900,0)
	case 30: return PICT_x2r10g10b10;
#endif
	case 32: return PICT_a8r8g8b8;
	}
}

CARD32
sna_render_format_for_depth(int depth)
{
	switch (depth) {
	case 1: return PIXMAN_a1;
	case 4: return PIXMAN_a4;
	case 8: return PIXMAN_a8;
	case 15: return PIXMAN_a1r5g5b5;
	case 16: return PIXMAN_r5g6b5;
	case 30: return PIXMAN_a2r10g10b10;
	default: assert(0);
	case 24:
	case 32: return PIXMAN_a8r8g8b8;
	}
}

static bool
no_render_composite(struct sna *sna,
		    uint8_t op,
		    PicturePtr src,
		    PicturePtr mask,
		    PicturePtr dst,
		    int16_t src_x, int16_t src_y,
		    int16_t mask_x, int16_t mask_y,
		    int16_t dst_x, int16_t dst_y,
		    int16_t width, int16_t height,
		    unsigned flags,
		    struct sna_composite_op *tmp)
{
	DBG(("%s (op=%d, mask? %d)\n", __FUNCTION__, op, mask != NULL));

	if (mask)
		return false;

	if (!is_gpu(sna, dst->pDrawable, PREFER_GPU_BLT) &&
	    (src->pDrawable == NULL || !is_gpu(sna, src->pDrawable, PREFER_GPU_BLT)))
		return false;

	return sna_blt_composite(sna,
				 op, src, dst,
				 src_x, src_y,
				 dst_x, dst_y,
				 width, height,
				 flags | COMPOSITE_FALLBACK, tmp);
	(void)mask_x;
	(void)mask_y;
}

static bool
no_render_check_composite_spans(struct sna *sna,
				uint8_t op, PicturePtr src, PicturePtr dst,
				int16_t width,  int16_t height, unsigned flags)
{
	return false;
}

static bool
no_render_copy_boxes(struct sna *sna, uint8_t alu,
		     const DrawableRec *src, struct kgem_bo *src_bo, int16_t src_dx, int16_t src_dy,
		     const DrawableRec *dst, struct kgem_bo *dst_bo, int16_t dst_dx, int16_t dst_dy,
		     const BoxRec *box, int n, unsigned flags)
{
	DBG(("%s (n=%d)\n", __FUNCTION__, n));

	if (!sna_blt_compare_depth(src, dst))
		return false;

	return sna_blt_copy_boxes(sna, alu,
				  src_bo, src_dx, src_dy,
				  dst_bo, dst_dx, dst_dy,
				  dst->bitsPerPixel,
				  box, n);
}

static bool
no_render_copy(struct sna *sna, uint8_t alu,
		 PixmapPtr src, struct kgem_bo *src_bo,
		 PixmapPtr dst, struct kgem_bo *dst_bo,
		 struct sna_copy_op *tmp)
{
	DBG(("%s ()\n", __FUNCTION__));

	if (sna_blt_compare_depth(&src->drawable, &dst->drawable) &&
	    sna_blt_copy(sna, alu,
			 src_bo, dst_bo, dst->drawable.bitsPerPixel,
			 tmp))
		return true;

	return false;
}

static bool
no_render_fill_boxes(struct sna *sna,
		     CARD8 op,
		     PictFormat format,
		     const xRenderColor *color,
		     const DrawableRec *dst, struct kgem_bo *dst_bo,
		     const BoxRec *box, int n)
{
	uint8_t alu = GXcopy;
	uint32_t pixel;

	DBG(("%s (op=%d, color=(%04x,%04x,%04x, %04x))\n",
	     __FUNCTION__, op,
	     color->red, color->green, color->blue, color->alpha));

	if (op == PictOpClear) {
		pixel = 0;
		alu = GXclear;
		op = PictOpSrc;
	}

	if (op == PictOpOver) {
		if ((color->alpha >= 0xff00))
			op = PictOpSrc;
	}

	if (op != PictOpSrc)
		return false;

	if (alu == GXcopy &&
	    !sna_get_pixel_from_rgba(&pixel,
				     color->red,
				     color->green,
				     color->blue,
				     color->alpha,
				     format))
		return false;

	return sna_blt_fill_boxes(sna, alu,
				  dst_bo, dst->bitsPerPixel,
				  pixel, box, n);
}

static bool
no_render_fill(struct sna *sna, uint8_t alu,
	       PixmapPtr dst, struct kgem_bo *dst_bo,
	       uint32_t color, unsigned flags,
	       struct sna_fill_op *tmp)
{
	DBG(("%s (alu=%d, color=%08x)\n", __FUNCTION__, alu, color));
	return sna_blt_fill(sna, alu,
			    dst_bo, dst->drawable.bitsPerPixel,
			    color,
			    tmp);
}

static bool
no_render_fill_one(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo,
		   uint32_t color,
		   int16_t x1, int16_t y1, int16_t x2, int16_t y2,
		   uint8_t alu)
{
	BoxRec box;

	box.x1 = x1;
	box.y1 = y1;
	box.x2 = x2;
	box.y2 = y2;

	DBG(("%s (alu=%d, color=%08x) (%d,%d), (%d, %d)\n",
	     __FUNCTION__, alu, color, x1, y1, x2, y2));
	return sna_blt_fill_boxes(sna, alu,
				  bo, dst->drawable.bitsPerPixel,
				  color, &box, 1);
}

static bool
no_render_clear(struct sna *sna, PixmapPtr dst, struct kgem_bo *bo)
{
	DBG(("%s: pixmap=%ld %dx%d\n", __FUNCTION__,
	     dst->drawable.serialNumber,
	     dst->drawable.width,
	     dst->drawable.height));
	return sna->render.fill_one(sna, dst, bo, 0,
				    0, 0, dst->drawable.width, dst->drawable.height,
				    GXclear);
}

static void no_render_reset(struct sna *sna)
{
	(void)sna;
}

static void no_render_flush(struct sna *sna)
{
	(void)sna;
}

static void
no_render_context_switch(struct kgem *kgem,
			 int new_mode)
{
	if (!kgem->nbatch)
		return;

	if (kgem_ring_is_idle(kgem, kgem->ring)) {
		DBG(("%s: GPU idle, flushing\n", __FUNCTION__));
		_kgem_submit(kgem);
	}

	(void)new_mode;
}

static void
no_render_fini(struct sna *sna)
{
	(void)sna;
}

const char *no_render_init(struct sna *sna)
{
	struct sna_render *render = &sna->render;

	memset (render, 0, sizeof (*render));

	render->prefer_gpu = PREFER_GPU_BLT;

	render->vertices = render->vertex_data;
	render->vertex_size = ARRAY_SIZE(render->vertex_data);

	render->composite = no_render_composite;
	render->check_composite_spans = no_render_check_composite_spans;

	render->copy_boxes = no_render_copy_boxes;
	render->copy = no_render_copy;

	render->fill_boxes = no_render_fill_boxes;
	render->fill = no_render_fill;
	render->fill_one = no_render_fill_one;
	render->clear = no_render_clear;

	render->reset = no_render_reset;
	render->flush = no_render_flush;
	render->fini = no_render_fini;

	sna->kgem.context_switch = no_render_context_switch;
	if (sna->kgem.has_blt)
		sna->kgem.ring = KGEM_BLT;

	sna_vertex_init(sna);
	return "generic";
}

static struct kgem_bo *
use_cpu_bo(struct sna *sna, PixmapPtr pixmap, const BoxRec *box, bool blt)
{
	struct sna_pixmap *priv;

	if (DBG_NO_CPU_BO)
		return NULL;

	priv = sna_pixmap(pixmap);
	if (priv == NULL || priv->cpu_bo == NULL) {
		DBG(("%s: no cpu bo\n", __FUNCTION__));
		return NULL;
	}

	if (!blt && priv->cpu_bo->snoop && priv->source_count > SOURCE_BIAS) {
		DBG(("%s: promoting snooped CPU bo due to reuse\n",
		     __FUNCTION__));
		return NULL;
	}

	if (priv->gpu_bo) {
		switch (sna_damage_contains_box(&priv->cpu_damage, box)) {
		case PIXMAN_REGION_OUT:
			DBG(("%s: has GPU bo and no damage to upload\n",
			     __FUNCTION__));
			return NULL;

		case PIXMAN_REGION_IN:
			DBG(("%s: has GPU bo but box is completely on CPU\n",
			     __FUNCTION__));
			break;
		default:
			if (kgem_bo_is_busy(priv->gpu_bo)){
				DBG(("%s: box is partially damaged on the CPU, and the GPU is busy\n",
				     __FUNCTION__));
				return NULL;
			}
			if (sna_damage_contains_box(&priv->gpu_damage,
						    box) != PIXMAN_REGION_OUT) {
				DBG(("%s: box is damaged on the GPU\n",
				     __FUNCTION__));
				return NULL;
			}
			break;
		}
	}

	if (!blt) {
		int w = box->x2 - box->x1;
		int h = box->y2 - box->y1;

		if (w < pixmap->drawable.width ||
		    h < pixmap->drawable.height ||
		    priv->source_count != SOURCE_BIAS) {
			bool want_tiling;

			if (priv->cpu_bo->pitch >= 4096) {
				DBG(("%s: size=%dx%d, promoting reused (%d) CPU bo due to TLB miss (%dx%d, pitch=%d)\n",
				     __FUNCTION__, w, h, priv->source_count,
				     pixmap->drawable.width,
				     pixmap->drawable.height,
				     priv->cpu_bo->pitch));
				return NULL;
			}

			if (priv->gpu_bo)
				want_tiling = priv->gpu_bo->tiling != I915_TILING_NONE;
			else
				want_tiling = kgem_choose_tiling(&sna->kgem,
								 I915_TILING_Y,
								 pixmap->drawable.width,
								 pixmap->drawable.height,
								 pixmap->drawable.bitsPerPixel) != I915_TILING_NONE;
			if (want_tiling &&
			    priv->source_count*w*h >= (int)pixmap->drawable.width * pixmap->drawable.height) {
				DBG(("%s: pitch (%d) requires tiling\n",
				     __FUNCTION__, priv->cpu_bo->pitch));
				return NULL;
			}
		}
	}

	add_shm_flush(sna, priv);

	DBG(("%s for box=(%d, %d), (%d, %d)\n",
	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2));
	++priv->source_count;
	return priv->cpu_bo;
}

static struct kgem_bo *
move_to_gpu(PixmapPtr pixmap, const BoxRec *box, bool blt)
{
	struct sna_pixmap *priv;
	int count, w, h;
	bool migrate = false;

	if (DBG_FORCE_UPLOAD > 0)
		return NULL;

	priv = sna_pixmap(pixmap);
	if (priv == NULL) {
		DBG(("%s: not migrating unattached pixmap=%ld\n",
		     __FUNCTION__, pixmap->drawable.serialNumber));
		return NULL;
	}

	if (priv->shm)
		blt = true;

	if (priv->gpu_bo) {
		if (priv->cpu_damage &&
		    sna_damage_contains_box(&priv->cpu_damage,
					    box) != PIXMAN_REGION_OUT)
			goto upload;

		return priv->gpu_bo;
	}

	if (priv->cpu_damage == NULL) {
		DBG(("%s: not migrating uninitialised pixmap=%ld\n",
		     __FUNCTION__, pixmap->drawable.serialNumber));
		return NULL;
	}

	if (pixmap->usage_hint) {
		DBG(("%s: not migrating pixmap=%ld due to usage_hint=%d\n",
		     __FUNCTION__,
		     pixmap->drawable.serialNumber,
		     pixmap->usage_hint));
		return NULL;
	}

	if (DBG_FORCE_UPLOAD < 0) {
		if (!sna_pixmap_force_to_gpu(pixmap,
					     blt ? MOVE_READ : MOVE_SOURCE_HINT | MOVE_ASYNC_HINT | MOVE_READ))
			return NULL;

		return priv->gpu_bo;
	}

	w = box->x2 - box->x1;
	h = box->y2 - box->y1;
	if (priv->cpu_bo && !priv->cpu_bo->flush) {
		migrate = true;
	} else if (w == pixmap->drawable.width && h == pixmap->drawable.height) {
		migrate = priv->source_count++ > SOURCE_BIAS;

		DBG(("%s: migrating whole pixmap (%dx%d) for source (%d,%d),(%d,%d), count %d? %d\n",
		     __FUNCTION__,
		     pixmap->drawable.width, pixmap->drawable.height,
		     box->x1, box->y1, box->x2, box->y2, priv->source_count,
		     migrate));
	} else if (kgem_choose_tiling(&to_sna_from_pixmap(pixmap)->kgem,
				      blt ? I915_TILING_X : I915_TILING_Y, w, h,
				      pixmap->drawable.bitsPerPixel) != I915_TILING_NONE) {
		count = priv->source_count++;
		if ((priv->create & KGEM_CAN_CREATE_GPU) == 0)
			count -= SOURCE_BIAS;

		DBG(("%s: migrate box (%d, %d), (%d, %d)? source count=%d, fraction=%d/%d [%d]\n",
		     __FUNCTION__,
		     box->x1, box->y1, box->x2, box->y2,
		     count, w*h,
		     pixmap->drawable.width * pixmap->drawable.height,
		     pixmap->drawable.width * pixmap->drawable.height / (w*h)));

		migrate = count*w*h > pixmap->drawable.width * pixmap->drawable.height;
	}

	if (!migrate)
		return NULL;

upload:
	if (blt) {
		if (!sna_pixmap_move_area_to_gpu(pixmap, box,
						 __MOVE_FORCE | MOVE_READ))
			return NULL;
	} else {
		if (!sna_pixmap_move_to_gpu(pixmap,
					    __MOVE_FORCE | MOVE_ASYNC_HINT | MOVE_SOURCE_HINT | MOVE_READ))
			return NULL;
	}

	return priv->gpu_bo;
}

static struct kgem_bo *upload(struct sna *sna,
			      struct sna_composite_channel *channel,
			      PixmapPtr pixmap,
			      const BoxRec *box)
{
	struct sna_pixmap *priv;
	struct kgem_bo *bo;

	DBG(("%s: box=(%d, %d), (%d, %d), pixmap=%dx%d\n",
	     __FUNCTION__, box->x1, box->y1, box->x2, box->y2, pixmap->drawable.width, pixmap->drawable.height));
	assert(box->x1 >= 0);
	assert(box->y1 >= 0);
	assert(box->x2 <= pixmap->drawable.width);
	assert(box->y2 <= pixmap->drawable.height);

	priv = sna_pixmap(pixmap);
	if (priv) {
		RegionRec region;

		if (priv->cpu_damage == NULL)
			return NULL; /* uninitialised */

		region.extents = *box;
		region.data = NULL;
		if (!sna_drawable_move_region_to_cpu(&pixmap->drawable,
						     &region, MOVE_READ))
			return NULL;

		assert(!priv->mapped);
		if (pixmap->devPrivate.ptr == NULL)
			return NULL; /* uninitialised */
	}

	bo = kgem_upload_source_image(&sna->kgem,
				      pixmap->devPrivate.ptr, box,
				      pixmap->devKind,
				      pixmap->drawable.bitsPerPixel);
	if (channel && bo) {
		channel->width  = box->x2 - box->x1;
		channel->height = box->y2 - box->y1;
		channel->offset[0] -= box->x1;
		channel->offset[1] -= box->y1;

		if (priv &&
		    pixmap->usage_hint == 0 &&
		    channel->width  == pixmap->drawable.width &&
		    channel->height == pixmap->drawable.height) {
			DBG(("%s: adding upload cache to pixmap=%ld\n",
			     __FUNCTION__, pixmap->drawable.serialNumber));
			assert(priv->gpu_damage == NULL);
			assert(priv->gpu_bo == NULL);
			assert(bo->proxy != NULL);
			sna_damage_all(&priv->cpu_damage, pixmap);
			kgem_proxy_bo_attach(bo, &priv->gpu_bo);
		}
	}

	return bo;
}

struct kgem_bo *
__sna_render_pixmap_bo(struct sna *sna,
		       PixmapPtr pixmap,
		       const BoxRec *box,
		       bool blt)
{
	struct kgem_bo *bo;

	bo = use_cpu_bo(sna, pixmap, box, blt);
	if (bo == NULL) {
		bo = move_to_gpu(pixmap, box, blt);
		if (bo == NULL)
			return NULL;
	}

	return bo;
}

int
sna_render_pixmap_bo(struct sna *sna,
		     struct sna_composite_channel *channel,
		     PixmapPtr pixmap,
		     int16_t x, int16_t y,
		     int16_t w, int16_t h,
		     int16_t dst_x, int16_t dst_y)
{
	struct sna_pixmap *priv;
	BoxRec box;

	DBG(("%s pixmap=%ld, (%d, %d)x(%d, %d)/(%d, %d)\n",
	     __FUNCTION__, pixmap->drawable.serialNumber,
	     x, y, w,h, pixmap->drawable.width, pixmap->drawable.height));

	channel->width  = pixmap->drawable.width;
	channel->height = pixmap->drawable.height;
	channel->offset[0] = x - dst_x;
	channel->offset[1] = y - dst_y;

	priv = sna_pixmap(pixmap);
	if (priv) {
		if (priv->gpu_bo &&
		    (DAMAGE_IS_ALL(priv->gpu_damage) || !priv->cpu_damage ||
		     priv->gpu_bo->proxy)) {
			DBG(("%s: GPU all damaged\n", __FUNCTION__));
			channel->bo = priv->gpu_bo;
			goto done;
		}

		if (priv->cpu_bo &&
		    (DAMAGE_IS_ALL(priv->cpu_damage) || !priv->gpu_damage) &&
		    !priv->cpu_bo->snoop && priv->cpu_bo->pitch < 4096) {
			DBG(("%s: CPU all damaged\n", __FUNCTION__));
			channel->bo = priv->cpu_bo;
			add_shm_flush(sna, priv);
			goto done;
		}
	}

	/* XXX handle transformed repeat */
	if (w == 0 || h == 0 || channel->transform) {
		box.x1 = box.y1 = 0;
		box.x2 = pixmap->drawable.width;
		box.y2 = pixmap->drawable.height;
	} else {
		box.x1 = x;
		box.y1 = y;
		box.x2 = bound(x, w);
		box.y2 = bound(y, h);

		if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
			if (box.x1 < 0)
				box.x1 = 0;
			if (box.y1 < 0)
				box.y1 = 0;
			if (box.x2 > pixmap->drawable.width)
				box.x2 = pixmap->drawable.width;
			if (box.y2 > pixmap->drawable.height)
				box.y2 = pixmap->drawable.height;
		} else {
			if (box.x1 < 0 || box.x2 > pixmap->drawable.width)
				box.x1 = 0, box.x2 = pixmap->drawable.width;
			if (box.y1 < 0 || box.y2 > pixmap->drawable.height)
				box.y1 = 0, box.y2 = pixmap->drawable.height;
		}
	}

	w = box.x2 - box.x1;
	h = box.y2 - box.y1;
	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
	     box.x1, box.y1, box.x2, box.y2, w, h,
	     pixmap->drawable.width, pixmap->drawable.height));
	if (w <= 0 || h <= 0) {
		DBG(("%s: sample extents outside of texture -> clear\n",
		     __FUNCTION__));
		return 0;
	}

	DBG(("%s: offset=(%d, %d), size=(%d, %d)\n",
	     __FUNCTION__,
	     channel->offset[0], channel->offset[1],
	     pixmap->drawable.width, pixmap->drawable.height));

	channel->bo = __sna_render_pixmap_bo(sna, pixmap, &box, false);
	if (channel->bo == NULL) {
		DBG(("%s: uploading CPU box (%d, %d), (%d, %d)\n",
		     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));
		channel->bo = upload(sna, channel, pixmap, &box);
		if (channel->bo == NULL)
			return 0;
	} else {
done:
		kgem_bo_reference(channel->bo);
	}

	channel->scale[0] = 1.f / channel->width;
	channel->scale[1] = 1.f / channel->height;
	return 1;
}

static int sna_render_picture_downsample(struct sna *sna,
					 PicturePtr picture,
					 struct sna_composite_channel *channel,
					 const int16_t x, const int16_t y,
					 const int16_t w, const int16_t h,
					 const int16_t dst_x, const int16_t dst_y)
{
	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
	ScreenPtr screen = pixmap->drawable.pScreen;
	PicturePtr tmp_src, tmp_dst;
	PictFormatPtr format;
	struct sna_pixmap *priv;
	pixman_transform_t t;
	PixmapPtr tmp;
	int width, height, size, max_size;
	int sx, sy, sw, sh;
	int error, ret = 0;
	BoxRec box, b;

	box.x1 = x;
	box.y1 = y;
	box.x2 = bound(x, w);
	box.y2 = bound(y, h);
	if (channel->transform) {
		pixman_vector_t v;

		pixman_transform_bounds(channel->transform, &box);

		v.vector[0] = x << 16;
		v.vector[1] = y << 16;
		v.vector[2] = 1 << 16;
		pixman_transform_point(channel->transform, &v);
	}

	if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
		if (box.x1 < 0)
			box.x1 = 0;
		if (box.y1 < 0)
			box.y1 = 0;
		if (box.x2 > pixmap->drawable.width)
			box.x2 = pixmap->drawable.width;
		if (box.y2 > pixmap->drawable.height)
			box.y2 = pixmap->drawable.height;
	} else {
		/* XXX tiled repeats? */
		if (box.x1 < 0 || box.x2 > pixmap->drawable.width)
			box.x1 = 0, box.x2 = pixmap->drawable.width;
		if (box.y1 < 0 || box.y2 > pixmap->drawable.height)
			box.y1 = 0, box.y2 = pixmap->drawable.height;

	}

	sw = box.x2 - box.x1;
	sh = box.y2 - box.y1;

	DBG(("%s: sample (%d, %d), (%d, %d)\n",
	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));

	sx = (sw + sna->render.max_3d_size - 1) / sna->render.max_3d_size;
	sy = (sh + sna->render.max_3d_size - 1) / sna->render.max_3d_size;

	DBG(("%s: scaling (%d, %d) down by %dx%d\n",
	     __FUNCTION__, sw, sh, sx, sy));

	width  = sw / sx;
	height = sh / sy;

	DBG(("%s: creating temporary GPU bo %dx%d\n",
	     __FUNCTION__, width, height));

	tmp = screen->CreatePixmap(screen,
				   width, height,
				   pixmap->drawable.depth,
				   SNA_CREATE_SCRATCH);
	if (tmp == NULL)
		goto fixup;

	priv = sna_pixmap(tmp);
	assert(priv && priv->gpu_bo);

	if (!sna_pixmap_move_to_gpu(pixmap, MOVE_ASYNC_HINT | MOVE_SOURCE_HINT | MOVE_READ)) {
fixup:
		DBG(("%s: unable to create GPU bo for target or temporary pixmaps\n",
		     __FUNCTION__));
		return sna_render_picture_fixup(sna, picture, channel,
						x, y, w, h,
						dst_x, dst_y);
	}

	format = PictureMatchFormat(screen,
				    pixmap->drawable.depth,
				    picture->format);
	if (format == NULL) {
		DBG(("%s: invalid depth=%d, format=%08x\n",
		     __FUNCTION__, pixmap->drawable.depth, picture->format));
		goto fixup;
	}

	tmp_dst = CreatePicture(0, &tmp->drawable, format, 0, NULL,
				serverClient, &error);
	if (!tmp_dst)
		goto cleanup_tmp;

	tmp_src = CreatePicture(0, &pixmap->drawable, format, 0, NULL,
				serverClient, &error);
	if (!tmp_src)
		goto cleanup_dst;

	tmp_src->repeat = 1;
	tmp_src->repeatType = RepeatPad;
	/* Prefer to use nearest as it helps reduce artefacts from
	 * interpolating and filtering twice.
	 */
	tmp_src->filter = PictFilterNearest;
	memset(&t, 0, sizeof(t));
	t.matrix[0][0] = (sw << 16) / width;
	t.matrix[0][2] = box.x1 << 16;
	t.matrix[1][1] = (sh << 16) / height;
	t.matrix[1][2] = box.y1 << 16;
	t.matrix[2][2] = 1 << 16;
	tmp_src->transform = &t;

	ValidatePicture(tmp_dst);
	ValidatePicture(tmp_src);

	/* Use a small size to accommodate enlargement through tile alignment */
	max_size = sna_max_tile_copy_size(sna, sna_pixmap(pixmap)->gpu_bo, priv->gpu_bo);
	if (max_size == 0)
		goto cleanup_dst;

	size = sna->render.max_3d_size - 4096 / pixmap->drawable.bitsPerPixel;
	while (size * size * 4 > max_size)
		size /= 2;
	DBG(("%s: size=%d (max=%d), scale %dx%d\n",
	     __FUNCTION__, size, max_size, sx, sy));

	sw = size / sx - 2 * sx;
	if (sw < 1)
		sw = 1;
	sh = size / sy - 2 * sy;
	if (sh < 1)
		sh = 1;
	DBG(("%s %d:%d downsampling using %dx%d GPU tiles\n",
	     __FUNCTION__, (width + sw-1)/sw, (height + sh-1)/sh, sw, sh));

	for (b.y1 = 0; b.y1 < height; b.y1 = b.y2) {
		b.y2 = b.y1 + sh;
		if (b.y2 > height)
			b.y2 = height;

		for (b.x1 = 0; b.x1 < width; b.x1 = b.x2) {
			struct sna_composite_op op;

			b.x2 = b.x1 + sw;
			if (b.x2 > width)
				b.x2 = width;

			DBG(("%s: tile (%d, %d), (%d, %d)\n",
			     __FUNCTION__, b.x1, b.y1, b.x2, b.y2));

			memset(&op, 0, sizeof(op));
			if (!sna->render.composite(sna,
						   PictOpSrc,
						   tmp_src, NULL, tmp_dst,
						   b.x1, b.y1,
						   0, 0,
						   b.x1, b.y1,
						   b.x2 - b.x1, b.y2 - b.y1,
						   0, &op))
				goto cleanup_src;

			op.box(sna, &op, &b);
			op.done(sna, &op);
		}
	}

	pixman_transform_invert(&channel->embedded_transform, &t);
	if (channel->transform)
		pixman_transform_multiply(&channel->embedded_transform,
					  &channel->embedded_transform,
					  channel->transform);
	channel->transform = &channel->embedded_transform;

	channel->offset[0] = x - dst_x;
	channel->offset[1] = y - dst_y;
	channel->scale[0] = 1.f/width;
	channel->scale[1] = 1.f/height;
	channel->width  = width;
	channel->height = height;
	channel->bo = kgem_bo_reference(priv->gpu_bo);

	ret = 1;
cleanup_src:
	tmp_src->transform = NULL;
	FreePicture(tmp_src, 0);
cleanup_dst:
	FreePicture(tmp_dst, 0);
cleanup_tmp:
	screen->DestroyPixmap(tmp);
	return ret;
}

bool
sna_render_pixmap_partial(struct sna *sna,
			  const DrawableRec *draw,
			  struct kgem_bo *bo,
			  struct sna_composite_channel *channel,
			  int16_t x, int16_t y,
			  int16_t w, int16_t h)
{
	BoxRec box;
	int offset;

	DBG(("%s (%d, %d)x(%d, %d), pitch %d, max %d\n",
	     __FUNCTION__, x, y, w, h, bo->pitch, sna->render.max_3d_pitch));

	if (bo->pitch > sna->render.max_3d_pitch) {
		DBG(("%s: pitch too great %d > %d\n", __FUNCTION__, bo->pitch, sna->render.max_3d_pitch));
		return false;
	}

	box.x1 = x;
	box.y1 = y;
	box.x2 = bound(x, w);
	box.y2 = bound(y, h);
	DBG(("%s: unaligned box (%d, %d), (%d, %d)\n",
	     __FUNCTION__, box.x1, box.y1, box.x2, box.y2));

	if (box.x1 < 0)
		box.x1 = 0;
	if (box.y1 < 0)
		box.y1 = 0;

	if (bo->tiling) {
		int tile_width, tile_height, tile_size;

		kgem_get_tile_size(&sna->kgem, bo->tiling, bo->pitch,
				   &tile_width, &tile_height, &tile_size);
		DBG(("%s: tile size for tiling %d: %dx%d, size=%d\n",
		     __FUNCTION__, bo->tiling, tile_width, tile_height, tile_size));

		/* Ensure we align to an even tile row */
		box.y1 = box.y1 & ~(2*tile_height - 1);
		box.y2 = ALIGN(box.y2, 2*tile_height);

		assert(tile_width * 8 >= draw->bitsPerPixel);
		box.x1 = box.x1 & ~(tile_width * 8 / draw->bitsPerPixel - 1);
		box.x2 = ALIGN(box.x2, tile_width * 8 / draw->bitsPerPixel);

		offset = box.x1 * draw->bitsPerPixel / 8 / tile_width * tile_size;
	} else {
		box.y1 = box.y1 & ~1;
		box.y2 = ALIGN(box.y2, 2);

		box.x1 = box.x1 & ~1;
		box.x2 = ALIGN(box.x2, 2);

		offset = box.x1 * draw->bitsPerPixel / 8;
	}

	if (box.x2 > draw->width)
		box.x2 = draw->width;
	if (box.y2 > draw->height)
		box.y2 = draw->height;

	w = box.x2 - box.x1;
	h = box.y2 - box.y1;
	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
	     box.x1, box.y1, box.x2, box.y2, w, h,
	     draw->width, draw->height));
	if (w <= 0 || h <= 0 ||
	    w > sna->render.max_3d_size ||
	    h > sna->render.max_3d_size) {
		DBG(("%s: box too large (%dx%d) for 3D pipeline (max %d)\n",
		    __FUNCTION__, w, h, sna->render.max_3d_size));
		return false;
	}

	/* How many tiles across are we? */
	channel->bo = kgem_create_proxy(&sna->kgem, bo,
					box.y1 * bo->pitch + offset,
					h * bo->pitch);
	if (channel->bo == NULL) {
		DBG(("%s: failed to create proxy for partial (offset=%d, size=%d)\n",
		     __FUNCTION__, box.y1 * bo->pitch + offset, h * bo->pitch));
		return false;
	}

	channel->bo->pitch = bo->pitch;

	channel->offset[0] = -box.x1;
	channel->offset[1] = -box.y1;
	channel->scale[0] = 1.f/w;
	channel->scale[1] = 1.f/h;
	channel->width  = w;
	channel->height = h;
	return true;
}

static bool
sna_render_picture_partial(struct sna *sna,
			   PicturePtr picture,
			   struct sna_composite_channel *channel,
			   int16_t x, int16_t y,
			   int16_t w, int16_t h,
			   int16_t dst_x, int16_t dst_y)
{
	struct kgem_bo *bo = NULL;
	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
	BoxRec box;
	int offset;

	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
	     __FUNCTION__, x, y, w, h, dst_x, dst_y));

	box.x1 = x;
	box.y1 = y;
	box.x2 = bound(x, w);
	box.y2 = bound(y, h);
	if (channel->transform)
		pixman_transform_bounds(channel->transform, &box);

	DBG(("%s sample=(%d, %d), (%d, %d): (%d, %d)/(%d, %d), repeat=%d\n", __FUNCTION__,
	     box.x1, box.y1, box.x2, box.y2, w, h,
	     pixmap->drawable.width, pixmap->drawable.height,
	     channel->repeat));

	if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
		if (box.x1 < 0)
			box.x1 = 0;
		if (box.y1 < 0)
			box.y1 = 0;
		if (box.x2 > pixmap->drawable.width)
			box.x2 = pixmap->drawable.width;
		if (box.y2 > pixmap->drawable.height)
			box.y2 = pixmap->drawable.height;
	} else {
		if (box.x1 < 0 || box.x2 > pixmap->drawable.width)
			box.x1 = 0, box.x2 = pixmap->drawable.width;
		if (box.y1 < 0 || box.y2 > pixmap->drawable.height)
			box.y1 = 0, box.y2 = pixmap->drawable.height;
	}

	if (use_cpu_bo(sna, pixmap, &box, false)) {
		bo = sna_pixmap(pixmap)->cpu_bo;
	} else {
		struct sna_pixmap *priv;

		priv = sna_pixmap_force_to_gpu(pixmap,
					       MOVE_READ | MOVE_ASYNC_HINT | MOVE_SOURCE_HINT);
		if (priv == NULL)
			return false;

		bo = priv->gpu_bo;
	}

	if (bo->pitch > sna->render.max_3d_pitch) {
		DBG(("%s: pitch too great %d > %d\n", __FUNCTION__, bo->pitch, sna->render.max_3d_pitch));
		return false;
	}

	if (bo->tiling) {
		int tile_width, tile_height, tile_size;

		kgem_get_tile_size(&sna->kgem, bo->tiling, bo->pitch,
				   &tile_width, &tile_height, &tile_size);

		DBG(("%s: tiling=%d, size=%dx%d, chunk=%d\n",
		     __FUNCTION__, bo->tiling,
		     tile_width, tile_height, tile_size));

		/* Ensure we align to an even tile row */
		box.y1 = box.y1 & ~(2*tile_height - 1);
		box.y2 = ALIGN(box.y2, 2*tile_height);
		if (box.y2 > pixmap->drawable.height)
			box.y2 = pixmap->drawable.height;

		box.x1 = box.x1 & ~(tile_width * 8 / pixmap->drawable.bitsPerPixel - 1);
		box.x2 = ALIGN(box.x2, tile_width * 8 / pixmap->drawable.bitsPerPixel);
		if (box.x2 > pixmap->drawable.width)
			box.x2 = pixmap->drawable.width;

		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
	} else
		offset = box.x1 * pixmap->drawable.bitsPerPixel / 8;

	w = box.x2 - box.x1;
	h = box.y2 - box.y1;
	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
	     box.x1, box.y1, box.x2, box.y2, w, h,
	     pixmap->drawable.width, pixmap->drawable.height));
	if (w <= 0 || h <= 0 ||
	    w > sna->render.max_3d_size ||
	    h > sna->render.max_3d_size)
		return false;

	/* How many tiles across are we? */
	channel->bo = kgem_create_proxy(&sna->kgem, bo,
					box.y1 * bo->pitch + offset,
					h * bo->pitch);
	if (channel->bo == NULL)
		return false;

	if (channel->transform) {
		memset(&channel->embedded_transform,
		       0,
		       sizeof(channel->embedded_transform));
		channel->embedded_transform.matrix[0][0] = 1 << 16;
		channel->embedded_transform.matrix[0][2] = -box.x1 << 16;
		channel->embedded_transform.matrix[1][1] = 1 << 16;
		channel->embedded_transform.matrix[1][2] = -box.y1 << 16;
		channel->embedded_transform.matrix[2][2] = 1 << 16;
		pixman_transform_multiply(&channel->embedded_transform,
					  &channel->embedded_transform,
					  channel->transform);
		channel->transform = &channel->embedded_transform;
	} else {
		x -= box.x1;
		y -= box.y1;
	}

	channel->offset[0] = x - dst_x;
	channel->offset[1] = y - dst_y;
	channel->scale[0] = 1.f/w;
	channel->scale[1] = 1.f/h;
	channel->width  = w;
	channel->height = h;
	return true;
}

int
sna_render_picture_extract(struct sna *sna,
			   PicturePtr picture,
			   struct sna_composite_channel *channel,
			   int16_t x, int16_t y,
			   int16_t w, int16_t h,
			   int16_t dst_x, int16_t dst_y)
{
	struct kgem_bo *bo = NULL, *src_bo;
	PixmapPtr pixmap = get_drawable_pixmap(picture->pDrawable);
	int16_t ox, oy, ow, oh;
	BoxRec box;

#if NO_EXTRACT
	return -1;
#endif

	DBG(("%s (%d, %d)x(%d, %d) [dst=(%d, %d)]\n",
	     __FUNCTION__, x, y, w, h, dst_x, dst_y));

	if (w == 0 || h == 0) {
		DBG(("%s: fallback -- unknown bounds\n", __FUNCTION__));
		return -1;
	}

	if (sna_render_picture_partial(sna, picture, channel,
				       x, y, w, h,
				       dst_x, dst_y))
		return 1;

	ow = w;
	oh = h;

	ox = box.x1 = x;
	oy = box.y1 = y;
	box.x2 = bound(x, w);
	box.y2 = bound(y, h);
	if (channel->transform) {
		pixman_vector_t v;

		pixman_transform_bounds(channel->transform, &box);

		v.vector[0] = ox << 16;
		v.vector[1] = oy << 16;
		v.vector[2] =  1 << 16;
		pixman_transform_point(channel->transform, &v);
		ox = v.vector[0] / v.vector[2];
		oy = v.vector[1] / v.vector[2];
	}

	DBG(("%s sample=(%d, %d), (%d, %d): (%d, %d)/(%d, %d), repeat=%d\n", __FUNCTION__,
	     box.x1, box.y1, box.x2, box.y2, w, h,
	     pixmap->drawable.width, pixmap->drawable.height,
	     channel->repeat));

	if (channel->repeat == RepeatNone || channel->repeat == RepeatPad) {
		if (box.x1 < 0)
			box.x1 = 0;
		if (box.y1 < 0)
			box.y1 = 0;
		if (box.x2 > pixmap->drawable.width)
			box.x2 = pixmap->drawable.width;
		if (box.y2 > pixmap->drawable.height)
			box.y2 = pixmap->drawable.height;
	} else {
		/* XXX tiled repeats? */
		if (box.x1 < 0 || box.x2 > pixmap->drawable.width)
			box.x1 = 0, box.x2 = pixmap->drawable.width;
		if (box.y1 < 0 || box.y2 > pixmap->drawable.height)
			box.y1 = 0, box.y2 = pixmap->drawable.height;
	}

	w = box.x2 - box.x1;
	h = box.y2 - box.y1;
	DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d)\n", __FUNCTION__,
	     box.x1, box.y1, box.x2, box.y2, w, h,
	     pixmap->drawable.width, pixmap->drawable.height));
	if (w <= 0 || h <= 0) {
		DBG(("%s: sample extents outside of texture -> clear\n",
		     __FUNCTION__));
		return 0;
	}

	if (w > sna->render.max_3d_size || h > sna->render.max_3d_size) {
		DBG(("%s: fallback -- sample too large for texture (%d, %d)x(%d, %d)\n",
		     __FUNCTION__, box.x1, box.y1, w, h));
		return sna_render_picture_downsample(sna, picture, channel,
						     x, y, ow, oh,
						     dst_x, dst_y);
	}

	src_bo = use_cpu_bo(sna, pixmap, &box, true);
	if (src_bo == NULL)
		src_bo = move_to_gpu(pixmap, &box, false);
	if (src_bo) {
		bo = kgem_create_2d(&sna->kgem, w, h,
				    pixmap->drawable.bitsPerPixel,
				    kgem_choose_tiling(&sna->kgem,
						       I915_TILING_X, w, h,
						       pixmap->drawable.bitsPerPixel),
				    CREATE_TEMPORARY);
		if (bo) {
			DrawableRec tmp;

			tmp.width  = w;
			tmp.height = h;
			tmp.depth  = pixmap->drawable.depth;
			tmp.bitsPerPixel = pixmap->drawable.bitsPerPixel;

			assert(tmp.width);
			assert(tmp.height);

			if (!sna->render.copy_boxes(sna, GXcopy,
						    &pixmap->drawable, src_bo, 0, 0,
						    &tmp, bo, -box.x1, -box.y1,
						    &box, 1, 0)) {
				kgem_bo_destroy(&sna->kgem, bo);
				bo = NULL;
			}
		}
	} else {
		struct sna_pixmap *priv = sna_pixmap(pixmap);
		if (priv) {
			RegionRec region;

			region.extents = box;
			region.data = NULL;
			if (!sna_drawable_move_region_to_cpu(&pixmap->drawable,
							     &region, MOVE_READ))
				return 0;

			assert(!priv->mapped);
			if (pixmap->devPrivate.ptr == NULL)
				return 0; /* uninitialised */
		}

		bo = kgem_upload_source_image(&sna->kgem,
					      pixmap->devPrivate.ptr,
					      &box,
					      pixmap->devKind,
					      pixmap->drawable.bitsPerPixel);
		if (priv != NULL && bo != NULL &&
		    box.x2 - box.x1 == pixmap->drawable.width &&
		    box.y2 - box.y1 == pixmap->drawable.height) {
			DBG(("%s: adding upload cache to pixmap=%ld\n",
			     __FUNCTION__, pixmap->drawable.serialNumber));
			assert(priv->gpu_damage == NULL);
			assert(priv->gpu_bo == NULL);
			assert(bo->proxy != NULL);
			sna_damage_all(&priv->cpu_damage, pixmap);
			kgem_proxy_bo_attach(bo, &priv->gpu_bo);
		}
	}

	if (bo == NULL) {
		DBG(("%s: falback -- pixmap is not on the GPU\n",
		     __FUNCTION__));
		return sna_render_picture_fixup(sna, picture, channel,
						x, y, ow, oh, dst_x, dst_y);
	}

	if (ox == x && oy == y) {
		x = y = 0;
	} else if (channel->transform) {
		pixman_vector_t v;
		pixman_transform_t m;

		v.vector[0] = (ox - box.x1) << 16;
		v.vector[1] = (oy - box.y1) << 16;
		v.vector[2] = 1 << 16;
		pixman_transform_invert(&m, channel->transform);
		pixman_transform_point(&m, &v);
		x = v.vector[0] / v.vector[2];
		y = v.vector[1] / v.vector[2];
	} else {
		x = ox - box.x1;
		y = oy - box.y1;
	}

	channel->offset[0] = x - dst_x;
	channel->offset[1] = y - dst_y;
	channel->scale[0] = 1.f/w;
	channel->scale[1] = 1.f/h;
	channel->width  = w;
	channel->height = h;
	channel->bo = bo;
	return 1;
}

static int
sna_render_picture_convolve(struct sna *sna,
			    PicturePtr picture,
			    struct sna_composite_channel *channel,
			    int16_t x, int16_t y,
			    int16_t w, int16_t h,
			    int16_t dst_x, int16_t dst_y)
{
	ScreenPtr screen = picture->pDrawable->pScreen;
	PixmapPtr pixmap;
	PicturePtr tmp;
	pixman_fixed_t *params = picture->filter_params;
	int x_off = -pixman_fixed_to_int((params[0] - pixman_fixed_1) >> 1);
	int y_off = -pixman_fixed_to_int((params[1] - pixman_fixed_1) >> 1);
	int cw = pixman_fixed_to_int(params[0]);
	int ch = pixman_fixed_to_int(params[1]);
	int i, j, error, depth;
	struct kgem_bo *bo;

	/* Lame multi-pass accumulation implementation of a general convolution
	 * that works everywhere.
	 */
	DBG(("%s: origin=(%d,%d) kernel=%dx%d, size=%dx%d\n",
	     __FUNCTION__, x_off, y_off, cw, ch, w, h));
	if (cw*ch > 32) /* too much loss of precision from quantization! */
		return -1;

	assert(picture->pDrawable);
	assert(picture->filter == PictFilterConvolution);
	assert(w <= sna->render.max_3d_size && h <= sna->render.max_3d_size);

	if (PICT_FORMAT_RGB(picture->format) == 0) {
		channel->pict_format = PIXMAN_a8;
		depth = 8;
	} else {
		channel->pict_format = PIXMAN_a8r8g8b8;
		depth = 32;
	}

	pixmap = screen->CreatePixmap(screen, w, h, depth, SNA_CREATE_SCRATCH);
	if (pixmap == NullPixmap) {
		DBG(("%s: pixmap allocation failed\n", __FUNCTION__));
		return -1;
	}

	tmp = NULL;
	bo = __sna_pixmap_get_bo(pixmap);
	assert(bo);
	if (sna->render.clear(sna, pixmap, bo))
		tmp = CreatePicture(0, &pixmap->drawable,
				PictureMatchFormat(screen, depth, channel->pict_format),
				0, NULL, serverClient, &error);
	screen->DestroyPixmap(pixmap);
	if (tmp == NULL)
		return -1;

	ValidatePicture(tmp);

	picture->filter = PictFilterBilinear;
	params += 2;
	for (j = 0; j < ch; j++) {
		for (i = 0; i < cw; i++) {
			xRenderColor color;
			PicturePtr alpha;

			color.alpha = *params++;
			color.red = color.green = color.blue = 0;
			DBG(("%s: (%d, %d), alpha=%x\n",
			     __FUNCTION__, i,j, color.alpha));

			if (color.alpha <= 0x00ff)
				continue;

			alpha = CreateSolidPicture(0, &color, &error);
			if (alpha) {
				sna_composite(PictOpAdd, picture, alpha, tmp,
					      x-(x_off+i), y-(y_off+j),
					      0, 0,
					      0, 0,
					      w, h);
				FreePicture(alpha, 0);
			}
		}
	}
	picture->filter = PictFilterConvolution;

	channel->height = h;
	channel->width  = w;
	channel->filter = PictFilterNearest;
	channel->repeat = RepeatNone;
	channel->is_affine = true;
	channel->transform = NULL;
	channel->scale[0] = 1.f / w;
	channel->scale[1] = 1.f / h;
	channel->offset[0] = -dst_x;
	channel->offset[1] = -dst_y;
	channel->bo = kgem_bo_reference(bo); /* transfer ownership */
	FreePicture(tmp, 0);

	return 1;
}

static bool
sna_render_picture_flatten(struct sna *sna,
			   PicturePtr picture,
			   struct sna_composite_channel *channel,
			   int16_t x, int16_t y,
			   int16_t w, int16_t h,
			   int16_t dst_x, int16_t dst_y)
{
	ScreenPtr screen = picture->pDrawable->pScreen;
	PixmapPtr pixmap;
	PicturePtr tmp, alpha;
	int old_format, error;

	assert(picture->pDrawable);
	assert(picture->alphaMap);
	assert(w <= sna->render.max_3d_size && h <= sna->render.max_3d_size);

	/* XXX shortcut a8? */
	DBG(("%s: %dx%d\n", __FUNCTION__, w, h));

	pixmap = screen->CreatePixmap(screen, w, h, 32, SNA_CREATE_SCRATCH);
	if (pixmap == NullPixmap) {
		DBG(("%s: pixmap allocation failed\n", __FUNCTION__));
		return false;
	}

	assert(__sna_pixmap_get_bo(pixmap));

	tmp = CreatePicture(0, &pixmap->drawable,
			    PictureMatchFormat(screen, 32, PICT_a8r8g8b8),
			    0, NULL, serverClient, &error);
	screen->DestroyPixmap(pixmap);
	if (tmp == NULL)
		return false;

	ValidatePicture(tmp);

	old_format = picture->format;
	picture->format = PICT_FORMAT(PICT_FORMAT_BPP(picture->format),
				      PICT_FORMAT_TYPE(picture->format),
				      0,
				      PICT_FORMAT_R(picture->format),
				      PICT_FORMAT_G(picture->format),
				      PICT_FORMAT_B(picture->format));

	alpha = picture->alphaMap;
	picture->alphaMap = NULL;

	sna_composite(PictOpSrc, picture, alpha, tmp,
		      x, y,
		      x + picture->alphaOrigin.x, y + picture->alphaOrigin.y,
		      0, 0,
		      w, h);

	picture->format = old_format;
	picture->alphaMap = alpha;

	channel->height = h;
	channel->width  = w;
	channel->filter = PictFilterNearest;
	channel->repeat = RepeatNone;
	channel->pict_format = PIXMAN_a8r8g8b8;
	channel->is_affine = true;
	channel->transform = NULL;
	channel->scale[0] = 1.f / w;
	channel->scale[1] = 1.f / h;
	channel->offset[0] = -dst_x;
	channel->offset[1] = -dst_y;
	channel->bo = kgem_bo_reference(__sna_pixmap_get_bo(pixmap));
	FreePicture(tmp, 0);

	return true;
}

int
sna_render_picture_approximate_gradient(struct sna *sna,
					PicturePtr picture,
					struct sna_composite_channel *channel,
					int16_t x, int16_t y,
					int16_t w, int16_t h,
					int16_t dst_x, int16_t dst_y)
{
	pixman_image_t *dst, *src;
	pixman_transform_t t;
	int w2 = w/2, h2 = h/2;
	int dx, dy;
	void *ptr;

#if NO_FIXUP
	return -1;
#endif

	DBG(("%s: (%d, %d)x(%d, %d), dst=(%d, %d)\n",
	     __FUNCTION__, x, y, w, h, dst_x, dst_y));

	if (w2 == 0 || h2 == 0) {
		DBG(("%s: fallback - unknown bounds\n", __FUNCTION__));
		return -1;
	}
	if (w2 > sna->render.max_3d_size || h2 > sna->render.max_3d_size) {
		DBG(("%s: fallback - too large (%dx%d)\n", __FUNCTION__, w, h));
		return -1;
	}

	channel->is_opaque = sna_gradient_is_opaque((PictGradient*)picture->pSourcePict);
	channel->pict_format =
		channel->is_opaque ? PIXMAN_x8r8g8b8 : PIXMAN_a8r8g8b8;
	DBG(("%s: gradient is opaque? %d, selecting format %08x\n",
	     __FUNCTION__, channel->is_opaque, channel->pict_format));
	assert(channel->card_format == -1);

	channel->bo = kgem_create_buffer_2d(&sna->kgem,
					    w2, h2, 32,
					    KGEM_BUFFER_WRITE_INPLACE,
					    &ptr);
	if (!channel->bo) {
		DBG(("%s: failed to create upload buffer, using clear\n",
		     __FUNCTION__));
		return 0;
	}

	dst = pixman_image_create_bits(channel->pict_format,
				       w2, h2, ptr, channel->bo->pitch);
	if (!dst) {
		kgem_bo_destroy(&sna->kgem, channel->bo);
		channel->bo = NULL;
		return 0;
	}

	src = image_from_pict(picture, false, &dx, &dy);
	if (src == NULL) {
		pixman_image_unref(dst);
		kgem_bo_destroy(&sna->kgem, channel->bo);
		channel->bo = NULL;
		return 0;
	}
	DBG(("%s: source offset (%d, %d)\n", __FUNCTION__, dx, dy));

	memset(&t, 0, sizeof(t));
	t.matrix[0][0] = (w << 16) / w2;
	t.matrix[0][2] = (x + dx) << 16;
	t.matrix[1][1] = (h << 16) / h2;
	t.matrix[1][2] = (y + dy) << 16;
	t.matrix[2][2] = 1 << 16;
	if (picture->transform)
		pixman_transform_multiply(&t, picture->transform, &t);
	DBG(("%s: applying transform [(%f, %f, %f), (%f, %f, %f), (%f, %f, %f)]\n",
	     __FUNCTION__,
	     pixman_fixed_to_double(t.matrix[0][0]),
	     pixman_fixed_to_double(t.matrix[0][1]),
	     pixman_fixed_to_double(t.matrix[0][2]),
	     pixman_fixed_to_double(t.matrix[1][0]),
	     pixman_fixed_to_double(t.matrix[1][1]),
	     pixman_fixed_to_double(t.matrix[1][2]),
	     pixman_fixed_to_double(t.matrix[2][0]),
	     pixman_fixed_to_double(t.matrix[2][1]),
	     pixman_fixed_to_double(t.matrix[2][2])));
	pixman_image_set_transform(src, &t);

	sna_image_composite(PictOpSrc, src, NULL, dst,
			    0, 0,
			    0, 0,
			    0, 0,
			    w2, h2);
	free_pixman_pict(picture, src);
	pixman_image_unref(dst);

	channel->width  = w2;
	channel->height = h2;

	channel->filter = PictFilterNearest;
	channel->repeat = RepeatNone;
	channel->is_affine = true;

	channel->scale[0] = 1.f/w;
	channel->scale[1] = 1.f/h;
	channel->offset[0] = -dst_x;
	channel->offset[1] = -dst_y;
	channel->transform = NULL;

	return 1;
}

int
sna_render_picture_fixup(struct sna *sna,
			 PicturePtr picture,
			 struct sna_composite_channel *channel,
			 int16_t x, int16_t y,
			 int16_t w, int16_t h,
			 int16_t dst_x, int16_t dst_y)
{
	pixman_image_t *dst, *src;
	int dx, dy;
	void *ptr;

#if NO_FIXUP
	return -1;
#endif

	DBG(("%s: (%d, %d)x(%d, %d)\n", __FUNCTION__, x, y, w, h));

	if (w == 0 || h == 0) {
		DBG(("%s: fallback - unknown bounds\n", __FUNCTION__));
		return -1;
	}
	if (w > sna->render.max_3d_size || h > sna->render.max_3d_size) {
		DBG(("%s: fallback - too large (%dx%d)\n", __FUNCTION__, w, h));
		return -1;
	}

	if (picture->alphaMap) {
		DBG(("%s: alphamap\n", __FUNCTION__));
		if (is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER) ||
		    is_gpu(sna, picture->alphaMap->pDrawable, PREFER_GPU_RENDER)) {
			if (sna_render_picture_flatten(sna, picture, channel,
							  x, y, w, h, dst_x, dst_y))
				return 1;
		}

		goto do_fixup;
	}

	if (picture->filter == PictFilterConvolution) {
		DBG(("%s: convolution\n", __FUNCTION__));
		if (is_gpu(sna, picture->pDrawable, PREFER_GPU_RENDER)) {
			return sna_render_picture_convolve(sna, picture, channel,
							   x, y, w, h, dst_x, dst_y);
		}

		goto do_fixup;
	}

do_fixup:
	if (PICT_FORMAT_RGB(picture->format) == 0)
		channel->pict_format = PIXMAN_a8;
	else
		channel->pict_format = PIXMAN_a8r8g8b8;

	if (picture->pDrawable &&
	    !sna_drawable_move_to_cpu(picture->pDrawable, MOVE_READ))
		return 0;

	channel->bo = kgem_create_buffer_2d(&sna->kgem,
					    w, h, PIXMAN_FORMAT_BPP(channel->pict_format),
					    KGEM_BUFFER_WRITE_INPLACE,
					    &ptr);
	if (!channel->bo) {
		DBG(("%s: failed to create upload buffer, using clear\n",
		     __FUNCTION__));
		return 0;
	}

	/* Composite in the original format to preserve idiosyncracies */
	if (!kgem_buffer_is_inplace(channel->bo) &&
	    (picture->pDrawable == NULL ||
	     alphaless(picture->format) == alphaless(channel->pict_format)))
		dst = pixman_image_create_bits(channel->pict_format,
					       w, h, ptr, channel->bo->pitch);
	else
		dst = pixman_image_create_bits((pixman_format_code_t)picture->format,
					       w, h, NULL, 0);
	if (!dst) {
		kgem_bo_destroy(&sna->kgem, channel->bo);
		return 0;
	}

	src = image_from_pict(picture, false, &dx, &dy);
	if (src == NULL) {
		pixman_image_unref(dst);
		kgem_bo_destroy(&sna->kgem, channel->bo);
		return 0;
	}

	DBG(("%s: compositing tmp=(%d+%d, %d+%d)x(%d, %d)\n",
	     __FUNCTION__, x, dx, y, dy, w, h));
	sna_image_composite(PictOpSrc, src, NULL, dst,
			    x + dx, y + dy,
			    0, 0,
			    0, 0,
			    w, h);
	free_pixman_pict(picture, src);

	/* Then convert to card format */
	if (pixman_image_get_data(dst) != ptr) {
		DBG(("%s: performing post-conversion %08x->%08x (%d, %d)\n",
		     __FUNCTION__,
		     picture->format, channel->pict_format,
		     w, h));

		src = dst;
		dst = pixman_image_create_bits(channel->pict_format,
					       w, h, ptr, channel->bo->pitch);
		if (dst) {
			sna_image_composite(PictOpSrc, src, NULL, dst,
					    0, 0,
					    0, 0,
					    0, 0,
					    w, h);
			pixman_image_unref(src);
		} else {
			memset(ptr, 0, __kgem_buffer_size(channel->bo));
			dst = src;
		}
	}
	pixman_image_unref(dst);

	channel->width  = w;
	channel->height = h;

	channel->filter = PictFilterNearest;
	channel->repeat = RepeatNone;
	channel->is_affine = true;

	channel->scale[0] = 1.f/w;
	channel->scale[1] = 1.f/h;
	channel->offset[0] = -dst_x;
	channel->offset[1] = -dst_y;
	channel->transform = NULL;

	return 1;
}

int
sna_render_picture_convert(struct sna *sna,
			   PicturePtr picture,
			   struct sna_composite_channel *channel,
			   PixmapPtr pixmap,
			   int16_t x, int16_t y,
			   int16_t w, int16_t h,
			   int16_t dst_x, int16_t dst_y,
			   bool fixup_alpha)
{
	BoxRec box;

#if NO_CONVERT
	return -1;
#endif

	if (w != 0 && h != 0) {
		box.x1 = x;
		box.y1 = y;
		box.x2 = bound(x, w);
		box.y2 = bound(y, h);

		if (channel->transform) {
			DBG(("%s: has transform, converting whole surface\n",
			     __FUNCTION__));
			box.x1 = box.y1 = 0;
			box.x2 = pixmap->drawable.width;
			box.y2 = pixmap->drawable.height;
		}

		if (box.x1 < 0)
			box.x1 = 0;
		if (box.y1 < 0)
			box.y1 = 0;
		if (box.x2 > pixmap->drawable.width)
			box.x2 = pixmap->drawable.width;
		if (box.y2 > pixmap->drawable.height)
			box.y2 = pixmap->drawable.height;
	} else {
		DBG(("%s: op no bounds, converting whole surface\n",
		     __FUNCTION__));
		box.x1 = box.y1 = 0;
		box.x2 = pixmap->drawable.width;
		box.y2 = pixmap->drawable.height;
	}

	w = box.x2 - box.x1;
	h = box.y2 - box.y1;

	DBG(("%s: convert (%d, %d)x(%d, %d), source size %dx%d\n",
	     __FUNCTION__, box.x1, box.y1, w, h,
	     pixmap->drawable.width,
	     pixmap->drawable.height));

	if (w <= 0 || h <= 0) {
		DBG(("%s: sample extents lie outside of source, using clear\n",
		     __FUNCTION__));
		return 0;
	}

	if (fixup_alpha && is_gpu(sna, &pixmap->drawable, PREFER_GPU_RENDER)) {
		ScreenPtr screen = pixmap->drawable.pScreen;
		PixmapPtr tmp;
		PicturePtr src, dst;
		int error;

		assert(PICT_FORMAT_BPP(picture->format) == pixmap->drawable.bitsPerPixel);
		channel->pict_format = PICT_FORMAT(PICT_FORMAT_BPP(picture->format),
						   PICT_FORMAT_TYPE(picture->format),
						   PICT_FORMAT_BPP(picture->format) - PIXMAN_FORMAT_DEPTH(picture->format),
						   PICT_FORMAT_R(picture->format),
						   PICT_FORMAT_G(picture->format),
						   PICT_FORMAT_B(picture->format));

		DBG(("%s: converting to %08x from %08x using composite alpha-fixup\n",
		     __FUNCTION__,
		     (unsigned)channel->pict_format,
		     (unsigned)picture->format));

		tmp = screen->CreatePixmap(screen, w, h, pixmap->drawable.bitsPerPixel, SNA_CREATE_SCRATCH);
		if (tmp == NULL)
			return -1;

		assert(__sna_pixmap_get_bo(tmp));

		dst = CreatePicture(0, &tmp->drawable,
				    PictureMatchFormat(screen,
						       pixmap->drawable.bitsPerPixel,
						       channel->pict_format),
				    0, NULL, serverClient, &error);
		if (dst == NULL) {
			screen->DestroyPixmap(tmp);
			return 0;
		}

		src = CreatePicture(0, &pixmap->drawable,
				    PictureMatchFormat(screen,
						       pixmap->drawable.depth,
						       picture->format),
				    0, NULL, serverClient, &error);
		if (src == NULL) {
			FreePicture(dst, 0);
			screen->DestroyPixmap(tmp);
			return 0;
		}

		ValidatePicture(src);
		ValidatePicture(dst);

		sna_composite(PictOpSrc, src, NULL, dst,
			      box.x1, box.y1,
			      0, 0,
			      0, 0,
			      w, h);
		FreePicture(dst, 0);
		FreePicture(src, 0);

		channel->bo = __sna_pixmap_get_bo(tmp);
		kgem_bo_reference(channel->bo);
		screen->DestroyPixmap(tmp);
	} else {
		pixman_image_t *src, *dst;
		void *ptr;

		if (!sna_pixmap_move_to_cpu(pixmap, MOVE_READ))
			return 0;

		src = pixman_image_create_bits((pixman_format_code_t)picture->format,
					       pixmap->drawable.width,
					       pixmap->drawable.height,
					       pixmap->devPrivate.ptr,
					       pixmap->devKind);
		if (!src)
			return 0;

		if (PICT_FORMAT_RGB(picture->format) == 0) {
			channel->pict_format = PIXMAN_a8;
			DBG(("%s: converting to a8 from %08x\n",
			     __FUNCTION__, picture->format));
		} else {
			channel->pict_format = PIXMAN_a8r8g8b8;
			DBG(("%s: converting to a8r8g8b8 from %08x\n",
			     __FUNCTION__, picture->format));
		}

		channel->bo = kgem_create_buffer_2d(&sna->kgem,
						    w, h, PIXMAN_FORMAT_BPP(channel->pict_format),
						    KGEM_BUFFER_WRITE_INPLACE,
						    &ptr);
		if (!channel->bo) {
			pixman_image_unref(src);
			return 0;
		}

		dst = pixman_image_create_bits(channel->pict_format,
					       w, h, ptr, channel->bo->pitch);
		if (!dst) {
			kgem_bo_destroy(&sna->kgem, channel->bo);
			pixman_image_unref(src);
			return 0;
		}

		if (sigtrap_get() == 0) {
			sna_image_composite(PictOpSrc, src, NULL, dst,
					    box.x1, box.y1,
					    0, 0,
					    0, 0,
					    w, h);
			sigtrap_put();
		}
		pixman_image_unref(dst);
		pixman_image_unref(src);
	}

	channel->width  = w;
	channel->height = h;

	channel->scale[0] = 1.f/w;
	channel->scale[1] = 1.f/h;
	channel->offset[0] = x - dst_x - box.x1;
	channel->offset[1] = y - dst_y - box.y1;

	DBG(("%s: offset=(%d, %d), size=(%d, %d)\n",
	     __FUNCTION__,
	     channel->offset[0], channel->offset[1],
	     channel->width, channel->height));
	return 1;
}

bool
sna_render_composite_redirect(struct sna *sna,
			      struct sna_composite_op *op,
			      int x, int y, int width, int height,
			      bool partial)
{
	struct sna_composite_redirect *t = &op->redirect;
	int bpp = op->dst.pixmap->drawable.bitsPerPixel;
	struct kgem_bo *bo;

	assert(t->real_bo == NULL);

#if NO_REDIRECT
	return false;
#endif

	DBG(("%s: target too large (%dx%d), copying to temporary %dx%d, max %d / %d\n",
	     __FUNCTION__,
	     op->dst.width, op->dst.height,
	     width, height,
	     sna->render.max_3d_size,
	     sna->render.max_3d_pitch));

	if (!width || !height)
		return false;

	if (width  > sna->render.max_3d_size ||
	    height > sna->render.max_3d_size)
		return false;

	if (op->dst.bo->pitch <= sna->render.max_3d_pitch) {
		BoxRec box;
		int w, h, offset;

		DBG(("%s: dst pitch (%d) fits within render pipeline (%d)\n",
		     __FUNCTION__, op->dst.bo->pitch, sna->render.max_3d_pitch));

		box.x1 = x + op->dst.x;
		box.x2 = bound(box.x1, width);
		box.y1 = y + op->dst.y;
		box.y2 = bound(box.y1, height);

		if (box.x1 < 0)
			box.x1 = 0;
		if (box.y1 < 0)
			box.y1 = 0;

		/* Ensure we align to an even tile row */
		if (op->dst.bo->tiling) {
			int tile_width, tile_height, tile_size;

			kgem_get_tile_size(&sna->kgem, op->dst.bo->tiling, op->dst.bo->pitch,
					   &tile_width, &tile_height, &tile_size);

			box.y1 = box.y1 & ~(2*tile_height - 1);
			box.y2 = ALIGN(box.y2, 2*tile_height);

			box.x1 = box.x1 & ~(tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel - 1);
			box.x2 = ALIGN(box.x2, tile_width * 8 / op->dst.pixmap->drawable.bitsPerPixel);

			if (box.x1 > sna->render.max_3d_size &&
			    box.x2 <= 2*sna->render.max_3d_size)
				box.x1 = sna->render.max_3d_size;

			if (box.y1 > sna->render.max_3d_size &&
			    box.y2 <= 2*sna->render.max_3d_size)
				box.y1 = sna->render.max_3d_size;

			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8 / tile_width * tile_size;
		} else {
			if (sna->kgem.gen < 040) {
				box.y1 = box.y1 & ~3;
				box.y2 = ALIGN(box.y2, 4);

				box.x1 = box.x1 & ~3;
				box.x2 = ALIGN(box.x2, 4);
			} else {
				box.y1 = box.y1 & ~1;
				box.y2 = ALIGN(box.y2, 2);

				box.x1 = box.x1 & ~1;
				box.x2 = ALIGN(box.x2, 2);
			}

			if (box.x1 > sna->render.max_3d_size &&
			    box.x2 <= 2*sna->render.max_3d_size)
				box.x1 = sna->render.max_3d_size;

			if (box.y1 > sna->render.max_3d_size &&
			    box.y2 <= 2*sna->render.max_3d_size)
				box.y1 = sna->render.max_3d_size;

			offset = box.x1 * op->dst.pixmap->drawable.bitsPerPixel / 8;
		}

		if (box.y2 > op->dst.pixmap->drawable.height)
			box.y2 = op->dst.pixmap->drawable.height;

		if (box.x2 > op->dst.pixmap->drawable.width)
			box.x2 = op->dst.pixmap->drawable.width;

		w = box.x2 - box.x1;
		h = box.y2 - box.y1;
		DBG(("%s box=(%d, %d), (%d, %d): (%d, %d)/(%d, %d), max %d\n", __FUNCTION__,
		     box.x1, box.y1, box.x2, box.y2, w, h,
		     op->dst.pixmap->drawable.width,
		     op->dst.pixmap->drawable.height,
		     sna->render.max_3d_size));
		if (w <= sna->render.max_3d_size &&
		    h <= sna->render.max_3d_size) {
			t->box.x2 = t->box.x1 = op->dst.x;
			t->box.y2 = t->box.y1 = op->dst.y;
			t->real_bo = op->dst.bo;
			t->real_damage = op->damage;
			if (op->damage) {
				assert(!DAMAGE_IS_ALL(op->damage));
				t->damage = sna_damage_create();
				op->damage = &t->damage;
			}

			/* How many tiles across are we? */
			op->dst.bo = kgem_create_proxy(&sna->kgem, op->dst.bo,
						       box.y1 * op->dst.bo->pitch + offset,
						       h * op->dst.bo->pitch);
			if (!op->dst.bo) {
				t->real_bo = NULL;
				if (t->damage)
					__sna_damage_destroy(t->damage);
				return false;
			}

			assert(op->dst.bo != t->real_bo);
			op->dst.bo->pitch = t->real_bo->pitch;

			op->dst.x -= box.x1;
			op->dst.y -= box.y1;
			op->dst.width  = w;
			op->dst.height = h;
			return true;
		}
	}

	/* We can process the operation in a single pass,
	 * but the target is too large for the 3D pipeline.
	 * Copy into a smaller surface and replace afterwards.
	 */
	bo = kgem_create_2d(&sna->kgem,
			    width, height, bpp,
			    kgem_choose_tiling(&sna->kgem, I915_TILING_X,
					       width, height, bpp),
			    CREATE_TEMPORARY);
	if (!bo)
		return false;

	t->box.x1 = x + op->dst.x;
	t->box.y1 = y + op->dst.y;
	t->box.x2 = bound(t->box.x1, width);
	t->box.y2 = bound(t->box.y1, height);

	DBG(("%s: original box (%d, %d), (%d, %d)\n",
	     __FUNCTION__, t->box.x1, t->box.y1, t->box.x2, t->box.y2));

	if (partial &&
	    !sna_blt_copy_boxes(sna, GXcopy,
				op->dst.bo, 0, 0,
				bo, -t->box.x1, -t->box.y1,
				bpp, &t->box, 1)) {
		kgem_bo_destroy(&sna->kgem, bo);
		return false;
	}

	t->real_bo = op->dst.bo;
	t->real_damage = op->damage;
	if (op->damage) {
		assert(!DAMAGE_IS_ALL(op->damage));
		t->damage = sna_damage_create();
		op->damage = &t->damage;
	}

	op->dst.bo = bo;
	op->dst.x = -x;
	op->dst.y = -y;
	op->dst.width  = width;
	op->dst.height = height;
	return true;
}

void
sna_render_composite_redirect_done(struct sna *sna,
				   const struct sna_composite_op *op)
{
	const struct sna_composite_redirect *t = &op->redirect;

	if (t->real_bo) {
		assert(op->dst.bo != t->real_bo);

		if (t->box.x2 > t->box.x1) {
			bool ok;

			DBG(("%s: copying temporary to dst\n", __FUNCTION__));
			ok = sna_blt_copy_boxes(sna, GXcopy,
						op->dst.bo, -t->box.x1, -t->box.y1,
						t->real_bo, 0, 0,
						op->dst.pixmap->drawable.bitsPerPixel,
						&t->box, 1);
			assert(ok);
			(void)ok;
		}
		if (t->damage) {
			DBG(("%s: combining damage (all? %d), offset=(%d, %d)\n",
			     __FUNCTION__, (int)DAMAGE_IS_ALL(t->damage),
			     t->box.x1, t->box.y1));
			sna_damage_combine(t->real_damage,
					   DAMAGE_PTR(t->damage),
					   t->box.x1, t->box.y1);
			__sna_damage_destroy(DAMAGE_PTR(t->damage));
		}

		kgem_bo_destroy(&sna->kgem, op->dst.bo);
	}
}

static bool
copy_overlap(struct sna *sna, uint8_t alu,
	     const DrawableRec *draw, struct kgem_bo *bo,
	     int16_t src_dx, int16_t src_dy,
	     int16_t dst_dx, int16_t dst_dy,
	     const BoxRec *box, int n, const BoxRec *extents)
{
	ScreenPtr screen = draw->pScreen;
	struct kgem_bo *tmp_bo;
	PixmapPtr tmp;
	bool ret = false;

	if (n == 0)
		return true;

	DBG(("%s: %d x %dx%d src=(%d, %d), dst=(%d, %d)\n",
	     __FUNCTION__, n,
	     extents->x2 - extents->x1,
	     extents->y2 - extents->y1,
	     src_dx, src_dy,
	     dst_dx, dst_dy));

	tmp = screen->CreatePixmap(screen,
				   extents->x2 - extents->x1,
				   extents->y2 - extents->y1,
				   draw->depth,
				   SNA_CREATE_SCRATCH);
	if (tmp == NULL)
		return false;

	tmp_bo = __sna_pixmap_get_bo(tmp);
	assert(tmp_bo);

	ret = (sna->render.copy_boxes(sna, GXcopy,
				      draw, bo, src_dx, src_dy,
				      &tmp->drawable, tmp_bo, -extents->x1, -extents->y1,
				      box, n, 0) &&
	       sna->render.copy_boxes(sna, alu,
				      &tmp->drawable, tmp_bo, -extents->x1, -extents->y1,
				      draw, bo, dst_dx, dst_dy,
				      box, n, 0));

	screen->DestroyPixmap(tmp);
	return ret;
}
bool
sna_render_copy_boxes__overlap(struct sna *sna, uint8_t alu,
			       const DrawableRec *draw, struct kgem_bo *bo,
			       int16_t src_dx, int16_t src_dy,
			       int16_t dst_dx, int16_t dst_dy,
			       const BoxRec *box, int n, const BoxRec *extents)
{
	bool ret = false;
	RegionRec overlap, non_overlap;
	pixman_region16_t region;
	pixman_box16_t stack_boxes[64], *boxes = stack_boxes;
	int num_boxes, i;

	DBG(("%s: pixmap=%ld, handle=%d, %d x [(%d, %d), (%d, %d)], dst=(%d, %d), src=(%d, %d)\n",
	     __FUNCTION__, draw->serialNumber, bo->handle,
	     n, extents->x1, extents->y1, extents->x2, extents->y2,
	     src_dx, src_dy, dst_dx, dst_dy));

	if ((dst_dx - src_dx < 4 && src_dx - dst_dx < 4) &&
	    (dst_dy - src_dy < 4 && src_dy - dst_dy < 4))
		return copy_overlap(sna, alu, draw, bo,
				    src_dx, src_dy,
				    dst_dx, dst_dy,
				    box, n, extents);

	if (n > ARRAY_SIZE(stack_boxes)) {
		boxes = malloc(sizeof(pixman_box16_t) * n);
		if (boxes == NULL)
			return copy_overlap(sna, alu, draw, bo,
					    src_dx, src_dy,
					    dst_dx, dst_dy,
					    box, n, extents);
	}

	region.extents.x1 = extents->x1 + dst_dx;
	region.extents.x2 = extents->x2 + dst_dx;
	region.extents.y1 = extents->y1 + dst_dy;
	region.extents.y2 = extents->y2 + dst_dy;

	for (i = num_boxes = 0; i < n; i++) {
		boxes[num_boxes].x1 = box[i].x1 + dst_dx;
		if (boxes[num_boxes].x1 < region.extents.x1)
			boxes[num_boxes].x1 = region.extents.x1;

		boxes[num_boxes].y1 = box[i].y1 + dst_dy;
		if (boxes[num_boxes].y1 < region.extents.y1)
			boxes[num_boxes].y1 = region.extents.y1;

		boxes[num_boxes].x2 = box[i].x2 + dst_dx;
		if (boxes[num_boxes].x2 > region.extents.x2)
			boxes[num_boxes].x2 = region.extents.x2;

		boxes[num_boxes].y2 = box[i].y2 + dst_dy;
		if (boxes[num_boxes].y2 > region.extents.y2)
			boxes[num_boxes].y2 = region.extents.y2;

		if (boxes[num_boxes].x2 > boxes[num_boxes].x1 &&
		    boxes[num_boxes].y2 > boxes[num_boxes].y1)
			num_boxes++;
	}

	if (num_boxes == 0) {
		ret = true;
		goto cleanup_boxes;
	}

	if (!pixman_region_init_rects(&region, boxes, num_boxes))
		goto cleanup_boxes;

	overlap.extents.x1 = extents->x1 + src_dx;
	overlap.extents.x2 = extents->x2 + src_dx;
	overlap.extents.y1 = extents->y1 + src_dy;
	overlap.extents.y2 = extents->y2 + src_dy;
	overlap.data = NULL;

	RegionIntersect(&overlap, &overlap, &region);
	DBG(("%s: overlapping extents: (%d, %d), (%d, %d) x %d\n",
	     __FUNCTION__,
	     overlap.extents.x1, overlap.extents.y1,
	     overlap.extents.x2, overlap.extents.y2,
	     region_num_rects(&overlap)));

	RegionNull(&non_overlap);
	RegionSubtract(&non_overlap, &region, &overlap);
	DBG(("%s: non-overlapping extents: (%d, %d), (%d, %d) x %d\n",
	     __FUNCTION__,
	     non_overlap.extents.x1, non_overlap.extents.y1,
	     non_overlap.extents.x2, non_overlap.extents.y2,
	     region_num_rects(&non_overlap)));

	n = region_num_rects(&non_overlap);
	box = region_rects(&non_overlap);
	if (n && !sna->render.copy_boxes(sna, alu,
					 draw, bo, -dst_dx + src_dx, -dst_dy + src_dy,
					 draw, bo, 0, 0,
					 box, n , COPY_NO_OVERLAP))
		goto cleanup_boxes;

	n = region_num_rects(&overlap);
	box = region_rects(&overlap);
	ret = copy_overlap(sna, alu, draw, bo,
			   -dst_dx + src_dx, -dst_dy + src_dy,
			   0, 0,
			   box, n, &overlap.extents);

cleanup_boxes:
	if (boxes != stack_boxes)
		free(boxes);

	return ret;
}

static bool can_copy_cpu(struct sna *sna,
			 struct kgem_bo *src,
			 struct kgem_bo *dst)
{
	DBG(("%s: tiling=%d:%d, pitch=%d:%d, can_map=%d:%d[%d]\n",
	     __FUNCTION__,
	     src->tiling, dst->tiling,
	     src->pitch, dst->pitch,
	     kgem_bo_can_map__cpu(&sna->kgem, src, false),
	     kgem_bo_can_map__cpu(&sna->kgem, dst, true),
	     sna->kgem.has_wc_mmap));

	if (src->tiling != dst->tiling)
		return false;

	if (!kgem_bo_can_map__cpu(&sna->kgem, src, false))
		return false;

	if (!kgem_bo_can_map__cpu(&sna->kgem, dst, true) &&
	    !sna->kgem.has_wc_mmap)
		return false;

	DBG(("%s -- yes, src handle=%d, dst handle=%d\n", __FUNCTION__, src->handle, dst->handle));
	return true;
}

bool
memcpy_copy_boxes(struct sna *sna, uint8_t op,
		  const DrawableRec *src_draw, struct kgem_bo *src_bo, int16_t sx, int16_t sy,
		  const DrawableRec *dst_draw, struct kgem_bo *dst_bo, int16_t dx, int16_t dy,
		  const BoxRec *box, int n, unsigned flags)
{
	memcpy_box_func detile = NULL;
	void *dst, *src;

	if (op != GXcopy)
		return false;

	if (src_draw->depth != dst_draw->depth)
		return false;

	dst = src = NULL;
	if (can_copy_cpu(sna, src_bo, dst_bo)) {
		if (src_bo->pitch != dst_bo->pitch ||
		    dx != sx || dy != sy || n > 1 ||
		    box->x1 + dx > 0 ||
		    box->y1 + dy > 0 ||
		    box->x2 + dx < dst_draw->width ||
		    box->y2 + dy < dst_draw->height) {
			if (dx != sx) /* not implemented in memcpy yet */
				goto use_gtt;

			switch (dst_bo->tiling) {
			default:
			case I915_TILING_Y:
				goto use_gtt;

			case I915_TILING_X:
				detile = sna->kgem.memcpy_between_tiled_x;
				if (detile == NULL)
					goto use_gtt;
				break;

			case I915_TILING_NONE:
				break;
			}
		}

		if (kgem_bo_can_map__cpu(&sna->kgem, dst_bo, true))
			dst = kgem_bo_map__cpu(&sna->kgem, dst_bo);
		else
			dst = kgem_bo_map__wc(&sna->kgem, dst_bo);
		src = kgem_bo_map__cpu(&sna->kgem, src_bo);
	}

	if (dst == NULL || src == NULL) {
use_gtt:
		dst = kgem_bo_map__gtt(&sna->kgem, dst_bo);
		src = kgem_bo_map__gtt(&sna->kgem, src_bo);
		if (dst == NULL || src == NULL)
			return false;

		kgem_bo_sync__gtt(&sna->kgem, dst_bo);
		kgem_bo_sync__gtt(&sna->kgem, src_bo);

		detile = NULL;
	} else {
		if (dst == dst_bo->map__wc)
			kgem_bo_sync__gtt(&sna->kgem, dst_bo);
		else
			kgem_bo_sync__cpu_full(&sna->kgem, dst_bo, true);
		kgem_bo_sync__cpu_full(&sna->kgem, src_bo, false);
	}

	DBG(("%s: src(%d, %d), dst(%d, %d) x %d\n",
	     __FUNCTION__, sx, sy, dx, dy, n));

	if (sigtrap_get() == 0) {
		if (detile) {
			do {
				detile(src, dst, dst_draw->bitsPerPixel,
				       src_bo->pitch, dst_bo->pitch,
				       box->x1 + sx, box->y1 + sy,
				       box->x1 + dx, box->y1 + dy,
				       box->x2 - box->x1, box->y2 - box->y1);
				box++;
			} while (--n);
		} else do {
			memcpy_blt(src, dst, dst_draw->bitsPerPixel,
				   src_bo->pitch, dst_bo->pitch,
				   box->x1 + sx, box->y1 + sy,
				   box->x1 + dx, box->y1 + dy,
				   box->x2 - box->x1, box->y2 - box->y1);
			box++;
		} while (--n);
		sigtrap_put();
	}

	return true;
}

void
sna_render_mark_wedged(struct sna *sna)
{
	sna->render.copy_boxes = memcpy_copy_boxes;
	sna->render.prefer_gpu = 0;
}