#include <stddef.h>
#include <inttypes.h>

/*
 * "constant time" memcmp.  Time taken depends on the buffer length, of
 * course, but not on the content of the buffers.
 *
 * Just like the ordinary memcmp function, the return value is
 * tri-state: <0, 0, or >0.  However, applications that need a
 * constant-time memory comparison function usually need only a
 * two-state result, signalling only whether the inputs were identical
 * or different, but not signalling which of the inputs was larger.
 * This code could be made significantly faster and simpler if the
 * requirement for a tri-state result were removed.
 *
 * In order to protect against adversaries who can observe timing,
 * cache hits or misses, page faults, etc., and who can use such
 * observations to learn something about the relationship between the
 * contents of the two buffers, we have to perform exactly the same
 * instructions and memory accesses regardless of the contents of the
 * buffers.  We can't stop as soon as we find a difference, we can't
 * take different conditional branches depending on the data, and we
 * can't use different pointers or array indexes depending on the data.
 *
 * Further reading:
 *
 * .Rs
 * .%A Paul C. Kocher
 * .%T Timing Attacks on Implementations of Diffie-Hellman, RSA, DSS, and Other Systems
 * .%D 1996
 * .%J CRYPTO 1996
 * .%P 104-113
 * .%U http://www.cryptography.com/timingattack/paper.html
 * .%U http://www.webcitation.org/query?url=http%3A%2F%2Fwww.cryptography.com%2Ftimingattack%2Fpaper.html&date=2012-10-17
 * .Re
 *
 * .Rs
 * .%A D. Boneh
 * .%A D. Brumley
 * .%T Remote timing attacks are practical
 * .%D August 2003
 * .%J Proceedings of the 12th Usenix Security Symposium, 2003
 * .%U https://crypto.stanford.edu/~dabo/abstracts/ssl-timing.html
 * .%U http://www.webcitation.org/query?url=https%3A%2F%2Fcrypto.stanford.edu%2F%7Edabo%2Fabstracts%2Fssl-timing.html&date=2012-10-17
 * .%U http://www.webcitation.org/query?url=http%3A%2F%2Fcrypto.stanford.edu%2F%7Edabo%2Fpubs%2Fpapers%2Fssl-timing.pdf&date=2012-10-17
 * .Es
 *
 * .Rs
 * .%A Coda Hale
 * .%T A Lesson In Timing Attacks (or, Don't use MessageDigest.isEquals)
 * .%D 13 Aug 2009
 * .%U http://codahale.com/a-lesson-in-timing-attacks/
 * .%U http://www.webcitation.org/query?url=http%3A%2F%2Fcodahale.com%2Fa-lesson-in-timing-attacks%2F&date=2012-10-17
 * .Re
 *
 */

/*
 * A note on portability:
 *
 * We assume that char is exactly 8 bits, the same as uint8_t, and that
 * integer types with exactly 16 bits and exactly 32 bits exist.  (If
 * there is ever a need to change this, then the actual requirement is
 * that we need a type that is at least two bits wider than char, and
 * another type that is at least two bits wider than that, or we need to
 * fake it somehow.)
 *
 * We do not assume any particular size for the plain "int" type, except
 * that it is at least 16 bits, as is guaranteed by the C language
 * standard.
 *
 * We do not assume that signed integer overflow is harmless.  We
 * ensure that signed integer overflow does not occur, so that
 * implementation-defined overflow behaviour is not invoked.
 *
 * We rely on the C standard's guarantees regarding the wraparound
 * behaviour of unsigned integer arithmetic, and on the analagous
 * guarantees regarding conversions from signed types to narrower
 * unsigned types.
 *
 * We do not assume that the platform uses two's complement arithmetic.
 */

/*
 * How hard do we have to try to prevent unwanted compiler optimisations?
 *
 * Try compiling with "#define USE_VOLATILE_TEMPORARY 0", and examine
 * the compiler output.  If the only conditional tests in the entire
 * function are to test whether len is zero, then all is well, but try
 * again with different optimisation flags to be sure.  If the compiler
 * emitted code with conditional tests that do anything other than
 * testing whether len is zero, then that's a problem, so try again with
 * "#define USE_VOLATILE_TEMPORARY 1".  If it's still bad, then you are
 * out of luck.
 */
#define USE_VOLATILE_TEMPORARY 0

int consttime_memcmp(const void *b1, const void *b2, size_t len)
{
	const uint8_t *c1, *c2;
	uint16_t d, r, m;

#if USE_VOLATILE_TEMPORARY
	volatile uint16_t v;
#else
	uint16_t v;
#endif

	c1 = b1;
	c2 = b2;

	r = 0;
	while (len) {
		/*
		 * Take the low 8 bits of r (in the range 0x00 to 0xff,
		 * or 0 to 255);
		 * As explained elsewhere, the low 8 bits of r will be zero
		 * if and only if all bytes compared so far were identical;
		 * Zero-extend to a 16-bit type (in the range 0x0000 to
		 * 0x00ff);
		 * Add 255, yielding a result in the range 255 to 510;
		 * Save that in a volatile variable to prevent
		 * the compiler from trying any shortcuts (the
		 * use of a volatile variable depends on "#ifdef
		 * USE_VOLATILE_TEMPORARY", and most compilers won't
		 * need it);
		 * Divide by 256 yielding a result of 1 if the original
		 * value of r was non-zero, or 0 if r was zero;
		 * Subtract 1, yielding 0 if r was non-zero, or -1 if r
		 * was zero;
		 * Convert to uint16_t, yielding 0x0000 if r was
		 * non-zero, or 0xffff if r was zero;
		 * Save in m.
		 */
		v = ((uint16_t)(uint8_t)r)+255;
		m = v/256-1;

		/*
		 * Get the values from *c1 and *c2 as uint8_t (each will
		 * be in the range 0 to 255, or 0x00 to 0xff);
		 * Convert them to signed int values (still in the
		 * range 0 to 255);
		 * Subtract them using signed arithmetic, yielding a
		 * result in the range -255 to +255;
		 * Convert to uint16_t, yielding a result in the range
		 * 0xff01 to 0xffff (for what was previously -255 to
		 * -1), or 0, or in the range 0x0001 to 0x00ff (for what
		 * was previously +1 to +255).
		 */
		d = (uint16_t)((int)*c1 - (int)*c2);

		/*
		 * If the low 8 bits of r were previously 0, then m
		 * is now 0xffff, so (d & m) is the same as d, so we
		 * effectively copy d to r;
		 * Otherwise, if r was previously non-zero, then m is
		 * now 0, so (d & m) is zero, so leave r unchanged.
		 * Note that the low 8 bits of d will be zero if and
		 * only if d == 0, which happens when *c1 == *c2.
		 * The low 8 bits of r are thus zero if and only if the
		 * entirety of r is zero, which happens if and only if
		 * all bytes compared so far were equal.  As soon as a
		 * non-zero value is stored in r, it remains unchanged
		 * for the remainder of the loop.
		 */
		r |= (d & m);

		/*
		 * Increment pointers, decrement length, and loop.
		 */
		++c1;
		++c2;
		--len;
	}

	/*
	 * At this point, r is an unsigned value, which will be 0 if the
	 * final result should be zero, or in the range 0x0001 to 0x00ff
	 * (1 to 255) if the final result should be positive, or in the
	 * range 0xff01 to 0xffff (65281 to 65535) if the final result
	 * should be negative.
	 *
	 * We want to convert the unsigned values in the range 0xff01
	 * to 0xffff to signed values in the range -255 to -1, while
	 * converting the other unsigned values to equivalent signed
	 * values (0, or +1 to +255).
	 *
	 * On a machine with two's complement arithmetic, simply copying
	 * the underlying bits (with sign extension if int is wider than
	 * 16 bits) would do the job, so something like this might work:
	 *
	 *     return (int16_t)r;
	 *
	 * However, that invokes implementation-defined behaviour,
	 * because values larger than 32767 can't fit in a signed 16-bit
	 * integer without overflow.
	 *
	 * To avoid any implementation-defined behaviour, we go through
	 * these contortions:
	 *
	 * a. Calculate ((uint32_t)r + 0x8000).  The cast to uint32_t
	 *    it to prevent problems on platforms where int is narrower
	 *    than 32 bits.  If int is a larger than 32-bits, then the
	 *    usual arithmetic conversions cause this addition to be
	 *    done in unsigned int arithmetic.  If int is 32 bits
	 *    or narrower, then this addition is done in uint32_t
	 *    arithmetic.  In either case, no overflow or wraparound
	 *    occurs, and the result from this step has a value that
	 *    will be one of 0x00008000 (32768), or in the range
	 *    0x00008001 to 0x000080ff (32769 to 33023), or in the range
	 *    0x00017f01 to 0x00017fff (98049 to 98303).
	 *
	 * b. Cast the result from (a) to uint16_t.  This effectively
	 *    discards the high bits of the result, in a way that is
	 *    well defined by the C language.  The result from this step
	 *    will be of type uint16_t, and its value will be one of
	 *    0x8000 (32768), or in the range 0x8001 to 0x80ff (32769 to
	 *    33023), or in the range 0x7f01 to 0x7fff (32513 to
	 *    32767).
	 *
	 * c. Cast the result from (b) to int32_t.  We use int32_t
	 *    instead of int because we need a type that's strictly
	 *    larger than 16 bits, and the C standard allows
	 *    implementations where int is only 16 bits.  The result
	 *    from this step will be of type int32_t, and its value wll
	 *    be one of 0x00008000 (32768), or in the range 0x00008001
	 *    to 0x000080ff (32769 to 33023), or in the range 0x00007f01
	 *    to 0x00007fff (32513 to 32767).
	 *
	 * d. Take the result from (c) and subtract 0x8000 (32768) using
	 *    signed int32_t arithmetic.  The result from this step will
	 *    be of type int32_t and the value will be one of
	 *    0x00000000 (0), or in the range 0x00000001 to 0x000000ff
	 *    (+1 to +255), or in the range 0xffffff01 to 0xffffffff
	 *    (-255 to -1).
	 *
	 * e. Cast the result from (d) to int.  This does nothing
	 *    interesting, except to make explicit what would have been
	 *    implicit in the return statement.  The final result is an
	 *    int in the range -255 to +255.
	 *
	 * Unfortunately, compilers don't seem to be good at figuring
	 * out that most of this can be optimised away by careful choice
	 * of register width and sign extension.
	 *
	 */
	return (/*e*/ int)(/*d*/
	    (/*c*/ int32_t)(/*b*/ uint16_t)(/*a*/ (uint32_t)r + 0x8000)
	    - 0x8000);
}