Index: ps/trunk/binaries/data/mods/public/gui/credits/texts/programming.json
===================================================================
--- ps/trunk/binaries/data/mods/public/gui/credits/texts/programming.json
+++ ps/trunk/binaries/data/mods/public/gui/credits/texts/programming.json
@@ -242,6 +242,7 @@
 			{"nick": "usey11"},
 			{"nick": "vincent_c", "name": "Vincent Cheng"},
 			{"nick": "vladislavbelov", "name": "Vladislav Belov"},
+			{"nick": "voroskoi"},
 			{"nick": "vts", "name": "Jeroen DR"},
 			{"nick": "wacko", "name": "Andrew Spiering"},
 			{"nick": "WhiteTreePaladin", "name": "Brian Ashley"},
Index: ps/trunk/build/premake/extern_libs5.lua
===================================================================
--- ps/trunk/build/premake/extern_libs5.lua
+++ ps/trunk/build/premake/extern_libs5.lua
@@ -463,7 +463,7 @@
 			add_default_links({
 				win_names  = { "nvtt" },
 				unix_names = { "nvcore", "nvmath", "nvimage", "nvtt" },
-				osx_names = { "nvcore", "nvmath", "nvimage", "nvtt", "squish" },
+				osx_names = { "bc6h", "bc7", "nvcore", "nvimage", "nvmath", "nvthread", "nvtt", "squish" },
 				dbg_suffix = "", -- for performance we always use the release-mode version
 			})
 		end,
Index: ps/trunk/libraries/osx/build-osx-libs.sh
===================================================================
--- ps/trunk/libraries/osx/build-osx-libs.sh
+++ ps/trunk/libraries/osx/build-osx-libs.sh
@@ -1002,7 +1002,7 @@
 
   # Could use CMAKE_OSX_DEPLOYMENT_TARGET and CMAKE_OSX_SYSROOT
   # but they're not as flexible for cross-compiling
-  # Disable optional libs that we don't need (avoids some conflicts with MacPorts)
+  # Disable png support (avoids some conflicts with MacPorts)
   (cmake .. \
       -DCMAKE_LINK_FLAGS="$LDFLAGS" \
       -DCMAKE_C_FLAGS="$CFLAGS" \
@@ -1010,19 +1010,13 @@
       -DCMAKE_BUILD_TYPE=Release \
       -DBINDIR=bin \
       -DLIBDIR=lib \
-      -DGLUT=0 \
-      -DGLEW=0 \
-      -DCG=0 \
-      -DCUDA=0 \
-      -DOPENEXR=0 \
-      -DJPEG=0 \
       -DPNG=0 \
-      -DTIFF=0 \
       -G "Unix Makefiles" \
     && make clean && make nvtt ${JOBS}) || die "NVTT build failed"
   popd
 
   mkdir -p ../lib
+  cp build/src/bc*/libbc*.a ../lib/
   cp build/src/nv*/libnv*.a ../lib/
   cp build/src/nvtt/squish/libsquish.a ../lib/
   popd
Index: ps/trunk/libraries/source/nvtt/README.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/README.txt
+++ ps/trunk/libraries/source/nvtt/README.txt
@@ -1,21 +1,9 @@
-This is NVTT 2.0.8-1 from http://code.google.com/p/nvidia-texture-tools/
+This is NVTT 2.1.1 from https://github.com/castano/nvidia-texture-tools
 plus some patches (see patches/):
-  r1156.patch (from NVTT SVN r1156 - fixes build with libtiff 4.0)
-  r1157.patch (from NVTT SVN r1157 - fixes build with CUDA 3.0)
-  r1172.patch (from NVTT SVN r1172 - fixes memory allocator interaction with Valgrind)
-  r907.patch and r1025.patch (from NVTT SVN - fixes build on FreeBSD)
-  rpath.patch (fixes .so file search paths for bundled copy)
-  issue139.patch (fixes http://code.google.com/p/nvidia-texture-tools/issues/detail?id=139)
-  issue176.patch (partially from http:/code.google.com/p/nvidia-texture-tools/issues/detail?id=176 - fixes build on OpenBSD)
-  png-api.patch (partially from NVTT SVN r1248 - fixes build with libpng 1.5)
+  cmake.patch - disables some dependencies
   cmake-freebsd.patch (fixes build on FreeBSD)
-  gcc47-unistd.patch (fixes build on GCC 4.7)
-  cmake-devflags.patch (from https://407191.bugs.gentoo.org/attachment.cgi?id=308589 - allows disabling various dependencies)
-  cmake-devflags2.patch - allows disabling more dependencies
-  issue182.patch (fixes http://code.google.com/p/nvidia-texture-tools/issues/detail?id=182)
-  cmake-noqt4.patch (removes unused dependency on Qt4, fixes build on systems without Qt)
-  arm-fix.patch (from NVTT SVN r1173 - fixes ARM build)
   issue188.patch (fixes http://code.google.com/p/nvidia-texture-tools/issues/detail?id=188)
-  clang-cpp11-error.patch (fixes build error on OS X Yosemite with clang, libc++ and c++11)
-  arm64-fix.patch (backported in http://trac.wildfiregames.com/ticket/3344 from upstream https://github.com/castano/nvidia-texture-tools/commit/58617584d4d2541ff9fcfe23a9a492af86b11efb - fixes ARM64 build)
-  gcc6-fix.path (fixes a compilation issue where GCC 6 doesn't want to cast a boolean to a pointer anymore)
+  issue261.patch (fixes https://github.com/castano/nvidia-texture-tools/issues/261)
+  rpath.patch (fixes .so file search paths for bundled copy)
+  win-shared-build.patch (adapted from https://github.com/castano/nvidia-texture-tools/pull/285)
+  musl-build.patch (fixes build on musl linux; contributed by voroskoi, with a part by leper, see https://code.wildfiregames.com/D2491)
Index: ps/trunk/libraries/source/nvtt/build.sh
===================================================================
--- ps/trunk/libraries/source/nvtt/build.sh
+++ ps/trunk/libraries/source/nvtt/build.sh
@@ -11,7 +11,7 @@
 mkdir -p src/build/
 cd src/build/
 
-cmake .. -DNVTT_SHARED=1 -DCMAKE_BUILD_TYPE=Release -DBINDIR=bin -DLIBDIR=lib -DGLUT=0 -DGLEW=0 -DCG=0 -DCUDA=0 -DOPENEXR=0 -G "Unix Makefiles"
+cmake .. -DNVTT_SHARED=1 -DCMAKE_BUILD_TYPE=Release -DBINDIR=bin -DLIBDIR=lib -G "Unix Makefiles"
 
 ${MAKE} nvtt ${JOBS}
 
Index: ps/trunk/libraries/source/nvtt/include/nvtt/nvtt.h
===================================================================
--- ps/trunk/libraries/source/nvtt/include/nvtt/nvtt.h
+++ ps/trunk/libraries/source/nvtt/include/nvtt/nvtt.h
@@ -1,308 +1,676 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_H
-#define NV_TT_H
-
-// Function linkage
-#if NVTT_SHARED
-
-#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ || defined __MINGW32__
-#	ifdef NVTT_EXPORTS
-#		define NVTT_API __declspec(dllexport)
-#	else
-#		define NVTT_API __declspec(dllimport)
-#	endif
-#endif
-
-#if defined __GNUC__ >= 4
-#	ifdef NVTT_EXPORTS
-#		define NVTT_API __attribute__((visibility("default")))
-#	endif
-#endif
-
-#endif // NVTT_SHARED
-
-#if !defined NVTT_API
-#	define NVTT_API
-#endif
-
-#define NVTT_VERSION 200
-
-#define NVTT_DECLARE_PIMPL(Class) \
-	private: \
-		Class(const Class &); \
-		void operator=(const Class &); \
-	public: \
-		struct Private; \
-		Private & m
-
-
-// Public interface.
-namespace nvtt
-{
-	/// Supported compression formats.
-	enum Format
-	{
-		// No compression.
-		Format_RGB,
-		Format_RGBA = Format_RGB,
-		
-		// DX9 formats.
-		Format_DXT1,
-		Format_DXT1a,   // DXT1 with binary alpha.
-		Format_DXT3,
-		Format_DXT5,
-		Format_DXT5n,   // Compressed HILO: R=1, G=y, B=0, A=x
-		
-		// DX10 formats.
-		Format_BC1 = Format_DXT1,
-		Format_BC1a = Format_DXT1a,
-		Format_BC2 = Format_DXT3,
-		Format_BC3 = Format_DXT5,
-		Format_BC3n = Format_DXT5n,
-		Format_BC4,     // ATI1
-		Format_BC5,     // 3DC, ATI2
-	};
-	
-	/// Quality modes.
-	enum Quality
-	{
-		Quality_Fastest,
-		Quality_Normal,
-		Quality_Production,
-		Quality_Highest,
-	};
-
-	/// Compression options. This class describes the desired compression format and other compression settings.
-	struct CompressionOptions
-	{
-		NVTT_DECLARE_PIMPL(CompressionOptions);
-
-		NVTT_API CompressionOptions();
-		NVTT_API ~CompressionOptions();
-		
-		NVTT_API void reset();
-		
-		NVTT_API void setFormat(Format format);
-		NVTT_API void setQuality(Quality quality);
-		NVTT_API void setColorWeights(float red, float green, float blue, float alpha = 1.0f);
-		
-		NVTT_API void setExternalCompressor(const char * name);
-
-		// Set color mask to describe the RGB/RGBA format.
-		NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
-
-		NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
-	};
-
-
-	/// Wrap modes.
-	enum WrapMode
-	{
-		WrapMode_Clamp,
-		WrapMode_Repeat,
-		WrapMode_Mirror,
-	};
-	
-	/// Texture types.
-	enum TextureType
-	{
-		TextureType_2D,
-		TextureType_Cube,
-	//	TextureType_3D,
-	};
-	
-	/// Input formats.
-	enum InputFormat
-	{
-		InputFormat_BGRA_8UB,
-	//	InputFormat_RGBE_8UB,
-	//	InputFormat_BGRA_32F,
-	};
-	
-	/// Mipmap downsampling filters.
-	enum MipmapFilter
-	{
-		MipmapFilter_Box,       ///< Box filter is quite good and very fast.
-		MipmapFilter_Triangle,  ///< Triangle filter blurs the results too much, but that might be what you want.
-		MipmapFilter_Kaiser,    ///< Kaiser-windowed Sinc filter is the best downsampling filter.
-	};
-	
-	/// Color transformation.
-	enum ColorTransform
-	{
-		ColorTransform_None,
-		ColorTransform_Linear,
-	};
-	
-	/// Extents rounding mode.
-	enum RoundMode
-	{
-		RoundMode_None,
-		RoundMode_ToNextPowerOfTwo,
-		RoundMode_ToNearestPowerOfTwo,
-		RoundMode_ToPreviousPowerOfTwo,
-	};
-	
-	/// Alpha mode.
-	enum AlphaMode
-	{
-		AlphaMode_None,
-		AlphaMode_Transparency,
-		AlphaMode_Premultiplied,
-	};
-
-	/// Input options. Specify format and layout of the input texture.
-	struct InputOptions
-	{
-		NVTT_DECLARE_PIMPL(InputOptions);
-
-		NVTT_API InputOptions();
-		NVTT_API ~InputOptions();
-		
-		// Set default options.
-		NVTT_API void reset();
-		
-		// Setup input layout.
-		NVTT_API void setTextureLayout(TextureType type, int w, int h, int d = 1);
-		NVTT_API void resetTextureLayout();
-		
-		// Set mipmap data. Copies the data.
-		NVTT_API bool setMipmapData(const void * data, int w, int h, int d = 1, int face = 0, int mipmap = 0);
-		
-		// Describe the format of the input.
-		NVTT_API void setFormat(InputFormat format);
-		
-		// Set the way the input alpha channel is interpreted.
-		NVTT_API void setAlphaMode(AlphaMode alphaMode);
-		
-		// Set gamma settings.
-		NVTT_API void setGamma(float inputGamma, float outputGamma);
-		
-		// Set texture wrappign mode.
-		NVTT_API void setWrapMode(WrapMode mode);
-		
-		// Set mipmapping options.
-		NVTT_API void setMipmapFilter(MipmapFilter filter);
-		NVTT_API void setMipmapGeneration(bool enabled, int maxLevel = -1);
-		NVTT_API void setKaiserParameters(float width, float alpha, float stretch);
-
-		// Set normal map options.
-		NVTT_API void setNormalMap(bool b);
-		NVTT_API void setConvertToNormalMap(bool convert);
-		NVTT_API void setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale);
-		NVTT_API void setNormalFilter(float sm, float medium, float big, float large);
-		NVTT_API void setNormalizeMipmaps(bool b);
-		
-		// Set color transforms. @@ Not implemented!
-		NVTT_API void setColorTransform(ColorTransform t);
-		NVTT_API void setLinearTransform(int channel, float w0, float w1, float w2, float w3);
-		
-		// Set resizing options.
-		NVTT_API void setMaxExtents(int d);
-		NVTT_API void setRoundMode(RoundMode mode);
-	};
-	
-	
-	/// Output handler.
-	struct OutputHandler
-	{
-		virtual ~OutputHandler() {}
-		
-		/// Indicate the start of a new compressed image that's part of the final texture.
-		virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel) = 0;
-		
-		/// Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
-		virtual bool writeData(const void * data, int size) = 0;
-	};
-
-	/// Error codes.
-	enum Error
-	{
-		Error_Unknown,
-		Error_InvalidInput,
-		Error_UnsupportedFeature,
-		Error_CudaError,
-  		Error_FileOpen,
-  		Error_FileWrite,
-	};
-	
-	/// Error handler.
-	struct ErrorHandler
-	{
-		virtual ~ErrorHandler() {}
-		
-		// Signal error.
-		virtual void error(Error e) = 0;
-	};
-
-
-	/// Output Options. This class holds pointers to the interfaces that are used to report the output of 
-	/// the compressor to the user.
-	struct OutputOptions
-	{
-		NVTT_DECLARE_PIMPL(OutputOptions);
-
-		NVTT_API OutputOptions();
-		NVTT_API ~OutputOptions();
-		
-		// Set default options.
-		NVTT_API void reset();
-		
-		NVTT_API void setFileName(const char * fileName);
-		
-		NVTT_API void setOutputHandler(OutputHandler * outputHandler);
-		NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
-		NVTT_API void setOutputHeader(bool outputHeader);
-	};
-
-
-	/// Texture compressor.
-	struct Compressor
-	{
-		NVTT_DECLARE_PIMPL(Compressor);
-
-		NVTT_API Compressor();
-		NVTT_API ~Compressor();
-
-		NVTT_API void enableCudaAcceleration(bool enable);
-		NVTT_API bool isCudaAccelerationEnabled() const;
-
-		// Main entrypoint of the compression library.
-		NVTT_API bool process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
-		
-		// Estimate the size of compressing the input with the given options.
-		NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
-	};
-	
-	
-	// Return string for the given error code.
-	NVTT_API const char * errorString(Error e);
-
-	// Return NVTT version.
-	NVTT_API unsigned int version();
-
-} // nvtt namespace
-
-#endif // NV_TT_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#ifndef NVTT_H
+#define NVTT_H
+
+// Function linkage
+#if NVTT_SHARED
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ || defined __MINGW32__
+#  ifdef NVTT_EXPORTS
+#    define NVTT_API __declspec(dllexport)
+#  else
+#    define NVTT_API __declspec(dllimport)
+#  endif
+#endif
+
+#if defined __GNUC__ >= 4
+#  ifdef NVTT_EXPORTS
+#    define NVTT_API __attribute__((visibility("default")))
+#  endif
+#endif
+
+#endif // NVTT_SHARED
+
+#if !defined NVTT_API
+#  define NVTT_API
+#endif
+
+#define NVTT_VERSION 20100
+
+#define NVTT_FORBID_COPY(Class) \
+    private: \
+        Class(const Class &); \
+        void operator=(const Class &); \
+    public:
+
+#define NVTT_DECLARE_PIMPL(Class) \
+    public: \
+        struct Private; \
+        Private & m
+
+
+// Public interface.
+namespace nvtt
+{
+    // Forward declarations.
+    struct Surface;
+    struct CubeSurface;
+
+
+    // Supported block-compression formats.
+    // @@ I wish I had distinguished between "formats" and compressors.
+    // That is:
+    // - 'DXT1' is a format 'DXT1a' and 'DXT1n' are DXT1 compressors.
+    // - 'DXT3' is a format 'DXT3n' is a DXT3 compressor.
+    // Having multiple enums for the same ids only creates confusion. Clean this up.
+    enum Format
+    {
+        // No block-compression (linear).
+        Format_RGB,
+        Format_RGBA = Format_RGB,
+
+        // DX9 formats.
+        Format_DXT1,
+        Format_DXT1a,   // DXT1 with binary alpha.
+        Format_DXT3,
+        Format_DXT5,
+        Format_DXT5n,   // Compressed HILO: R=1, G=y, B=0, A=x
+
+        // DX10 formats.
+        Format_BC1 = Format_DXT1,
+        Format_BC1a = Format_DXT1a,
+        Format_BC2 = Format_DXT3,
+        Format_BC3 = Format_DXT5,
+        Format_BC3n = Format_DXT5n,
+        Format_BC4,     // ATI1
+        Format_BC5,     // 3DC, ATI2
+
+        Format_DXT1n,   // Not supported.
+        Format_CTX1,    // Not supported.
+
+        Format_BC6,
+        Format_BC7,
+
+        Format_BC3_RGBM,    // 
+
+        Format_Count
+    };
+
+    // Pixel types. These basically indicate how the output should be interpreted, but do not have any influence over the input. They are only relevant in RGBA mode.
+    enum PixelType
+    {
+        PixelType_UnsignedNorm = 0,
+        PixelType_SignedNorm = 1,   // Not supported yet.
+        PixelType_UnsignedInt = 2,  // Not supported yet.
+        PixelType_SignedInt = 3,    // Not supported yet.
+        PixelType_Float = 4,
+        PixelType_UnsignedFloat = 5,
+        PixelType_SharedExp = 6,    // Shared exponent.
+    };
+
+    // Quality modes.
+    enum Quality
+    {
+        Quality_Fastest,
+        Quality_Normal,
+        Quality_Production,
+        Quality_Highest,
+    };
+
+    // DXT decoder.
+    enum Decoder
+    {
+        Decoder_D3D10,
+        Decoder_D3D9,
+        Decoder_NV5x,
+        //Decoder_RSX, // To take advantage of DXT5 bug.
+    };
+
+
+    // Compression options. This class describes the desired compression format and other compression settings.
+    struct CompressionOptions
+    {
+        NVTT_FORBID_COPY(CompressionOptions);
+        NVTT_DECLARE_PIMPL(CompressionOptions);
+
+        NVTT_API CompressionOptions();
+        NVTT_API ~CompressionOptions();
+
+        NVTT_API void reset();
+
+        NVTT_API void setFormat(Format format);
+        NVTT_API void setQuality(Quality quality);
+        NVTT_API void setColorWeights(float red, float green, float blue, float alpha = 1.0f);
+
+        NVTT_API void setExternalCompressor(const char * name);
+
+        // Set color mask to describe the RGB/RGBA format.
+        NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
+        NVTT_API void setPixelFormat(unsigned char rsize, unsigned char gsize, unsigned char bsize, unsigned char asize);
+
+        NVTT_API void setPixelType(PixelType pixelType);
+
+        NVTT_API void setPitchAlignment(int pitchAlignment);
+
+        // @@ I wish this wasn't part of the compression options. Quantization is applied before compression. We don't have compressors with error diffusion. 
+        // @@ These options are only taken into account when using the InputOptions API.
+        NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
+
+        NVTT_API void setTargetDecoder(Decoder decoder);
+
+        // Translate to and from D3D formats.
+        NVTT_API unsigned int d3d9Format() const;
+        //NVTT_API bool setD3D9Format(unsigned int format);
+        //NVTT_API unsigned int dxgiFormat() const;
+        //NVTT_API bool setDxgiFormat(unsigned int format);
+    };
+
+    /*
+    // DXGI_FORMAT_R16G16_FLOAT
+    compressionOptions.setPixelType(PixelType_Float);
+    compressionOptions.setPixelFormat2(16, 16, 0, 0);
+
+    // DXGI_FORMAT_R32G32B32A32_FLOAT
+    compressionOptions.setPixelType(PixelType_Float);
+    compressionOptions.setPixelFormat2(32, 32, 32, 32);
+    */
+
+
+    // Wrap modes.
+    enum WrapMode
+    {
+        WrapMode_Clamp,
+        WrapMode_Repeat,
+        WrapMode_Mirror,
+    };
+
+    // Texture types.
+    enum TextureType
+    {
+        TextureType_2D,
+        TextureType_Cube,
+        TextureType_3D,
+        TextureType_Array,
+    };
+
+    // Input formats.
+    enum InputFormat
+    {
+        InputFormat_BGRA_8UB,   // Normalized [0, 1] 8 bit fixed point.
+        InputFormat_RGBA_16F,   // 16 bit floating point.
+        InputFormat_RGBA_32F,   // 32 bit floating point.
+        InputFormat_R_32F,      // Single channel 32 bit floating point.
+    };
+
+    // Mipmap downsampling filters.
+    enum MipmapFilter
+    {
+        MipmapFilter_Box,       // Box filter is quite good and very fast.
+        MipmapFilter_Triangle,  // Triangle filter blurs the results too much, but that might be what you want.
+        MipmapFilter_Kaiser,    // Kaiser-windowed Sinc filter is the best downsampling filter.
+    };
+
+    // Texture resize filters.
+    enum ResizeFilter
+    {
+        ResizeFilter_Box,
+        ResizeFilter_Triangle,
+        ResizeFilter_Kaiser,
+        ResizeFilter_Mitchell,
+    };
+
+    // Extents rounding mode.
+    enum RoundMode
+    {
+        RoundMode_None,
+        RoundMode_ToNextPowerOfTwo,
+        RoundMode_ToNearestPowerOfTwo,
+        RoundMode_ToPreviousPowerOfTwo,
+        RoundMode_ToNextMultipleOfFour,                     // (New in NVTT 2.1)
+        RoundMode_ToNearestMultipleOfFour,                  // (New in NVTT 2.1)
+        RoundMode_ToPreviousMultipleOfFour,                 // (New in NVTT 2.1)
+    };
+
+    // Alpha mode.
+    enum AlphaMode
+    {
+        AlphaMode_None,
+        AlphaMode_Transparency,
+        AlphaMode_Premultiplied,
+    };
+
+    // Input options. Specify format and layout of the input texture. (Deprecated in NVTT 2.1)
+    struct InputOptions
+    {
+        NVTT_FORBID_COPY(InputOptions);
+        NVTT_DECLARE_PIMPL(InputOptions);
+
+        NVTT_API InputOptions();
+        NVTT_API ~InputOptions();
+
+        // Set default options.
+        NVTT_API void reset();
+
+        // Setup input layout.
+        NVTT_API void setTextureLayout(TextureType type, int w, int h, int d = 1, int arraySize = 1);
+        NVTT_API void resetTextureLayout();
+
+        // Set mipmap data. Copies the data.
+        NVTT_API bool setMipmapData(const void * data, int w, int h, int d = 1, int face = 0, int mipmap = 0);
+
+        // Describe the format of the input.
+        NVTT_API void setFormat(InputFormat format);
+
+        // Set the way the input alpha channel is interpreted. @@ Not implemented!
+        NVTT_API void setAlphaMode(AlphaMode alphaMode);
+
+        // Set gamma settings.
+        NVTT_API void setGamma(float inputGamma, float outputGamma);
+
+        // Set texture wrapping mode.
+        NVTT_API void setWrapMode(WrapMode mode);
+
+        // Set mipmapping options.
+        NVTT_API void setMipmapFilter(MipmapFilter filter);
+        NVTT_API void setMipmapGeneration(bool enabled, int maxLevel = -1);
+        NVTT_API void setKaiserParameters(float width, float alpha, float stretch);
+
+        // Set normal map options.
+        NVTT_API void setNormalMap(bool b);
+        NVTT_API void setConvertToNormalMap(bool convert);
+        NVTT_API void setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale);
+        NVTT_API void setNormalFilter(float sm, float medium, float big, float large);
+        NVTT_API void setNormalizeMipmaps(bool b);
+
+        // Set resizing options.
+        NVTT_API void setMaxExtents(int d);
+        NVTT_API void setRoundMode(RoundMode mode);
+    };
+
+
+    // Output handler.
+    struct OutputHandler
+    {
+        virtual ~OutputHandler() {}
+
+        // Indicate the start of a new compressed image that's part of the final texture.
+        virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel) = 0;
+
+        // Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
+        virtual bool writeData(const void * data, int size) = 0;
+
+        // Indicate the end of the compressed image. (New in NVTT 2.1)
+        virtual void endImage() = 0;
+    };
+
+    // Error codes.
+    enum Error
+    {
+        Error_Unknown,
+        Error_InvalidInput,
+        Error_UnsupportedFeature,
+        Error_CudaError,
+        Error_FileOpen,
+        Error_FileWrite,
+        Error_UnsupportedOutputFormat,
+        Error_Count
+    };
+
+    // Error handler.
+    struct ErrorHandler
+    {
+        virtual ~ErrorHandler() {}
+
+        // Signal error.
+        virtual void error(Error e) = 0;
+    };
+
+    // Container.
+    enum Container
+    {
+        Container_DDS,
+        Container_DDS10,
+        // Container_KTX,   // Khronos Texture: http://www.khronos.org/opengles/sdk/tools/KTX/
+        // Container_VTF,   // Valve Texture Format: http://developer.valvesoftware.com/wiki/Valve_Texture_Format
+    };
+
+
+    // Output Options. This class holds pointers to the interfaces that are used to report the output of
+    // the compressor to the user.
+    struct OutputOptions
+    {
+        NVTT_FORBID_COPY(OutputOptions);
+        NVTT_DECLARE_PIMPL(OutputOptions);
+
+        NVTT_API OutputOptions();
+        NVTT_API ~OutputOptions();
+
+        // Set default options.
+        NVTT_API void reset();
+
+        NVTT_API void setFileName(const char * fileName);
+        NVTT_API void setFileHandle(void * fp);
+
+        NVTT_API void setOutputHandler(OutputHandler * outputHandler);
+        NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
+
+        NVTT_API void setOutputHeader(bool outputHeader);
+        NVTT_API void setContainer(Container container);
+        NVTT_API void setUserVersion(int version);
+        NVTT_API void setSrgbFlag(bool b);
+    };
+
+    // (New in NVTT 2.1)
+    typedef void Task(void * context, int id);
+
+    // (New in NVTT 2.1)
+    struct TaskDispatcher
+    {
+        virtual ~TaskDispatcher() {}
+
+        virtual void dispatch(Task * task, void * context, int count) = 0;
+    };
+
+    // Context.
+    struct Compressor
+    {
+        NVTT_FORBID_COPY(Compressor);
+        NVTT_DECLARE_PIMPL(Compressor);
+
+        NVTT_API Compressor();
+        NVTT_API ~Compressor();
+
+        // Context settings.
+        NVTT_API void enableCudaAcceleration(bool enable);
+        NVTT_API bool isCudaAccelerationEnabled() const;
+        NVTT_API void setTaskDispatcher(TaskDispatcher * disp); // (New in NVTT 2.1)
+
+        // InputOptions API.
+        NVTT_API bool process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
+
+        // Surface API. (New in NVTT 2.1)
+        NVTT_API bool outputHeader(const Surface & img, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API bool compress(const Surface & img, int face, int mipmap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(const Surface & img, int mipmapCount, const CompressionOptions & compressionOptions) const;
+
+        // CubeSurface API. (New in NVTT 2.1)
+        NVTT_API bool outputHeader(const CubeSurface & cube, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API bool compress(const CubeSurface & cube, int mipmap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(const CubeSurface & cube, int mipmapCount, const CompressionOptions & compressionOptions) const;
+
+        // Raw API. (New in NVTT 2.1)
+        NVTT_API bool outputHeader(TextureType type, int w, int h, int d, int arraySize, int mipmapCount, bool isNormalMap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API bool compress(int w, int h, int d, int face, int mipmap, const float * rgba, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(int w, int h, int d, int mipmapCount, const CompressionOptions & compressionOptions) const;
+    };
+
+    // "Compressor" is deprecated. This should have been called "Context"
+    typedef Compressor Context;
+
+    // (New in NVTT 2.1)
+    enum NormalTransform {
+        NormalTransform_Orthographic,
+        NormalTransform_Stereographic,
+        NormalTransform_Paraboloid,
+        NormalTransform_Quartic
+        //NormalTransform_DualParaboloid,
+    };
+
+    // (New in NVTT 2.1)
+    enum ToneMapper {
+        ToneMapper_Linear,
+        ToneMapper_Reindhart,
+        ToneMapper_Halo,
+        ToneMapper_Lightmap,
+    };
+
+
+    // A surface is one level of a 2D or 3D texture. (New in NVTT 2.1)
+    // @@ It would be nice to add support for texture borders for correct resizing of tiled textures and constrained DXT compression.
+    struct Surface
+    {
+        NVTT_API Surface();
+        NVTT_API Surface(const Surface & img);
+        NVTT_API ~Surface();
+
+        NVTT_API void operator=(const Surface & img);
+
+        // Texture parameters.
+        NVTT_API void setWrapMode(WrapMode mode);
+        NVTT_API void setAlphaMode(AlphaMode alphaMode);
+        NVTT_API void setNormalMap(bool isNormalMap);
+
+        // Queries.
+        NVTT_API bool isNull() const;
+        NVTT_API int width() const;
+        NVTT_API int height() const;
+        NVTT_API int depth() const;
+        NVTT_API TextureType type() const;
+        NVTT_API WrapMode wrapMode() const;
+        NVTT_API AlphaMode alphaMode() const;
+        NVTT_API bool isNormalMap() const;
+        NVTT_API int countMipmaps() const;
+        NVTT_API int countMipmaps(int min_size) const;
+        NVTT_API float alphaTestCoverage(float alphaRef = 0.5, int alpha_channel = 3) const;
+        NVTT_API float average(int channel, int alpha_channel = -1, float gamma = 2.2f) const;
+        NVTT_API const float * data() const;
+        NVTT_API const float * channel(int i) const;
+        NVTT_API void histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const;
+        NVTT_API void range(int channel, float * rangeMin, float * rangeMax, int alpha_channel = -1, float alpha_ref = 0.f) const;
+
+        // Texture data.
+        NVTT_API bool load(const char * fileName, bool * hasAlpha = 0);
+        NVTT_API bool save(const char * fileName, bool hasAlpha = 0, bool hdr = 0) const;
+        NVTT_API bool setImage(int w, int h, int d);
+        NVTT_API bool setImage(InputFormat format, int w, int h, int d, const void * data);
+        NVTT_API bool setImage(InputFormat format, int w, int h, int d, const void * r, const void * g, const void * b, const void * a);
+        NVTT_API bool setImage2D(Format format, Decoder decoder, int w, int h, const void * data);
+
+        // Resizing methods.
+        NVTT_API void resize(int w, int h, int d, ResizeFilter filter);
+        NVTT_API void resize(int w, int h, int d, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter);
+
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, int min_size = 1);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0, int min_size = 1);
+        NVTT_API bool buildNextMipmapSolidColor(const float * const color_components);
+        NVTT_API void canvasSize(int w, int h, int d);
+        // associated to resizing:
+        NVTT_API bool canMakeNextMipmap(int min_size = 1);
+
+        // Color transforms.
+        NVTT_API void toLinear(float gamma);
+        NVTT_API void toGamma(float gamma);
+        NVTT_API void toLinear(int channel, float gamma);
+        NVTT_API void toGamma(int channel, float gamma);
+        NVTT_API void toSrgb();
+        NVTT_API void toLinearFromSrgb();
+        NVTT_API void toXenonSrgb();
+        NVTT_API void transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]);
+        NVTT_API void swizzle(int r, int g, int b, int a);
+        NVTT_API void scaleBias(int channel, float scale, float bias);
+        NVTT_API void clamp(int channel, float low = 0.0f, float high = 1.0f);
+        NVTT_API void blend(float r, float g, float b, float a, float t);
+        NVTT_API void premultiplyAlpha();
+        NVTT_API void toGreyScale(float redScale, float greenScale, float blueScale, float alphaScale);
+        NVTT_API void setBorder(float r, float g, float b, float a);
+        NVTT_API void fill(float r, float g, float b, float a);
+        NVTT_API void scaleAlphaToCoverage(float coverage, float alphaRef = 0.5f, int alpha_channel = 3);
+        NVTT_API void toRGBM(float range = 1.0f, float threshold = 0.25f);
+        NVTT_API void fromRGBM(float range = 1.0f, float threshold = 0.25f);
+        NVTT_API void toLM(float range = 1.0f, float threshold = 0.0f);
+        NVTT_API void toRGBE(int mantissaBits, int exponentBits);
+        NVTT_API void fromRGBE(int mantissaBits, int exponentBits);
+        NVTT_API void toYCoCg();
+        NVTT_API void blockScaleCoCg(int bits = 5, float threshold = 0.0f);
+        NVTT_API void fromYCoCg();
+        NVTT_API void toLUVW(float range = 1.0f);
+        NVTT_API void fromLUVW(float range = 1.0f);
+        NVTT_API void abs(int channel);
+        NVTT_API void convolve(int channel, int kernelSize, float * kernelData);
+        NVTT_API void toLogScale(int channel, float base);
+        NVTT_API void fromLogScale(int channel, float base);
+        NVTT_API void setAtlasBorder(int w, int h, float r, float g, float b, float a);
+
+        NVTT_API void toneMap(ToneMapper tm, float * parameters);
+
+        //NVTT_API void blockLuminanceScale(float scale);
+
+        // Color quantization.
+        NVTT_API void binarize(int channel, float threshold, bool dither);
+        NVTT_API void quantize(int channel, int bits, bool exactEndPoints, bool dither);
+
+        // Normal map transforms.
+        NVTT_API void toNormalMap(float sm, float medium, float big, float large);
+        NVTT_API void normalizeNormalMap();
+        NVTT_API void transformNormals(NormalTransform xform);
+        NVTT_API void reconstructNormals(NormalTransform xform);
+        NVTT_API void toCleanNormalMap();
+        NVTT_API void packNormals(float scale = 0.5f, float bias = 0.5f);       // [-1,1] -> [ 0,1]
+        NVTT_API void expandNormals(float scale = 2.0f, float bias = -1.0f);    // [ 0,1] -> [-1,1]
+        NVTT_API Surface createToksvigMap(float power) const;
+        NVTT_API Surface createCleanMap() const;
+
+        // Geometric transforms.
+        NVTT_API void flipX();
+        NVTT_API void flipY();
+        NVTT_API void flipZ();
+        NVTT_API Surface createSubImage(int x0, int x1, int y0, int y1, int z0, int z1) const;
+
+        // Copy image data.
+        NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel);
+        NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel, int dstChannel);
+
+        NVTT_API bool addChannel(const Surface & img, int srcChannel, int dstChannel, float scale);
+
+        NVTT_API bool copy(const Surface & src, int xsrc, int ysrc, int zsrc, int xsize, int ysize, int zsize, int xdst, int ydst, int zdst);
+
+
+    //private:
+        void detach();
+
+        struct Private;
+        Private * m;
+    };
+
+
+    // Cube layout formats. (New in NVTT 2.1)
+    enum CubeLayout {
+        CubeLayout_VerticalCross,
+        CubeLayout_HorizontalCross,
+        CubeLayout_Column,
+        CubeLayout_Row,
+        CubeLayout_LatitudeLongitude
+    };
+
+    // (New in NVTT 2.1)
+    enum EdgeFixup {
+        EdgeFixup_None,
+        EdgeFixup_Stretch,
+        EdgeFixup_Warp,
+        EdgeFixup_Average,
+    };
+
+    // A CubeSurface is one level of a cube map texture. (New in NVTT 2.1)
+    struct CubeSurface
+    {
+        NVTT_API CubeSurface();
+        NVTT_API CubeSurface(const CubeSurface & img);
+        NVTT_API ~CubeSurface();
+
+        NVTT_API void operator=(const CubeSurface & img);
+
+        // Queries.
+        NVTT_API bool isNull() const;
+        NVTT_API int edgeLength() const;
+        NVTT_API int countMipmaps() const;
+
+        // Texture data.
+        NVTT_API bool load(const char * fileName, int mipmap);
+        NVTT_API bool save(const char * fileName) const;
+
+        NVTT_API Surface & face(int face);
+        NVTT_API const Surface & face(int face) const;
+
+        // Layout conversion. @@ Not implemented.
+        NVTT_API void fold(const Surface & img, CubeLayout layout);
+        NVTT_API Surface unfold(CubeLayout layout) const;
+
+        // @@ Angular extent filtering.
+
+        // @@ Add resizing methods.
+
+        // @@ Add edge fixup methods.
+
+        NVTT_API float average(int channel) const;
+        NVTT_API void range(int channel, float * minimum_ptr, float * maximum_ptr) const;
+        NVTT_API void clamp(int channel, float low = 0.0f, float high = 1.0f);
+
+
+        // Filtering.
+        NVTT_API CubeSurface irradianceFilter(int size, EdgeFixup fixupMethod) const;
+        NVTT_API CubeSurface cosinePowerFilter(int size, float cosinePower, EdgeFixup fixupMethod) const;
+
+        NVTT_API CubeSurface fastResample(int size, EdgeFixup fixupMethod) const;
+
+
+        /*
+        NVTT_API void resize(int w, int h, ResizeFilter filter);
+        NVTT_API void resize(int w, int h, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0);
+        */
+
+        // Color transforms.
+        NVTT_API void toLinear(float gamma);
+        NVTT_API void toGamma(float gamma);
+
+    //private:
+        void detach();
+
+        struct Private;
+        Private * m;
+    };
+
+
+    // Return string for the given error code.
+    NVTT_API const char * errorString(Error e);
+
+    // Return NVTT version.
+    NVTT_API unsigned int version();
+
+    // Image comparison and error measurement functions. (New in NVTT 2.1)
+    NVTT_API float rmsError(const Surface & reference, const Surface & img);
+    NVTT_API float rmsAlphaError(const Surface & reference, const Surface & img);
+    NVTT_API float cieLabError(const Surface & reference, const Surface & img);
+    NVTT_API float angularError(const Surface & reference, const Surface & img);
+    NVTT_API Surface diff(const Surface & reference, const Surface & img, float scale);
+
+    NVTT_API float rmsToneMappedError(const Surface & reference, const Surface & img, float exposure);
+
+
+    NVTT_API Surface histogram(const Surface & img, int width, int height);
+    NVTT_API Surface histogram(const Surface & img, float minRange, float maxRange, int width, int height);
+
+} // nvtt namespace
+
+#endif // NVTT_H
Index: ps/trunk/libraries/source/nvtt/patches/arm-fix.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/arm-fix.patch
+++ ps/trunk/libraries/source/nvtt/patches/arm-fix.patch
@@ -1,21 +0,0 @@
-Index: src/src/nvcore/nvcore.h
-===================================================================
---- src/src/nvcore/nvcore.h	(revision 13633)
-+++ src/src/nvcore/nvcore.h	(revision 13634)
-@@ -67,6 +67,7 @@
- // NV_CPU_X86
- // NV_CPU_X86_64
- // NV_CPU_PPC
-+// NV_CPU_ARM
- 
- #define NV_CPU_STRING 	POSH_CPU_STRING
- 
-@@ -76,6 +77,8 @@
- #	define NV_CPU_X86 1
- #elif defined POSH_CPU_PPC
- #	define NV_CPU_PPC 1
-+#elif defined POSH_CPU_STRONGARM
-+#	define NV_CPU_ARM 1
- #else
- #	error "Unsupported CPU"
- #endif
Index: ps/trunk/libraries/source/nvtt/patches/arm64-fix.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/arm64-fix.patch
+++ ps/trunk/libraries/source/nvtt/patches/arm64-fix.patch
@@ -1,67 +0,0 @@
-Patch from http://trac.wildfiregames.com/ticket/3344
-Backport from upstream https://github.com/castano/nvidia-texture-tools/commit/58617584d4d2541ff9fcfe23a9a492af86b11efb
-
-Index: src/src/nvcore/Debug.cpp
-===================================================================
---- src/src/nvcore/Debug.cpp	(revision 16870)
-+++ src/src/nvcore/Debug.cpp	(working copy)
-@@ -232,6 +232,9 @@
- #		elif NV_CPU_PPC
- 			ucontext_t * ucp = (ucontext_t *)secret;
- 			return (void *) ucp->uc_mcontext.regs->nip;
-+#		elif NV_CPU_AARCH64
-+			ucontext_t * ucp = (ucontext_t *)secret;
-+			return (void *) ucp->uc_mcontext.pc;
- #		endif
- #	endif
- 		
-Index: src/src/nvcore/nvcore.h
-===================================================================
---- src/src/nvcore/nvcore.h	(revision 16870)
-+++ src/src/nvcore/nvcore.h	(working copy)
-@@ -68,6 +68,7 @@
- // NV_CPU_X86_64
- // NV_CPU_PPC
- // NV_CPU_ARM
-+// NV_CPU_AARCH64
- 
- #define NV_CPU_STRING 	POSH_CPU_STRING
- 
-@@ -79,6 +80,8 @@
- #	define NV_CPU_PPC 1
- #elif defined POSH_CPU_STRONGARM
- #	define NV_CPU_ARM 1
-+#elif defined POSH_CPU_AARCH64
-+#	define NV_CPU_AARCH64 1
- #else
- #	error "Unsupported CPU"
- #endif
-Index: src/src/nvcore/poshlib/posh.h
-===================================================================
---- src/src/nvcore/poshlib/posh.h	(revision 16870)
-+++ src/src/nvcore/poshlib/posh.h	(working copy)
-@@ -485,6 +485,11 @@
- #  define POSH_CPU_STRING "ARM"
- #endif
- 
-+#if defined __aarch64__
-+#  define POSH_CPU_AARCH64 1
-+#  define POSH_CPU_STRING "ARM64"
-+#endif
-+
- #if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
- #  define POSH_CPU_MIPS 1 
- #  if defined _R5900
-@@ -658,7 +663,7 @@
- ** the MIPS series, so we have to be careful about those.
- ** ----------------------------------------------------------------------------
- */
--#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
-+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
- #  define POSH_ENDIAN_STRING "little"
- #  define POSH_LITTLE_ENDIAN 1
- #else
-Index: libraries/source/spidermonkey/mozjs-31.2.0.rc0.tar.bz2
-===================================================================
-Cannot display: file marked as a binary type.
-svn:mime-type = application/octet-stream
Index: ps/trunk/libraries/source/nvtt/patches/clang-cpp11-error.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/clang-cpp11-error.patch
+++ ps/trunk/libraries/source/nvtt/patches/clang-cpp11-error.patch
@@ -1,67 +0,0 @@
-Index: src/src/nvimage/ImageIO.cpp
-===================================================================
---- src/src/nvimage/ImageIO.cpp	(revision 16371)
-+++ src/src/nvimage/ImageIO.cpp	(working copy)
-@@ -132,13 +132,13 @@
- {
- 	nvDebugCheck(fileName != NULL);
- 
--	StdInputStream stream(fileName);
--	
--	if (stream.isError()) {
--		return false;
--	}
--	
--	return loadFloat(fileName, stream);
-+	StdInputStream stream(fileName);
-+	
-+	if (stream.isError()) {
-+		return NULL;
-+	}
-+	
-+	return loadFloat(fileName, stream);
- }
- 
- FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s)
-@@ -230,13 +230,13 @@
- 		case TGA_TYPE_RLE_INDEXED:
- 			rle = true;
- 			// no break is intended!
--		case TGA_TYPE_INDEXED:
--			if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
--				nvDebug( "*** ImageIO::loadTGA: Error, only 24bit paletted images are supported.\n" );
--				return false;
--			}
--			pal = true;
--			break;
-+		case TGA_TYPE_INDEXED:
-+			if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
-+				nvDebug( "*** ImageIO::loadTGA: Error, only 24bit paletted images are supported.\n" );
-+				return NULL;
-+			}
-+			pal = true;
-+			break;
- 
- 		case TGA_TYPE_RLE_RGB:
- 			rle = true;
-@@ -251,13 +251,13 @@
- 		case TGA_TYPE_GREY:
- 			grey = true;
- 			break;
--
--		default:
--			nvDebug( "*** ImageIO::loadTGA: Error, unsupported image type.\n" );
--			return false;
--	}
--
--	const uint pixel_size = (tga.pixel_size/8);
-+
-+		default:
-+			nvDebug( "*** ImageIO::loadTGA: Error, unsupported image type.\n" );
-+			return NULL;
-+	}
-+
-+	const uint pixel_size = (tga.pixel_size/8);
- 	nvDebugCheck(pixel_size <= 4);
- 	
- 	const uint size = tga.width * tga.height * pixel_size;
Index: ps/trunk/libraries/source/nvtt/patches/cmake-build.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/cmake-build.patch
+++ ps/trunk/libraries/source/nvtt/patches/cmake-build.patch
@@ -0,0 +1,243 @@
+---
+ CMakeLists.txt              | 16 ++++----
+ src/CMakeLists.txt          | 82 ++++++++++++++++++-------------------
+ src/nvcore/CMakeLists.txt   |  6 +--
+ src/nvimage/CMakeLists.txt  |  6 +--
+ src/nvmath/CMakeLists.txt   |  6 +--
+ src/nvthread/CMakeLists.txt |  6 +--
+ src/nvtt/CMakeLists.txt     |  6 +--
+ 7 files changed, 64 insertions(+), 64 deletions(-)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index ab4dcb6..9c80369 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -11,19 +11,19 @@ SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${NV_CMAKE_DIR}")
+ #ENDIF(CMAKE_COMPILER_IS_GNUCC)
+ set (CMAKE_CXX_STANDARD 11)
+ 
+-IF(WIN32)
++#IF(WIN32)
+ 	# gnuwin32 paths:
+-	SET(GNUWIN32_PATH "${NV_SOURCE_DIR}/extern/gnuwin32")
+-	SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "${GNUWIN32_PATH}/include")
+-	SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "${GNUWIN32_PATH}/lib")
++	#SET(GNUWIN32_PATH "${NV_SOURCE_DIR}/extern/gnuwin32")
++	#SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "${GNUWIN32_PATH}/include")
++	#SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "${GNUWIN32_PATH}/lib")
+ 	
+ 	# Set GLUT path:
+-	SET(GLUT_ROOT_DIR "${NV_SOURCE_DIR}/extern/glut")
++	#SET(GLUT_ROOT_DIR "${NV_SOURCE_DIR}/extern/glut")
+ 
+ 	# Set FreeImage path:
+-	SET(FREEIMAGE_ROOT_DIR "${NV_SOURCE_DIR}/extern/FreeImage")
+-	
+-ENDIF(WIN32)
++	#SET(FREEIMAGE_ROOT_DIR "${NV_SOURCE_DIR}/extern/FreeImage")
++
++#ENDIF(WIN32)
+ 
+ INCLUDE(${NV_CMAKE_DIR}/OptimalOptions.cmake)
+ MESSAGE(STATUS "Setting optimal options")
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index f64b263..ec97402 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -11,13 +11,16 @@
+ SUBDIRS(bc6h)
+ SUBDIRS(bc7)
+ 
++# Make PNG optional (we disable it on macOS)
++SET(PNG TRUE CACHE BOOL "")
++
+ # OpenGL
+-#INCLUDE(FindOpenGL)
+-#IF(OPENGL_FOUND)
+-#	MESSAGE(STATUS "Looking for OpenGL - found")
+-#ELSE(OPENGL_FOUND)
+-#	MESSAGE(STATUS "Looking for OpenGL - not found")
+-#ENDIF(OPENGL_FOUND)
++INCLUDE(FindOpenGL)
++IF(OPENGL_FOUND)
++	MESSAGE(STATUS "Looking for OpenGL - found")
++ELSE(OPENGL_FOUND)
++	MESSAGE(STATUS "Looking for OpenGL - not found")
++ENDIF(OPENGL_FOUND)
+ 
+ # GLUT
+ #INCLUDE(FindGLUT)
+@@ -28,12 +31,12 @@
+ #ENDIF(GLUT_FOUND)
+ 
+ # DirectX
+-#INCLUDE(${NV_CMAKE_DIR}/FindDirectX.cmake)
+-#IF(DX10_FOUND)
+-#	MESSAGE(STATUS "Looking for DirectX - found")
+-#ELSE(DX10_FOUND)
+-#	MESSAGE(STATUS "Looking for DirectX - not found")
+-#ENDIF(DX10_FOUND)
++INCLUDE(${NV_CMAKE_DIR}/FindDirectX.cmake)
++IF(DX10_FOUND)
++	MESSAGE(STATUS "Looking for DirectX - found")
++ELSE(DX10_FOUND)
++	MESSAGE(STATUS "Looking for DirectX - not found")
++ENDIF(DX10_FOUND)
+ 
+ # GLEW
+ #INCLUDE(${NV_CMAKE_DIR}/FindGLEW.cmake)
+@@ -53,18 +56,18 @@
+ 
+ # CUDA
+ #FIND_PACKAGE(CUDA)
+-IF(CUDA_FOUND)
+-    IF(MINGW)
+-        MESSAGE(STATUS "Looking for CUDA - not supported on MinGW")
+-        UNSET(CUDA_FOUND)
+-    ENDIF(MINGW)
+-    IF(CUDA_FOUND)
+-        SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
+-        MESSAGE(STATUS "Looking for CUDA - found")
+-    ENDIF(CUDA_FOUND)
+-ELSE(CUDA_FOUND)
+-    MESSAGE(STATUS "Looking for CUDA - not found")
+-ENDIF(CUDA_FOUND)
++#IF(CUDA_FOUND)
++#	IF(MINGW)
++#		MESSAGE(STATUS "Looking for CUDA - not supported on MinGW")
++#		UNSET(CUDA_FOUND)
++#	ENDIF(MINGW)
++#	IF(CUDA_FOUND)
++#		SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
++#		MESSAGE(STATUS "Looking for CUDA - found")
++#	ENDIF(CUDA_FOUND)
++#ELSE(CUDA_FOUND)
++#	MESSAGE(STATUS "Looking for CUDA - not found")
++#ENDIF(CUDA_FOUND)
+ 
+ # Maya
+ #INCLUDE(${NV_CMAKE_DIR}/FindMaya.cmake)
+@@ -94,13 +97,15 @@
+ #ENDIF(JPEG_FOUND)
+ 
+ # PNG
+-#INCLUDE(FindPNG)
+-#IF(PNG_FOUND)
+-#	SET(HAVE_PNG ${PNG_FOUND} CACHE BOOL "Set to TRUE if PNG is found, FALSE otherwise")
+-#	MESSAGE(STATUS "Looking for PNG - found")
+-#ELSE(PNG_FOUND)
+-#	MESSAGE(STATUS "Looking for PNG - not found")
+-#ENDIF(PNG_FOUND)
++IF(PNG)
++	INCLUDE(FindPNG)
++	IF(PNG_FOUND)
++		SET(HAVE_PNG ${PNG_FOUND} CACHE BOOL "Set to TRUE if PNG is found, FALSE otherwise")
++		MESSAGE(STATUS "Looking for PNG - found")
++	ELSE(PNG_FOUND)
++		MESSAGE(STATUS "Looking for PNG - not found")
++	ENDIF(PNG_FOUND)
++ENDIF(PNG)
+ 
+ # TIFF
+ #SET(TIFF_NAMES libtiff)
+@@ -122,15 +127,15 @@
+ #ENDIF(OPENEXR_FOUND)
+ 
+ # OpenMP
+-INCLUDE(FindOpenMP)
+-IF(OPENMP_FOUND)
+-	SET(HAVE_OPENMP ${OPENMP_FOUND} CACHE BOOL "Set to TRUE if OpenMP is found, FALSE otherwise")
+-	MESSAGE(STATUS "Looking for OpenMP - found")
+-	SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+-	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+-ELSE(OPENMP_FOUND)
+-	MESSAGE(STATUS "Looking for OpenMP - not found")
+-ENDIF(OPENMP_FOUND)
++#INCLUDE(FindOpenMP)
++#IF(OPENMP_FOUND)
++#	SET(HAVE_OPENMP ${OPENMP_FOUND} CACHE BOOL "Set to TRUE if OpenMP is found, FALSE otherwise")
++#	MESSAGE(STATUS "Looking for OpenMP - found")
++#	SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
++#	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
++#ELSE(OPENMP_FOUND)
++#	MESSAGE(STATUS "Looking for OpenMP - not found")
++#ENDIF(OPENMP_FOUND)
+ 
+ # Threads
+ FIND_PACKAGE(Threads REQUIRED)
+ MESSAGE(STATUS "Use thread library: ${CMAKE_THREAD_LIBS_INIT}")
+diff --git a/src/nvcore/CMakeLists.txt b/src/nvcore/CMakeLists.txt
+index 3dfcb5d..a0bec38 100644
+--- a/src/nvcore/CMakeLists.txt
++++ b/src/nvcore/CMakeLists.txt
+@@ -44,6 +44,6 @@ if (CMAKE_SYSTEM_NAME MATCHES "NetBSD" OR CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+ endif()
+ 
+ INSTALL(TARGETS nvcore
+-    RUNTIME DESTINATION bin
+-    LIBRARY DESTINATION lib
+-    ARCHIVE DESTINATION lib/static)
++    RUNTIME DESTINATION ${BINDIR}
++    LIBRARY DESTINATION ${LIBDIR}
++    ARCHIVE DESTINATION ${LIBDIR})
+diff --git a/src/nvimage/CMakeLists.txt b/src/nvimage/CMakeLists.txt
+index dce627d..420d9a6 100644
+--- a/src/nvimage/CMakeLists.txt
++++ b/src/nvimage/CMakeLists.txt
+@@ -56,7 +56,7 @@ ENDIF(NVIMAGE_SHARED)
+ TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore posh bc6h bc7 nvmath)
+ 
+ INSTALL(TARGETS nvimage
+-    RUNTIME DESTINATION bin
+-    LIBRARY DESTINATION lib
+-    ARCHIVE DESTINATION lib/static)
++    RUNTIME DESTINATION ${BINDIR}
++    LIBRARY DESTINATION ${LIBDIR}
++    ARCHIVE DESTINATION ${LIBDIR})
+ 
+diff --git a/src/nvmath/CMakeLists.txt b/src/nvmath/CMakeLists.txt
+index abeb05f..e63df63 100644
+--- a/src/nvmath/CMakeLists.txt
++++ b/src/nvmath/CMakeLists.txt
+@@ -28,6 +28,6 @@ ENDIF(NVMATH_SHARED)
+ TARGET_LINK_LIBRARIES(nvmath ${LIBS} nvcore)
+ 
+ INSTALL(TARGETS nvmath
+-    RUNTIME DESTINATION bin
+-    LIBRARY DESTINATION lib
+-    ARCHIVE DESTINATION lib/static)
++    RUNTIME DESTINATION ${BINDIR}
++    LIBRARY DESTINATION ${LIBDIR}
++    ARCHIVE DESTINATION ${LIBDIR})
+diff --git a/src/nvthread/CMakeLists.txt b/src/nvthread/CMakeLists.txt
+index 15dbc4e..a2b3654 100644
+--- a/src/nvthread/CMakeLists.txt
++++ b/src/nvthread/CMakeLists.txt
+@@ -23,6 +23,6 @@ ENDIF(NVTHREAD_SHARED)
+ TARGET_LINK_LIBRARIES(nvthread ${LIBS} nvcore)
+ 
+ INSTALL(TARGETS nvthread
+-	RUNTIME DESTINATION bin
+-	LIBRARY DESTINATION lib
+-	ARCHIVE DESTINATION lib/static)
++	RUNTIME DESTINATION ${BINDIR}
++	LIBRARY DESTINATION ${LIBDIR}
++	ARCHIVE DESTINATION ${LIBDIR})
+diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
+index 7923159..df77c86 100644
+--- a/src/nvtt/CMakeLists.txt
++++ b/src/nvtt/CMakeLists.txt
+@@ -50,9 +50,9 @@ ENDIF(NVTT_SHARED)
+ TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvimage nvthread squish bc6h bc7 nvmath)
+ 
+ INSTALL(TARGETS nvtt 
+-    RUNTIME DESTINATION bin
+-    LIBRARY DESTINATION lib
+-    ARCHIVE DESTINATION lib/static)
++    RUNTIME DESTINATION ${BINDIR}
++    LIBRARY DESTINATION ${LIBDIR}
++    ARCHIVE DESTINATION ${LIBDIR})
+ 
+ INSTALL(FILES nvtt.h DESTINATION include/nvtt)
+ 
+--
Index: ps/trunk/libraries/source/nvtt/patches/cmake-devflags.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/cmake-devflags.patch
+++ ps/trunk/libraries/source/nvtt/patches/cmake-devflags.patch
@@ -1,176 +0,0 @@
-From: hasufell <julian.ospald@googlemail.com>
-Date: Wed Apr 11 21:49:58 UTC 2012
-Subject: various cmake fixes
-
-fix hardcoded install paths for BINDIR and LIBDIR
-make cg, cuda, glew, glut and openexr controllable and not automagic
-
---- src/nvcore/CMakeLists.txt
-+++ src/nvcore/CMakeLists.txt
-@@ -42,6 +42,6 @@
- TARGET_LINK_LIBRARIES(nvcore ${LIBS})
- 
- INSTALL(TARGETS nvcore
--	RUNTIME DESTINATION bin
--	LIBRARY DESTINATION lib
--	ARCHIVE DESTINATION lib/static)
-+	RUNTIME DESTINATION ${BINDIR}
-+	LIBRARY DESTINATION ${LIBDIR}
-+	ARCHIVE DESTINATION ${LIBDIR})
---- src/nvimage/CMakeLists.txt
-+++ src/nvimage/CMakeLists.txt
-@@ -62,7 +62,7 @@
- TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore nvmath posh)
- 
- INSTALL(TARGETS nvimage
--	RUNTIME DESTINATION bin
--	LIBRARY DESTINATION lib
--	ARCHIVE DESTINATION lib/static)
-+	RUNTIME DESTINATION ${BINDIR}
-+	LIBRARY DESTINATION ${LIBDIR} 
-+	ARCHIVE DESTINATION ${LIBDIR})
- 
---- src/nvmath/CMakeLists.txt
-+++ src/nvmath/CMakeLists.txt
-@@ -28,6 +28,6 @@
- TARGET_LINK_LIBRARIES(nvmath ${LIBS} nvcore)
- 
- INSTALL(TARGETS nvmath
--	RUNTIME DESTINATION bin
--	LIBRARY DESTINATION lib
--	ARCHIVE DESTINATION lib/static)
-+	RUNTIME DESTINATION ${BINDIR}
-+	LIBRARY DESTINATION ${LIBDIR}
-+	ARCHIVE DESTINATION ${LIBDIR})
---- src/nvtt/CMakeLists.txt
-+++ src/nvtt/CMakeLists.txt
-@@ -53,9 +53,9 @@
- TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvmath nvimage squish)
- 
- INSTALL(TARGETS nvtt 
--	RUNTIME DESTINATION bin
--	LIBRARY DESTINATION lib
--	ARCHIVE DESTINATION lib/static)
-+	RUNTIME DESTINATION ${BINDIR}
-+	LIBRARY DESTINATION ${LIBDIR}
-+	ARCHIVE DESTINATION ${LIBDIR})
- 
- INSTALL(FILES nvtt.h DESTINATION include/nvtt)
- 
---- src/CMakeLists.txt
-+++ src/CMakeLists.txt
-@@ -5,6 +5,13 @@
- SUBDIRS(nvtt)
- 
- INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-+
-+# initial variables
-+SET(GLUT TRUE CACHE BOOL "")
-+SET(GLEW TRUE CACHE BOOL "")
-+SET(CG TRUE CACHE BOOL "")
-+SET(CUDA TRUE CACHE BOOL "")
-+SET(OPENEXR TRUE CACHE BOOL "")
- 
- # OpenGL
- INCLUDE(FindOpenGL)
-@@ -15,13 +22,15 @@
- ENDIF(OPENGL_FOUND)
- 
- # GLUT
--INCLUDE(${NV_CMAKE_DIR}/FindGLUT.cmake)
--#INCLUDE(FindGLUT)
--IF(GLUT_FOUND)
--	MESSAGE(STATUS "Looking for GLUT - found")
--ELSE(GLUT_FOUND)
--	MESSAGE(STATUS "Looking for GLUT - not found")
--ENDIF(GLUT_FOUND)
-+IF(GLUT)
-+	INCLUDE(${NV_CMAKE_DIR}/FindGLUT.cmake)
-+	#INCLUDE(FindGLUT)
-+	IF(GLUT_FOUND)
-+		MESSAGE(STATUS "Looking for GLUT - found")
-+	ELSE(GLUT_FOUND)
-+		MESSAGE(STATUS "Looking for GLUT - not found")
-+	ENDIF(GLUT_FOUND)
-+ENDIF(GLUT)
- 
- # DirectX
- INCLUDE(${NV_CMAKE_DIR}/FindDirectX.cmake)
-@@ -32,29 +41,35 @@
- ENDIF(DX10_FOUND)
- 
- # GLEW
--INCLUDE(${NV_CMAKE_DIR}/FindGLEW.cmake)
--IF(GLEW_FOUND)
--	MESSAGE(STATUS "Looking for GLEW - found")
--ELSE(GLEW_FOUND)
--	MESSAGE(STATUS "Looking for GLEW - not found")
--ENDIF(GLEW_FOUND)
-+IF(GLEW)
-+	INCLUDE(${NV_CMAKE_DIR}/FindGLEW.cmake)
-+	IF(GLEW_FOUND)
-+		MESSAGE(STATUS "Looking for GLEW - found")
-+	ELSE(GLEW_FOUND)
-+		MESSAGE(STATUS "Looking for GLEW - not found")
-+	ENDIF(GLEW_FOUND)
-+ENDIF(GLEW)
- 
- # Cg
--INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
--IF(CG_FOUND)
--	MESSAGE(STATUS "Looking for Cg - found")
--ELSE(CG_FOUND)
--	MESSAGE(STATUS "Looking for Cg - not found")
--ENDIF(CG_FOUND)
-+IF(CG)
-+	INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
-+	IF(CG_FOUND)
-+		MESSAGE(STATUS "Looking for Cg - found")
-+	ELSE(CG_FOUND)
-+		MESSAGE(STATUS "Looking for Cg - not found")
-+	ENDIF(CG_FOUND)
-+ENDIF(CG)
- 
- # CUDA
--INCLUDE(${NV_CMAKE_DIR}/FindCUDA.cmake)
--IF(CUDA_FOUND)
--	SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
--	MESSAGE(STATUS "Looking for CUDA - found")
--ELSE(CUDA_FOUND)
--	MESSAGE(STATUS "Looking for CUDA - not found")
--ENDIF(CUDA_FOUND)
-+IF(CUDA)
-+	INCLUDE(${NV_CMAKE_DIR}/FindCUDA.cmake)
-+	IF(CUDA_FOUND)
-+		SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
-+		MESSAGE(STATUS "Looking for CUDA - found")
-+	ELSE(CUDA_FOUND)
-+		MESSAGE(STATUS "Looking for CUDA - not found")
-+	ENDIF(CUDA_FOUND)
-+ENDIF(CUDA)
- 
- # Maya
- INCLUDE(${NV_CMAKE_DIR}/FindMaya.cmake)
-@@ -93,13 +108,15 @@
- ENDIF(TIFF_FOUND)
- 
- # OpenEXR
--INCLUDE(${NV_CMAKE_DIR}/FindOpenEXR.cmake)
--IF(OPENEXR_FOUND)
--	SET(HAVE_OPENEXR ${OPENEXR_FOUND} CACHE BOOL "Set to TRUE if OpenEXR is found, FALSE otherwise")
--	MESSAGE(STATUS "Looking for OpenEXR - found")
--ELSE(OPENEXR_FOUND)
--	MESSAGE(STATUS "Looking for OpenEXR - not found")
--ENDIF(OPENEXR_FOUND)
-+IF(OPENEXR)
-+	INCLUDE(${NV_CMAKE_DIR}/FindOpenEXR.cmake)
-+	IF(OPENEXR_FOUND)
-+		SET(HAVE_OPENEXR ${OPENEXR_FOUND} CACHE BOOL "Set to TRUE if OpenEXR is found, FALSE otherwise")
-+		MESSAGE(STATUS "Looking for OpenEXR - found")
-+	ELSE(OPENEXR_FOUND)
-+		MESSAGE(STATUS "Looking for OpenEXR - not found")
-+	ENDIF(OPENEXR_FOUND)
-+ENDIF(OPENEXR)
- 
- # Qt
- FIND_PACKAGE(Qt4)
Index: ps/trunk/libraries/source/nvtt/patches/cmake-devflags2.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/cmake-devflags2.patch
+++ ps/trunk/libraries/source/nvtt/patches/cmake-devflags2.patch
@@ -1,101 +0,0 @@
-Index: src/CMakeLists.txt
-===================================================================
---- src/CMakeLists.txt	(revision 13168)
-+++ src/CMakeLists.txt	(working copy)
-@@ -9,12 +9,15 @@
- # initial variables
- SET(GLUT TRUE CACHE BOOL "")
- SET(GLEW TRUE CACHE BOOL "")
--SET(CG TRUE CACHE BOOL "")
--SET(CUDA TRUE CACHE BOOL "")
--SET(OPENEXR TRUE CACHE BOOL "")
--
--# OpenGL
--INCLUDE(FindOpenGL)
-+SET(CG TRUE CACHE BOOL "")
-+SET(CUDA TRUE CACHE BOOL "")
-+SET(OPENEXR TRUE CACHE BOOL "")
-+SET(JPEG TRUE CACHE BOOL "")
-+SET(PNG TRUE CACHE BOOL "")
-+SET(TIFF TRUE CACHE BOOL "")
-+
-+# OpenGL
-+INCLUDE(FindOpenGL)
- IF(OPENGL_FOUND)
- 	MESSAGE(STATUS "Looking for OpenGL - found")
- ELSE(OPENGL_FOUND)
-@@ -78,37 +81,43 @@
- 	MESSAGE(STATUS "Looking for Maya - found")
- ELSE(MAYA_FOUND)
- 	MESSAGE(STATUS "Looking for Maya - not found")
--ENDIF(MAYA_FOUND)
--
--# JPEG
--INCLUDE(FindJPEG)
--IF(JPEG_FOUND)
--	SET(HAVE_JPEG ${JPEG_FOUND} CACHE BOOL "Set to TRUE if JPEG is found, FALSE otherwise")
--	MESSAGE(STATUS "Looking for JPEG - found")
--ELSE(JPEG_FOUND)
--	MESSAGE(STATUS "Looking for JPEG - not found")
--ENDIF(JPEG_FOUND)
--
--# PNG
--INCLUDE(FindPNG)
--IF(PNG_FOUND)
--	SET(HAVE_PNG ${PNG_FOUND} CACHE BOOL "Set to TRUE if PNG is found, FALSE otherwise")
--	MESSAGE(STATUS "Looking for PNG - found")
--ELSE(PNG_FOUND)
--	MESSAGE(STATUS "Looking for PNG - not found")
--ENDIF(PNG_FOUND)
--
--# TIFF
--INCLUDE(FindTIFF)
--IF(TIFF_FOUND)
--	SET(HAVE_TIFF ${TIFF_FOUND} CACHE BOOL "Set to TRUE if TIFF is found, FALSE otherwise")
--	MESSAGE(STATUS "Looking for TIFF - found")
--ELSE(TIFF_FOUND)
--	MESSAGE(STATUS "Looking for TIFF - not found")
--ENDIF(TIFF_FOUND)
--
--# OpenEXR
--IF(OPENEXR)
-+ENDIF(MAYA_FOUND)
-+
-+# JPEG
-+IF(JPEG)
-+	INCLUDE(FindJPEG)
-+	IF(JPEG_FOUND)
-+		SET(HAVE_JPEG ${JPEG_FOUND} CACHE BOOL "Set to TRUE if JPEG is found, FALSE otherwise")
-+		MESSAGE(STATUS "Looking for JPEG - found")
-+	ELSE(JPEG_FOUND)
-+		MESSAGE(STATUS "Looking for JPEG - not found")
-+	ENDIF(JPEG_FOUND)
-+ENDIF(JPEG)
-+
-+# PNG
-+IF(PNG)
-+	INCLUDE(FindPNG)
-+	IF(PNG_FOUND)
-+		SET(HAVE_PNG ${PNG_FOUND} CACHE BOOL "Set to TRUE if PNG is found, FALSE otherwise")
-+		MESSAGE(STATUS "Looking for PNG - found")
-+	ELSE(PNG_FOUND)
-+		MESSAGE(STATUS "Looking for PNG - not found")
-+	ENDIF(PNG_FOUND)
-+ENDIF(PNG)
-+
-+# TIFF
-+IF(TIFF)
-+	INCLUDE(FindTIFF)
-+	IF(TIFF_FOUND)
-+		SET(HAVE_TIFF ${TIFF_FOUND} CACHE BOOL "Set to TRUE if TIFF is found, FALSE otherwise")
-+		MESSAGE(STATUS "Looking for TIFF - found")
-+	ELSE(TIFF_FOUND)
-+		MESSAGE(STATUS "Looking for TIFF - not found")
-+	ENDIF(TIFF_FOUND)
-+ENDIF(TIFF)
-+
-+# OpenEXR
-+IF(OPENEXR)
- 	INCLUDE(${NV_CMAKE_DIR}/FindOpenEXR.cmake)
- 	IF(OPENEXR_FOUND)
- 		SET(HAVE_OPENEXR ${OPENEXR_FOUND} CACHE BOOL "Set to TRUE if OpenEXR is found, FALSE otherwise")
Index: ps/trunk/libraries/source/nvtt/patches/cmake-freebsd.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/cmake-freebsd.patch
+++ ps/trunk/libraries/source/nvtt/patches/cmake-freebsd.patch
@@ -1,15 +1,22 @@
-Index: nvtt/CMakeLists.txt
-===================================================================
---- nvtt/CMakeLists.txt	(revision 10975)
-+++ nvtt/CMakeLists.txt	(working copy)
-@@ -44,6 +44,10 @@
+---
+ src/nvtt/CMakeLists.txt | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
+index df77c86..e543807 100644
+--- a/src/nvtt/CMakeLists.txt
++++ b/src/nvtt/CMakeLists.txt
+@@ -41,7 +41,11 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+ 
  ADD_DEFINITIONS(-DNVTT_EXPORTS)
  
- IF(NVTT_SHARED)
-+	IF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-+		SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
-+	ENDIF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+-IF(NVTT_SHARED)	
++IF(NVTT_SHARED)
++    IF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
++        SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
++    ENDIF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 +
- 	ADD_DEFINITIONS(-DNVTT_SHARED=1)
- 	ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
+     ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
  ELSE(NVTT_SHARED)
+     ADD_LIBRARY(nvtt ${NVTT_SRCS})
+--
Index: ps/trunk/libraries/source/nvtt/patches/cmake-noqt4.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/cmake-noqt4.patch
+++ ps/trunk/libraries/source/nvtt/patches/cmake-noqt4.patch
@@ -1,14 +0,0 @@
-Index: src/src/CMakeLists.txt
-===================================================================
---- src/src/CMakeLists.txt	(revision 13170)
-+++ src/src/CMakeLists.txt	(revision 13635)
-@@ -128,7 +128,8 @@
- ENDIF(OPENEXR)
- 
- # Qt
--FIND_PACKAGE(Qt4)
-+# We don't actually use this and it requires having Qt4 installed, so why is this in here?
-+#FIND_PACKAGE(Qt4)
- 
- # Threads
- FIND_PACKAGE(Threads REQUIRED)
Index: ps/trunk/libraries/source/nvtt/patches/gcc47-unistd.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/gcc47-unistd.patch
+++ ps/trunk/libraries/source/nvtt/patches/gcc47-unistd.patch
@@ -1,23 +0,0 @@
-Index: src/src/nvcore/Debug.cpp
-===================================================================
---- src/src/nvcore/Debug.cpp	(revision 11373)
-+++ src/src/nvcore/Debug.cpp	(working copy)
-@@ -27,6 +27,10 @@
- #	include <signal.h>
- #endif
- 
-+#if NV_OS_LINUX || NV_OS_DARWIN || NV_OS_FREEBSD
-+#	include <unistd.h>	// getpid
-+#endif
-+
- #if NV_OS_LINUX && defined(HAVE_EXECINFO_H)
- #	include <execinfo.h> // backtrace
- #	if NV_CC_GNUC // defined(HAVE_CXXABI_H)
-@@ -35,7 +39,6 @@
- #endif
- 
- #if NV_OS_DARWIN || NV_OS_FREEBSD
--#	include <unistd.h>	// getpid
- #	include <sys/types.h>
- #	include <sys/sysctl.h>	// sysctl
- #	include <sys/ucontext.h>
Index: ps/trunk/libraries/source/nvtt/patches/gcc6-fix.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/gcc6-fix.patch
+++ ps/trunk/libraries/source/nvtt/patches/gcc6-fix.patch
@@ -1,31 +0,0 @@
-Index: libraries/source/nvtt/src/src/nvimage/ImageIO.cpp
-===================================================================
---- libraries/source/nvtt/src/src/nvimage/ImageIO.cpp	(revision 18164)
-+++ libraries/source/nvtt/src/src/nvimage/ImageIO.cpp	(working copy)
-@@ -621,7 +621,7 @@
- 	png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
- 	if (png_ptr == NULL) {
- 	//	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
--		return false;
-+		return NULL;
- 	}
- 
- 	// Allocate/initialize a memory block for the image information
-@@ -629,7 +629,7 @@
- 	if (info_ptr == NULL) {
- 		png_destroy_read_struct(&png_ptr, NULL, NULL);
- 	//	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
--		return false;
-+		return NULL;
- 	}
- 
- 	// Set up the error handling
-@@ -636,7 +636,7 @@
- 	if (setjmp(png_jmpbuf(png_ptr))) {
- 		png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
- 	//	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
--		return false;
-+		return NULL;
- 	}
- 
- 	// Set up the I/O functions.
Index: ps/trunk/libraries/source/nvtt/patches/issue139.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/issue139.patch
+++ ps/trunk/libraries/source/nvtt/patches/issue139.patch
@@ -1,55 +0,0 @@
-Index: src/src/nvmath/Vector.h
-===================================================================
---- src/src/nvmath/Vector.h	(revision 8311)
-+++ src/src/nvmath/Vector.h	(working copy)
-@@ -68,7 +68,7 @@
- 	scalar y() const;
- 	scalar z() const;
- 
--	const Vector2 & xy() const;
-+	Vector2 xy() const;
- 
- 	scalar component(uint idx) const;
- 
-@@ -111,8 +111,8 @@
- 	scalar z() const;
- 	scalar w() const;
- 	
--	const Vector2 & xy() const;
--	const Vector3 & xyz() const;
-+	Vector2 xy() const;
-+	Vector3 xyz() const;
- 
- 	scalar component(uint idx) const;
- 
-@@ -231,9 +231,9 @@
- inline scalar Vector3::y() const { return m_y; }
- inline scalar Vector3::z() const { return m_z; }
- 	
--inline const Vector2 & Vector3::xy() const
-+inline Vector2 Vector3::xy() const
- {
--	return *(Vector2 *)this;
-+	return Vector2(m_x, m_y);
- }
- 
- inline scalar Vector3::component(uint idx) const
-@@ -332,14 +332,14 @@
- inline scalar Vector4::z() const { return m_z; }
- inline scalar Vector4::w() const { return m_w; }
- 
--inline const Vector2 & Vector4::xy() const
-+inline Vector2 Vector4::xy() const
- {
--	return *(Vector2 *)this;
-+	return Vector2(m_x, m_y);
- }
- 
--inline const Vector3 & Vector4::xyz() const
-+inline Vector3 Vector4::xyz() const
- {
--	return *(Vector3 *)this;
-+	return Vector3(m_x, m_y, m_z);
- }
- 
- inline scalar Vector4::component(uint idx) const
Index: ps/trunk/libraries/source/nvtt/patches/issue176.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/issue176.patch
+++ ps/trunk/libraries/source/nvtt/patches/issue176.patch
@@ -1,105 +0,0 @@
-Index: src/src/nvcore/nvcore.h
-===================================================================
---- src/src/nvcore/nvcore.h	(revision 11943)
-+++ src/src/nvcore/nvcore.h	(working copy)
-@@ -41,6 +41,9 @@
- #elif defined POSH_OS_FREEBSD
- #	define NV_OS_FREEBSD 1
- #	define NV_OS_UNIX 1
-+#elif defined POSH_OS_OPENBSD
-+#	define NV_OS_OPENBSD 1
-+#	define NV_OS_UNIX 1
- #elif defined POSH_OS_CYGWIN32
- #	define NV_OS_CYGWIN 1
- #elif defined POSH_OS_MINGW
-@@ -178,7 +181,7 @@
- #elif NV_CC_GNUC
- #	if NV_OS_LINUX
- #		include "DefsGnucLinux.h"
--#	elif NV_OS_DARWIN || NV_OS_FREEBSD
-+#	elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
- #		include "DefsGnucDarwin.h"
- #	elif NV_OS_MINGW
- #		include "DefsGnucWin32.h"
-Index: src/src/nvcore/Debug.cpp
-===================================================================
---- src/src/nvcore/Debug.cpp	(revision 11943)
-+++ src/src/nvcore/Debug.cpp	(working copy)
-@@ -27,7 +27,7 @@
- #	include <signal.h>
- #endif
- 
--#if NV_OS_LINUX || NV_OS_DARWIN || NV_OS_FREEBSD
-+#if NV_OS_LINUX || NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
- #	include <unistd.h>	// getpid
- #endif
- 
-@@ -38,10 +38,13 @@
- #	endif
- #endif
- 
--#if NV_OS_DARWIN || NV_OS_FREEBSD
-+#if NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
- #	include <sys/types.h>
-+#	include <sys/param.h>
- #	include <sys/sysctl.h>	// sysctl
--#	include <sys/ucontext.h>
-+#	if !NV_OS_OPENBSD
-+#		include <sys/ucontext.h>
-+#	endif
- #	undef HAVE_EXECINFO_H
- #	if defined(HAVE_EXECINFO_H) // only after OSX 10.5
- #		include <execinfo.h> // backtrace
-@@ -210,6 +213,14 @@
- 			ucontext_t * ucp = (ucontext_t *)secret;
- 			return (void *)ucp->uc_mcontext.mc_eip;
- #		endif
-+#	elif NV_OS_OPENBSD
-+#		if NV_CPU_X86_64
-+			ucontext_t * ucp = (ucontext_t *)secret;
-+			return (void *)ucp->sc_rip;
-+#		elif NV_CPU_X86
-+			ucontext_t * ucp = (ucontext_t *)secret;
-+			return (void *)ucp->sc_eip;
-+#		endif
- #	else
- #		if NV_CPU_X86_64
- 			// #define REG_RIP REG_INDEX(rip) // seems to be 16
-Index: src/src/nvcore/poshlib/posh.h
-===================================================================
---- src/src/nvcore/poshlib/posh.h	(revision 11943)
-+++ src/src/nvcore/poshlib/posh.h	(working copy)
-@@ -298,6 +298,11 @@
- #  define POSH_OS_STRING "FreeBSD"
- #endif
- 
-+#if defined __OpenBSD__
-+#  define POSH_OS_OPENBSD 1
-+#  define POSH_OS_STRING "OpenBSD"
-+#endif
-+
- #if defined __CYGWIN32__
- #  define POSH_OS_CYGWIN32 1
- #  define POSH_OS_STRING "Cygwin"
-Index: src/src/nvmath/nvmath.h
-===================================================================
---- src/src/nvmath/nvmath.h	(revision 11943)
-+++ src/src/nvmath/nvmath.h	(working copy)
-@@ -115,7 +115,7 @@
- {
- #if NV_OS_WIN32
- 	return _finite(f) != 0;
--#elif NV_OS_DARWIN || NV_OS_FREEBSD
-+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
- 	return isfinite(f);
- #elif NV_OS_LINUX
- 	return finitef(f);
-@@ -130,7 +130,7 @@
- {
- #if NV_OS_WIN32
- 	return _isnan(f) != 0;
--#elif NV_OS_DARWIN || NV_OS_FREEBSD
-+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
- 	return isnan(f);
- #elif NV_OS_LINUX
- 	return isnanf(f);
Index: ps/trunk/libraries/source/nvtt/patches/issue182.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/issue182.patch
+++ ps/trunk/libraries/source/nvtt/patches/issue182.patch
@@ -1,18 +0,0 @@
-Index: src/src/nvtt/squish/CMakeLists.txt
-===================================================================
---- src/src/nvtt/squish/CMakeLists.txt	(revision 13060)
-+++ src/src/nvtt/squish/CMakeLists.txt	(working copy)
-@@ -22,7 +22,11 @@
- 
- ADD_LIBRARY(squish STATIC ${SQUISH_SRCS})
- 
--IF(CMAKE_COMPILER_IS_GNUCXX)
-+IF("${CMAKE_CXX_COMPILER}" MATCHES "clang(\\+\\+)?$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-+	SET(CMAKE_COMPILER_IS_CLANGXX 1)
-+ENDIF()
-+
-+IF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
- 	SET_TARGET_PROPERTIES(squish PROPERTIES COMPILE_FLAGS -fPIC)
--ENDIF(CMAKE_COMPILER_IS_GNUCXX)
-+ENDIF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
- 
Index: ps/trunk/libraries/source/nvtt/patches/issue188.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/issue188.patch
+++ ps/trunk/libraries/source/nvtt/patches/issue188.patch
@@ -1,8 +1,12 @@
-Index: src/cmake/OptimalOptions.cmake
-===================================================================
---- src/cmake/OptimalOptions.cmake	(revision 13805)
-+++ src/cmake/OptimalOptions.cmake	(working copy)
-@@ -15,7 +15,7 @@
+---
+ cmake/OptimalOptions.cmake | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/cmake/OptimalOptions.cmake b/cmake/OptimalOptions.cmake
+index ac450c9..4993dd5 100644
+--- a/cmake/OptimalOptions.cmake
++++ b/cmake/OptimalOptions.cmake
+@@ -16,7 +16,7 @@ IF(CMAKE_COMPILER_IS_GNUCXX)
  	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "i686")
  
  	IF(NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
@@ -11,3 +15,4 @@
  		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=athlon64 -msse3")
  	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
  
+--
Index: ps/trunk/libraries/source/nvtt/patches/issue261.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/issue261.patch
+++ ps/trunk/libraries/source/nvtt/patches/issue261.patch
@@ -0,0 +1,23 @@
+ src/nvthread/Atomic.h | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/src/nvthread/Atomic.h b/src/nvthread/Atomic.h
+index 657b16763a..3010a5f5f4 100644
+--- a/libraries/source/nvtt/src/src/nvthread/Atomic.h
++++ b/libraries/source/nvtt/src/src/nvthread/Atomic.h
+@@ -183,7 +183,6 @@ namespace nv {
+ 
+ 
+ #elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
+-    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
+ 
+     //ACS: Use Apple's atomics instead? I don't know if these are better in any way; there are non-barrier versions too. There's no OSAtomicSwap32 tho'
+     /*
+@@ -254,7 +253,6 @@ namespace nv {
+ 
+ 
+ #elif NV_CC_CLANG && POSH_CPU_STRONGARM
+-    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
+     
+     inline uint32 atomicIncrement(uint32 * value)
+     {
Index: ps/trunk/libraries/source/nvtt/patches/musl-build.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/musl-build.patch
+++ ps/trunk/libraries/source/nvtt/patches/musl-build.patch
@@ -0,0 +1,36 @@
+---
+ src/nvmath/nvmath.h       | 4 +---
+ src/nvthread/nvthread.cpp | 2 ++
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h
+index 439e599575..1f1ff1fcbc 100644
+--- a/src/nvmath/nvmath.h
++++ b/src/nvmath/nvmath.h
+@@ -187,10 +187,8 @@ namespace nv
+     {
+ #if NV_OS_WIN32 || NV_OS_XBOX
+         return _isnan(f) != 0;
+-#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_NETBSD || NV_OS_OPENBSD || NV_OS_ORBIS
++#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_NETBSD || NV_OS_OPENBSD || NV_OS_ORBIS || NV_OS_LINUX
+         return isnan(f);
+-#elif NV_OS_LINUX
+-        return isnanf(f);
+ #else
+ #   error "isNan not supported"
+ #endif
+diff --git a/libraries/source/nvtt/src/src/nvthread/nvthread.cpp b/libraries/source/nvtt/src/src/nvthread/nvthread.cpp
+index d8564d391b..967b886d0f 100644
+--- a/src/nvthread/nvthread.cpp
++++ b/src/nvthread/nvthread.cpp
+@@ -8,7 +8,9 @@
+ #include "Win32.h"
+ #elif NV_OS_UNIX
+ #include <sys/types.h>
++#if !NV_OS_LINUX
+ #include <sys/sysctl.h>
++#endif
+ #include <unistd.h>
+ #elif NV_OS_DARWIN
+ #import <stdio.h>
+--
Index: ps/trunk/libraries/source/nvtt/patches/png-api.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/png-api.patch
+++ ps/trunk/libraries/source/nvtt/patches/png-api.patch
@@ -1,13 +0,0 @@
-Index: src/src/nvimage/ImageIO.cpp
-===================================================================
---- src/src/nvimage/ImageIO.cpp	(revision 9895)
-+++ src/src/nvimage/ImageIO.cpp	(working copy)
-@@ -603,7 +603,7 @@
- {
- 	nvDebugCheck(png_ptr != NULL);
- 	
--	Stream * s = (Stream *)png_ptr->io_ptr;
-+	Stream * s = (Stream *)png_get_io_ptr(png_ptr);
- 	s->serialize(data, (int)length);
- 	
- 	if (s->isError()) {
Index: ps/trunk/libraries/source/nvtt/patches/r1025.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/r1025.patch
+++ ps/trunk/libraries/source/nvtt/patches/r1025.patch
@@ -1,16 +0,0 @@
-Index: extern/poshlib/posh.h
-===================================================================
---- extern/poshlib/posh.h	(revision 1024)
-+++ extern/poshlib/posh.h	(revision 1025)
-@@ -293,6 +293,11 @@
- #  define POSH_OS_STRING "Linux"
- #endif
- 
-+#if defined __FreeBSD__
-+#  define POSH_OS_FREEBSD 1 
-+#  define POSH_OS_STRING "FreeBSD"
-+#endif
-+
- #if defined __CYGWIN32__
- #  define POSH_OS_CYGWIN32 1
- #  define POSH_OS_STRING "Cygwin"
Index: ps/trunk/libraries/source/nvtt/patches/r1156.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/r1156.patch
+++ ps/trunk/libraries/source/nvtt/patches/r1156.patch
@@ -1,114 +0,0 @@
-Index: branches/2.0/src/nvcore/nvcore.h
-===================================================================
---- branches/2.0/src/nvcore/nvcore.h	(revision 1155)
-+++ branches/2.0/src/nvcore/nvcore.h	(revision 1156)
-@@ -99,6 +99,23 @@
- #define NV_ENDIAN_STRING	POSH_ENDIAN_STRING
- 
- 
-+// Type definitions:
-+typedef posh_u8_t   uint8;
-+typedef posh_i8_t   int8;
-+
-+typedef posh_u16_t  uint16;
-+typedef posh_i16_t  int16;
-+
-+typedef posh_u32_t  uint32;
-+typedef posh_i32_t  int32;
-+
-+typedef posh_u64_t  uint64;
-+typedef posh_i64_t  int64;
-+
-+// Aliases
-+typedef uint32      uint;
-+
-+
- // Version string:
- #define NV_VERSION_STRING \
- 	NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
-Index: branches/2.0/src/nvcore/DefsVcWin32.h
-===================================================================
---- branches/2.0/src/nvcore/DefsVcWin32.h	(revision 1155)
-+++ branches/2.0/src/nvcore/DefsVcWin32.h	(revision 1156)
-@@ -39,7 +39,7 @@
- #define __FUNC__ __FUNCTION__ 
- #endif
- 
--
-+/*
- // Type definitions
- typedef unsigned char       uint8;
- typedef signed char         int8;
-@@ -55,8 +55,8 @@
- 
- // Aliases
- typedef uint32              uint;
-+*/
- 
--
- // Unwanted VC++ warnings to disable.
- /*
- #pragma warning(disable : 4244)		// conversion to float, possible loss of data
-Index: branches/2.0/src/nvcore/DefsGnucDarwin.h
-===================================================================
---- branches/2.0/src/nvcore/DefsGnucDarwin.h	(revision 1155)
-+++ branches/2.0/src/nvcore/DefsGnucDarwin.h	(revision 1156)
-@@ -2,7 +2,7 @@
- #error "Do not include this file directly."
- #endif
- 
--#include <stdint.h> // uint8_t, int8_t, ...
-+//#include <stdint.h> // uint8_t, int8_t, ...
- 
- // Function linkage
- #define DLL_IMPORT
-@@ -48,7 +48,7 @@
- 
- #define restrict    __restrict__
- 
--
-+/*
- // Type definitions
- typedef uint8_t     uint8;
- typedef int8_t      int8;
-@@ -64,3 +64,4 @@
- 
- // Aliases
- typedef uint32      uint;
-+*/
-Index: branches/2.0/src/nvcore/DefsGnucLinux.h
-===================================================================
---- branches/2.0/src/nvcore/DefsGnucLinux.h	(revision 1155)
-+++ branches/2.0/src/nvcore/DefsGnucLinux.h	(revision 1156)
-@@ -47,7 +47,7 @@
- 
- #define restrict    __restrict__
- 
--
-+/*
- // Type definitions
- typedef unsigned char       uint8;
- typedef signed char         int8;
-@@ -63,3 +63,4 @@
- 
- // Aliases
- typedef uint32              uint;
-+*/
-Index: branches/2.0/src/nvcore/DefsGnucWin32.h
-===================================================================
---- branches/2.0/src/nvcore/DefsGnucWin32.h	(revision 1155)
-+++ branches/2.0/src/nvcore/DefsGnucWin32.h	(revision 1156)
-@@ -41,7 +41,7 @@
- 
- #define restrict	__restrict__
- 
--
-+/*
- // Type definitions
- typedef unsigned char		uint8;
- typedef signed char			int8;
-@@ -57,3 +57,4 @@
- 
- // Aliases
- typedef uint32				uint;
-+*/
Index: ps/trunk/libraries/source/nvtt/patches/r1157.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/r1157.patch
+++ ps/trunk/libraries/source/nvtt/patches/r1157.patch
@@ -1,13 +0,0 @@
-Index: branches/2.0/cmake/FindCUDA.cmake
-===================================================================
---- branches/2.0/cmake/FindCUDA.cmake	(revision 1156)
-+++ branches/2.0/cmake/FindCUDA.cmake	(revision 1157)
-@@ -120,7 +120,7 @@
- 	FOREACH (CUFILE ${ARGN})
- 		GET_FILENAME_COMPONENT (CUFILE ${CUFILE} ABSOLUTE)
- 		GET_FILENAME_COMPONENT (CFILE ${CUFILE} NAME_WE)
--		SET (CFILE ${CMAKE_CURRENT_BINARY_DIR}/${CFILE}.gen.c)
-+		SET (CFILE ${CMAKE_CURRENT_BINARY_DIR}/${CFILE}.gen.cpp)
- 
- 		GET_CUFILE_DEPENDENCIES(CUDEPS ${CUFILE})
- 		#MESSAGE("${CUDEPS}")
Index: ps/trunk/libraries/source/nvtt/patches/r1172.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/r1172.patch
+++ ps/trunk/libraries/source/nvtt/patches/r1172.patch
@@ -1,589 +0,0 @@
-Index: branches/2.0/src/nvimage/FloatImage.cpp
-===================================================================
---- branches/2.0/src/nvimage/FloatImage.cpp	(revision 1171)
-+++ branches/2.0/src/nvimage/FloatImage.cpp	(revision 1172)
-@@ -151,13 +151,13 @@
- 	m_height = h;
- 	m_componentNum = c;
- 	m_count = w * h * c;
--	m_mem = reinterpret_cast<float *>(nv::mem::malloc(m_count * sizeof(float)));
-+	m_mem = reinterpret_cast<float *>(::malloc(m_count * sizeof(float)));
- }
- 
- /// Free the image, but don't clear the members.
- void FloatImage::free()
- {
--	nv::mem::free( reinterpret_cast<void *>(m_mem) );
-+	::free( reinterpret_cast<void *>(m_mem) );
- 	m_mem = NULL;
- }
- 
-Index: branches/2.0/src/nvimage/ImageIO.cpp
-===================================================================
---- branches/2.0/src/nvimage/ImageIO.cpp	(revision 1171)
-+++ branches/2.0/src/nvimage/ImageIO.cpp	(revision 1172)
-@@ -954,7 +954,7 @@
- 	fimage->allocate(spp, width, height);
- 	
- 	int linesize = TIFFScanlineSize(tif);
--	tdata_t buf = (::uint8 *)nv::mem::malloc(linesize);
-+	tdata_t buf = (::uint8 *)::malloc(linesize);
- 	
- 	for (uint y = 0; y < height; y++) 
- 	{
-@@ -991,7 +991,7 @@
- 		}
- 	}
- 
--	nv::mem::free(buf);
-+	::free(buf);
- 	
- 	TIFFClose(tif);
- 	
-Index: branches/2.0/src/nvimage/Image.cpp
-===================================================================
---- branches/2.0/src/nvimage/Image.cpp	(revision 1171)
-+++ branches/2.0/src/nvimage/Image.cpp	(revision 1172)
-@@ -78,7 +78,7 @@
- 
- void Image::free()
- {
--	nv::mem::free(m_data);
-+	::free(m_data);
- 	m_data = NULL;
- }
- 
-Index: branches/2.0/src/nvtt/CompressRGB.cpp
-===================================================================
---- branches/2.0/src/nvtt/CompressRGB.cpp	(revision 1171)
-+++ branches/2.0/src/nvtt/CompressRGB.cpp	(revision 1172)
-@@ -82,7 +82,7 @@
-     // Determine pitch.
-     uint pitch = computePitch(w, compressionOptions.bitcount, 8);
- 
--    uint8 * dst = (uint8 *)mem::malloc(pitch + 4);
-+    uint8 * dst = (uint8 *)::malloc(pitch + 4);
- 
-     for (uint y = 0; y < h; y++)
-     {
-@@ -127,6 +127,6 @@
-         }
-     }
- 
--    mem::free(dst);
-+    ::free(dst);
- }
- 
-Index: branches/2.0/src/nvtt/cuda/CudaCompressDXT.cpp
-===================================================================
---- branches/2.0/src/nvtt/cuda/CudaCompressDXT.cpp	(revision 1171)
-+++ branches/2.0/src/nvtt/cuda/CudaCompressDXT.cpp	(revision 1172)
-@@ -137,7 +137,7 @@
- 	const uint h = (m_image->height() + 3) / 4;
- 
- 	uint imageSize = w * h * 16 * sizeof(Color32);
--    uint * blockLinearImage = (uint *) malloc(imageSize);
-+    uint * blockLinearImage = (uint *) ::malloc(imageSize);
- 	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
- 
- 	const uint blockNum = w * h;
-@@ -207,14 +207,14 @@
- 	const uint h = (m_image->height() + 3) / 4;
- 
- 	uint imageSize = w * h * 16 * sizeof(Color32);
--    uint * blockLinearImage = (uint *) malloc(imageSize);
-+    uint * blockLinearImage = (uint *) ::malloc(imageSize);
- 	convertToBlockLinear(m_image, blockLinearImage);
- 
- 	const uint blockNum = w * h;
- 	const uint compressedSize = blockNum * 8;
- 
- 	AlphaBlockDXT3 * alphaBlocks = NULL;
--	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
-+	alphaBlocks = (AlphaBlockDXT3 *)::malloc(min(compressedSize, MAX_BLOCKS * 8U));
- 
- 	setupCompressKernel(compressionOptions.colorWeight.ptr());
- 	
-@@ -298,14 +298,14 @@
- 	const uint h = (m_image->height() + 3) / 4;
- 
- 	uint imageSize = w * h * 16 * sizeof(Color32);
--    uint * blockLinearImage = (uint *) malloc(imageSize);
-+    uint * blockLinearImage = (uint *) ::malloc(imageSize);
- 	convertToBlockLinear(m_image, blockLinearImage);
- 
- 	const uint blockNum = w * h;
- 	const uint compressedSize = blockNum * 8;
- 
- 	AlphaBlockDXT5 * alphaBlocks = NULL;
--	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
-+	alphaBlocks = (AlphaBlockDXT5 *)::malloc(min(compressedSize, MAX_BLOCKS * 8U));
- 
- 	setupCompressKernel(compressionOptions.colorWeight.ptr());
- 	
-Index: branches/2.0/src/nvcore/StrLib.cpp
-===================================================================
---- branches/2.0/src/nvcore/StrLib.cpp	(revision 1171)
-+++ branches/2.0/src/nvcore/StrLib.cpp	(revision 1172)
-@@ -21,17 +21,17 @@
- {
- 	static char * strAlloc(uint size)
- 	{
--		return static_cast<char *>(mem::malloc(size));
-+		return static_cast<char *>(::malloc(size));
- 	}
- 	
- 	static char * strReAlloc(char * str, uint size)
- 	{
--		return static_cast<char *>(mem::realloc(str, size));
-+		return static_cast<char *>(::realloc(str, size));
- 	}
- 	
- 	static void strFree(const char * str)
- 	{
--		return mem::free(const_cast<char *>(str));
-+		return ::free(const_cast<char *>(str));
- 	}
- 	
- 	/*static char * strDup( const char * str ) 
-Index: branches/2.0/src/nvcore/StrLib.h
-===================================================================
---- branches/2.0/src/nvcore/StrLib.h	(revision 1171)
-+++ branches/2.0/src/nvcore/StrLib.h	(revision 1172)
-@@ -294,7 +294,7 @@
- 				const uint16 count = getRefCount();
- 				setRefCount(count - 1);
- 				if (count - 1 == 0) {
--					mem::free(data - 2);
-+					free(data - 2);
- 					data = NULL;
- 				}
- 			}
-@@ -323,7 +323,7 @@
- 
- 		void allocString(const char * str, int len)
- 		{
--			const char * ptr = static_cast<const char *>(mem::malloc(2 + len + 1));
-+			const char * ptr = static_cast<const char *>(::malloc(2 + len + 1));
- 	
- 			setData( ptr );				
- 			setRefCount( 0 );
-Index: branches/2.0/src/nvcore/Memory.cpp
-===================================================================
---- branches/2.0/src/nvcore/Memory.cpp	(revision 1171)
-+++ branches/2.0/src/nvcore/Memory.cpp	(revision 1172)
-@@ -1,3 +1,4 @@
-+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
- 
- #include "Memory.h"
- #include "Debug.h"
-@@ -2,33 +3,114 @@
- 
--//#if HAVE_MALLOC_H
--//#include <malloc.h>
--//#endif
--
- #include <stdlib.h>
- 
-+#define USE_EFENCE 0
- 
-+#if USE_EFENCE
-+extern "C" void *EF_malloc(size_t size);
-+extern "C" void *EF_realloc(void * oldBuffer, size_t newSize);
-+extern "C" void EF_free(void * address);
-+#endif
-+
- using namespace nv;
- 
--void * nv::mem::malloc(size_t size)
-+#if NV_OVERRIDE_ALLOC
-+
-+void * malloc(size_t size)
- {
--	return ::malloc(size);
-+#if USE_EFENCE
-+    return EF_malloc(size);
-+#else
-+    return ::malloc(size);
-+#endif
- }
- 
--void * nv::mem::malloc(size_t size, const char * file, int line)
-+void * debug_malloc(size_t size, const char * file, int line)
- {
--	NV_UNUSED(file);
--	NV_UNUSED(line);
--	return ::malloc(size);
-+    NV_UNUSED(file);
-+    NV_UNUSED(line);
-+#if USE_EFENCE
-+    return EF_malloc(size);
-+#else
-+    return ::malloc(size);
-+#endif
- }
- 
--void nv::mem::free(const void * ptr)
-+void free(void * ptr)
- {
--	::free(const_cast<void *>(ptr));
-+#if USE_EFENCE
-+    return EF_free(const_cast<void *>(ptr));
-+#else
-+    ::free(const_cast<void *>(ptr));
-+#endif
- }
- 
--void * nv::mem::realloc(void * ptr, size_t size)
-+void * realloc(void * ptr, size_t size)
- {
--	nvDebugCheck(ptr != NULL || size != 0);	// undefined realloc behavior.
--	return ::realloc(ptr, size);
-+    nvDebugCheck(ptr != NULL || size != 0); // undefined realloc behavior.
-+#if USE_EFENCE
-+    return EF_realloc(ptr, size);
-+#else
-+    return ::realloc(ptr, size);
-+#endif
- }
- 
-+/* No need to override this unless we want line info.
-+void * operator new (size_t size) throw()
-+{
-+    return malloc(size);
-+}
-+
-+void operator delete (void *p) throw()
-+{
-+    free(p);
-+}
-+
-+void * operator new [] (size_t size) throw()
-+{
-+    return malloc(size);
-+}
-+
-+void operator delete [] (void * p) throw()
-+{
-+    free(p);
-+}
-+*/
-+
-+#if 0 // Code from Apple:
-+void* operator new(std::size_t sz) throw (std::bad_alloc)
-+{
-+        void *result = std::malloc (sz == 0 ? 1 : sz);
-+        if (result == NULL)
-+                throw std::bad_alloc();
-+        gNewCounter++;
-+        return result;
-+}
-+void operator delete(void* p) throw()
-+{
-+        if (p == NULL)
-+                return;
-+        std::free (p);
-+        gDeleteCounter++;
-+}
-+
-+/* These are the 'nothrow' versions of the above operators.
-+   The system version will try to call a std::new_handler if they
-+   fail, but your overriding versions are not required to do this.  */
-+void* operator new(std::size_t sz, const std::nothrow_t&) throw()
-+{
-+        try {
-+                void * result = ::operator new (sz);  // calls our overridden operator new
-+                return result;
-+        } catch (std::bad_alloc &) {
-+          return NULL;
-+        }
-+}
-+void operator delete(void* p, const std::nothrow_t&) throw()
-+{
-+        ::operator delete (p);
-+}
-+
-+#endif // 0
-+
-+
-+#endif // NV_OVERRIDE_ALLOC
-Index: branches/2.0/src/nvcore/Containers.h
-===================================================================
---- branches/2.0/src/nvcore/Containers.h	(revision 1171)
-+++ branches/2.0/src/nvcore/Containers.h	(revision 1172)
-@@ -16,9 +16,9 @@
- 
- 
- // nvcore
--#include <nvcore/nvcore.h>
--#include <nvcore/Memory.h>
--#include <nvcore/Debug.h>
-+#include "nvcore.h"
-+#include "Memory.h"
-+#include "Debug.h"
- 
- #include <string.h>	// memmove
- #include <new>		// for placement new
-@@ -589,15 +589,15 @@
- 			// free the buffer.
- 			if( m_buffer_size == 0 ) {
- 				if( m_buffer ) {
--					mem::free( m_buffer );
-+					free( m_buffer );
- 					m_buffer = NULL;
- 				}
- 			}
- 			
- 			// realloc the buffer
- 			else {
--				if( m_buffer ) m_buffer = (T *) mem::realloc( m_buffer, sizeof(T) * m_buffer_size );
--				else m_buffer = (T *) mem::malloc( sizeof(T) * m_buffer_size );
-+				if( m_buffer ) m_buffer = (T *) realloc(m_buffer, sizeof(T) * m_buffer_size);
-+				else m_buffer = (T *) ::malloc(sizeof(T) * m_buffer_size);
- 			}
- 		}
- 		
-@@ -778,7 +778,7 @@
- 						e->clear();
- 					}
- 				}
--				mem::free(table);
-+				free(table);
- 				table = NULL;
- 				entry_count = 0;
- 				size_mask = -1;
-@@ -1001,7 +1001,7 @@
- 			new_size = nextPowerOfTwo(new_size);
- 			
- 			HashMap<T, U, hash_functor> new_hash;
--			new_hash.table = (Entry *) mem::malloc(sizeof(Entry) * new_size);
-+			new_hash.table = (Entry *) ::malloc(sizeof(Entry) * new_size);
- 			nvDebugCheck(new_hash.table != NULL);
- 			
- 			new_hash.entry_count = 0;
-@@ -1026,7 +1026,7 @@
- 				}
- 				
- 				// Delete our old data buffer.
--				mem::free(table);
-+				free(table);
- 			}
- 			
- 			// Steal new_hash's data.
-Index: branches/2.0/src/nvcore/Memory.h
-===================================================================
---- branches/2.0/src/nvcore/Memory.h	(revision 1171)
-+++ branches/2.0/src/nvcore/Memory.h	(revision 1172)
-@@ -1,186 +1,52 @@
--// This code is in the public domain -- castanyo@yahoo.es
-+// This code is in the public domain -- Ignacio CastaÃ±o <castano@gmail.com>
- 
-+#pragma once
- #ifndef NV_CORE_MEMORY_H
- #define NV_CORE_MEMORY_H
- 
--#include <nvcore/nvcore.h>
-+#include "nvcore.h"
- 
- #include <stdlib.h> // malloc(), realloc() and free()
--#include <stddef.h>	// size_t
-+#include <stddef.h> // size_t
- 
- #include <new>	// new and delete
- 
--// Custom memory allocator
--namespace nv
--{
--	namespace mem 
--	{
--		NVCORE_API void * malloc(size_t size);
--		NVCORE_API void * malloc(size_t size, const char * file, int line);
--		
--		NVCORE_API void free(const void * ptr);
--		NVCORE_API void * realloc(void * ptr, size_t size);
--		
--	} // mem namespace
--	
--} // nv namespace
-+#define NV_OVERRIDE_ALLOC 0
- 
-+#if NV_OVERRIDE_ALLOC
- 
--// Override new/delete
--
--inline void * operator new (size_t size) throw()
--{
--	return nv::mem::malloc(size); 
-+// Custom memory allocator
-+extern "C" {
-+    NVCORE_API void * malloc(size_t size);
-+    NVCORE_API void * debug_malloc(size_t size, const char * file, int line);
-+    NVCORE_API void free(void * ptr);
-+    NVCORE_API void * realloc(void * ptr, size_t size);
- }
- 
--inline void operator delete (void *p) throw()
--{
--	nv::mem::free(p); 
--}
--
--inline void * operator new [] (size_t size) throw()
--{
--	return nv::mem::malloc(size);
--}
--
--inline void operator delete [] (void * p) throw()
--{
--	nv::mem::free(p); 
--}
--
- /*
- #ifdef _DEBUG
- #define new new(__FILE__, __LINE__)
--#define malloc(i) malloc(i, __FILE__, __LINE__)
-+#define malloc(i) debug_malloc(i, __FILE__, __LINE__)
- #endif
- */
- 
--#if 0
--/*
--    File:	main.cpp
--    
--    Version:	1.0
-+#endif
- 
--	Abstract: Overrides the C++ 'operator new' and 'operator delete'.
-+namespace nv {
- 
--    Disclaimer:	IMPORTANT:  This Apple software is supplied to you by Apple Computer, Inc.
--		("Apple") in consideration of your agreement to the following terms, and your
--		use, installation, modification or redistribution of this Apple software
--		constitutes acceptance of these terms.  If you do not agree with these terms,
--		please do not use, install, modify or redistribute this Apple software.
-+    // C++ helpers.
-+    template <typename T> T * malloc(size_t count) {
-+        return (T *)::malloc(sizeof(T) * count);
-+    }
- 
--		In consideration of your agreement to abide by the following terms, and subject
--		to these terms, Apple grants you a personal, non-exclusive license, under Appleâs
--		copyrights in this original Apple software (the "Apple Software"), to use,
--		reproduce, modify and redistribute the Apple Software, with or without
--		modifications, in source and/or binary forms; provided that if you redistribute
--		the Apple Software in its entirety and without modifications, you must retain
--		this notice and the following text and disclaimers in all such redistributions of
--		the Apple Software.  Neither the name, trademarks, service marks or logos of
--		Apple Computer, Inc. may be used to endorse or promote products derived from the
--		Apple Software without specific prior written permission from Apple.  Except as
--		expressly stated in this notice, no other rights or licenses, express or implied,
--		are granted by Apple herein, including but not limited to any patent rights that
--		may be infringed by your derivative works or by other works in which the Apple
--		Software may be incorporated.
-+    template <typename T> T * realloc(T * ptr, size_t count) {
-+        return (T *)::realloc(ptr, sizeof(T) * count);
-+    }
- 
--		The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
--		WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
--		WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
--		PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
--		COMBINATION WITH YOUR PRODUCTS.
-+    template <typename T> void free(const T * ptr) {
-+        ::free((void *)ptr);
-+    }
- 
--		IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
--		CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
--		GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
--		ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
--		OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
--		(INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
--		ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+} // nv namespace
- 
--	Copyright Â© 2006 Apple Computer, Inc., All Rights Reserved
--*/
--
--/* This sample shows how to override the C++ global 'new' and 'delete' operators.  */
--#include <new>
--#include <iostream>
--#include <cstdlib>
--#include <stdexcept>
--#include <locale>
--
--/* Some variables and code to make the example do something.  */
--namespace {
--  unsigned long long gNewCounter; // number of times 'new' was called
--  unsigned long long gDeleteCounter;  // number of times 'delete' was called
--  
--  void printCounters()  // print the counters above
--  {
--	std::cout << "new was called " << gNewCounter << " times and delete was called " << gDeleteCounter << " times\n";
--  }
--}
--
--/* These are the overridden new and delete routines.
--   Most applications will want to override at least these four versions of new/delete if they override any of them.
--
--   In Mac OS, it's not necessary to override the array versions of operator new and delete if all
--   they would do is call the non-array versions; the C++ standard library, as an extension
--   to the C++ standard, does this for you.
--
--   Developers should consult the section [lib.support.dynamic] in the C++ standard to see the requirements
--   on the generic operators new and delete; the system may expect that your overridden operators meet all these
--   requirements.
--
--   Your operators may be called by the system, even early in start-up before constructors have been executed.  */
--void* operator new(std::size_t sz) throw (std::bad_alloc)
--{
--	void *result = std::malloc (sz == 0 ? 1 : sz);
--	if (result == NULL)
--		throw std::bad_alloc();
--	gNewCounter++;
--	return result;
--}
--void operator delete(void* p) throw()
--{
--	if (p == NULL)
--		return;
--	std::free (p);
--	gDeleteCounter++;
--}
--
--/* These are the 'nothrow' versions of the above operators.
--   The system version will try to call a std::new_handler if they
--   fail, but your overriding versions are not required to do this.  */
--void* operator new(std::size_t sz, const std::nothrow_t&) throw()
--{
--	try {
--		void * result = ::operator new (sz);  // calls our overridden operator new
--		return result;
--	} catch (std::bad_alloc &) {
--	  return NULL;
--	}
--}
--void operator delete(void* p, const std::nothrow_t&) throw()
--{
--	::operator delete (p);
--}
--
--/* Bug 4067110 is that if your program has no weak symbols at all, the linker will not set the
--   WEAK_DEFINES bit in the Mach-O header and as a result the new and delete operators above won't
--   be seen by system libraries.  This is mostly a problem for test programs and small examples,
--   since almost all real C++ programs complicated enough to override new and delete will have at
--   least one weak symbol.  However, this is a small example, so:  */
--void __attribute__((weak, visibility("default"))) workaroundFor4067110 () { }
--
--/* This is a simple test program that causes the runtime library to call new and delete.  */
--int main() 
--{
--	atexit (printCounters);
--	try {
--	  std::locale example("does_not_exist");
--	} catch (std::runtime_error &x) {
--	}
--	return 0;
--}
--#endif // 0
--
- #endif // NV_CORE_MEMORY_H
Index: ps/trunk/libraries/source/nvtt/patches/r907.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/r907.patch
+++ ps/trunk/libraries/source/nvtt/patches/r907.patch
@@ -1,73 +0,0 @@
-Index: src/nvcore/nvcore.h
-===================================================================
---- src/nvcore/nvcore.h	(revision 906)
-+++ src/nvcore/nvcore.h	(revision 907)
-@@ -38,6 +38,9 @@
- #if defined POSH_OS_LINUX
- #	define NV_OS_LINUX 1
- #	define NV_OS_UNIX 1
-+#elif defined POSH_OS_FREEBSD
-+#	define NV_OS_FREEBSD 1
-+#	define NV_OS_UNIX 1
- #elif defined POSH_OS_CYGWIN32
- #	define NV_OS_CYGWIN 1
- #elif defined POSH_OS_MINGW
-@@ -161,7 +164,7 @@
- #elif NV_CC_GNUC
- #	if NV_OS_LINUX
- #		include "DefsGnucLinux.h"
--#	elif NV_OS_DARWIN
-+#	elif NV_OS_DARWIN || NV_OS_FREEBSD
- #		include "DefsGnucDarwin.h"
- #	elif NV_OS_MINGW
- #		include "DefsGnucWin32.h"
-Index: src/nvcore/Debug.cpp
-===================================================================
---- src/nvcore/Debug.cpp	(revision 906)
-+++ src/nvcore/Debug.cpp	(revision 907)
-@@ -34,7 +34,7 @@
- #	endif
- #endif
- 
--#if NV_OS_DARWIN
-+#if NV_OS_DARWIN || NV_OS_FREEBSD
- #	include <unistd.h>	// getpid
- #	include <sys/types.h>
- #	include <sys/sysctl.h>	// sysctl
-@@ -199,6 +199,14 @@
- 				return (void *) ucp->uc_mcontext->ss.eip;
- #			endif
- #		endif
-+#	elif NV_OS_FREEBSD
-+#		if NV_CPU_X86_64
-+			ucontext_t * ucp = (ucontext_t *)secret;
-+			return (void *)ucp->uc_mcontext.mc_rip;
-+#		elif NV_CPU_X86
-+			ucontext_t * ucp = (ucontext_t *)secret;
-+			return (void *)ucp->uc_mcontext.mc_eip;
-+#		endif
- #	else
- #		if NV_CPU_X86_64
- 			// #define REG_RIP REG_INDEX(rip) // seems to be 16
-Index: src/nvmath/nvmath.h
-===================================================================
---- src/nvmath/nvmath.h	(revision 906)
-+++ src/nvmath/nvmath.h	(revision 907)
-@@ -115,7 +115,7 @@
- {
- #if NV_OS_WIN32
- 	return _finite(f) != 0;
--#elif NV_OS_DARWIN
-+#elif NV_OS_DARWIN || NV_OS_FREEBSD
- 	return isfinite(f);
- #elif NV_OS_LINUX
- 	return finitef(f);
-@@ -130,7 +130,7 @@
- {
- #if NV_OS_WIN32
- 	return _isnan(f) != 0;
--#elif NV_OS_DARWIN
-+#elif NV_OS_DARWIN || NV_OS_FREEBSD
- 	return isnan(f);
- #elif NV_OS_LINUX
- 	return isnanf(f);
Index: ps/trunk/libraries/source/nvtt/patches/rpath.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/rpath.patch
+++ ps/trunk/libraries/source/nvtt/patches/rpath.patch
@@ -1,8 +1,12 @@
-Index: libraries/nvtt/src/CMakeLists.txt
-===================================================================
---- libraries/nvtt/src/CMakeLists.txt	(revision 8295)
-+++ libraries/nvtt/src/CMakeLists.txt	(working copy)
-@@ -22,6 +22,10 @@
+---
+ CMakeLists.txt | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 9c80369..9e77386 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -42,6 +42,12 @@ IF(NVTT_SHARED)
  	SET(NVIMAGE_SHARED TRUE)
  ENDIF(NVTT_SHARED)
  
@@ -11,6 +15,8 @@
 +SET(CMAKE_INSTALL_RPATH "$ORIGIN")
 +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 +SET(CMAKE_INSTALL_NAME_DIR "@executable_path")
++
+ ADD_SUBDIRECTORY(extern)
  
  ADD_SUBDIRECTORY(src)
- 
+--
Index: ps/trunk/libraries/source/nvtt/patches/win-shared-build.patch
===================================================================
--- ps/trunk/libraries/source/nvtt/patches/win-shared-build.patch
+++ ps/trunk/libraries/source/nvtt/patches/win-shared-build.patch
@@ -0,0 +1,362 @@
+---
+ src/nvcore/StrLib.h               |  8 ++++----
+ src/nvimage/BlockDXT.h            | 16 ++++++++--------
+ src/nvimage/ColorBlock.h          |  6 ++++--
+ src/nvimage/DirectDrawSurface.cpp |  4 ++--
+ src/nvimage/DirectDrawSurface.h   |  4 ++--
+ src/nvimage/ErrorMetric.h         |  8 ++++----
+ src/nvimage/FloatImage.h          |  4 ++--
+ src/nvimage/NormalMap.h           |  8 ++++----
+ src/nvmath/Fitting.h              | 12 ++++++------
+ src/nvmath/Gamma.h                |  4 ++--
+ src/nvmath/Half.cpp               | 10 ++++++++++
+ src/nvmath/Half.h                 | 23 ++++-------------------
+ src/nvtt/CMakeLists.txt           |  1 +
+ 13 files changed, 53 insertions(+), 55 deletions(-)
+
+diff --git a/src/nvcore/StrLib.h b/src/nvcore/StrLib.h
+index 1d6d13a..1ae8e91 100644
+--- a/src/nvcore/StrLib.h
++++ b/src/nvcore/StrLib.h
+@@ -197,11 +197,11 @@ namespace nv
+         void stripExtension();
+ 
+         // statics
+-        NVCORE_API static char separator();
+-        NVCORE_API static const char * fileName(const char *);
+-        NVCORE_API static const char * extension(const char *);
++        static char separator();
++        static const char * fileName(const char *);
++        static const char * extension(const char *);
+ 
+-        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
++        static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+     };
+ 
+ 
+diff --git a/src/nvimage/BlockDXT.h b/src/nvimage/BlockDXT.h
+index 18a3b65..8ad5bed 100644
+--- a/src/nvimage/BlockDXT.h
++++ b/src/nvimage/BlockDXT.h
+@@ -39,7 +39,7 @@ namespace nv
+ 
+ 
+     /// DXT1 block.
+-    struct BlockDXT1
++    struct NVIMAGE_CLASS BlockDXT1
+     {
+         Color16 col0;
+         Color16 col1;
+@@ -105,7 +105,7 @@ namespace nv
+ 
+ 
+     /// DXT3 block.
+-    struct BlockDXT3
++    struct NVIMAGE_CLASS BlockDXT3
+     {
+         AlphaBlockDXT3 alpha;
+         BlockDXT1 color;
+@@ -119,7 +119,7 @@ namespace nv
+ 
+ 
+     /// DXT5 alpha block.
+-    struct AlphaBlockDXT5
++    struct NVIMAGE_CLASS AlphaBlockDXT5
+     {
+         union {
+             struct {
+@@ -162,7 +162,7 @@ namespace nv
+ 
+ 
+     /// DXT5 block.
+-    struct BlockDXT5
++    struct NVIMAGE_CLASS BlockDXT5
+     {
+         AlphaBlockDXT5 alpha;
+         BlockDXT1 color;
+@@ -175,7 +175,7 @@ namespace nv
+     };
+ 
+     /// ATI1 block.
+-    struct BlockATI1
++    struct NVIMAGE_CLASS BlockATI1
+     {
+         AlphaBlockDXT5 alpha;
+ 
+@@ -186,7 +186,7 @@ namespace nv
+     };
+ 
+     /// ATI2 block.
+-    struct BlockATI2
++    struct NVIMAGE_CLASS BlockATI2
+     {
+         AlphaBlockDXT5 x;
+         AlphaBlockDXT5 y;
+@@ -217,14 +217,14 @@ namespace nv
+     };
+ 
+ 	/// BC6 block.
+-	struct BlockBC6
++	struct NVIMAGE_CLASS BlockBC6
+ 	{
+ 		uint8 data[16];		// Not even going to try to write a union for this thing.
+ 		void decodeBlock(Vector3 colors[16]) const;
+ 	};
+ 
+ 	/// BC7 block.
+-	struct BlockBC7
++	struct NVIMAGE_CLASS BlockBC7
+ 	{
+ 		uint8 data[16];		// Not even going to try to write a union for this thing.
+ 		void decodeBlock(ColorBlock * block) const;
+diff --git a/src/nvimage/ColorBlock.h b/src/nvimage/ColorBlock.h
+index 6638f56..d63d5a5 100644
+--- a/src/nvimage/ColorBlock.h
++++ b/src/nvimage/ColorBlock.h
+@@ -4,6 +4,8 @@
+ #ifndef NV_IMAGE_COLORBLOCK_H
+ #define NV_IMAGE_COLORBLOCK_H
+ 
++#include "nvimage.h"
++
+ #include "nvmath/Color.h"
+ #include "nvmath/Vector.h"
+ 
+@@ -14,7 +16,7 @@ namespace nv
+ 
+ 
+     /// Uncompressed 4x4 color block.
+-    struct ColorBlock
++    struct NVIMAGE_CLASS ColorBlock
+     {
+         ColorBlock();
+         ColorBlock(const uint * linearImage);
+@@ -128,7 +130,7 @@ namespace nv
+ 
+ 
+     /// Uncompressed 4x4 alpha block.
+-    struct AlphaBlock4x4
++    struct NVIMAGE_CLASS AlphaBlock4x4
+     {
+         void init(uint8 value);
+         void init(const ColorBlock & src, uint channel);
+diff --git a/src/nvimage/DirectDrawSurface.cpp b/src/nvimage/DirectDrawSurface.cpp
+index 2daaea5..ffa132e 100644
+--- a/src/nvimage/DirectDrawSurface.cpp
++++ b/src/nvimage/DirectDrawSurface.cpp
+@@ -461,7 +461,7 @@ namespace
+ 
+ } // namespace
+ 
+-uint nv::findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
++NVIMAGE_API uint nv::findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+ {
+     for (int i = 0; i < s_formatCount; i++)
+     {
+@@ -478,7 +478,7 @@ uint nv::findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint
+     return 0;
+ }
+ 
+-uint nv::findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
++NVIMAGE_API uint nv::findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+ {
+     for (int i = 0; i < s_formatCount; i++)
+     {
+diff --git a/src/nvimage/DirectDrawSurface.h b/src/nvimage/DirectDrawSurface.h
+index d63fdde..6513b14 100644
+--- a/src/nvimage/DirectDrawSurface.h
++++ b/src/nvimage/DirectDrawSurface.h
+@@ -263,9 +263,9 @@ namespace nv
+         DXGI_FORMAT_BC7_UNORM_SRGB = 99,
+     };
+ 
+-    extern uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
++    NVIMAGE_API extern uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+ 
+-    extern uint findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
++    NVIMAGE_API extern uint findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+ 
+     struct RGBAPixelFormat
+     {
+diff --git a/src/nvimage/ErrorMetric.h b/src/nvimage/ErrorMetric.h
+index b875802..df025b5 100644
+--- a/src/nvimage/ErrorMetric.h
++++ b/src/nvimage/ErrorMetric.h
+@@ -6,10 +6,10 @@ namespace nv
+ {
+     class FloatImage;
+ 
+-    float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+-    float rmsAlphaError(const FloatImage * ref, const FloatImage * img);
++    NVIMAGE_API float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
++    NVIMAGE_API float rmsAlphaError(const FloatImage * ref, const FloatImage * img);
+ 
+-    float cieLabError(const FloatImage * ref, const FloatImage * img);
++    NVIMAGE_API float cieLabError(const FloatImage * ref, const FloatImage * img);
+     float cieLab94Error(const FloatImage * ref, const FloatImage * img);
+     float spatialCieLabError(const FloatImage * ref, const FloatImage * img);
+ 
+@@ -17,6 +17,6 @@ namespace nv
+     float averageAlphaError(const FloatImage * ref, const FloatImage * img);
+ 
+     float averageAngularError(const FloatImage * img0, const FloatImage * img1);
+-    float rmsAngularError(const FloatImage * img0, const FloatImage * img1);
++    NVIMAGE_API float rmsAngularError(const FloatImage * img0, const FloatImage * img1);
+ 
+ } // nv namespace
+diff --git a/src/nvimage/FloatImage.h b/src/nvimage/FloatImage.h
+index 1015aec..104baf0 100644
+--- a/src/nvimage/FloatImage.h
++++ b/src/nvimage/FloatImage.h
+@@ -152,7 +152,7 @@ namespace nv
+         float sampleNearestRepeat(uint c, float x, float y, float z) const;
+         float sampleNearestMirror(uint c, float x, float y, float z) const;
+ 
+-        float sampleLinearClamp(uint c, float x, float y) const;
++        NVIMAGE_API float sampleLinearClamp(uint c, float x, float y) const;
+         float sampleLinearRepeat(uint c, float x, float y) const;
+         float sampleLinearMirror(uint c, float x, float y) const;
+ 
+@@ -162,7 +162,7 @@ namespace nv
+         //@}
+ 
+ 
+-        FloatImage* clone() const;
++        NVIMAGE_API FloatImage* clone() const;
+ 
+     public:
+ 
+diff --git a/src/nvimage/NormalMap.h b/src/nvimage/NormalMap.h
+index 3f13d42..39a27ad 100644
+--- a/src/nvimage/NormalMap.h
++++ b/src/nvimage/NormalMap.h
+@@ -44,12 +44,12 @@ namespace nv
+ 	};
+ 
+ 	// @@ These two functions should be deprecated:
+-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
++	NVIMAGE_API FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
++	NVIMAGE_API FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
+ 
+-	FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
++	NVIMAGE_API FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
+ 
+-	void normalizeNormalMap(FloatImage * img);
++	NVIMAGE_API void normalizeNormalMap(FloatImage * img);
+ 
+ 	// @@ Add generation of DU/DV maps.
+ 
+diff --git a/src/nvmath/Fitting.h b/src/nvmath/Fitting.h
+index 7a88cd2..5ffb50a 100644
+--- a/src/nvmath/Fitting.h
++++ b/src/nvmath/Fitting.h
+@@ -23,14 +23,14 @@ namespace nv
+         Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+         Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+ 
+-        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+-        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
++        NVMATH_API Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
++        NVMATH_API Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+ 
+-        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+-        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
++        NVMATH_API Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
++        NVMATH_API Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+ 
+-		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+-        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
++        NVMATH_API Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
++        NVMATH_API Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+ 
+         Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+         Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+diff --git a/src/nvmath/Gamma.h b/src/nvmath/Gamma.h
+index e990a79..f59dd05 100644
+--- a/src/nvmath/Gamma.h
++++ b/src/nvmath/Gamma.h
+@@ -30,8 +30,8 @@
+ namespace nv {
+ 
+     // gamma conversion of float array (in-place is allowed)
+-    void powf_5_11(const float* src, float* dst, int count);
+-    void powf_11_5(const float* src, float* dst, int count);
++    NVMATH_API void powf_5_11(const float* src, float* dst, int count);
++    NVMATH_API void powf_11_5(const float* src, float* dst, int count);
+ 
+ } // nv namespace
+ 
+diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp
+index 953cc7c..efb4ab8 100644
+--- a/src/nvmath/Half.cpp
++++ b/src/nvmath/Half.cpp
+@@ -633,6 +633,16 @@ void nv::half_init_tables()
+     }
+ }
+ 
++// Fast half to float conversion based on:
++// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
++uint32 nv::fast_half_to_float(uint16 h)
++{
++	// Initialize table if necessary.
++	if (mantissa_table[0] != 0)
++		half_init_tables();
++	uint exp = h >> 10;
++	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
++}
+ 
+ #if 0
+ 
+diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h
+index 6f5b8ad..77dff5a 100644
+--- a/src/nvmath/Half.h
++++ b/src/nvmath/Half.h
+@@ -6,30 +6,15 @@
+ 
+ namespace nv {
+ 
+-    uint32 half_to_float( uint16 h );
+-    uint16 half_from_float( uint32 f );
++    NVMATH_API uint32 half_to_float( uint16 h );
++    NVMATH_API uint16 half_from_float( uint32 f );
+ 
+     // vin,vout must be 16 byte aligned. count must be a multiple of 8.
+     // implement a non-SSE version if we need it. For now, this naming makes it clear this is only available when SSE2 is
+     void half_to_float_array_SSE2(const uint16 * vin, float * vout, int count);
+ 
+-    void half_init_tables();
+-
+-    extern uint32 mantissa_table[2048];
+-    extern uint32 exponent_table[64];
+-    extern uint32 offset_table[64];
+-
+-    // Fast half to float conversion based on:
+-    // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+-    inline uint32 fast_half_to_float(uint16 h)
+-    {
+-		// Initialize table if necessary.
+-		if (mantissa_table[0] != 0)
+-			half_init_tables();
+-	    uint exp = h >> 10;
+-	    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+-    }
+-
++    NVMATH_API void half_init_tables();
++    NVMATH_API uint32 fast_half_to_float(uint16 h);
+ 
+     inline uint16 to_half(float c) {
+         union { float f; uint32 u; } f;
+diff --git a/src/nvtt/CMakeLists.txt b/src/nvtt/CMakeLists.txt
+index e543807..a0d8aa9 100644
+--- a/src/nvtt/CMakeLists.txt
++++ b/src/nvtt/CMakeLists.txt
+@@ -46,6 +46,7 @@ IF(NVTT_SHARED)
+         SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
+     ENDIF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+ 
++    ADD_DEFINITIONS(-DNVTT_SHARED=1)
+     ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
+ ELSE(NVTT_SHARED)
+     ADD_LIBRARY(nvtt ${NVTT_SRCS})
+--
Index: ps/trunk/libraries/source/nvtt/src/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/CMakeLists.txt
@@ -1,21 +1,41 @@
-CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.0)
 PROJECT(NV)
 ENABLE_TESTING()
 
 SET(NV_CMAKE_DIR "${NV_SOURCE_DIR}/cmake")
 SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${NV_CMAKE_DIR}")
 
-IF(WIN32)
-	SET(GNUWIN32 "${NV_SOURCE_DIR}/gnuwin32")
-	SET(CMAKE_INCLUDE_PATH "${GNUWIN32}/include")
-	SET(CMAKE_LIBRARY_PATH "${GNUWIN32}/lib")
-ENDIF(WIN32)
+# GCC check (needs -std:c++11 flag)
+#if(CMAKE_COMPILER_IS_GNUCC)
+#	ADD_DEFINITIONS("-std=c++11")
+#ENDIF(CMAKE_COMPILER_IS_GNUCC)
+set (CMAKE_CXX_STANDARD 11)
+
+#IF(WIN32)
+	# gnuwin32 paths:
+	#SET(GNUWIN32_PATH "${NV_SOURCE_DIR}/extern/gnuwin32")
+	#SET(CMAKE_INCLUDE_PATH ${CMAKE_INCLUDE_PATH} "${GNUWIN32_PATH}/include")
+	#SET(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} "${GNUWIN32_PATH}/lib")
+	
+	# Set GLUT path:
+	#SET(GLUT_ROOT_DIR "${NV_SOURCE_DIR}/extern/glut")
+
+	# Set FreeImage path:
+	#SET(FREEIMAGE_ROOT_DIR "${NV_SOURCE_DIR}/extern/FreeImage")
+
+#ENDIF(WIN32)
 
 INCLUDE(${NV_CMAKE_DIR}/OptimalOptions.cmake)
 MESSAGE(STATUS "Setting optimal options")
 MESSAGE(STATUS "  Processor: ${NV_SYSTEM_PROCESSOR}")
 MESSAGE(STATUS "  Compiler Flags: ${CMAKE_CXX_FLAGS}")
 
+IF(CMAKE_BUILD_TYPE MATCHES "debug")
+	SET(CMAKE_DEBUG_POSTFIX "_d" CACHE STRING "Postfix for debug build libraries.")
+    ADD_DEFINITIONS(-D_DEBUG=1)
+ENDIF()
+
+
 IF(NVTT_SHARED)
 	SET(NVCORE_SHARED TRUE)
 	SET(NVMATH_SHARED TRUE)
@@ -28,8 +48,41 @@
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_NAME_DIR "@executable_path")
 
+ADD_SUBDIRECTORY(extern)
+
 ADD_SUBDIRECTORY(src)
 
+# These files should only be installed when creating packages.
+INSTALL(FILES 
+	LICENSE
+	README.md
+	DESTINATION share/doc/nvtt)
+
+# Add packaging support
+INCLUDE(InstallRequiredSystemLibraries)
+
+IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+	SET(CPACK_GENERATOR "TGZ;DEB")
+ENDIF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+
+SET(CPACK_PACKAGE_NAME "nvidia-texture-tools")
+SET(CPACK_PACKAGE_VERSION_MAJOR "2")
+SET(CPACK_PACKAGE_VERSION_MINOR "1")
+SET(CPACK_PACKAGE_VERSION_PATCH "0")
+SET(CPACK_PACKAGE_VERSION "2.1.0")
+SET(CPACK_PACKAGE_CONTACT "Ignacio Castaño <castano@gmail.com>")
+#SET(CPACK_PACKAGE_VENDOR "NVIDIA Corporation")
+SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Texture processing tools with support for Direct3D 10 and 11 formats.")
+
+SET(CPACK_PACKAGE_DESCRIPTION_FILE "${NV_SOURCE_DIR}/README.md")
+SET(CPACK_RESOURCE_FILE_LICENSE "${NV_SOURCE_DIR}/LICENSE")
+
+# NSIS options:
 IF(WIN32)
-	ADD_SUBDIRECTORY(gnuwin32)
+	SET(CPACK_NSIS_DISPLAY_NAME "${CPACK_PACKAGE_VENDOR}\\\\NVIDIA Texture Tools 2.1")
+	SET(CPACK_PACKAGE_INSTALL_DIRECTORY "${CPACK_PACKAGE_VENDOR}\\\\NVIDIA Texture Tools 2.1")
+	SET(CPACK_PACKAGE_ICON "${NV_SOURCE_DIR}\\\\project\\\\vc8\\\\nvcompress\\\\nvidia.ico")
 ENDIF(WIN32)
+
+INCLUDE(CPack)
+
Index: ps/trunk/libraries/source/nvtt/src/ChangeLog
===================================================================
--- ps/trunk/libraries/source/nvtt/src/ChangeLog
+++ ps/trunk/libraries/source/nvtt/src/ChangeLog
@@ -1,15 +1,25 @@
-NVIDIA Texture Tools version 2.0.8
- * Fix float to fixed image conversion. Patch provided by Alex Pfaffe. Fixes issue 121.
- * ColorBlock::isSingleColor compares only RGB channels. Fixes issue 115.
- * Fix cmake build in msvc. Fixes issue 111.
- * Better estimate principal component. Fixes issue 120.
+NVIDIA Texture Tools version 2.1.1
+ * Various fixes.
+
+NVIDIA Texture Tools version 2.1.0
+ * Too many changes to list here.
+ * CTX1 CUDA compressor.
+ * DXT1n CUDA compressor.
+ * Support alpha premultiplication by Charles Nicholson. See issue 30.
+ * Improved decompressor tool submitted by Amorilia. See issue 41.
+ * Add support for YCoCg color transform. Fixes issue 18.
+ * Add support for linear and swizzle transforms. Fixes issue 4.
+ * Fix loading of EXR files using OpenEXR.
+ * Use FreeImage as primary image loading library. Fixes issue 31. Reverted.
+ * Output swizzle codes like AMD's tools.
+ * Added support for saving PNGs by Frank Richter. Fixes issue 79 and 80.
+ * Added gnome thumbnailer by Frank Richter. Fixes issue 82.
+ * Cleanup sources removing files that are not strictly required.
 
 NVIDIA Texture Tools version 2.0.7
  * Output correct exit codes. Fixes issue 92.
  * Fix thread-safety errors. Fixes issue 90.
  * Add SIMD power method. Fixes issue 94.
- * Interact better with applications that already use CUDA.
- * Faster CPU compression.
 
 NVIDIA Texture Tools version 2.0.6
  * Fix dll version checking.
Index: ps/trunk/libraries/source/nvtt/src/LICENSE
===================================================================
--- ps/trunk/libraries/source/nvtt/src/LICENSE
+++ ps/trunk/libraries/source/nvtt/src/LICENSE
@@ -0,0 +1,25 @@
+NVIDIA Texture Tools is licensed under the MIT license.
+
+Copyright (c) 2009-2016 Ignacio Castano
+Copyright (c) 2007-2009 NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
Index: ps/trunk/libraries/source/nvtt/src/NVIDIA_Texture_Tools_LICENSE.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/NVIDIA_Texture_Tools_LICENSE.txt
+++ ps/trunk/libraries/source/nvtt/src/NVIDIA_Texture_Tools_LICENSE.txt
@@ -1,24 +0,0 @@
-NVIDIA Texture Tools 2.0 is licensed under the MIT license.
-
-Copyright (c) 2007 NVIDIA Corporation
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation
-files (the "Software"), to deal in the Software without
-restriction, including without limitation the rights to use,
-copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the
-Software is furnished to do so, subject to the following
-conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
Index: ps/trunk/libraries/source/nvtt/src/NVIDIA_Texture_Tools_README.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/NVIDIA_Texture_Tools_README.txt
+++ ps/trunk/libraries/source/nvtt/src/NVIDIA_Texture_Tools_README.txt
@@ -1,167 +0,0 @@
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
-NVIDIA Texture Tools
-README.txt
-Version 2.0
---------------------------------------------------------------------------------
---------------------------------------------------------------------------------
-
---------------------------------------------------------------------------------
-TABLE OF CONTENTS
---------------------------------------------------------------------------------
-I.   Instructions
-II.  Contents
-III. Compilation Instructions
-IV.  Using NVIDIA Texture Tools in your own applications
-V.   Known Issues
-VI.  Frequently Asked Questions
---------------------------------------------------------------------------------
-
-I.   Introduction
---------------------------------------------------------------------------------
-
-This is our first alpha release of our new Texture Tools. The main highlights of 
-this release are support for all DX10 texture formats, higher speed and improved
-compression quality.
-
-In addition to that it also comes with a hardware accelerated compressor that 
-uses CUDA to compress blocks in parallel on the GPU and runs around 10 times 
-faster than the CPU counterpart.
-
-You can obtain CUDA from our developer site at:
-
-http://developer.nvidia.com/object/cuda.html
-
-The source code of the Texture Tools is being released under the terms of 
-the MIT license.
-
-
-II.  Contents
---------------------------------------------------------------------------------
-
-This release contains only the source code of the texture compression library
-and an example commandline application that shows its use.
-
-
-III. Compilation Instructions
---------------------------------------------------------------------------------
-
-The compression library and the example can be compiled with Visual Studio 8 on 
-Windows using the following solution file:
-
-project\vc8\nvtt.sln
-
-On most other platforms you can also use cmake. For more information about 
-cmake, visit:
-
-http://www.cmake.org/
-
-On unix systems you can use the standard build procedure (assuming cmake is 
-installed on your system):
-
-$ ./configure
-$ make
-$ sudo make install
-
-
-IV.  Using NVIDIA Texture Tools
---------------------------------------------------------------------------------
-
-To use the NVIDIA Texture Tools in your own applications you just have to
-include the following header file:
-
-src/nvimage/nvtt/nvtt.h
-
-And include the nvtt library in your projects. 
-
-The following file contains a simple example that shows how to use the library:
-
-src/nvimage/nvtt/compress.cpp
-
-The usage of the commandline tool is the following:
-
-$ nvcompress [options] infile [outfile]
-
-where 'infile' is and TGA, PNG, PSD, DDS or JPG file, 'outfile' is a DDS file
-and 'options' is one or more of the following:
-
-Input options:
-  -color   	The input image is a color map (default).
-  -normal  	The input image is a normal map.
-  -tonormal	Convert input to normal map.
-  -clamp   	Clamp wrapping mode (default).
-  -repeat  	Repeat wrapping mode.
-  -nomips  	Disable mipmap generation.
-
-Compression options:
-  -fast    	Fast compression.
-  -nocuda  	Do not use cuda compressor.
-  -rgb     	RGBA format
-  -bc1     	BC1 format (DXT1)
-  -bc2     	BC2 format (DXT3)
-  -bc3     	BC3 format (DXT5)
-  -bc3n    	BC3 normal map format (DXT5n/RXGB)
-  -bc4     	BC4 format (ATI1)
-  -bc5     	BC5 format (3Dc/ATI2)
-
-In order to run the compiled example on a PC that doesn't have Microsoft Visual 
-Studio 2003 installed, you will have to install the Microsoft Visual Studio 2003
-redistributable package that you can download at:
-
-http://go.microsoft.com/fwlink/?linkid=65127&clcid=0x409
-
-
-V.   Known Issues
---------------------------------------------------------------------------------
-
-None so far. Please send suggestions and bug reports to:
-
-TextureTools@nvidia.com
-
-or report them at:
-
-http://code.google.com/p/nvidia-texture-tools/issues/list
-
-
-VI.  Frequently Asked Questions
---------------------------------------------------------------------------------
-
-- Do the NVIDIA Texture Tools work on OSX?
-It currently compiles and runs properly, but it has not been tested extensively.
-In particular there may be endiannes errors in the code.
-
-
-- Do the NVIDIA Texture Tools work on Linux?
-Yes.
-
-
-- Do the NVIDIA Texture Tools work on Vista?
-Yes, but note that CUDA is not supported on Vista yet, so the tool is not hardware 
-accelerated.
-
-
-- Is CUDA required?
-No. The Visual Studio solution file contains a configuration that allows you
-to compile the texture tools without CUDA support. The cmake scripts automatically
-detect the CUDA installation and use it only when available.
-
-
-- Where can I get CUDA?
-http://developer.nvidia.com/object/cuda.html
-
-
-- Why is feature XYZ not supported?
-In order to keep the code small and reduce maintenance costs we have limited the 
-features available in our new texture tools. We also have open sourced the code, so
-that people can modify it and add their own favourite features.
-
-
-- Can I use the NVIDIA Texture Tools in my commercial application?
-Yes, the NVIDIA Texture Tools are licensed under the MIT license.
-
-
-- Can I use the NVIDIA Texture Tools in my GPL application?
-Yes, the MIT license is compatible with the GPL and LGPL licenses.
-
-
-
Index: ps/trunk/libraries/source/nvtt/src/README.md
===================================================================
--- ps/trunk/libraries/source/nvtt/src/README.md
+++ ps/trunk/libraries/source/nvtt/src/README.md
@@ -0,0 +1,46 @@
+NVIDIA Texture Tools
+====================
+
+The NVIDIA Texture Tools is a collection of image processing and texture 
+manipulation tools, designed to be integrated in game tools and asset 
+processing pipelines.
+
+The primary features of the library are mipmap and normal map generation, format 
+conversion and DXT compression.
+
+
+### How to build (Windows)
+
+Open `project/vc12/thekla.sln` using Visual Studio.
+
+Solutions for previous versions are also available, but they may not be up to date.
+
+
+### How to build (Linux/OSX)
+
+Use [cmake](http://www.cmake.org/) and the provided configure script:
+
+```bash
+$ ./configure
+$ make
+$ sudo make install
+```
+
+
+### Using NVIDIA Texture Tools
+
+To use the NVIDIA Texture Tools in your own applications you just have to
+include the following header file:
+
+src/nvimage/nvtt/nvtt.h
+
+And include the nvtt library in your projects. 
+
+The following file contains a simple example that shows how to use the library:
+
+src/nvimage/nvtt/compress.cpp
+
+Detailed documentation of the API can be found at:
+
+http://code.google.com/p/nvidia-texture-tools/wiki/ApiDocumentation
+
Index: ps/trunk/libraries/source/nvtt/src/VERSION
===================================================================
--- ps/trunk/libraries/source/nvtt/src/VERSION
+++ ps/trunk/libraries/source/nvtt/src/VERSION
@@ -1 +1 @@
-2.0.8
+2.1.1
Index: ps/trunk/libraries/source/nvtt/src/cmake/DetermineProcessor.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/DetermineProcessor.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/DetermineProcessor.cmake
@@ -5,24 +5,68 @@
 IF(UNIX)
 	FIND_PROGRAM(CMAKE_UNAME uname /bin /usr/bin /usr/local/bin )
 	IF(CMAKE_UNAME)
-		EXEC_PROGRAM(uname ARGS -p OUTPUT_VARIABLE NV_SYSTEM_PROCESSOR RETURN_VALUE val)
+		#EXEC_PROGRAM(uname ARGS -p OUTPUT_VARIABLE NV_SYSTEM_PROCESSOR RETURN_VALUE val)
 
-		IF("${val}" GREATER 0 OR NV_SYSTEM_PROCESSOR STREQUAL "unknown")
+		#IF("${val}" GREATER 0 OR NV_SYSTEM_PROCESSOR STREQUAL "unknown")
 			EXEC_PROGRAM(uname ARGS -m OUTPUT_VARIABLE NV_SYSTEM_PROCESSOR RETURN_VALUE val)
-		ENDIF("${val}" GREATER 0 OR NV_SYSTEM_PROCESSOR STREQUAL "unknown")
+		#ENDIF("${val}" GREATER 0 OR NV_SYSTEM_PROCESSOR STREQUAL "unknown")
+
+		IF(NV_SYSTEM_PROCESSOR STREQUAL "Power Macintosh")
+			SET(NV_SYSTEM_PROCESSOR "powerpc")
+		ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "Power Macintosh")
 
 		# processor may have double quote in the name, and that needs to be removed
 		STRING(REGEX REPLACE "\"" "" NV_SYSTEM_PROCESSOR "${NV_SYSTEM_PROCESSOR}")
 		STRING(REGEX REPLACE "/" "_" NV_SYSTEM_PROCESSOR "${NV_SYSTEM_PROCESSOR}")
 	ENDIF(CMAKE_UNAME)
 
-	# Get extended processor information with:
-	# `cat /proc/cpuinfo`
+#~ 	# Get extended processor information from /proc/cpuinfo
+#~ 	IF(EXISTS "/proc/cpuinfo")
+
+#~ 		FILE(READ /proc/cpuinfo PROC_CPUINFO)
+
+#~ 		SET(VENDOR_ID_RX "vendor_id[ \t]*:[ \t]*([a-zA-Z]+)\n")
+#~ 		STRING(REGEX MATCH "${VENDOR_ID_RX}" VENDOR_ID "${PROC_CPUINFO}")
+#~ 		STRING(REGEX REPLACE "${VENDOR_ID_RX}" "\\1" VENDOR_ID "${VENDOR_ID}")
+
+#~ 		SET(CPU_FAMILY_RX "cpu family[ \t]*:[ \t]*([0-9]+)")
+#~ 		STRING(REGEX MATCH "${CPU_FAMILY_RX}" CPU_FAMILY "${PROC_CPUINFO}")
+#~ 		STRING(REGEX REPLACE "${CPU_FAMILY_RX}" "\\1" CPU_FAMILY "${CPU_FAMILY}")
+
+#~ 		SET(MODEL_RX "model[ \t]*:[ \t]*([0-9]+)")
+#~ 		STRING(REGEX MATCH "${MODEL_RX}" MODEL "${PROC_CPUINFO}")
+#~ 		STRING(REGEX REPLACE "${MODEL_RX}" "\\1" MODEL "${MODEL}")
+
+#~ 		SET(FLAGS_RX "flags[ \t]*:[ \t]*([a-zA-Z0-9 _]+)\n")
+#~ 		STRING(REGEX MATCH "${FLAGS_RX}" FLAGS "${PROC_CPUINFO}")
+#~ 		STRING(REGEX REPLACE "${FLAGS_RX}" "\\1" FLAGS "${FLAGS}")
+
+#~ 		# Debug output.
+#~ 		IF(LINUX_CPUINFO)
+#~ 			MESSAGE(STATUS "LinuxCPUInfo.cmake:")
+#~ 			MESSAGE(STATUS "VENDOR_ID : ${VENDOR_ID}")
+#~ 			MESSAGE(STATUS "CPU_FAMILY : ${CPU_FAMILY}")
+#~ 			MESSAGE(STATUS "MODEL : ${MODEL}")
+#~ 			MESSAGE(STATUS "FLAGS : ${FLAGS}")
+#~ 		ENDIF(LINUX_CPUINFO)
+
+#~ 	ENDIF(EXISTS "/proc/cpuinfo")
+
+#~		# Information on how to decode CPU_FAMILY and MODEL:
+#~		# http://balusc.xs4all.nl/srv/har-cpu-int-pm.php
 
 ELSE(UNIX)
+
   IF(WIN32)
-    SET (NV_SYSTEM_PROCESSOR "$ENV{PROCESSOR_ARCHITECTURE}")
+    # It's not OK to trust $ENV{PROCESSOR_ARCHITECTURE}: its value depends on the type of executable being run,
+	# so a 32-bit cmake (the default binary distribution) will always say "x86" regardless of the actual target.
+	IF (CMAKE_SIZEOF_VOID_P EQUAL 8)
+      SET (NV_SYSTEM_PROCESSOR "x86_64")
+	ELSE(CMAKE_SIZEOF_VOID_P EQUAL 8)
+	  SET (NV_SYSTEM_PROCESSOR "x86")
+	ENDIF(CMAKE_SIZEOF_VOID_P EQUAL 8)
   ENDIF(WIN32)
+
 ENDIF(UNIX)
 
 
Index: ps/trunk/libraries/source/nvtt/src/cmake/FindCUDA.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/FindCUDA.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/FindCUDA.cmake
@@ -1,142 +0,0 @@
-#
-# Try to find CUDA compiler, runtime libraries, and include path.
-# Once done this will define
-#
-# CUDA_FOUND
-# CUDA_INCLUDE_PATH
-# CUDA_RUNTIME_LIBRARY
-# CUDA_COMPILER
-#
-# It will also define the following macro:
-#
-# WRAP_CUDA
-#
-
-IF (WIN32)
-	FIND_PROGRAM (CUDA_COMPILER nvcc.exe
-		$ENV{CUDA_BIN_PATH}
-		DOC "The CUDA Compiler")
-ELSE(WIN32)
-	FIND_PROGRAM (CUDA_COMPILER nvcc
-		$ENV{CUDA_BIN_PATH}
-		/usr/local/cuda/bin
-		DOC "The CUDA Compiler")
-ENDIF(WIN32)
-
-IF (CUDA_COMPILER)
-	GET_FILENAME_COMPONENT (CUDA_COMPILER_DIR ${CUDA_COMPILER} PATH)
-	GET_FILENAME_COMPONENT (CUDA_COMPILER_SUPER_DIR ${CUDA_COMPILER_DIR} PATH)
-ELSE (CUDA_COMPILER)
-	SET (CUDA_COMPILER_DIR .)
-	SET (CUDA_COMPILER_SUPER_DIR ..)
-ENDIF (CUDA_COMPILER)
-
-FIND_PATH (CUDA_INCLUDE_PATH cuda_runtime.h
-	$ENV{CUDA_INC_PATH}
-	${CUDA_COMPILER_SUPER_DIR}/include
-	${CUDA_COMPILER_DIR}
-	DOC "The directory where CUDA headers reside")
-
-FIND_LIBRARY (CUDA_RUNTIME_LIBRARY
-	NAMES cudart
-	PATHS
-	$ENV{CUDA_LIB_PATH}
-	${CUDA_COMPILER_SUPER_DIR}/lib
-	${CUDA_COMPILER_DIR}
-	DOC "The CUDA runtime library")
-
-IF (CUDA_INCLUDE_PATH AND CUDA_RUNTIME_LIBRARY)
-	SET (CUDA_FOUND TRUE)
-ELSE (CUDA_INCLUDE_PATH AND CUDA_RUNTIME_LIBRARY)
-	SET (CUDA_FOUND FALSE)
-ENDIF (CUDA_INCLUDE_PATH AND CUDA_RUNTIME_LIBRARY)
-
-SET (CUDA_LIBRARIES ${CUDA_RUNTIME_LIBRARY})
-
-MARK_AS_ADVANCED (CUDA_FOUND CUDA_COMPILER CUDA_RUNTIME_LIBRARY)
-
-
-#SET(CUDA_OPTIONS "-ncfe")
-SET(CUDA_OPTIONS "--host-compilation=C")
-
-IF (CUDA_EMULATION)
-	SET (CUDA_OPTIONS "${CUDA_OPTIONS} -deviceemu")
-ENDIF (CUDA_EMULATION)
-
-
-# Get include directories.
-MACRO(GET_CUDA_INC_DIRS _cuda_INC_DIRS)
-	SET(${_cuda_INC_DIRS})
-	GET_DIRECTORY_PROPERTY(_inc_DIRS INCLUDE_DIRECTORIES)
-
-	FOREACH(_current ${_inc_DIRS})
-		SET(${_cuda_INC_DIRS} ${${_cuda_INC_DIRS}} "-I" ${_current})
-	ENDFOREACH(_current ${_inc_DIRS})
-	
-	SET(${_cuda_INC_DIRS} ${${_cuda_INC_DIRS}} "-I" ${CUDA_INCLUDE_PATH})
-
-#	IF (CMAKE_SYTEM_INCLUDE_PATH)
-#		SET(${_cuda_INC_DIRS} ${${_cuda_INC_DIRS}} "-I" ${CMAKE_SYSTEM_INCLUDE_PATH})
-#	ENDIF (CMAKE_SYTEM_INCLUDE_PATH)
-#	IF (CMAKE_INCLUDE_PATH)
-#		SET(${_cuda_INC_DIRS} ${${_cuda_INC_DIRS}} "-I" ${CMAKE_INCLUDE_PATH})
-#	ENDIF (CMAKE_INCLUDE_PATH)
-
-ENDMACRO(GET_CUDA_INC_DIRS)
-
-
-# Get file dependencies.
-MACRO (GET_CUFILE_DEPENDENCIES dependencies file)
-	GET_FILENAME_COMPONENT(filepath ${file} PATH)
-	
-	#  parse file for dependencies
-	FILE(READ "${file}" CONTENTS)
-	#STRING(REGEX MATCHALL "#[ \t]*include[ \t]+[<\"][^>\"]*" DEPS "${CONTENTS}")
-	STRING(REGEX MATCHALL "#[ \t]*include[ \t]+\"[^\"]*" DEPS "${CONTENTS}")
-	
-	SET(${dependencies})
-	
-	FOREACH(DEP ${DEPS})
-		STRING(REGEX REPLACE "#[ \t]*include[ \t]+\"" "" DEP "${DEP}")
-
-		FIND_PATH(PATH_OF_${DEP} ${DEP}
-			${filepath})
-
-		IF(NOT ${PATH_OF_${DEP}} STREQUAL PATH_OF_${DEP}-NOTFOUND)
-			#MESSAGE("${file} : ${PATH_OF_${DEP}}/${DEP}")
-			SET(${dependencies} ${${dependencies}} ${PATH_OF_${DEP}}/${DEP})
-		ENDIF(NOT ${PATH_OF_${DEP}} STREQUAL PATH_OF_${DEP}-NOTFOUND)
-		
-	ENDFOREACH(DEP)
-
-ENDMACRO (GET_CUFILE_DEPENDENCIES)
-
-
-# WRAP_CUDA(outfile ...)
-MACRO (WRAP_CUDA outfiles)
-	GET_CUDA_INC_DIRS(cuda_includes)
-	#MESSAGE(${cuda_includes})
-
-	FOREACH (CUFILE ${ARGN})
-		GET_FILENAME_COMPONENT (CUFILE ${CUFILE} ABSOLUTE)
-		GET_FILENAME_COMPONENT (CFILE ${CUFILE} NAME_WE)
-		SET (CFILE ${CMAKE_CURRENT_BINARY_DIR}/${CFILE}.gen.cpp)
-
-		GET_CUFILE_DEPENDENCIES(CUDEPS ${CUFILE})
-		#MESSAGE("${CUDEPS}")
-
-		ADD_CUSTOM_COMMAND (
-			OUTPUT ${CFILE}
-			COMMAND ${CUDA_COMPILER}
-			ARGS -cuda ${cuda_includes} ${CUDA_OPTIONS} -o ${CFILE} ${CUFILE}
-			MAIN_DEPENDENCY ${CUFILE}
-			DEPENDS ${CUDEPS})
-
-		#MACRO_ADD_FILE_DEPENDENCIES(${CUFILE} ${CFILE})
-
-		SET (${outfiles} ${${outfiles}} ${CFILE})
-	ENDFOREACH (CUFILE)
-	
-	SET_SOURCE_FILES_PROPERTIES(${outfiles} PROPERTIES GENERATED 1)
-	
-ENDMACRO (WRAP_CUDA)
Index: ps/trunk/libraries/source/nvtt/src/cmake/FindCg.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/FindCg.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/FindCg.cmake
@@ -1,129 +1,172 @@
-#
-# Try to find NVIDIA's Cg compiler, runtime libraries, and include path.
-# Once done this will define
-#
-# CG_FOUND =system has NVIDIA Cg and it can be used. 
-# CG_INCLUDE_PATH = directory where cg.h resides
-# CG_LIBRARY = full path to libCg.so (Cg.DLL on win32)
-# CG_GL_LIBRARY = full path to libCgGL.so (CgGL.dll on win32)
-# CG_COMPILER = full path to cgc (cgc.exe on win32)
-# 
-
-# On OSX default to using the framework version of Cg.
-
-IF (APPLE)
-  INCLUDE(${CMAKE_ROOT}/Modules/CMakeFindFrameworks.cmake)
-  SET(CG_FRAMEWORK_INCLUDES)
-  CMAKE_FIND_FRAMEWORKS(Cg)
-  IF (Cg_FRAMEWORKS)
-    FOREACH(dir ${Cg_FRAMEWORKS})
-      SET(CG_FRAMEWORK_INCLUDES ${CG_FRAMEWORK_INCLUDES}
-        ${dir}/Headers ${dir}/PrivateHeaders)
-    ENDFOREACH(dir)
-
-    # Find the include  dir
-    FIND_PATH(CG_INCLUDE_PATH cg.h
-      ${CG_FRAMEWORK_INCLUDES}
-      )
-
-    # Since we are using Cg framework, we must link to it.
-	# Note, we use weak linking, so that it works even when Cg is not available.
-    SET(CG_LIBRARY "-weak_framework Cg" CACHE STRING "Cg library")
-    SET(CG_GL_LIBRARY "-weak_framework Cg" CACHE STRING "Cg GL library")
-  ENDIF (Cg_FRAMEWORKS)
-  FIND_PROGRAM(CG_COMPILER cgc
-    /usr/bin
-    /usr/local/bin
-    DOC "The Cg compiler"
-    )
-ELSE (APPLE)
-  IF (WIN32)
-    FIND_PROGRAM( CG_COMPILER cgc
-      $ENV{CG_BIN_PATH}
-      $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/bin
-      $ENV{PROGRAMFILES}/Cg
-      ${PROJECT_SOURCE_DIR}/../Cg
-      DOC "The Cg Compiler"
-      )
-    IF (CG_COMPILER)
-      GET_FILENAME_COMPONENT(CG_COMPILER_DIR ${CG_COMPILER} PATH)
-      GET_FILENAME_COMPONENT(CG_COMPILER_SUPER_DIR ${CG_COMPILER_DIR} PATH)
-    ELSE (CG_COMPILER)
-      SET (CG_COMPILER_DIR .)
-      SET (CG_COMPILER_SUPER_DIR ..)
-    ENDIF (CG_COMPILER)
-    FIND_PATH( CG_INCLUDE_PATH Cg/cg.h
-      $ENV{CG_INC_PATH}
-      $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/include
-      $ENV{PROGRAMFILES}/Cg
-      ${PROJECT_SOURCE_DIR}/../Cg
-      ${CG_COMPILER_SUPER_DIR}/include
-      ${CG_COMPILER_DIR}
-      DOC "The directory where Cg/cg.h resides"
-      )
-    FIND_LIBRARY( CG_LIBRARY
-      NAMES Cg
-      PATHS
-      $ENV{CG_LIB_PATH}
-      $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/lib
-      $ENV{PROGRAMFILES}/Cg
-      ${PROJECT_SOURCE_DIR}/../Cg
-      ${CG_COMPILER_SUPER_DIR}/lib
-      ${CG_COMPILER_DIR}
-      DOC "The Cg runtime library"
-      )
-    FIND_LIBRARY( CG_GL_LIBRARY
-      NAMES CgGL
-      PATHS
-      $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/lib
-      $ENV{PROGRAMFILES}/Cg
-      ${PROJECT_SOURCE_DIR}/../Cg
-      ${CG_COMPILER_SUPER_DIR}/lib
-      ${CG_COMPILER_DIR}
-      DOC "The Cg runtime library"
-      )
-  ELSE (WIN32)
-    FIND_PROGRAM( CG_COMPILER cgc
-      /usr/bin
-      /usr/local/bin
-      DOC "The Cg Compiler"
-      )
-    GET_FILENAME_COMPONENT(CG_COMPILER_DIR "${CG_COMPILER}" PATH)
-    GET_FILENAME_COMPONENT(CG_COMPILER_SUPER_DIR "${CG_COMPILER_DIR}" PATH)
-    FIND_PATH( CG_INCLUDE_PATH Cg/cg.h
-      /usr/include
-      /usr/local/include
-      ${CG_COMPILER_SUPER_DIR}/include
-      DOC "The directory where Cg/cg.h resides"
-      )
-    FIND_LIBRARY( CG_LIBRARY Cg
-      PATHS
-      /usr/lib64
-      /usr/lib
-      /usr/local/lib64
-      /usr/local/lib
-      ${CG_COMPILER_SUPER_DIR}/lib64
-      ${CG_COMPILER_SUPER_DIR}/lib
-      DOC "The Cg runtime library"
-      )
-	SET(CG_LIBRARY ${CG_LIBRARY} -lpthread)
-    FIND_LIBRARY( CG_GL_LIBRARY CgGL
-      PATHS
-      /usr/lib64
-      /usr/lib
-      /usr/local/lib64
-      /usr/local/lib
-      ${CG_COMPILER_SUPER_DIR}/lib64
-      ${CG_COMPILER_SUPER_DIR}/lib
-      DOC "The Cg runtime library"
-      )
-  ENDIF (WIN32)
-ENDIF (APPLE)
-
-IF (CG_INCLUDE_PATH)
-  SET( CG_FOUND 1 CACHE STRING "Set to 1 if CG is found, 0 otherwise")
-ELSE (CG_INCLUDE_PATH)
-  SET( CG_FOUND 0 CACHE STRING "Set to 1 if CG is found, 0 otherwise")
-ENDIF (CG_INCLUDE_PATH)
-
-MARK_AS_ADVANCED( CG_FOUND )
+#
+# Try to find NVIDIA's Cg compiler, runtime libraries, and include path.
+# Once done this will define
+#
+# CG_FOUND =system has NVIDIA Cg and it can be used. 
+# CG_INCLUDE_DIR = directory where cg.h resides
+# CG_LIBRARY = full path to libCg.so (Cg.DLL on win32)
+# CG_GL_LIBRARY = full path to libCgGL.so (CgGL.dll on win32)
+# CG_COMPILER = full path to cgc (cgc.exe on win32)
+# 
+
+# On OSX default to using the framework version of Cg.
+IF (APPLE)
+  INCLUDE(${CMAKE_ROOT}/Modules/CMakeFindFrameworks.cmake)
+  SET(CG_FRAMEWORK_INCLUDES)
+  CMAKE_FIND_FRAMEWORKS(Cg)
+  IF (Cg_FRAMEWORKS)
+    FOREACH(dir ${Cg_FRAMEWORKS})
+      SET(CG_FRAMEWORK_INCLUDES ${CG_FRAMEWORK_INCLUDES}
+        ${dir}/Headers ${dir}/PrivateHeaders)
+    ENDFOREACH(dir)
+
+    # Find the include  dir
+    FIND_PATH(CG_INCLUDE_DIR cg.h
+      ${CG_FRAMEWORK_INCLUDES}
+      )
+
+    # Since we are using Cg framework, we must link to it.
+	# Note, we use weak linking, so that it works even when Cg is not available.
+    SET(CG_LIBRARY "-weak_framework Cg" CACHE STRING "Cg library")
+    SET(CG_GL_LIBRARY "-weak_framework Cg" CACHE STRING "Cg GL library")
+  ENDIF (Cg_FRAMEWORKS)
+  FIND_PROGRAM(CG_COMPILER cgc
+    /usr/bin
+    /usr/local/bin
+    DOC "The Cg compiler"
+    )
+ELSE (APPLE)
+  IF (WIN32)
+ 
+    # When compiling 64-bit programs, the binaries and libs are in bin.x64 and lib.x64 directories,
+	
+	# This will have only effect for 64bit versions of cmake, when running the default 32bit version
+	# both ProgramFiles and ProgramFiles(x86) point to the same place in Win64
+	SET(PFx86_VARNAME "ProgramFiles(x86)")
+    SET(PFx86 $ENV{${PFx86_VARNAME}})
+	
+	# Let's play safe in case we are cross compiling to 64 bit: for cgc it doesn't really matter
+	FIND_PROGRAM( CG_COMPILER cgc
+      $ENV{CG_BIN64_PATH}
+	  $ENV{CG_BIN_PATH}
+      $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/bin
+	  $ENV{PFx86}/NVIDIA\ Corporation/Cg/bin
+      $ENV{PROGRAMFILES}/Cg
+      ${PROJECT_SOURCE_DIR}/../Cg
+      DOC "The Cg Compiler"
+    )
+
+    IF (CG_COMPILER)
+      GET_FILENAME_COMPONENT(CG_COMPILER_DIR ${CG_COMPILER} PATH)
+      GET_FILENAME_COMPONENT(CG_COMPILER_SUPER_DIR ${CG_COMPILER_DIR} PATH)
+    ELSE (CG_COMPILER)
+      SET (CG_COMPILER_DIR .)
+      SET (CG_COMPILER_SUPER_DIR ..)
+    ENDIF (CG_COMPILER)
+    FIND_PATH( CG_INCLUDE_DIR Cg/cg.h
+      $ENV{CG_INC_PATH}
+      $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/include
+      $ENV{PROGRAMFILES}/Cg
+      ${PROJECT_SOURCE_DIR}/../Cg
+      ${CG_COMPILER_SUPER_DIR}/include
+      ${CG_COMPILER_DIR}
+      DOC "The directory where Cg/cg.h resides"
+      )
+	
+	IF (NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      FIND_LIBRARY( CG_LIBRARY
+        NAMES Cg
+        PATHS
+        $ENV{CG_LIB64_PATH}
+        $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/lib.x64
+		$ENV{PFx86}/NVIDIA\ Corporation/Cg/lib.x64
+        $ENV{PROGRAMFILES}/Cg
+        $ENV{PFx86}/Cg
+        ${PROJECT_SOURCE_DIR}/../Cg
+        ${CG_COMPILER_SUPER_DIR}/lib.x64
+        ${CG_COMPILER_DIR}
+        DOC "The Cg runtime library (64-bit)"
+        )
+      FIND_LIBRARY( CG_GL_LIBRARY
+        NAMES CgGL
+        PATHS
+		$ENV{CG_LIB64_PATH}
+        $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/lib.x64
+		$ENV{PFx86}/NVIDIA\ Corporation/Cg/lib.x64
+        $ENV{PROGRAMFILES}/Cg
+		$ENV{PFx86}/Cg
+        ${PROJECT_SOURCE_DIR}/../Cg
+        ${CG_COMPILER_SUPER_DIR}/lib.x64
+        ${CG_COMPILER_DIR}
+        DOC "The Cg GL runtime library (64-bit)"
+        )
+	ELSE(NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      FIND_LIBRARY( CG_LIBRARY
+        NAMES Cg
+        PATHS
+        $ENV{CG_LIB_PATH}
+        $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/lib
+        $ENV{PROGRAMFILES}/Cg
+        ${PROJECT_SOURCE_DIR}/../Cg
+        ${CG_COMPILER_SUPER_DIR}/lib
+        ${CG_COMPILER_DIR}
+        DOC "The Cg runtime library"
+        )
+      FIND_LIBRARY( CG_GL_LIBRARY
+        NAMES CgGL
+        PATHS
+		$ENV{CG_LIB_PATH}
+        $ENV{PROGRAMFILES}/NVIDIA\ Corporation/Cg/lib
+        $ENV{PROGRAMFILES}/Cg
+        ${PROJECT_SOURCE_DIR}/../Cg
+        ${CG_COMPILER_SUPER_DIR}/lib
+        ${CG_COMPILER_DIR}
+        DOC "The Cg GL runtime library"
+        )
+	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
+
+  ELSE (WIN32)
+    FIND_PROGRAM( CG_COMPILER cgc
+      /usr/bin
+      /usr/local/bin
+      DOC "The Cg Compiler"
+      )
+    GET_FILENAME_COMPONENT(CG_COMPILER_DIR "${CG_COMPILER}" PATH)
+    GET_FILENAME_COMPONENT(CG_COMPILER_SUPER_DIR "${CG_COMPILER_DIR}" PATH)
+    FIND_PATH( CG_INCLUDE_DIR Cg/cg.h
+      /usr/include
+      /usr/local/include
+      ${CG_COMPILER_SUPER_DIR}/include
+      DOC "The directory where Cg/cg.h resides"
+      )
+    FIND_LIBRARY( CG_LIBRARY Cg
+      PATHS
+      /usr/lib64
+      /usr/lib
+      /usr/local/lib64
+      /usr/local/lib
+      ${CG_COMPILER_SUPER_DIR}/lib64
+      ${CG_COMPILER_SUPER_DIR}/lib
+      DOC "The Cg runtime library"
+      )
+	SET(CG_LIBRARY ${CG_LIBRARY} -lpthread)
+    FIND_LIBRARY( CG_GL_LIBRARY CgGL
+      PATHS
+      /usr/lib64
+      /usr/lib
+      /usr/local/lib64
+      /usr/local/lib
+      ${CG_COMPILER_SUPER_DIR}/lib64
+      ${CG_COMPILER_SUPER_DIR}/lib
+      DOC "The Cg runtime library"
+      )
+  ENDIF (WIN32)
+ENDIF (APPLE)
+
+IF (CG_INCLUDE_DIR)
+  SET( CG_FOUND 1 CACHE STRING "Set to 1 if CG is found, 0 otherwise")
+ELSE (CG_INCLUDE_DIR)
+  SET( CG_FOUND 0 CACHE STRING "Set to 1 if CG is found, 0 otherwise")
+ENDIF (CG_INCLUDE_DIR)
+
+MARK_AS_ADVANCED( CG_FOUND )
Index: ps/trunk/libraries/source/nvtt/src/cmake/FindFreeImage.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/FindFreeImage.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/FindFreeImage.cmake
@@ -0,0 +1,53 @@
+#
+# Try to find the FreeImage library and include path.
+# Once done this will define
+#
+# FREEIMAGE_FOUND
+# FREEIMAGE_INCLUDE_PATH
+# FREEIMAGE_LIBRARY
+# 
+
+IF (WIN32)
+	FIND_PATH( FREEIMAGE_INCLUDE_PATH FreeImage.h
+		${FREEIMAGE_ROOT_DIR}/include
+		${FREEIMAGE_ROOT_DIR}
+		DOC "The directory where FreeImage.h resides")
+	FIND_LIBRARY( FREEIMAGE_LIBRARY
+		NAMES FreeImage freeimage
+		PATHS
+		${FREEIMAGE_ROOT_DIR}/lib
+		${FREEIMAGE_ROOT_DIR}
+		DOC "The FreeImage library")
+ELSE (WIN32)
+	FIND_PATH( FREEIMAGE_INCLUDE_PATH FreeImage.h
+		/usr/include
+		/usr/local/include
+		/sw/include
+		/opt/local/include
+		DOC "The directory where FreeImage.h resides")
+	FIND_LIBRARY( FREEIMAGE_LIBRARY
+		NAMES FreeImage freeimage
+		PATHS
+		/usr/lib64
+		/usr/lib
+		/usr/local/lib64
+		/usr/local/lib
+		/sw/lib
+		/opt/local/lib
+		DOC "The FreeImage library")
+ENDIF (WIN32)
+
+SET(FREEIMAGE_LIBRARIES ${FREEIMAGE_LIBRARY})
+
+IF (FREEIMAGE_INCLUDE_PATH AND FREEIMAGE_LIBRARY)
+	SET( FREEIMAGE_FOUND TRUE CACHE BOOL "Set to TRUE if FreeImage is found, FALSE otherwise")
+ELSE (FREEIMAGE_INCLUDE_PATH AND FREEIMAGE_LIBRARY)
+	SET( FREEIMAGE_FOUND FALSE CACHE BOOL "Set to TRUE if FreeImage is found, FALSE otherwise")
+ENDIF (FREEIMAGE_INCLUDE_PATH AND FREEIMAGE_LIBRARY)
+
+MARK_AS_ADVANCED(
+	FREEIMAGE_FOUND 
+	FREEIMAGE_LIBRARY
+	FREEIMAGE_LIBRARIES
+	FREEIMAGE_INCLUDE_PATH)
+
Index: ps/trunk/libraries/source/nvtt/src/cmake/FindGLEW.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/FindGLEW.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/FindGLEW.cmake
@@ -10,24 +10,28 @@
 IF (WIN32)
 	FIND_PATH( GLEW_INCLUDE_PATH GL/glew.h
 		$ENV{PROGRAMFILES}/GLEW/include
-		${PROJECT_SOURCE_DIR}/src/nvgl/glew/include
+		${GLEW_ROOT_DIR}/include
 		DOC "The directory where GL/glew.h resides")
-	FIND_LIBRARY( GLEW_LIBRARY
-		NAMES glew GLEW glew32 glew32s
-		PATHS
-		$ENV{PROGRAMFILES}/GLEW/lib
-		${PROJECT_SOURCE_DIR}/src/nvgl/glew/bin
-		${PROJECT_SOURCE_DIR}/src/nvgl/glew/lib
-		DOC "The GLEW library")
+
+    FIND_LIBRARY( GLEW_LIBRARY
+        NAMES glew GLEW glew32 glew32s
+        PATHS
+        $ENV{PROGRAMFILES}/GLEW/lib
+        ${PROJECT_SOURCE_DIR}/src/nvgl/glew/bin
+        ${PROJECT_SOURCE_DIR}/src/nvgl/glew/lib
+        DOC "The GLEW library")
 ELSE (WIN32)
 	FIND_PATH( GLEW_INCLUDE_PATH GL/glew.h
 		/usr/include
 		/usr/local/include
 		/sw/include
 		/opt/local/include
+		${GLEW_ROOT_DIR}/include
 		DOC "The directory where GL/glew.h resides")
+
+	# Prefer the static library.
 	FIND_LIBRARY( GLEW_LIBRARY
-		NAMES GLEW glew
+		NAMES libGLEW.a GLEW
 		PATHS
 		/usr/lib64
 		/usr/lib
@@ -35,13 +39,12 @@
 		/usr/local/lib
 		/sw/lib
 		/opt/local/lib
+		${GLEW_ROOT_DIR}/lib
 		DOC "The GLEW library")
 ENDIF (WIN32)
 
-IF (GLEW_INCLUDE_PATH)
-	SET( GLEW_FOUND 1 CACHE STRING "Set to 1 if GLEW is found, 0 otherwise")
-ELSE (GLEW_INCLUDE_PATH)
-	SET( GLEW_FOUND 0 CACHE STRING "Set to 1 if GLEW is found, 0 otherwise")
-ENDIF (GLEW_INCLUDE_PATH)
-
-MARK_AS_ADVANCED( GLEW_FOUND )
+SET(GLEW_FOUND "NO")
+IF (GLEW_INCLUDE_PATH AND GLEW_LIBRARY)
+	SET(GLEW_LIBRARIES ${GLEW_LIBRARY})
+	SET(GLEW_FOUND "YES")
+ENDIF (GLEW_INCLUDE_PATH AND GLEW_LIBRARY)
Index: ps/trunk/libraries/source/nvtt/src/cmake/FindGLUT.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/FindGLUT.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/FindGLUT.cmake
@@ -1,127 +0,0 @@
-# - try to find glut library and include files
-#  GLUT_INCLUDE_DIR, where to find GL/glut.h, etc.
-#  GLUT_LIBRARIES, the libraries to link against
-#  GLUT_FOUND, If false, do not try to use GLUT.
-# Also defined, but not for general use are:
-#  GLUT_glut_LIBRARY = the full path to the glut library.
-#  GLUT_Xmu_LIBRARY  = the full path to the Xmu library.
-#  GLUT_Xi_LIBRARY   = the full path to the Xi Library.
-
-IF (WIN32)
-
-  IF(CYGWIN)
-
-    FIND_PATH( GLUT_INCLUDE_DIR GL/glut.h
-      /usr/include
-    )
-
-    FIND_LIBRARY( GLUT_glut_LIBRARY glut32
-      ${OPENGL_LIBRARY_DIR}
-      /usr/lib
-      /usr/lib/w32api
-      /usr/local/lib
-      /usr/X11R6/lib
-    )
-
-
-  ELSE(CYGWIN)
-
-#    FIND_PATH( GLUT_INCLUDE_DIR GL/glut.h
-#     ${GLUT_ROOT_PATH}/include
-#   )
-
-#   FIND_LIBRARY( GLUT_glut_LIBRARY glut32
-#     ${GLUT_ROOT_PATH}/lib
-#     ${OPENGL_LIBRARY_DIR}
-#   )
-
-	FIND_PATH( GLUT_INCLUDE_DIR GL/glut.h
-		${GLUT_ROOT_PATH}/include
-		${PROJECT_SOURCE_DIR}/src/nvgl/glut/include
-		DOC "The directory where GL/glut.h resides")
-	FIND_LIBRARY( GLUT_glut_LIBRARY
-		NAMES glut GLUT glut32 glut32s
-		PATHS
-		${GLUT_ROOT_PATH}/lib
-		${PROJECT_SOURCE_DIR}/src/nvgl/glut/bin
-		${PROJECT_SOURCE_DIR}/src/nvgl/glut/lib
-		${OPENGL_LIBRARY_DIR}
-		DOC "The GLUT library")
-
-  ENDIF(CYGWIN)
-
-ELSE (WIN32)
-
-  IF (APPLE)
-# These values for Apple could probably do with improvement.
-    FIND_PATH( GLUT_INCLUDE_DIR glut.h
-      /System/Library/Frameworks/GLUT.framework/Versions/A/Headers
-      ${OPENGL_LIBRARY_DIR}
-    )
-    SET(GLUT_glut_LIBRARY "-framework Glut" CACHE STRING "GLUT library for OSX") 
-    SET(GLUT_cocoa_LIBRARY "-framework Cocoa" CACHE STRING "Cocoa framework for OSX")
-  ELSE (APPLE)
-
-    FIND_PATH( GLUT_INCLUDE_DIR GL/glut.h
-      /usr/include
-      /usr/include/GL
-      /usr/local/include
-      /usr/openwin/share/include
-      /usr/openwin/include
-      /usr/X11R6/include
-      /usr/include/X11
-      /opt/graphics/OpenGL/include
-      /opt/graphics/OpenGL/contrib/libglut
-    )
-
-    FIND_LIBRARY( GLUT_glut_LIBRARY glut
-      /usr/lib
-      /usr/local/lib
-      /usr/openwin/lib
-      /usr/X11R6/lib
-    )
-
-    FIND_LIBRARY( GLUT_Xi_LIBRARY Xi
-      /usr/lib
-      /usr/local/lib
-      /usr/openwin/lib
-      /usr/X11R6/lib
-    )
-
-    FIND_LIBRARY( GLUT_Xmu_LIBRARY Xmu
-      /usr/lib
-      /usr/local/lib
-      /usr/openwin/lib
-      /usr/X11R6/lib
-    )
-
-  ENDIF (APPLE)
-
-ENDIF (WIN32)
-
-SET( GLUT_FOUND "NO" )
-IF(GLUT_INCLUDE_DIR)
-  IF(GLUT_glut_LIBRARY)
-    # Is -lXi and -lXmu required on all platforms that have it?
-    # If not, we need some way to figure out what platform we are on.
-    SET( GLUT_LIBRARIES
-      ${GLUT_glut_LIBRARY}
-      ${GLUT_Xmu_LIBRARY}
-      ${GLUT_Xi_LIBRARY} 
-      ${GLUT_cocoa_LIBRARY}
-    )
-    SET( GLUT_FOUND "YES" )
-
-#The following deprecated settings are for backwards compatibility with CMake1.4
-    SET (GLUT_LIBRARY ${GLUT_LIBRARIES})
-    SET (GLUT_INCLUDE_PATH ${GLUT_INCLUDE_DIR})
-
-  ENDIF(GLUT_glut_LIBRARY)
-ENDIF(GLUT_INCLUDE_DIR)
-
-MARK_AS_ADVANCED(
-  GLUT_INCLUDE_DIR
-  GLUT_glut_LIBRARY
-  GLUT_Xmu_LIBRARY
-  GLUT_Xi_LIBRARY
-)
Index: ps/trunk/libraries/source/nvtt/src/cmake/OptimalOptions.cmake
===================================================================
--- ps/trunk/libraries/source/nvtt/src/cmake/OptimalOptions.cmake
+++ ps/trunk/libraries/source/nvtt/src/cmake/OptimalOptions.cmake
@@ -9,9 +9,10 @@
 	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "i586")
 
 	IF(NV_SYSTEM_PROCESSOR STREQUAL "i686")
-		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=i686")
+		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=i686")
 		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpmath=sse -mtune=i686 -msse3")
 		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=pentium4")
+		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=prescott")
 	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "i686")
 
 	IF(NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
@@ -20,12 +21,30 @@
 	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "x86_64")
 
 	IF(NV_SYSTEM_PROCESSOR STREQUAL "powerpc")
-		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=powerpc -maltivec -mabi=altivec -mpowerpc-gfxopt")
+		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=powerpc -faltivec -maltivec -mabi=altivec -mpowerpc-gfxopt")
 		
 		# ibook G4:
-		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=7450 -mtune=7450 -maltivec -mabi=altivec -mpowerpc-gfxopt")
+		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=7450 -mtune=7450 -faltivec -maltivec -mabi=altivec -mpowerpc-gfxopt")
+
+		# G5
+		#SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=G5 -faltivec -maltivec -mabi=altivec -mpowerpc-gfxopt")
+
 	ENDIF(NV_SYSTEM_PROCESSOR STREQUAL "powerpc")
 
+#	IF(DARWIN)
+#		SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mmacosx-version-min=10.5 -isysroot /Developer/SDKs/MacOSX10.5.sdk")
+#		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmacosx-version-min=10.5 -isysroot /Developer/SDKs/MacOSX10.5.sdk")
+#	ENDIF(DARWIN)
+	IF(APPLE)
+		SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -arch i586 -arch x86_64 -msse3 -mmacosx-version-min=10.5")
+		SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -arch i586 -arch x86_64 -msse3 -mmacosx-version-min=10.5")
+	ENDIF(APPLE)
+
+	IF(CMAKE_BUILD_TYPE STREQUAL "debug")
+		ADD_DEFINITIONS(-D_DEBUG)
+	ENDIF(CMAKE_BUILD_TYPE STREQUAL "debug")
+
+	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 ENDIF(CMAKE_COMPILER_IS_GNUCXX)
 
 IF(MSVC)
Index: ps/trunk/libraries/source/nvtt/src/configure
===================================================================
--- ps/trunk/libraries/source/nvtt/src/configure
+++ ps/trunk/libraries/source/nvtt/src/configure
@@ -18,7 +18,7 @@
 
 
 help=false
-build="Debug"	# release
+build="debug"	# release
 prefix=/usr/local
 
 # Parse the args
@@ -26,9 +26,8 @@
 do
 	case $i in
 		--help )				help=true ;;
-		--debug )				build="Debug" ;;
-		--release )				build="Release" ;;
-		--prefix=* )			prefix="${i#--prefix=}" ;;
+		--debug )				build="debug" ;;
+		--release )				build="release" ;;
 		--prefix=* )			prefix="${i#--prefix=}" ;;
 		* )						echo "Unrecognised argument $i" ;;
 	esac
@@ -51,9 +50,9 @@
 
 echo "-- Configuring nvidia-texture-tools "`cat VERSION`
 
-mkdir -p ./build
-cd ./build
-$CMAKE .. -DNVTT_SHARED=1 -DCMAKE_BUILD_TYPE=$build -DCMAKE_INSTALL_PREFIX=$prefix -G "Unix Makefiles" || exit 1
+mkdir -p ./build-$build
+cd ./build-$build
+$CMAKE .. -DNVTT_SHARED=0 -DCMAKE_BUILD_TYPE=$build -DCMAKE_INSTALL_PREFIX=$prefix -G "Unix Makefiles" || exit 1
 cd ..
 
 echo ""
@@ -62,11 +61,15 @@
 
 cat > Makefile << EOF
 all:
-	@make --no-print-directory -C build/
+	@+make --no-print-directory -C build-$build/
 install:
-	@make install --no-print-directory -C build/
+	@+make install --no-print-directory -C build-$build/
+package:
+	@+make package --no-print-directory -C build-$build/
+test:
+	@+make test --no-print-directory -C build-$build/
 clean:
-	@make clean --no-print-directory -C build/
+	@+make clean --no-print-directory -C build-$build/
 distclean:
-	@rm -Rf build/
+	@rm -Rf build-$build/
 EOF
Index: ps/trunk/libraries/source/nvtt/src/extern/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/extern/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/extern/CMakeLists.txt
@@ -0,0 +1,13 @@
+
+#IF(WIN32)
+	#ADD_SUBDIRECTORY(gnuwin32)
+#ENDIF(WIN32)
+
+ADD_SUBDIRECTORY(poshlib)
+
+#ADD_SUBDIRECTORY(EtcLib)
+#ADD_SUBDIRECTORY(rg_etc1_v104)
+#ADD_SUBDIRECTORY(etcpack)
+
+#ADD_SUBDIRECTORY(butteraugli)
+
Index: ps/trunk/libraries/source/nvtt/src/extern/poshlib/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/extern/poshlib/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/extern/poshlib/CMakeLists.txt
@@ -0,0 +1,7 @@
+
+SET(POSHLIB_SRCS
+	posh.c
+	posh.h)
+
+ADD_LIBRARY(posh STATIC ${POSHLIB_SRCS})
+
Index: ps/trunk/libraries/source/nvtt/src/extern/poshlib/posh.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/extern/poshlib/posh.h
+++ ps/trunk/libraries/source/nvtt/src/extern/poshlib/posh.h
@@ -0,0 +1,1034 @@
+/**
+@file posh.h
+@author Brian Hook
+@version 1.3.001
+
+Header file for POSH, the Portable Open Source Harness project.
+
+NOTE: Unlike most header files, this one is designed to be included
+multiple times, which is why it does not have the @#ifndef/@#define
+preamble.
+
+POSH relies on environment specified preprocessor symbols in order
+to infer as much as possible about the target OS/architecture and
+the host compiler capabilities.
+
+NOTE: POSH is simple and focused. It attempts to provide basic
+functionality and information, but it does NOT attempt to emulate
+missing functionality.  I am also not willing to make POSH dirty
+and hackish to support truly ancient and/or outmoded and/or bizarre
+technologies such as non-ANSI compilers, systems with non-IEEE
+floating point formats, segmented 16-bit operating systems, etc.
+
+Please refer to the accompanying HTML documentation or visit
+http://www.poshlib.org for more information on how to use POSH.
+
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REVISION:
+
+I've been lax about revision histories, so this starts at, um, 1.3.001.
+Sorry for any inconveniences.
+
+1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary,
+                      where I was not detecting Visual Studio
+                      compilation on x86-64 systems.  Added check for
+                      _M_X64 which should fix that.
+
+*/
+/*
+I have yet to find an authoritative reference on preprocessor
+symbols, but so far this is what I've gleaned:
+
+GNU GCC/G++:
+   - __GNUC__: GNU C version
+   - __GNUG__: GNU C++ compiler
+   - __sun__ : on Sun platforms
+   - __svr4__: on Solaris and other SysV R4 platforms
+   - __mips__: on MIPS processor platforms
+   - __sparc_v9__: on Sparc 64-bit CPUs
+   - __sparcv9: 64-bit Solaris
+   - __MIPSEL__: mips processor, compiled for little endian
+   - __MIPSEB__: mips processor, compiled for big endian
+   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
+   - mc68000: 68K
+   - m68000: 68K
+   - m68k: 68K
+   - __palmos__: PalmOS
+
+Intel C/C++ Compiler:
+   - __ECC      : compiler version, IA64 only
+   - __EDG__
+   - __ELF__
+   - __GXX_ABI_VERSION
+   - __i386     : IA-32 only
+   - __i386__   : IA-32 only
+   - i386       : IA-32 only
+   - __ia64     : IA-64 only
+   - __ia64__   : IA-64 only
+   - ia64       : IA-64 only
+   - __ICC      : IA-32 only
+   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
+
+Apple's C/C++ Compiler for OS X:
+   - __APPLE_CC__
+   - __APPLE__
+   - __BIG_ENDIAN__
+   - __APPLE__
+   - __ppc__
+   - __MACH__
+
+DJGPP:
+   - __MSDOS__
+   - __unix__
+   - __unix
+   - __GNUC__
+   - __GO32
+   - DJGPP
+   - __i386, __i386, i386
+
+Cray's C compiler:
+   - _ADDR64: if 64-bit pointers
+   - _UNICOS: 
+   - __unix:
+
+SGI's CC compiler predefines the following (and more) with -ansi:
+   - __sgi
+   - __unix
+   - __host_mips
+   - _SYSTYPE_SVR4
+   - __mips
+   - _MIPSEB
+   - anyone know if there is a predefined symbol for the compiler?!
+
+MinGW:
+   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
+   - __MINGW32__
+
+Cygwin:
+   - as Gnu C, but also
+   - __unix__
+   - __CYGWIN32__
+
+Microsoft Visual Studio predefines the following:
+   - _MSC_VER
+   - _WIN32: on Win32
+   - _M_IX6 (on x86 systems)
+   - _M_X64: on x86-64 systems
+   - _M_ALPHA (on DEC AXP systems)
+   - _SH3: WinCE, Hitachi SH-3
+   - _MIPS: WinCE, MIPS
+   - _ARM: WinCE, ARM
+
+Sun's C Compiler:
+   - sun and _sun
+   - unix and _unix
+   - sparc and _sparc (SPARC systems only)
+   - i386 and _i386 (x86 systems only)
+   - __SVR4 (Solaris only)
+   - __sparcv9: 64-bit solaris
+   - __SUNPRO_C
+   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
+
+Borland C/C++ predefines the following:
+   - __BORLANDC__:
+
+DEC/Compaq C/C++ on Alpha:
+   - __alpha
+   - __arch64__
+   - __unix__ (on Tru64 Unix)
+   - __osf__
+   - __DECC
+   - __DECCXX (C++ compilation)
+   - __DECC_VER
+   - __DECCXX_VER
+
+IBM's AIX compiler:
+   - __64BIT__ if 64-bit mode
+   - _AIX
+   - __IBMC__: C compiler version
+   - __IBMCPP__: C++ compiler version
+   - _LONG_LONG: compiler allows long long
+
+Watcom:
+   - __WATCOMC__
+   - __DOS__ : if targeting DOS
+   - __386__ : if 32-bit support
+   - __WIN32__ : if targetin 32-bit Windows
+
+HP-UX C/C++ Compiler:
+   - __hpux
+   - __unix
+   - __hppa (on PA-RISC)
+   - __LP64__: if compiled in 64-bit mode
+
+Metrowerks:
+   - __MWERKS__
+   - __powerpc__
+   - _powerc
+   - __MC68K__
+   - macintosh when compiling for MacOS
+   - __INTEL__ for x86 targets
+   - __POWERPC__
+
+LLVM:
+   - __llvm__
+   - __clang__
+*/
+
+/*
+** ----------------------------------------------------------------------------
+** Include <limits.h> optionally
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_USE_LIMITS_H
+#  include <limits.h>
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine compilation environment
+** ----------------------------------------------------------------------------
+*/
+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
+#  define POSH_COMPILER_STRING "Intel C/C++"
+#  define POSH_COMPILER_INTEL 1
+#endif
+
+#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
+#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
+#  define POSH_COMPILER_MIPSPRO 1 
+#endif
+
+#if defined __hpux && !defined __GNUC__
+#  define POSH_COMPILER_STRING "HP-UX CC"
+#  define POSH_COMPILER_HPCC 1 
+#endif
+
+#if defined __clang__
+#  define POSH_COMPILER_STRING "Clang"
+#  define POSH_COMPILER_CLANG 1
+#endif
+
+#if defined __GNUC__ && !defined __clang__
+#  define POSH_COMPILER_STRING "Gnu GCC"
+#  define POSH_COMPILER_GCC 1
+#endif
+
+#if defined __APPLE_CC__
+   /* we don't define the compiler string here, let it be GNU */
+#  define POSH_COMPILER_APPLECC 1
+#endif
+
+#if defined __IBMC__ || defined __IBMCPP__
+#  define POSH_COMPILER_STRING "IBM C/C++"
+#  define POSH_COMPILER_IBM 1
+#endif
+
+#if defined _MSC_VER
+#  define POSH_COMPILER_STRING "Microsoft Visual C++"
+#  define POSH_COMPILER_MSVC 1
+#endif
+
+#if defined __SUNPRO_C
+#  define POSH_COMPILER_STRING "Sun Pro" 
+#  define POSH_COMPILER_SUN 1
+#endif
+
+#if defined __BORLANDC__
+#  define POSH_COMPILER_STRING "Borland C/C++"
+#  define POSH_COMPILER_BORLAND 1
+#endif
+
+#if defined __MWERKS__
+#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
+#  define POSH_COMPILER_METROWERKS 1
+#endif
+
+#if defined __DECC || defined __DECCXX
+#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
+#  define POSH_COMPILER_DEC 1
+#endif
+
+#if defined __WATCOMC__
+#  define POSH_COMPILER_STRING "Watcom C/C++"
+#  define POSH_COMPILER_WATCOM 1
+#endif
+
+#if !defined POSH_COMPILER_STRING
+#  define POSH_COMPILER_STRING "Unknown compiler"
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine target operating system
+** ----------------------------------------------------------------------------
+*/
+#if defined linux || defined __linux__
+#  define POSH_OS_LINUX 1 
+#  define POSH_OS_STRING "Linux"
+#endif
+
+#if defined __FreeBSD__
+#  define POSH_OS_FREEBSD 1 
+#  define POSH_OS_STRING "FreeBSD"
+#endif
+
+#if defined __NetBSD__
+#  define POSH_OS_NETBSD 1
+#  define POSH_OS_STRING "NetBSD"
+#endif
+
+#if defined __OpenBSD__
+#  define POSH_OS_OPENBSD 1
+#  define POSH_OS_STRING "OpenBSD"
+#endif
+
+#if defined __CYGWIN32__
+#  define POSH_OS_CYGWIN32 1
+#  define POSH_OS_STRING "Cygwin"
+#endif
+
+#if defined GEKKO
+#  define POSH_OS_GAMECUBE
+#  define __powerpc__
+#  define POSH_OS_STRING "GameCube"
+#endif
+
+#if defined __MINGW32__
+#  define POSH_OS_MINGW 1
+#  define POSH_OS_STRING "MinGW"
+#endif
+
+#if defined GO32 && defined DJGPP && defined __MSDOS__
+#  define POSH_OS_GO32 1
+#  define POSH_OS_STRING "GO32/MS-DOS"
+#endif
+
+/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
+   otherwise Watcom assumes host=target */
+#if defined __WATCOMC__  && defined __386__ && defined __DOS__
+#  define POSH_OS_DOS32 1
+#  define POSH_OS_STRING "DOS/32-bit"
+#endif
+
+#if defined _UNICOS
+#  define POSH_OS_UNICOS 1
+#  define POSH_OS_STRING "UNICOS"
+#endif
+
+#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
+#  define POSH_OS_OSX 1
+#  define POSH_OS_STRING "MacOS X"
+#endif
+
+#if defined __sun__ || defined sun || defined __sun || defined __solaris__
+#  if defined __SVR4 || defined __svr4__ || defined __solaris__
+#     define POSH_OS_STRING "Solaris"
+#     define POSH_OS_SOLARIS 1
+#  endif
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "SunOS"
+#     define POSH_OS_SUNOS 1
+#  endif
+#endif
+
+#if defined __sgi__ || defined sgi || defined __sgi
+#  define POSH_OS_IRIX 1
+#  define POSH_OS_STRING "Irix"
+#endif
+
+#if defined __hpux__ || defined __hpux
+#  define POSH_OS_HPUX 1
+#  define POSH_OS_STRING "HP-UX"
+#endif
+
+#if defined _AIX
+#  define POSH_OS_AIX 1
+#  define POSH_OS_STRING "AIX"
+#endif
+
+#if ( defined __alpha && defined __osf__ )
+#  define POSH_OS_TRU64 1
+#  define POSH_OS_STRING "Tru64"
+#endif
+
+#if defined __BEOS__ || defined __beos__
+#  define POSH_OS_BEOS 1
+#  define POSH_OS_STRING "BeOS"
+#endif
+
+#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
+#  define POSH_OS_AMIGA 1
+#  define POSH_OS_STRING "Amiga"
+#endif
+
+#if defined __unix__
+#  define POSH_OS_UNIX 1 
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "Unix-like(generic)"
+#  endif
+#endif
+
+#if defined _WIN32_WCE
+#  define POSH_OS_WINCE 1
+#  define POSH_OS_STRING "Windows CE"
+#endif
+
+#if defined _XBOX || defined _XBOX_VER
+#  define POSH_OS_XBOX 1
+#  define POSH_OS_STRING "XBOX"
+#endif
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
+#  define POSH_OS_WIN32 1
+#  if !defined POSH_OS_XBOX
+#     if defined _WIN64
+#        define POSH_OS_WIN64 1
+#        define POSH_OS_STRING "Win64"
+#     else
+#        if !defined POSH_OS_STRING
+#           define POSH_OS_STRING "Win32"
+#        endif
+#     endif
+#  endif
+#endif
+
+#if defined __palmos__
+#  define POSH_OS_PALM 1
+#  define POSH_OS_STRING "PalmOS"
+#endif
+
+#if defined THINK_C || defined macintosh
+#  define POSH_OS_MACOS 1
+#  define POSH_OS_STRING "MacOS"
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Determine target CPU
+** -----------------------------------------------------------------------------
+*/
+
+#if defined GEKKO
+#  define POSH_CPU_PPC750 1
+#  define POSH_CPU_STRING "IBM PowerPC 750 (NGC)"
+#endif
+
+#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
+#  define POSH_CPU_68K 1
+#  define POSH_CPU_STRING "MC68000"
+#endif
+
+#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__ || defined _M_PPC
+#  define POSH_CPU_PPC 1
+#  if !defined POSH_CPU_STRING
+#    if defined __powerpc64__
+#       define POSH_CPU_PPC64 1
+#       define POSH_CPU_STRING "PowerPC64"
+#    else
+#       define POSH_CPU_STRING "PowerPC"
+#    endif
+#  endif
+#endif
+
+#if defined _CRAYT3E || defined _CRAYMPP
+#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
+#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
+#endif
+
+#if defined CRAY || defined _CRAY && !defined _CRAYT3E
+#  error Non-AXP Cray systems not supported
+#endif
+
+#if defined _SH3
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_STRING "Hitachi SH-3"
+#endif
+
+#if defined __sh4__ || defined __SH4__
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_SH4 1
+#  define POSH_CPU_STRING "Hitachi SH-4"
+#endif
+
+#if defined __sparc__ || defined __sparc
+#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
+#     define POSH_CPU_SPARC64 1 
+#     define POSH_CPU_STRING "Sparc/64"
+#  else
+#     define POSH_CPU_STRING "Sparc/32"
+#  endif
+#  define POSH_CPU_SPARC 1
+#endif
+
+#if defined ARM || defined __arm__ || defined _ARM
+#  define POSH_CPU_STRONGARM 1
+#  define POSH_CPU_STRING "ARM"
+#endif
+
+#if defined __aarch64__
+#  define POSH_CPU_AARCH64 1
+#  define POSH_CPU_STRING "ARM64"
+#endif
+
+#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
+#  define POSH_CPU_MIPS 1 
+#  if defined _R5900
+#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
+#  else
+#    define POSH_CPU_STRING "MIPS"
+#  endif
+#endif
+
+#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
+#  define POSH_CPU_IA64 1
+#  define POSH_CPU_STRING "IA64"
+#endif
+
+#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
+#  define POSH_CPU_X86 1
+#  if defined __x86_64__ || defined _M_X64
+#     define POSH_CPU_X86_64 1 
+#  endif
+#  if defined POSH_CPU_X86_64
+#     define POSH_CPU_STRING "AMD x86-64"
+#  else
+#     define POSH_CPU_STRING "Intel 386+"
+#  endif
+#endif
+
+#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
+#  define POSH_CPU_AXP 1
+#  define POSH_CPU_STRING "AXP"
+#endif
+
+#if defined __hppa || defined hppa
+#  define POSH_CPU_HPPA 1
+#  define POSH_CPU_STRING "PA-RISC"
+#endif
+
+#if !defined POSH_CPU_STRING
+#  error POSH cannot determine target CPU
+#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Attempt to autodetect building for embedded on Sony PS2
+** -----------------------------------------------------------------------------
+*/
+#if !defined POSH_OS_STRING
+#  if !defined FORCE_DOXYGEN
+#    define POSH_OS_EMBEDDED 1 
+#  endif
+#  if defined _R5900
+#     define POSH_OS_STRING "Sony PS2(embedded)"
+#  else
+#     define POSH_OS_STRING "Embedded/Unknown"
+#  endif
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Handle cdecl, stdcall, fastcall, etc.
+** ---------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
+#  if defined __GNUC__
+#     define POSH_CDECL __attribute__((cdecl))
+#     define POSH_STDCALL __attribute__((stdcall))
+#     define POSH_FASTCALL __attribute__((fastcall))
+#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
+#     define POSH_CDECL    __cdecl
+#     define POSH_STDCALL  __stdcall
+#     define POSH_FASTCALL __fastcall
+#  endif
+#else
+#  define POSH_CDECL    
+#  define POSH_STDCALL  
+#  define POSH_FASTCALL 
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
+** ---------------------------------------------------------------------------
+*/
+
+/*
+** We undefine this so that multiple inclusions will work
+*/
+#if defined POSH_IMPORTEXPORT
+#  undef POSH_IMPORTEXPORT
+#endif
+
+#if defined POSH_DLL
+#   if defined POSH_OS_WIN32
+#      if defined _MSC_VER 
+#         if ( _MSC_VER >= 800 )
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif  /* defined _MSC_VER */
+#      if defined __BORLANDC__
+#         if ( __BORLANDC__ >= 0x500 )
+#            if defined POSH_BUILDING_LIB 
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif /* defined __BORLANDC__ */
+       /* for all other compilers, we're just making a blanket assumption */
+#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
+#         if defined POSH_BUILDING_LIB
+#            define POSH_IMPORTEXPORT __declspec( dllexport )
+#         else
+#            define POSH_IMPORTEXPORT __declspec( dllimport )
+#         endif
+#      endif /* all other compilers */
+#      if !defined POSH_IMPORTEXPORT
+#         error Building DLLs not supported on this compiler (poshlib@poshlib.org if you know how)
+#      endif
+#   endif /* defined POSH_OS_WIN32 */
+#endif
+
+/* On pretty much everything else, we can thankfully just ignore this */
+#if !defined POSH_IMPORTEXPORT
+#  define POSH_IMPORTEXPORT
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_DLL    
+#  define POSH_BUILDING_LIB
+#  undef POSH_DLL
+#  undef POSH_BUILDING_LIB
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** (Re)define POSH_PUBLIC_API export signature 
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_PUBLIC_API
+#  undef POSH_PUBLIC_API
+#endif
+
+#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
+#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
+#else
+#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Try to infer endianess.  Basically we just go through the CPUs we know are
+** little endian, and assume anything that isn't one of those is big endian.
+** As a sanity check, we also do this with operating systems we know are
+** little endian, such as Windows.  Some processors are bi-endian, such as 
+** the MIPS series, so we have to be careful about those.
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__ || defined __ORDER_LITTLE_ENDIAN__
+#  define POSH_ENDIAN_STRING "little"
+#  define POSH_LITTLE_ENDIAN 1
+#else
+#  define POSH_ENDIAN_STRING "big"
+#  define POSH_BIG_ENDIAN 1
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_LITTLE_ENDIAN
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Cross-platform compile time assertion macro
+** ----------------------------------------------------------------------------
+*/
+#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit Integer
+**
+** We don't require 64-bit support, nor do we emulate its functionality, we
+** simply export it if it's available.  Since we can't count on <limits.h>
+** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
+** ----------------------------------------------------------------------------
+*/
+#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
+#  define POSH_64BIT_INTEGER 1
+typedef long posh_i64_t; 
+typedef unsigned long posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "l"
+#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
+#  define POSH_64BIT_INTEGER 1
+typedef __int64 posh_i64_t;
+typedef unsigned __int64 posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)(x##i64))
+#  define POSH_U64( x ) ((posh_u64_t)(x##ui64))
+#  define POSH_I64_PRINTF_PREFIX "I64"
+#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
+#  define POSH_64BIT_INTEGER 1
+typedef long long posh_i64_t;
+typedef unsigned long long posh_u64_t;
+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
+#  define POSH_I64_PRINTF_PREFIX "ll"
+#endif
+
+/* hack */
+/*#ifdef __MINGW32__
+#undef POSH_I64
+#undef POSH_U64
+#undef POSH_I64_PRINTF_PREFIX
+#define POSH_I64( x ) ((posh_i64_t)x)
+#define POSH_U64( x ) ((posh_u64_t)x)
+#define POSH_I64_PRINTF_PREFIX "I64"
+#endif*/
+
+#ifdef FORCE_DOXYGEN
+typedef long long posh_i64_t;
+typedef unsigned long posh_u64_t;
+#  define POSH_64BIT_INTEGER
+#  define POSH_I64_PRINTF_PREFIX
+#  define POSH_I64(x)
+#  define POSH_U64(x)
+#endif
+
+/** Minimum value for a 64-bit signed integer */
+#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
+/** Maximum value for a 64-bit signed integer */
+#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
+/** Minimum value for a 64-bit unsigned integer */
+#define POSH_U64_MIN  POSH_U64(0)
+/** Maximum value for a 64-bit unsigned integer */
+#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
+
+/* ----------------------------------------------------------------------------
+** Basic Sized Types
+**
+** These types are expected to be EXACTLY sized so you can use them for
+** serialization.
+** ----------------------------------------------------------------------------
+*/
+#define POSH_FALSE 0 
+#define POSH_TRUE  1 
+
+typedef int            posh_bool_t;
+typedef unsigned char  posh_byte_t;
+
+/* NOTE: These assume that CHAR_BIT is 8!! */
+typedef unsigned char  posh_u8_t;
+typedef signed char    posh_i8_t;
+
+#if defined POSH_USE_LIMITS_H
+#  if CHAR_BITS > 8
+#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
+#  endif /* CHAR_BITS > 8 */
+
+/* 16-bit */
+#  if ( USHRT_MAX == 65535 ) 
+   typedef unsigned short posh_u16_t;
+   typedef short          posh_i16_t;
+#  else
+   /* Yes, in theory there could still be a 16-bit character type and shorts are
+      32-bits in size...if you find such an architecture, let me know =P */
+#    error No 16-bit type found
+#  endif
+
+/* 32-bit */
+#  if ( INT_MAX == 2147483647 )
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  elif ( LONG_MAX == 2147483647 )
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  else
+      error No 32-bit type found
+#  endif
+
+#else /* POSH_USE_LIMITS_H */
+
+  typedef unsigned short posh_u16_t;
+  typedef short          posh_i16_t;
+
+#  if !defined POSH_OS_PALM
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  else
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  endif
+#endif
+
+/** Minimum value for a byte */
+#define POSH_BYTE_MIN    0
+/** Maximum value for an 8-bit unsigned value */
+#define POSH_BYTE_MAX    255
+/** Minimum value for a byte */
+#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
+/** Maximum value for a 16-bit signed value */
+#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
+/** Minimum value for a 16-bit unsigned value */
+#define POSH_U16_MIN     0
+/** Maximum value for a 16-bit unsigned value */
+#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
+/** Minimum value for a 32-bit signed value */
+#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
+/** Maximum value for a 32-bit signed value */
+#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
+/** Minimum value for a 32-bit unsigned value */
+#define POSH_U32_MIN     0
+/** Maximum value for a 32-bit unsigned value */
+#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
+
+/*
+** ----------------------------------------------------------------------------
+** Sanity checks on expected sizes
+** ----------------------------------------------------------------------------
+*/
+#if !defined FORCE_DOXYGEN
+
+POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
+POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
+
+#if !defined POSH_NO_FLOAT
+   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
+   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
+#endif
+
+#if defined POSH_64BIT_INTEGER
+   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
+   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
+#endif
+
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit pointer support
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
+#   define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_64BIT_POINTER
+   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
+#elif !defined FORCE_DOXYGEN
+/* if this assertion is hit then you're on a system that either has 64-bit
+   addressing and we didn't catch it, or you're on a system with 16-bit
+   pointers.  In the latter case, POSH doesn't actually care, we're just
+   triggering this assertion to make sure you're aware of the situation,
+   so feel free to delete it.
+
+   If this assertion is triggered on a known 32 or 64-bit platform, 
+   please let us know (poshlib@poshlib.org) */
+   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_64BIT_POINTER
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** POSH Utility Functions
+**
+** These are optional POSH utility functions that are not required if you don't
+** need anything except static checking of your host and target environment.
+** 
+** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
+** to enforce their export if your own library is only using them internally.
+** ----------------------------------------------------------------------------
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char *POSH_GetArchString( void );
+
+#if !defined POSH_NO_FLOAT
+
+posh_u32_t  POSH_LittleFloatBits( float f );
+posh_u32_t  POSH_BigFloatBits( float f );
+float       POSH_FloatFromLittleBits( posh_u32_t bits );
+float       POSH_FloatFromBigBits( posh_u32_t bits );
+
+void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
+double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
+
+/* unimplemented
+float      *POSH_WriteFloatToLittle( void *dst, float f );
+float      *POSH_WriteFloatToBig( void *dst, float f );
+float       POSH_ReadFloatFromLittle( const void *src );
+float       POSH_ReadFloatFromBig( const void *src );
+
+double     *POSH_WriteDoubleToLittle( void *dst, double d );
+double     *POSH_WriteDoubleToBig( void *dst, double d );
+double      POSH_ReadDoubleFromLittle( const void *src );
+double      POSH_ReadDoubleFromBig( const void *src );
+*/
+#endif /* !defined POSH_NO_FLOAT */
+
+#if defined FORCE_DOXYGEN
+#  define POSH_NO_FLOAT
+#  undef  POSH_NO_FLOAT
+#endif
+
+extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
+extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
+extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
+extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
+
+#if defined POSH_64BIT_INTEGER
+
+extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
+extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
+
+#endif /*POSH_64BIT_INTEGER */
+
+extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
+
+extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
+
+extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
+extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
+extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
+extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
+
+extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
+extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
+extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
+extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
+
+#if defined POSH_64BIT_INTEGER
+extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
+extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
+
+extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
+extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
+extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
+extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
+#endif /* POSH_64BIT_INTEGER */
+
+#if defined POSH_LITTLE_ENDIAN
+
+#  define POSH_LittleU16(x) (x)
+#  define POSH_LittleU32(x) (x)
+#  define POSH_LittleI16(x) (x)
+#  define POSH_LittleI32(x) (x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) (x)
+#    define POSH_LittleI64(x) (x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#  define POSH_BigU16(x) POSH_SwapU16(x)
+#  define POSH_BigU32(x) POSH_SwapU32(x)
+#  define POSH_BigI16(x) POSH_SwapI16(x)
+#  define POSH_BigI32(x) POSH_SwapI32(x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) POSH_SwapU64(x)
+#    define POSH_BigI64(x) POSH_SwapI64(x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#else
+
+#  define POSH_BigU16(x) (x)
+#  define POSH_BigU32(x) (x)
+#  define POSH_BigI16(x) (x)
+#  define POSH_BigI32(x) (x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) (x)
+#    define POSH_BigI64(x) (x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#  define POSH_LittleU16(x) POSH_SwapU16(x)
+#  define POSH_LittleU32(x) POSH_SwapU32(x)
+#  define POSH_LittleI16(x) POSH_SwapI16(x)
+#  define POSH_LittleI32(x) POSH_SwapI32(x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) POSH_SwapU64(x)
+#    define POSH_LittleI64(x) POSH_SwapI64(x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
Index: ps/trunk/libraries/source/nvtt/src/extern/poshlib/posh.c
===================================================================
--- ps/trunk/libraries/source/nvtt/src/extern/poshlib/posh.c
+++ ps/trunk/libraries/source/nvtt/src/extern/poshlib/posh.c
@@ -0,0 +1,1006 @@
+/*
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/** 
+ @file    posh.c
+ @author  Brian Hook
+ @date    2002
+ @brief   Portable Open Source Harness primary source file
+*/
+#include "posh.h"
+
+#if !defined FORCE_DOXYGEN
+
+#if !defined POSH_NO_FLOAT
+#  define POSH_FLOAT_STRING "enabled"
+#else
+#  define POSH_FLOAT_STRING "disabled"
+#endif
+
+#if defined POSH_64BIT_INTEGER
+#  define POSH_64BIT_INTEGER_STRING "yes"
+#else
+#  define POSH_64BIT_INTEGER_STRING "no"
+#endif
+
+#if defined POSH_64BIT_POINTER
+#  define POSH_POINTER_STRING "64-bits"
+#else
+#  define POSH_POINTER_STRING "32-bits"
+#endif
+
+#if defined POSH_LITTLE_ENDIAN
+#  define IS_BIG_ENDIAN    0
+
+#  define NATIVE16  POSH_LittleU16
+#  define NATIVE32  POSH_LittleU32
+#  define NATIVE64  POSH_LittleU64
+#  define FOREIGN16 POSH_BigU16
+#  define FOREIGN32 POSH_BigU32
+#  define FOREIGN64 POSH_BigU64
+#else
+#  define IS_BIG_ENDIAN    1
+
+#  define NATIVE16  POSH_BigU16
+#  define NATIVE32  POSH_BigU32
+#  define NATIVE64  POSH_BigU64
+#  define FOREIGN16 POSH_LittleU16
+#  define FOREIGN32 POSH_LittleU32
+#  define FOREIGN64 POSH_LittleU64
+#endif /* POSH_LITTLE_ENDIAN */
+
+static 
+int 
+s_testBigEndian( void )
+{
+   union 
+   {
+      posh_byte_t c[ 4 ];
+      posh_u32_t  i;
+   } u;
+
+   u.i= 1;
+
+   if ( u.c[ 0 ] == 1 )
+   {
+      return 0;
+   }
+   return 1;
+}
+
+static
+const char *
+s_testSerialization( void )
+{
+   posh_byte_t serbuf[ 8 ];
+   posh_u16_t  tmp16;
+   posh_u32_t  tmp32;
+
+   /* 16-bit serialization */
+   POSH_WriteU16ToLittle( serbuf, 0xABCD );
+   if ( ( tmp16 = POSH_ReadU16FromLittle( serbuf ) ) != 0xABCD )
+   {
+      return "*ERROR: failed little-endian 16-bit serialization test";
+   }
+
+   POSH_WriteU16ToBig( serbuf, 0xABCD );
+   if ( ( tmp16 = POSH_ReadU16FromBig( serbuf ) ) != 0xABCD )
+   {
+      return "*ERROR: failed big-endian 16-bit serialization test";
+   }
+
+   /* 32-bit serialization */
+   POSH_WriteU32ToLittle( serbuf, 0xABCD1234L );
+   if ( ( tmp32 = POSH_ReadU32FromLittle( serbuf ) ) != 0xABCD1234 )
+   {
+      return "*ERROR: failed little-endian 32-bit serialization test";
+   }
+
+   POSH_WriteU32ToBig( serbuf, 0xABCD1234L );
+   if ( ( tmp32 = POSH_ReadU32FromBig( serbuf ) ) != 0xABCD1234 )
+   {
+      return "*ERROR: failed big-endian 32-bit serialization test";
+   }
+
+#if defined POSH_64BIT_INTEGER
+   {
+#define REF64 POSH_U64(0xFEDCBA9876543210)
+
+      posh_u64_t tmp64;
+
+      POSH_WriteU64ToLittle( serbuf, REF64 );
+
+      if ( ( tmp64 = POSH_ReadU64FromLittle( serbuf ) ) != REF64 )
+      {
+         return "*ERROR: failed little-endian 64-bit serialization test";
+      }
+
+      POSH_WriteU64ToBig( serbuf, REF64 );
+
+      if ( ( tmp64 = POSH_ReadU64FromBig( serbuf ) ) != REF64 )
+      {
+         return "*ERROR: failed big-endian 64-bit serialization test";
+      }
+   }
+#endif
+
+   return 0;
+}
+
+#if !defined POSH_NO_FLOAT
+static
+const char *
+s_testFloatingPoint( void )
+{
+   float fRef = 10.0f/30.0f;
+   double dRef = 10.0/30.0;
+   posh_byte_t dbuf[ 8 ];
+   float fTmp;
+   double dTmp;
+
+   fTmp = POSH_FloatFromLittleBits( POSH_LittleFloatBits( fRef ) );
+
+   if ( fTmp != fRef )
+   {
+      return "*ERROR: POSH little endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   fTmp = POSH_FloatFromBigBits( POSH_BigFloatBits( fRef ) );
+   if ( fTmp != fRef )
+   {
+      return "*ERROR: POSH big endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   POSH_DoubleBits( dRef, dbuf );
+
+   dTmp = POSH_DoubleFromBits( dbuf );
+
+   if ( dTmp != dRef )
+   {
+      return "*ERROR: POSH double precision floating point serialization failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   return 0;
+}
+#endif /* !defined POSH_NO_FLOAT */
+
+static
+const char *
+s_testEndianess( void )
+{
+   /* check endianess */
+   if ( s_testBigEndian() != IS_BIG_ENDIAN )
+   {
+      return "*ERROR: POSH compile time endianess does not match run-time endianess verification.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   /* make sure our endian swap routines work */
+   if ( ( NATIVE32( 0x11223344L ) != 0x11223344L ) || 
+        ( FOREIGN32( 0x11223344L ) != 0x44332211L ) ||
+        ( NATIVE16( 0x1234 ) != 0x1234 ) ||
+        ( FOREIGN16( 0x1234 ) != 0x3412 ) )
+   {
+      return "*ERROR: POSH endianess macro selection failed.  Please report this to poshlib@poshlib.org!\n";
+   }
+
+   /* test serialization routines */
+
+   return 0;
+}
+#endif /* !defined FORCE_DOXYGEN */
+
+/**
+  Returns a string describing this platform's basic attributes.  
+
+  POSH_GetArchString() reports on an architecture's statically determined
+  attributes.  In addition, it will perform run-time verification checks
+  to make sure the various platform specific functions work.  If an error
+  occurs, please contact me at poshlib@poshlib.org so we can try to resolve
+  what the specific failure case is.
+  @returns a string describing this platform on success, or a string in the 
+           form "*ERROR: [text]" on failure.  You can simply check to see if
+           the first character returned is '*' to verify an error condition.
+*/
+const char *
+POSH_GetArchString( void )
+{
+   const char *err;
+   const char *s = "OS:.............."POSH_OS_STRING"\n"
+                   "CPU:............."POSH_CPU_STRING"\n"
+                   "endian:.........."POSH_ENDIAN_STRING"\n"
+                   "ptr size:........"POSH_POINTER_STRING"\n"
+                   "64-bit ints......"POSH_64BIT_INTEGER_STRING"\n"
+                   "floating point..."POSH_FLOAT_STRING"\n"
+                   "compiler........."POSH_COMPILER_STRING"\n";
+
+   /* test endianess */
+   err = s_testEndianess();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+   /* test serialization */
+   err = s_testSerialization();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+#if !defined POSH_NO_FLOAT
+   /* check that our floating point support is correct */
+   err = s_testFloatingPoint();
+
+   if ( err != 0 )
+   {
+      return err;
+   }
+
+#endif
+
+   return s;
+}
+
+/* ---------------------------------------------------------------------------*/
+/*                           BYTE SWAPPING SUPPORT                            */
+/* ---------------------------------------------------------------------------*/
+/** 
+ * Byte swaps a 16-bit unsigned value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] unsigned 16-bit input value to swap
+   @returns a byte swapped version of v
+ */
+posh_u16_t
+POSH_SwapU16( posh_u16_t v )
+{
+   posh_u16_t swapped;
+
+   swapped  = v << 8;
+   swapped |= v >> 8;
+
+   return swapped;
+}
+
+/** 
+ * Byte swaps a 16-bit signed value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] signed 16-bit input value to swap
+   @returns a byte swapped version of v
+   @remarks This just calls back to the unsigned version, since byte swapping 
+            is independent of sign.  However, we still provide this function to
+            avoid signed/unsigned mismatch compiler warnings.
+ */
+posh_i16_t
+POSH_SwapI16( posh_i16_t v )
+{
+   return ( posh_i16_t ) POSH_SwapU16( v );
+}
+
+/** 
+ * Byte swaps a 32-bit unsigned value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] unsigned 32-bit input value to swap
+   @returns a byte swapped version of v
+ */
+posh_u32_t
+POSH_SwapU32( posh_u32_t v )
+{
+   posh_u32_t swapped;
+
+   swapped  = ( v & 0xFF ) << 24;
+   swapped |= ( v & 0xFF00 ) << 8;
+   swapped |= ( v >> 8 ) & 0xFF00;
+   swapped |= ( v >> 24 );
+
+   return swapped;
+}
+
+/** 
+ * Byte swaps a 32-bit signed value
+ *
+   @ingroup ByteSwapFunctions
+   @param v [in] signed 32-bit input value to swap
+   @returns a byte swapped version of v
+   @remarks This just calls back to the unsigned version, since byte swapping 
+            is independent of sign.  However, we still provide this function to
+            avoid signed/unsigned mismatch compiler warnings.
+ */
+posh_i32_t
+POSH_SwapI32( posh_i32_t v )
+{
+   return ( posh_i32_t ) POSH_SwapU32( ( posh_u32_t ) v );
+}
+
+#if defined POSH_64BIT_INTEGER
+/**
+ * Byte swaps a 64-bit unsigned value
+
+   @param v [in] a 64-bit input value to swap
+   @ingroup SixtyFourBit
+   @returns a byte swapped version of v
+*/
+posh_u64_t 
+POSH_SwapU64( posh_u64_t v )
+{
+   posh_byte_t tmp;
+   union {
+      posh_byte_t bytes[ 8 ];
+      posh_u64_t  u64;
+   } u;
+
+   u.u64 = v;
+
+   tmp = u.bytes[ 0 ]; u.bytes[ 0 ] = u.bytes[ 7 ]; u.bytes[ 7 ] = tmp;
+   tmp = u.bytes[ 1 ]; u.bytes[ 1 ] = u.bytes[ 6 ]; u.bytes[ 6 ] = tmp;
+   tmp = u.bytes[ 2 ]; u.bytes[ 2 ] = u.bytes[ 5 ]; u.bytes[ 5 ] = tmp;
+   tmp = u.bytes[ 3 ]; u.bytes[ 3 ] = u.bytes[ 4 ]; u.bytes[ 4 ] = tmp;
+
+   return u.u64;
+}
+
+/**
+ * Byte swaps a 64-bit signed value
+
+   @param v [in] a 64-bit input value to swap
+   @ingroup SixtyFourBit
+   @returns a byte swapped version of v
+*/
+posh_i64_t 
+POSH_SwapI64( posh_i64_t v )
+{
+   return ( posh_i64_t ) POSH_SwapU64( ( posh_u64_t ) v );
+}
+
+#endif /* defined POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                           IN-MEMORY SERIALIZATION                          */
+/* ---------------------------------------------------------------------------*/
+
+/**
+ * Writes an unsigned 16-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL.  Alignment doesn't matter.
+ @param value [in] host-endian unsigned 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs
+*/
+posh_u16_t *
+POSH_WriteU16ToLittle( void *dst, posh_u16_t value )
+{
+   posh_u16_t  *p16 = ( posh_u16_t * ) dst;
+   posh_byte_t *p   = ( posh_byte_t * ) dst;
+
+   p[ 0 ] = value & 0xFF;
+   p[ 1 ] = ( value & 0xFF00) >> 8;
+
+   return p16 + 1;
+}
+
+/**
+ * Writes a signed 16-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU16ToLittle() with appropriate casting.
+*/
+posh_i16_t *
+POSH_WriteI16ToLittle( void *dst, posh_i16_t value )
+{
+   return ( posh_i16_t * ) POSH_WriteU16ToLittle( dst, ( posh_u16_t ) value );
+}
+
+/**
+ * Writes an unsigned 32-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u32_t *
+POSH_WriteU32ToLittle( void *dst, posh_u32_t value )
+{
+   posh_u32_t  *p32   = ( posh_u32_t * ) dst;
+   posh_byte_t *p     = ( posh_byte_t * ) dst;
+
+   p[ 0 ] = ( value & 0xFF );
+   p[ 1 ] = ( value & 0xFF00 ) >> 8;
+   p[ 2 ] = ( value & 0xFF0000 ) >> 16;
+   p[ 3 ] = ( value & 0xFF000000 ) >> 24;
+
+   return p32 + 1;
+}
+
+/**
+ * Writes a signed 32-bit value to a little endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU32ToLittle() with appropriate casting.
+*/
+posh_i32_t *
+POSH_WriteI32ToLittle( void *dst, posh_i32_t value )
+{
+   return ( posh_i32_t * ) POSH_WriteU32ToLittle( dst, ( posh_u32_t ) value );
+}
+
+/**
+ * Writes an unsigned 16-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs
+*/
+posh_u16_t *
+POSH_WriteU16ToBig( void *dst, posh_u16_t value )
+{
+   posh_u16_t *p16 = ( posh_u16_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+
+   p[ 1 ] = ( value & 0xFF );
+   p[ 0 ] = ( value & 0xFF00 ) >> 8;
+
+   return p16 + 1;
+}
+
+/**
+ * Writes a signed 16-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 16-bit value
+ @returns a pointer to the location two bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU16ToLittle() with appropriate casting.
+*/
+posh_i16_t *
+POSH_WriteI16ToBig( void *dst, posh_i16_t value )
+{
+   return ( posh_i16_t * ) POSH_WriteU16ToBig( dst, ( posh_u16_t ) value );
+}
+
+/**
+ * Writes an unsigned 32-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u32_t *
+POSH_WriteU32ToBig( void *dst, posh_u32_t value )
+{
+   posh_u32_t *p32 = ( posh_u32_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+
+   p[ 3 ] = ( value & 0xFF );
+   p[ 2 ] = ( value & 0xFF00 ) >> 8;
+   p[ 1 ] = ( value & 0xFF0000 ) >> 16;
+   p[ 0 ] = ( value & 0xFF000000 ) >> 24;
+
+   return p32 + 1;
+}
+
+/**
+ * Writes a signed 32-bit value to a big endian buffer
+
+ @ingroup MemoryBuffer
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 32-bit value
+ @returns a pointer to the location four bytes after dst
+ @remarks does no validation of the inputs.  This simply calls
+          POSH_WriteU32ToBig() with appropriate casting.
+*/
+posh_i32_t *
+POSH_WriteI32ToBig( void *dst, posh_i32_t value )
+{
+   return ( posh_i32_t * ) POSH_WriteU32ToBig( dst, ( posh_u32_t ) value );
+}
+
+#if defined POSH_64BIT_INTEGER
+/**
+ * Writes an unsigned 64-bit value to a little-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u64_t *
+POSH_WriteU64ToLittle( void *dst, posh_u64_t value )
+{
+   posh_u64_t *p64 = ( posh_u64_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+   int i;
+
+   for ( i = 0; i < 8; i++, value >>= 8 )
+   {
+       p[ i ] = ( posh_byte_t ) ( value & 0xFF );
+   }
+
+   return p64 + 1;
+}
+
+/**
+ * Writes a signed 64-bit value to a little-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_i64_t *
+POSH_WriteI64ToLittle( void *dst, posh_i64_t value )
+{
+   return ( posh_i64_t * ) POSH_WriteU64ToLittle( dst, ( posh_u64_t ) value );
+}
+
+/**
+ * Writes an unsigned 64-bit value to a big-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian unsigned 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_u64_t *
+POSH_WriteU64ToBig( void *dst, posh_u64_t value )
+{
+   posh_u64_t *p64 = ( posh_u64_t * ) dst;
+   posh_byte_t *p  = ( posh_byte_t * ) dst;
+   int i;
+
+   for ( i = 0; i < 8; i++, value >>= 8 )
+   {
+       p[ 7-i ] = ( posh_byte_t ) ( value & 0xFF );
+   }
+
+   return p64 + 8;
+}
+
+/**
+ * Writes a signed 64-bit value to a big-endian buffer
+
+ @ingroup SixtyFourBit
+ @param dst [out] pointer to the destination buffer, may not be NULL
+ @param value [in] host-endian signed 64-bit value
+ @returns a pointer to the location eight bytes after dst
+ @remarks does no validation of the inputs.
+*/
+posh_i64_t *
+POSH_WriteI64ToBig( void *dst, posh_i64_t value )
+{
+   return ( posh_i64_t * ) POSH_WriteU64ToBig( dst, ( posh_u64_t ) value );
+}
+
+#endif /* POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                         IN-MEMORY DESERIALIZATION                          */
+/* ---------------------------------------------------------------------------*/
+
+/** 
+ * Reads an unsigned 16-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 16-bit value
+*/
+posh_u16_t  
+POSH_ReadU16FromLittle( const void *src )
+{
+    posh_u16_t   v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 0 ];
+    v |= ( ( posh_u16_t ) p[ 1 ] ) << 8;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 16-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 16-bit value
+*/
+posh_i16_t  
+POSH_ReadI16FromLittle( const void *src )
+{
+   return ( posh_i16_t ) POSH_ReadU16FromLittle( src );
+}
+
+/** 
+ * Reads an unsigned 32-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u32_t  
+POSH_ReadU32FromLittle( const void *src )
+{
+    posh_u32_t v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 0 ];
+    v |= ( ( posh_u32_t ) p[ 1 ] ) << 8;
+    v |= ( ( posh_u32_t ) p[ 2 ] ) << 16;
+    v |= ( ( posh_u32_t ) p[ 3 ] ) << 24;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 32-bit value from a little-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i32_t  
+POSH_ReadI32FromLittle( const void *src )
+{
+   return ( posh_i32_t ) POSH_ReadU32FromLittle( src );
+}
+
+
+/** 
+ * Reads an unsigned 16-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 16-bit value
+*/
+posh_u16_t  
+POSH_ReadU16FromBig( const void *src )
+{
+    posh_u16_t   v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 1 ];
+    v |= ( ( posh_u16_t ) p[ 0 ] ) << 8;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 16-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 16-bit value
+*/
+posh_i16_t  
+POSH_ReadI16FromBig( const void *src )
+{
+   return ( posh_i16_t ) POSH_ReadU16FromBig( src );
+}
+
+/** 
+ * Reads an unsigned 32-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u32_t  
+POSH_ReadU32FromBig( const void *src )
+{
+    posh_u32_t   v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+
+    v |= p[ 3 ];
+    v |= ( ( posh_u32_t ) p[ 2 ] ) << 8;
+    v |= ( ( posh_u32_t ) p[ 1 ] ) << 16;
+    v |= ( ( posh_u32_t ) p[ 0 ] ) << 24;
+
+    return v;
+}
+
+/** 
+ * Reads a signed 32-bit value from a big-endian buffer
+ @ingroup MemoryBuffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i32_t  
+POSH_ReadI32FromBig( const void *src )
+{
+   return POSH_BigI32( (*(const posh_i32_t*)src ) );
+}
+
+#if defined POSH_64BIT_INTEGER
+
+/** 
+ * Reads an unsigned 64-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u64_t  
+POSH_ReadU64FromLittle( const void *src )
+{
+    posh_u64_t v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+    int i;
+
+    for ( i = 0; i < 8; i++ )
+    {
+        v |= ( ( posh_u64_t ) p[ i ] ) << (i*8);
+    }
+
+    return v;
+}
+
+/** 
+ * Reads a signed 64-bit value from a little-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i64_t  
+POSH_ReadI64FromLittle( const void *src )
+{
+   return ( posh_i64_t ) POSH_ReadU64FromLittle( src );
+}
+
+/** 
+ * Reads an unsigned 64-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian unsigned 32-bit value
+*/
+posh_u64_t
+POSH_ReadU64FromBig( const void *src )
+{
+    posh_u64_t v = 0;
+    posh_byte_t *p = ( posh_byte_t * ) src;
+    int i;
+
+    for ( i = 0; i < 8; i++ )
+    {
+        v |= ( ( posh_u64_t ) p[ 7-i ] ) << (i*8);
+    }
+
+    return v;
+}
+
+/** 
+ * Reads an signed 64-bit value from a big-endian buffer
+ @param src [in] source buffer
+ @returns host-endian signed 32-bit value
+*/
+posh_i64_t
+POSH_ReadI64FromBig( const void *src )
+{
+   return ( posh_i64_t ) POSH_ReadU64FromBig( src );
+}
+
+#endif /* POSH_64BIT_INTEGER */
+
+/* ---------------------------------------------------------------------------*/
+/*                           FLOATING POINT SUPPORT                           */
+/* ---------------------------------------------------------------------------*/
+
+#if !defined POSH_NO_FLOAT
+
+/** @ingroup FloatingPoint
+    @param[in] f floating point value
+    @returns a little-endian bit representation of f
+ */
+posh_u32_t
+POSH_LittleFloatBits( float f )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.f32 = f;
+
+   return POSH_LittleU32( u.u32 );
+}
+
+/** 
+ * Extracts raw big-endian bits from a 32-bit floating point value
+ *
+   @ingroup FloatingPoint
+   @param   f [in] floating point value
+   @returns a big-endian bit representation of f
+ */
+posh_u32_t
+POSH_BigFloatBits( float f )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.f32 = f;
+
+   return POSH_BigU32( u.u32 );
+}
+
+/** 
+ * Extracts raw, little-endian bit representation from a 64-bit double.
+ *
+   @param d [in] 64-bit double precision value
+   @param dst [out] 8-byte storage buffer
+   @ingroup FloatingPoint
+   @returns the raw bits used to represent the value 'd', in the form dst[0]=LSB
+ */
+void
+POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] )
+{
+   union
+   {
+      double d64;
+      posh_byte_t bytes[ 8 ];
+   } u;
+
+   u.d64 = d;
+
+#if defined POSH_LITTLE_ENDIAN
+   dst[ 0 ] = u.bytes[ 0 ];
+   dst[ 1 ] = u.bytes[ 1 ];
+   dst[ 2 ] = u.bytes[ 2 ];
+   dst[ 3 ] = u.bytes[ 3 ];
+   dst[ 4 ] = u.bytes[ 4 ];
+   dst[ 5 ] = u.bytes[ 5 ];
+   dst[ 6 ] = u.bytes[ 6 ];
+   dst[ 7 ] = u.bytes[ 7 ];
+#else
+   dst[ 0 ] = u.bytes[ 7 ];
+   dst[ 1 ] = u.bytes[ 6 ];
+   dst[ 2 ] = u.bytes[ 5 ];
+   dst[ 3 ] = u.bytes[ 4 ];
+   dst[ 4 ] = u.bytes[ 3 ];
+   dst[ 5 ] = u.bytes[ 2 ];
+   dst[ 6 ] = u.bytes[ 1 ];
+   dst[ 7 ] = u.bytes[ 0 ];
+#endif
+}
+
+/** 
+ * Creates a double-precision, 64-bit floating point value from a set of raw, 
+ * little-endian bits
+
+   @ingroup FloatingPoint
+   @param src [in] little-endian byte representation of 64-bit double precision 
+                  floating point value
+   @returns double precision floating point representation of the raw bits
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number, nor is there any check to ensure that src is 
+            non-NULL.  BE CAREFUL USING THIS.
+ */
+double
+POSH_DoubleFromBits( const posh_byte_t src[ 8 ] )
+{
+   union
+   {
+      double d64;
+      posh_byte_t bytes[ 8 ];
+   } u;
+
+#if defined POSH_LITTLE_ENDIAN
+   u.bytes[ 0 ] = src[ 0 ];
+   u.bytes[ 1 ] = src[ 1 ];
+   u.bytes[ 2 ] = src[ 2 ];
+   u.bytes[ 3 ] = src[ 3 ];
+   u.bytes[ 4 ] = src[ 4 ];
+   u.bytes[ 5 ] = src[ 5 ];
+   u.bytes[ 6 ] = src[ 6 ];
+   u.bytes[ 7 ] = src[ 7 ];
+#else
+   u.bytes[ 0 ] = src[ 7 ];
+   u.bytes[ 1 ] = src[ 6 ];
+   u.bytes[ 2 ] = src[ 5 ];
+   u.bytes[ 3 ] = src[ 4 ];
+   u.bytes[ 4 ] = src[ 3 ];
+   u.bytes[ 5 ] = src[ 2 ];
+   u.bytes[ 6 ] = src[ 1 ];
+   u.bytes[ 7 ] = src[ 0 ];
+#endif
+
+   return u.d64;
+}
+
+/** 
+ * Creates a floating point number from little endian bits
+ *
+   @ingroup FloatingPoint
+   @param   bits [in] raw floating point bits in little-endian form
+   @returns a floating point number based on the given bit representation
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number.  BE CAREFUL USING THIS.
+ */
+float       
+POSH_FloatFromLittleBits( posh_u32_t bits )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.u32 = bits;
+#if defined POSH_BIG_ENDIAN
+   u.u32 = POSH_SwapU32( u.u32 );
+#endif
+
+   return u.f32;
+}
+
+/** 
+ * Creates a floating point number from big-endian bits
+ *
+   @ingroup FloatingPoint
+   @param   bits [in] raw floating point bits in big-endian form
+   @returns a floating point number based on the given bit representation
+   @remarks No error checking is performed, so there are no guarantees that the 
+            result is a valid number.  BE CAREFUL USING THIS.
+ */
+float
+POSH_FloatFromBigBits( posh_u32_t bits )
+{
+   union
+   {
+      float f32;
+      posh_u32_t u32;
+   } u;
+
+   u.u32 = bits;
+#if defined POSH_LITTLE_ENDIAN
+   u.u32 = POSH_SwapU32( u.u32 );
+#endif
+
+   return u.f32;
+}
+
+#endif /* !defined POSH_NO_FLOAT */
Index: ps/trunk/libraries/source/nvtt/src/extern/stb/stb_dxt.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/extern/stb/stb_dxt.h
+++ ps/trunk/libraries/source/nvtt/src/extern/stb/stb_dxt.h
@@ -0,0 +1,624 @@
+// stb_dxt.h - v1.04 - DXT1/DXT5 compressor - public domain
+// original by fabian "ryg" giesen - ported to C by stb
+// use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
+//
+// USAGE:
+//   call stb_compress_dxt_block() for every block (you must pad)
+//     source should be a 4x4 block of RGBA data in row-major order;
+//     A is ignored if you specify alpha=0; you can turn on dithering
+//     and "high quality" using mode.
+//
+// version history:
+//   v1.04  - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
+//            single color match fix (allow for inexact color interpolation);
+//            optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
+//   v1.03  - (stb) endianness support
+//   v1.02  - (stb) fix alpha encoding bug
+//   v1.01  - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
+//   v1.00  - (stb) first release
+
+#ifndef STB_INCLUDE_STB_DXT_H
+#define STB_INCLUDE_STB_DXT_H
+
+// compression mode (bitflags)
+#define STB_DXT_NORMAL    0
+#define STB_DXT_DITHER    1   // use dithering. dubious win. never use for normal maps and the like!
+#define STB_DXT_HIGHQUAL  2   // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
+
+void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode);
+#define STB_COMPRESS_DXT_BLOCK
+
+#ifdef STB_DXT_IMPLEMENTATION
+
+// configuration options for DXT encoder. set them in the project/makefile or just define
+// them at the top.
+
+// STB_DXT_USE_ROUNDING_BIAS
+//     use a rounding bias during color interpolation. this is closer to what "ideal"
+//     interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
+//     implicitly had this turned on. 
+//
+//     in case you're targeting a specific type of hardware (e.g. console programmers):
+//     NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
+//     to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
+//     you also see "(a*5 + b*3) / 8" on some old GPU designs.
+// #define STB_DXT_USE_ROUNDING_BIAS
+
+#include <stdlib.h>
+#include <math.h>
+#include <string.h> // memset
+
+static unsigned char stb__Expand5[32];
+static unsigned char stb__Expand6[64];
+static unsigned char stb__OMatch5[256][2];
+static unsigned char stb__OMatch6[256][2];
+static unsigned char stb__QuantRBTab[256+16];
+static unsigned char stb__QuantGTab[256+16];
+
+static int stb__Mul8Bit(int a, int b)
+{
+  int t = a*b + 128;
+  return (t + (t >> 8)) >> 8;
+}
+
+static void stb__From16Bit(unsigned char *out, unsigned short v)
+{
+   int rv = (v & 0xf800) >> 11;
+   int gv = (v & 0x07e0) >>  5;
+   int bv = (v & 0x001f) >>  0;
+
+   out[0] = stb__Expand5[rv];
+   out[1] = stb__Expand6[gv];
+   out[2] = stb__Expand5[bv];
+   out[3] = 0;
+}
+
+static unsigned short stb__As16Bit(int r, int g, int b)
+{
+   return (stb__Mul8Bit(r,31) << 11) + (stb__Mul8Bit(g,63) << 5) + stb__Mul8Bit(b,31);
+}
+
+// linear interpolation at 1/3 point between a and b, using desired rounding type
+static int stb__Lerp13(int a, int b)
+{
+#ifdef STB_DXT_USE_ROUNDING_BIAS
+   // with rounding bias
+   return a + stb__Mul8Bit(b-a, 0x55);
+#else
+   // without rounding bias
+   // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
+   return (2*a + b) / 3;
+#endif
+}
+
+// lerp RGB color
+static void stb__Lerp13RGB(unsigned char *out, unsigned char *p1, unsigned char *p2)
+{
+   out[0] = stb__Lerp13(p1[0], p2[0]);
+   out[1] = stb__Lerp13(p1[1], p2[1]);
+   out[2] = stb__Lerp13(p1[2], p2[2]);
+}
+
+/****************************************************************************/
+
+// compute table to reproduce constant colors as accurately as possible
+static void stb__PrepareOptTable(unsigned char *Table,const unsigned char *expand,int size)
+{
+   int i,mn,mx;
+   for (i=0;i<256;i++) {
+      int bestErr = 256;
+      for (mn=0;mn<size;mn++) {
+         for (mx=0;mx<size;mx++) {
+            int mine = expand[mn];
+            int maxe = expand[mx];
+            int err = abs(stb__Lerp13(maxe, mine) - i);
+            
+            // DX10 spec says that interpolation must be within 3% of "correct" result,
+            // add this as error term. (normally we'd expect a random distribution of
+            // +-1.5% error, but nowhere in the spec does it say that the error has to be
+            // unbiased - better safe than sorry).
+            err += abs(maxe - mine) * 3 / 100;
+            
+            if(err < bestErr)
+            { 
+               Table[i*2+0] = mx;
+               Table[i*2+1] = mn;
+               bestErr = err;
+            }
+         }
+      }
+   }
+}
+
+static void stb__EvalColors(unsigned char *color,unsigned short c0,unsigned short c1)
+{
+   stb__From16Bit(color+ 0, c0);
+   stb__From16Bit(color+ 4, c1);
+   stb__Lerp13RGB(color+ 8, color+0, color+4);
+   stb__Lerp13RGB(color+12, color+4, color+0);
+}
+
+// Block dithering function. Simply dithers a block to 565 RGB.
+// (Floyd-Steinberg)
+static void stb__DitherBlock(unsigned char *dest, unsigned char *block)
+{
+  int err[8],*ep1 = err,*ep2 = err+4, *et;
+  int ch,y;
+
+  // process channels seperately
+  for (ch=0; ch<3; ++ch) {
+      unsigned char *bp = block+ch, *dp = dest+ch;
+      unsigned char *quant = (ch == 1) ? stb__QuantGTab+8 : stb__QuantRBTab+8;
+      memset(err, 0, sizeof(err));
+      for(y=0; y<4; ++y) {
+         dp[ 0] = quant[bp[ 0] + ((3*ep2[1] + 5*ep2[0]) >> 4)];
+         ep1[0] = bp[ 0] - dp[ 0];
+         dp[ 4] = quant[bp[ 4] + ((7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]) >> 4)];
+         ep1[1] = bp[ 4] - dp[ 4];
+         dp[ 8] = quant[bp[ 8] + ((7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]) >> 4)];
+         ep1[2] = bp[ 8] - dp[ 8];
+         dp[12] = quant[bp[12] + ((7*ep1[2] + 5*ep2[3] + ep2[2]) >> 4)];
+         ep1[3] = bp[12] - dp[12];
+         bp += 16;
+         dp += 16;
+         et = ep1, ep1 = ep2, ep2 = et; // swap
+      }
+   }
+}
+
+// The color matching function
+static unsigned int stb__MatchColorsBlock(unsigned char *block, unsigned char *color,int dither)
+{
+   unsigned int mask = 0;
+   int dirr = color[0*4+0] - color[1*4+0];
+   int dirg = color[0*4+1] - color[1*4+1];
+   int dirb = color[0*4+2] - color[1*4+2];
+   int dots[16];
+   int stops[4];
+   int i;
+   int c0Point, halfPoint, c3Point;
+
+   for(i=0;i<16;i++)
+      dots[i] = block[i*4+0]*dirr + block[i*4+1]*dirg + block[i*4+2]*dirb;
+
+   for(i=0;i<4;i++)
+      stops[i] = color[i*4+0]*dirr + color[i*4+1]*dirg + color[i*4+2]*dirb;
+
+   // think of the colors as arranged on a line; project point onto that line, then choose
+   // next color out of available ones. we compute the crossover points for "best color in top
+   // half"/"best in bottom half" and then the same inside that subinterval.
+   //
+   // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
+   // but it's very close and a lot faster.
+   // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
+   
+   c0Point   = (stops[1] + stops[3]) >> 1;
+   halfPoint = (stops[3] + stops[2]) >> 1;
+   c3Point   = (stops[2] + stops[0]) >> 1;
+
+   if(!dither) {
+      // the version without dithering is straightforward
+      for (i=15;i>=0;i--) {
+         int dot = dots[i];
+         mask <<= 2;
+
+         if(dot < halfPoint)
+           mask |= (dot < c0Point) ? 1 : 3;
+         else
+           mask |= (dot < c3Point) ? 2 : 0;
+      }
+  } else {
+      // with floyd-steinberg dithering
+      int err[8],*ep1 = err,*ep2 = err+4;
+      int *dp = dots, y;
+
+      c0Point   <<= 4;
+      halfPoint <<= 4;
+      c3Point   <<= 4;
+      for(i=0;i<8;i++)
+         err[i] = 0;
+
+      for(y=0;y<4;y++)
+      {
+         int dot,lmask,step;
+
+         dot = (dp[0] << 4) + (3*ep2[1] + 5*ep2[0]);
+         if(dot < halfPoint)
+           step = (dot < c0Point) ? 1 : 3;
+         else
+           step = (dot < c3Point) ? 2 : 0;
+         ep1[0] = dp[0] - stops[step];
+         lmask = step;
+
+         dot = (dp[1] << 4) + (7*ep1[0] + 3*ep2[2] + 5*ep2[1] + ep2[0]);
+         if(dot < halfPoint)
+           step = (dot < c0Point) ? 1 : 3;
+         else
+           step = (dot < c3Point) ? 2 : 0;
+         ep1[1] = dp[1] - stops[step];
+         lmask |= step<<2;
+
+         dot = (dp[2] << 4) + (7*ep1[1] + 3*ep2[3] + 5*ep2[2] + ep2[1]);
+         if(dot < halfPoint)
+           step = (dot < c0Point) ? 1 : 3;
+         else
+           step = (dot < c3Point) ? 2 : 0;
+         ep1[2] = dp[2] - stops[step];
+         lmask |= step<<4;
+
+         dot = (dp[3] << 4) + (7*ep1[2] + 5*ep2[3] + ep2[2]);
+         if(dot < halfPoint)
+           step = (dot < c0Point) ? 1 : 3;
+         else
+           step = (dot < c3Point) ? 2 : 0;
+         ep1[3] = dp[3] - stops[step];
+         lmask |= step<<6;
+
+         dp += 4;
+         mask |= lmask << (y*8);
+         { int *et = ep1; ep1 = ep2; ep2 = et; } // swap
+      }
+   }
+
+   return mask;
+}
+
+// The color optimization function. (Clever code, part 1)
+static void stb__OptimizeColorsBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16)
+{
+  int mind = 0x7fffffff,maxd = -0x7fffffff;
+  unsigned char *minp, *maxp;
+  double magn;
+  int v_r,v_g,v_b;
+  static const int nIterPower = 4;
+  float covf[6],vfr,vfg,vfb;
+
+  // determine color distribution
+  int cov[6];
+  int mu[3],min[3],max[3];
+  int ch,i,iter;
+
+  for(ch=0;ch<3;ch++)
+  {
+    const unsigned char *bp = ((const unsigned char *) block) + ch;
+    int muv,minv,maxv;
+
+    muv = minv = maxv = bp[0];
+    for(i=4;i<64;i+=4)
+    {
+      muv += bp[i];
+      if (bp[i] < minv) minv = bp[i];
+      else if (bp[i] > maxv) maxv = bp[i];
+    }
+
+    mu[ch] = (muv + 8) >> 4;
+    min[ch] = minv;
+    max[ch] = maxv;
+  }
+
+  // determine covariance matrix
+  for (i=0;i<6;i++)
+     cov[i] = 0;
+
+  for (i=0;i<16;i++)
+  {
+    int r = block[i*4+0] - mu[0];
+    int g = block[i*4+1] - mu[1];
+    int b = block[i*4+2] - mu[2];
+
+    cov[0] += r*r;
+    cov[1] += r*g;
+    cov[2] += r*b;
+    cov[3] += g*g;
+    cov[4] += g*b;
+    cov[5] += b*b;
+  }
+
+  // convert covariance matrix to float, find principal axis via power iter
+  for(i=0;i<6;i++)
+    covf[i] = cov[i] / 255.0f;
+
+  vfr = (float) (max[0] - min[0]);
+  vfg = (float) (max[1] - min[1]);
+  vfb = (float) (max[2] - min[2]);
+
+  for(iter=0;iter<nIterPower;iter++)
+  {
+    float r = vfr*covf[0] + vfg*covf[1] + vfb*covf[2];
+    float g = vfr*covf[1] + vfg*covf[3] + vfb*covf[4];
+    float b = vfr*covf[2] + vfg*covf[4] + vfb*covf[5];
+
+    vfr = r;
+    vfg = g;
+    vfb = b;
+  }
+
+  magn = fabs(vfr);
+  if (fabs(vfg) > magn) magn = fabs(vfg);
+  if (fabs(vfb) > magn) magn = fabs(vfb);
+
+   if(magn < 4.0f) { // too small, default to luminance
+      v_r = 299; // JPEG YCbCr luma coefs, scaled by 1000.
+      v_g = 587;
+      v_b = 114;
+   } else {
+      magn = 512.0 / magn;
+      v_r = (int) (vfr * magn);
+      v_g = (int) (vfg * magn);
+      v_b = (int) (vfb * magn);
+   }
+
+   // Pick colors at extreme points
+   for(i=0;i<16;i++)
+   {
+      int dot = block[i*4+0]*v_r + block[i*4+1]*v_g + block[i*4+2]*v_b;
+
+      if (dot < mind) {
+         mind = dot;
+         minp = block+i*4;
+      }
+
+      if (dot > maxd) {
+         maxd = dot;
+         maxp = block+i*4;
+      }
+   }
+
+   *pmax16 = stb__As16Bit(maxp[0],maxp[1],maxp[2]);
+   *pmin16 = stb__As16Bit(minp[0],minp[1],minp[2]);
+}
+
+static int stb__sclamp(float y, int p0, int p1)
+{
+   int x = (int) y;
+   if (x < p0) return p0;
+   if (x > p1) return p1;
+   return x;
+}
+
+// The refinement function. (Clever code, part 2)
+// Tries to optimize colors to suit block contents better.
+// (By solving a least squares system via normal equations+Cramer's rule)
+static int stb__RefineBlock(unsigned char *block, unsigned short *pmax16, unsigned short *pmin16, unsigned int mask)
+{
+   static const int w1Tab[4] = { 3,0,2,1 };
+   static const int prods[4] = { 0x090000,0x000900,0x040102,0x010402 };
+   // ^some magic to save a lot of multiplies in the accumulating loop...
+   // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
+
+   float frb,fg;
+   unsigned short oldMin, oldMax, min16, max16;
+   int i, akku = 0, xx,xy,yy;
+   int At1_r,At1_g,At1_b;
+   int At2_r,At2_g,At2_b;
+   unsigned int cm = mask;
+
+   oldMin = *pmin16;
+   oldMax = *pmax16;
+
+   if((mask ^ (mask<<2)) < 4) // all pixels have the same index?
+   {
+      // yes, linear system would be singular; solve using optimal
+      // single-color match on average color
+      int r = 8, g = 8, b = 8;
+      for (i=0;i<16;++i) {
+         r += block[i*4+0];
+         g += block[i*4+1];
+         b += block[i*4+2];
+      }
+
+      r >>= 4; g >>= 4; b >>= 4;
+
+      max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
+      min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
+   } else {
+      At1_r = At1_g = At1_b = 0;
+      At2_r = At2_g = At2_b = 0;
+      for (i=0;i<16;++i,cm>>=2) {
+         int step = cm&3;
+         int w1 = w1Tab[step];
+         int r = block[i*4+0];
+         int g = block[i*4+1];
+         int b = block[i*4+2];
+
+         akku    += prods[step];
+         At1_r   += w1*r;
+         At1_g   += w1*g;
+         At1_b   += w1*b;
+         At2_r   += r;
+         At2_g   += g;
+         At2_b   += b;
+      }
+
+      At2_r = 3*At2_r - At1_r;
+      At2_g = 3*At2_g - At1_g;
+      At2_b = 3*At2_b - At1_b;
+
+      // extract solutions and decide solvability
+      xx = akku >> 16;
+      yy = (akku >> 8) & 0xff;
+      xy = (akku >> 0) & 0xff;
+
+      frb = 3.0f * 31.0f / 255.0f / (xx*yy - xy*xy);
+      fg = frb * 63.0f / 31.0f;
+
+      // solve.
+      max16 =   stb__sclamp((At1_r*yy - At2_r*xy)*frb+0.5f,0,31) << 11;
+      max16 |=  stb__sclamp((At1_g*yy - At2_g*xy)*fg +0.5f,0,63) << 5;
+      max16 |=  stb__sclamp((At1_b*yy - At2_b*xy)*frb+0.5f,0,31) << 0;
+
+      min16 =   stb__sclamp((At2_r*xx - At1_r*xy)*frb+0.5f,0,31) << 11;
+      min16 |=  stb__sclamp((At2_g*xx - At1_g*xy)*fg +0.5f,0,63) << 5;
+      min16 |=  stb__sclamp((At2_b*xx - At1_b*xy)*frb+0.5f,0,31) << 0;
+   }
+
+   *pmin16 = min16;
+   *pmax16 = max16;
+   return oldMin != min16 || oldMax != max16;
+}
+
+// Color block compression
+static void stb__CompressColorBlock(unsigned char *dest, unsigned char *block, int mode)
+{
+   unsigned int mask;
+   int i;
+   int dither;
+   int refinecount;
+   unsigned short max16, min16;
+   unsigned char dblock[16*4],color[4*4];
+   
+   dither = mode & STB_DXT_DITHER;
+   refinecount = (mode & STB_DXT_HIGHQUAL) ? 2 : 1;
+
+   // check if block is constant
+   for (i=1;i<16;i++)
+      if (((unsigned int *) block)[i] != ((unsigned int *) block)[0])
+         break;
+
+   if(i == 16) { // constant color
+      int r = block[0], g = block[1], b = block[2];
+      mask  = 0xaaaaaaaa;
+      max16 = (stb__OMatch5[r][0]<<11) | (stb__OMatch6[g][0]<<5) | stb__OMatch5[b][0];
+      min16 = (stb__OMatch5[r][1]<<11) | (stb__OMatch6[g][1]<<5) | stb__OMatch5[b][1];
+   } else {
+      // first step: compute dithered version for PCA if desired
+      if(dither)
+         stb__DitherBlock(dblock,block);
+
+      // second step: pca+map along principal axis
+      stb__OptimizeColorsBlock(dither ? dblock : block,&max16,&min16);
+      if (max16 != min16) {
+         stb__EvalColors(color,max16,min16);
+         mask = stb__MatchColorsBlock(block,color,dither);
+      } else
+         mask = 0;
+
+      // third step: refine (multiple times if requested)
+      for (i=0;i<refinecount;i++) {
+         unsigned int lastmask = mask;
+         
+         if (stb__RefineBlock(dither ? dblock : block,&max16,&min16,mask)) {
+            if (max16 != min16) {
+               stb__EvalColors(color,max16,min16);
+               mask = stb__MatchColorsBlock(block,color,dither);
+            } else {
+               mask = 0;
+               break;
+            }
+         }
+         
+         if(mask == lastmask)
+            break;
+      }
+  }
+
+  // write the color block
+  if(max16 < min16)
+  {
+     unsigned short t = min16;
+     min16 = max16;
+     max16 = t;
+     mask ^= 0x55555555;
+  }
+
+  dest[0] = (unsigned char) (max16);
+  dest[1] = (unsigned char) (max16 >> 8);
+  dest[2] = (unsigned char) (min16);
+  dest[3] = (unsigned char) (min16 >> 8);
+  dest[4] = (unsigned char) (mask);
+  dest[5] = (unsigned char) (mask >> 8);
+  dest[6] = (unsigned char) (mask >> 16);
+  dest[7] = (unsigned char) (mask >> 24);
+}
+
+// Alpha block compression (this is easy for a change)
+static void stb__CompressAlphaBlock(unsigned char *dest,unsigned char *src,int mode)
+{
+   int i,dist,bias,dist4,dist2,bits,mask;
+
+   // find min/max color
+   int mn,mx;
+   mn = mx = src[3];
+
+   for (i=1;i<16;i++)
+   {
+      if (src[i*4+3] < mn) mn = src[i*4+3];
+      else if (src[i*4+3] > mx) mx = src[i*4+3];
+   }
+
+   // encode them
+   ((unsigned char *)dest)[0] = mx;
+   ((unsigned char *)dest)[1] = mn;
+   dest += 2;
+
+   // determine bias and emit color indices
+   // given the choice of mx/mn, these indices are optimal:
+   // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
+   dist = mx-mn;
+   dist4 = dist*4;
+   dist2 = dist*2;
+   bias = (dist < 8) ? (dist - 1) : (dist/2 + 2);
+   bias -= mn * 7;
+   bits = 0,mask=0;
+   
+   for (i=0;i<16;i++) {
+      int a = src[i*4+3]*7 + bias;
+      int ind,t;
+
+      // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
+      t = (a >= dist4) ? -1 : 0; ind =  t & 4; a -= dist4 & t;
+      t = (a >= dist2) ? -1 : 0; ind += t & 2; a -= dist2 & t;
+      ind += (a >= dist);
+      
+      // turn linear scale into DXT index (0/1 are extremal pts)
+      ind = -ind & 7;
+      ind ^= (2 > ind);
+
+      // write index
+      mask |= ind << bits;
+      if((bits += 3) >= 8) {
+         *dest++ = mask;
+         mask >>= 8;
+         bits -= 8;
+      }
+   }
+}
+
+static void stb__InitDXT()
+{
+   int i;
+   for(i=0;i<32;i++)
+      stb__Expand5[i] = (i<<3)|(i>>2);
+
+   for(i=0;i<64;i++)
+      stb__Expand6[i] = (i<<2)|(i>>4);
+
+   for(i=0;i<256+16;i++)
+   {
+      int v = i-8 < 0 ? 0 : i-8 > 255 ? 255 : i-8;
+      stb__QuantRBTab[i] = stb__Expand5[stb__Mul8Bit(v,31)];
+      stb__QuantGTab[i] = stb__Expand6[stb__Mul8Bit(v,63)];
+   }
+
+   stb__PrepareOptTable(&stb__OMatch5[0][0],stb__Expand5,32);
+   stb__PrepareOptTable(&stb__OMatch6[0][0],stb__Expand6,64);
+}
+
+void stb_compress_dxt_block(unsigned char *dest, const unsigned char *src, int alpha, int mode)
+{
+   static int init=1;
+   if (init) {
+      stb__InitDXT();
+      init=0;
+   }
+
+   if (alpha) {
+      stb__CompressAlphaBlock(dest,(unsigned char*) src,mode);
+      dest += 8;
+   }
+
+   stb__CompressColorBlock(dest,(unsigned char*) src,mode);
+}
+#endif // STB_DXT_IMPLEMENTATION
+
+#endif // STB_INCLUDE_STB_DXT_H
Index: ps/trunk/libraries/source/nvtt/src/extern/stb/stb_image.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/extern/stb/stb_image.h
+++ ps/trunk/libraries/source/nvtt/src/extern/stb/stb_image.h
@@ -0,0 +1,4954 @@
+/* stbi-1.29 - public domain JPEG/PNG reader - http://nothings.org/stb_image.c
+   when you control the images you're loading
+                                     no warranty implied; use at your own risk
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline (no JPEG progressive)
+      PNG 8-bit only
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+
+      - decoded from memory or through stdio FILE (define STBI_NO_STDIO to remove code)
+      - supports installable dequantizing-IDCT, YCbCr-to-RGB conversion (define STBI_SIMD)
+
+   Latest revisions:
+      1.29 (2010-08-16) various warning fixes from Aurelien Pocheville 
+      1.28 (2010-08-01) fix bug in GIF palette transparency (SpartanJ)
+      1.27 (2010-08-01) cast-to-uint8 to fix warnings (Laurent Gomila)
+                        allow trailing 0s at end of image data (Laurent Gomila)
+      1.26 (2010-07-24) fix bug in file buffering for PNG reported by SpartanJ
+      1.25 (2010-07-17) refix trans_data warning (Won Chun)
+      1.24 (2010-07-12) perf improvements reading from files
+                        minor perf improvements for jpeg
+                        deprecated type-specific functions in hope of feedback
+                        attempt to fix trans_data warning (Won Chun)
+      1.23              fixed bug in iPhone support
+      1.22 (2010-07-10) removed image *writing* support to stb_image_write.h
+                        stbi_info support from Jetro Lauha
+                        GIF support from Jean-Marc Lienher
+                        iPhone PNG-extensions from James Brown
+                        warning-fixes from Nicolas Schulz and Janez Zemva
+      1.21              fix use of 'uint8' in header (reported by jon blow)
+      1.20              added support for Softimage PIC, by Tom Seddon
+
+   See end of file for full revision history.
+
+   TODO:
+      stbi_info support for BMP,PSD,HDR,PIC
+      rewrite stbi_info and load_file variations to share file handling code
+           (current system allows individual functions to be called directly,
+           since each does all the work, but I doubt anyone uses this in practice)
+
+
+ ============================    Contributors    =========================
+              
+ Image formats                                Optimizations & bugfixes
+    Sean Barrett (jpeg, png, bmp)                Fabian "ryg" Giesen
+    Nicolas Schulz (hdr, psd)                                                 
+    Jonathan Dummer (tga)                     Bug fixes & warning fixes           
+    Jean-Marc Lienher (gif)                      Marc LeBlanc               
+    Tom Seddon (pic)                             Christpher Lloyd           
+    Thatcher Ulrich (psd)                        Dave Moore                 
+                                                 Won Chun                   
+                                                 the Horde3D community      
+ Extensions, features                            Janez Zemva                
+    Jetro Lauha (stbi_info)                      Jonathan Blow              
+    James "moose2000" Brown (iPhone PNG)         Laurent Gomila                             
+                                                 Aruelien Pocheville
+
+ If your name should be here but isn't, let Sean know.
+
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// To get a header file for this, either cut and paste the header,
+// or create stb_image.h, #define STBI_HEADER_FILE_ONLY, and
+// then include stb_image.c from it.
+
+////   begin header file  ////////////////////////////////////////////////////
+//
+// Limitations:
+//    - no jpeg progressive support
+//    - non-HDR formats support 8-bit samples only (jpeg, png)
+//    - no delayed line count (jpeg) -- IJG doesn't support either
+//    - no 1-bit BMP
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ... 
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    stbi_image_free(data)
+//
+// Standard parameters:
+//    int *x       -- outputs image width in pixels
+//    int *y       -- outputs image height in pixels
+//    int *comp    -- outputs # of image components in image file
+//    int req_comp -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
+// If req_comp is non-zero, *comp has the number of components that _would_
+// have been output otherwise. E.g. if you set req_comp to 4, you will always
+// get RGBA output, but you can check *comp to easily see if it's opaque.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
+// can be queried for an extremely brief, end-user unfriendly explanation
+// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
+// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// By default we convert iphone-formatted PNGs back to RGB; nominally they
+// would silently load as BGR, except the existing code should have just
+// failed on such iPhone PNGs. But you can disable this conversion by
+// by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// you will always just get the native iphone "format" through.
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image now supports loading HDR images in general, and currently
+// the Radiance .HDR file format, although the support is provided
+// generically. You can still load any file through the existing interface;
+// if you attempt to load an HDR file, it will be automatically remapped to
+// LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+// 
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for req_comp
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+typedef unsigned char stbi_uc;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// PRIMARY API - works on images of any type
+
+// load image by filename, open file, or memory buffer
+extern stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+
+#ifndef STBI_NO_STDIO
+extern stbi_uc *stbi_load            (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_HDR
+   extern float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+
+   #ifndef STBI_NO_STDIO
+   extern float *stbi_loadf            (char const *filename,   int *x, int *y, int *comp, int req_comp);
+   extern float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   #endif
+
+   extern void   stbi_hdr_to_ldr_gamma(float gamma);
+   extern void   stbi_hdr_to_ldr_scale(float scale);
+
+   extern void   stbi_ldr_to_hdr_gamma(float gamma);
+   extern void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_HDR
+
+// get a VERY brief reason for failure
+// NOT THREADSAFE
+extern const char *stbi_failure_reason  (void); 
+
+// free the loaded image -- this is just free()
+extern void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+extern int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+extern int      stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+
+#ifndef STBI_NO_STDIO
+extern int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+
+extern int      stbi_is_hdr          (char const *filename);
+extern int      stbi_is_hdr_from_file(FILE *f);
+#endif
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+extern void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+extern void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+
+// ZLIB client - used by PNG, available for other purposes
+
+extern char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+extern char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+extern int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+extern char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+extern int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+// define new loaders
+typedef struct
+{
+   int       (*test_memory)(stbi_uc const *buffer, int len);
+   stbi_uc * (*load_from_memory)(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+   #ifndef STBI_NO_STDIO
+   int       (*test_file)(FILE *f);
+   stbi_uc * (*load_from_file)(FILE *f, int *x, int *y, int *comp, int req_comp);
+   #endif
+} stbi_loader;
+
+// register a loader by filling out the above structure (you must define ALL functions)
+// returns 1 if added or already added, 0 if not added (too many loaders)
+// NOT THREADSAFE
+extern int stbi_register_loader(stbi_loader *loader);
+
+// define faster low-level operations (typically SIMD support)
+#ifdef STBI_SIMD
+typedef void (*stbi_idct_8x8)(stbi_uc *out, int out_stride, short data[64], unsigned short *dequantize);
+// compute an integer IDCT on "input"
+//     input[x] = data[x] * dequantize[x]
+//     write results to 'out': 64 samples, each run of 8 spaced by 'out_stride'
+//                             CLAMP results to 0..255
+typedef void (*stbi_YCbCr_to_RGB_run)(stbi_uc *output, stbi_uc const  *y, stbi_uc const *cb, stbi_uc const *cr, int count, int step);
+// compute a conversion from YCbCr to RGB
+//     'count' pixels
+//     write pixels to 'output'; each pixel is 'step' bytes (either 3 or 4; if 4, write '255' as 4th), order R,G,B
+//     y: Y input channel
+//     cb: Cb input channel; scale/biased to be 0..255
+//     cr: Cr input channel; scale/biased to be 0..255
+
+extern void stbi_install_idct(stbi_idct_8x8 func);
+extern void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func);
+#endif // STBI_SIMD
+
+
+
+
+// TYPE-SPECIFIC ACCESS
+
+#ifdef STBI_TYPE_SPECIFIC_FUNCTIONS
+
+// is it a jpeg?
+extern int      stbi_jpeg_test_memory     (stbi_uc const *buffer, int len);
+extern stbi_uc *stbi_jpeg_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+extern int      stbi_jpeg_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+extern stbi_uc *stbi_jpeg_load            (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern int      stbi_jpeg_test_file       (FILE *f);
+extern stbi_uc *stbi_jpeg_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+
+extern int      stbi_jpeg_info            (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_jpeg_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+#endif
+
+// is it a png?
+extern int      stbi_png_test_memory      (stbi_uc const *buffer, int len);
+extern stbi_uc *stbi_png_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+extern int      stbi_png_info_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+extern stbi_uc *stbi_png_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern int      stbi_png_info             (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_png_test_file        (FILE *f);
+extern stbi_uc *stbi_png_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+extern int      stbi_png_info_from_file   (FILE *f,                  int *x, int *y, int *comp);
+#endif
+
+// is it a bmp?
+extern int      stbi_bmp_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_bmp_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_bmp_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_bmp_test_file        (FILE *f);
+extern stbi_uc *stbi_bmp_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a tga?
+extern int      stbi_tga_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_tga_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_tga_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_tga_test_file        (FILE *f);
+extern stbi_uc *stbi_tga_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a psd?
+extern int      stbi_psd_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_psd_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_psd_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_psd_test_file        (FILE *f);
+extern stbi_uc *stbi_psd_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it an hdr?
+extern int      stbi_hdr_test_memory      (stbi_uc const *buffer, int len);
+
+extern float *  stbi_hdr_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern float *  stbi_hdr_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_hdr_test_file        (FILE *f);
+extern float *  stbi_hdr_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a pic?
+extern int      stbi_pic_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_pic_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_pic_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_pic_test_file        (FILE *f);
+extern stbi_uc *stbi_pic_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a gif?
+extern int      stbi_gif_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_gif_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_gif_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+extern int      stbi_gif_info_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+extern int      stbi_gif_test_file        (FILE *f);
+extern stbi_uc *stbi_gif_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+extern int      stbi_gif_info             (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_gif_info_from_file   (FILE *f,                  int *x, int *y, int *comp);
+#endif
+
+#endif//STBI_TYPE_SPECIFIC_FUNCTIONS
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifndef STBI_HEADER_FILE_ONLY
+
+#ifndef STBI_NO_HDR
+#include <math.h>  // ldexp
+#include <string.h> // strcmp
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+#include <stdlib.h>
+#include <memory.h>
+#include <assert.h>
+#include <stdarg.h>
+
+#ifndef _MSC_VER
+  #ifdef __cplusplus
+  #define __forceinline inline
+  #else
+  #define __forceinline
+  #endif
+#endif
+
+
+// implementation:
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef   signed short  int16;
+typedef unsigned int   uint32;
+typedef   signed int    int32;
+typedef unsigned int   uint;
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(uint32)==4 ? 1 : -1];
+
+#if defined(STBI_NO_STDIO) && !defined(STBI_NO_WRITE)
+#define STBI_NO_WRITE
+#endif
+
+#define STBI_NOTUSED(v)  v=v
+
+#ifdef _MSC_VER
+#define STBI_HAS_LRTOL
+#endif
+
+#ifdef STBI_HAS_LRTOL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Generic API that works on all image types
+//
+
+// deprecated functions
+
+// is it a jpeg?
+extern int      stbi_jpeg_test_memory     (stbi_uc const *buffer, int len);
+extern stbi_uc *stbi_jpeg_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+extern int      stbi_jpeg_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+extern stbi_uc *stbi_jpeg_load            (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern int      stbi_jpeg_test_file       (FILE *f);
+extern stbi_uc *stbi_jpeg_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+
+extern int      stbi_jpeg_info            (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_jpeg_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+#endif
+
+// is it a png?
+extern int      stbi_png_test_memory      (stbi_uc const *buffer, int len);
+extern stbi_uc *stbi_png_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+extern int      stbi_png_info_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+extern stbi_uc *stbi_png_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern int      stbi_png_info             (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_png_test_file        (FILE *f);
+extern stbi_uc *stbi_png_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+extern int      stbi_png_info_from_file   (FILE *f,                  int *x, int *y, int *comp);
+#endif
+
+// is it a bmp?
+extern int      stbi_bmp_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_bmp_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_bmp_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_bmp_test_file        (FILE *f);
+extern stbi_uc *stbi_bmp_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a tga?
+extern int      stbi_tga_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_tga_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_tga_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_tga_test_file        (FILE *f);
+extern stbi_uc *stbi_tga_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a psd?
+extern int      stbi_psd_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_psd_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_psd_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_psd_test_file        (FILE *f);
+extern stbi_uc *stbi_psd_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it an hdr?
+extern int      stbi_hdr_test_memory      (stbi_uc const *buffer, int len);
+
+extern float *  stbi_hdr_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern float *  stbi_hdr_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_hdr_test_file        (FILE *f);
+extern float *  stbi_hdr_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a pic?
+extern int      stbi_pic_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_pic_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_pic_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+#ifndef STBI_NO_STDIO
+extern int      stbi_pic_test_file        (FILE *f);
+extern stbi_uc *stbi_pic_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+#endif
+
+// is it a gif?
+extern int      stbi_gif_test_memory      (stbi_uc const *buffer, int len);
+
+extern stbi_uc *stbi_gif_load             (char const *filename,     int *x, int *y, int *comp, int req_comp);
+extern stbi_uc *stbi_gif_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
+extern int      stbi_gif_info_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+extern int      stbi_gif_test_file        (FILE *f);
+extern stbi_uc *stbi_gif_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+extern int      stbi_gif_info             (char const *filename,     int *x, int *y, int *comp);
+extern int      stbi_gif_info_from_file   (FILE *f,                  int *x, int *y, int *comp);
+#endif
+
+
+// this is not threadsafe
+static const char *failure_reason;
+
+const char *stbi_failure_reason(void)
+{
+   return failure_reason;
+}
+
+static int e(const char *str)
+{
+   failure_reason = str;
+   return 0;
+}
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define e(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define e(x,y)  e(y)
+#else
+   #define e(x,y)  e(x)
+#endif
+
+#define epf(x,y)   ((float *) (e(x,y)?NULL:NULL))
+#define epuc(x,y)  ((unsigned char *) (e(x,y)?NULL:NULL))
+
+void stbi_image_free(void *retval_from_stbi_load)
+{
+   free(retval_from_stbi_load);
+}
+
+#define MAX_LOADERS  32
+stbi_loader *loaders[MAX_LOADERS];
+static int max_loaders = 0;
+
+int stbi_register_loader(stbi_loader *loader)
+{
+   int i;
+   for (i=0; i < MAX_LOADERS; ++i) {
+      // already present?
+      if (loaders[i] == loader)
+         return 1;
+      // end of the list?
+      if (loaders[i] == NULL) {
+         loaders[i] = loader;
+         max_loaders = i+1;
+         return 1;
+      }
+   }
+   // no room for it
+   return 0;
+}
+
+#ifndef STBI_NO_HDR
+static float   *ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+static stbi_uc *hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_STDIO
+unsigned char *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return epuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+unsigned char *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   int i;
+   if (stbi_jpeg_test_file(f)) return stbi_jpeg_load_from_file(f,x,y,comp,req_comp);
+   if (stbi_png_test_file(f))  return stbi_png_load_from_file(f,x,y,comp,req_comp);
+   if (stbi_bmp_test_file(f))  return stbi_bmp_load_from_file(f,x,y,comp,req_comp);
+   if (stbi_gif_test_file(f))  return stbi_gif_load_from_file(f,x,y,comp,req_comp);
+   if (stbi_psd_test_file(f))  return stbi_psd_load_from_file(f,x,y,comp,req_comp);
+   if (stbi_pic_test_file(f))  return stbi_pic_load_from_file(f,x,y,comp,req_comp);
+
+   #ifndef STBI_NO_HDR
+   if (stbi_hdr_test_file(f)) {
+      float *hdr = stbi_hdr_load_from_file(f, x,y,comp,req_comp);
+      return hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   for (i=0; i < max_loaders; ++i)
+      if (loaders[i]->test_file(f))
+         return loaders[i]->load_from_file(f,x,y,comp,req_comp);
+   // test tga last because it's a crappy test!
+   if (stbi_tga_test_file(f))
+      return stbi_tga_load_from_file(f,x,y,comp,req_comp);
+   return epuc("unknown image type", "Image not of any known type, or corrupt");
+}
+#endif
+
+unsigned char *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   int i;
+   if (stbi_jpeg_test_memory(buffer,len)) return stbi_jpeg_load_from_memory(buffer,len,x,y,comp,req_comp);
+   if (stbi_png_test_memory(buffer,len))  return stbi_png_load_from_memory(buffer,len,x,y,comp,req_comp);
+   if (stbi_bmp_test_memory(buffer,len))  return stbi_bmp_load_from_memory(buffer,len,x,y,comp,req_comp);
+   if (stbi_gif_test_memory(buffer,len))  return stbi_gif_load_from_memory(buffer,len,x,y,comp,req_comp);
+   if (stbi_psd_test_memory(buffer,len))  return stbi_psd_load_from_memory(buffer,len,x,y,comp,req_comp);
+   if (stbi_pic_test_memory(buffer,len))  return stbi_pic_load_from_memory(buffer,len,x,y,comp,req_comp);
+
+   #ifndef STBI_NO_HDR
+   if (stbi_hdr_test_memory(buffer, len)) {
+      float *hdr = stbi_hdr_load_from_memory(buffer, len,x,y,comp,req_comp);
+      return hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   for (i=0; i < max_loaders; ++i)
+      if (loaders[i]->test_memory(buffer,len))
+         return loaders[i]->load_from_memory(buffer,len,x,y,comp,req_comp);
+   // test tga last because it's a crappy test!
+   if (stbi_tga_test_memory(buffer,len))
+      return stbi_tga_load_from_memory(buffer,len,x,y,comp,req_comp);
+   return epuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+#ifndef STBI_NO_HDR
+
+#ifndef STBI_NO_STDIO
+float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = fopen(filename, "rb");
+   float *result;
+   if (!f) return epf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi_hdr_test_file(f))
+      return stbi_hdr_load_from_file(f,x,y,comp,req_comp);
+   #endif
+   data = stbi_load_from_file(f, x, y, comp, req_comp);
+   if (data)
+      return ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return epf("unknown image type", "Image not of any known type, or corrupt");
+}
+#endif
+
+float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *data;
+   #ifndef STBI_NO_HDR
+   if (stbi_hdr_test_memory(buffer, len))
+      return stbi_hdr_load_from_memory(buffer, len,x,y,comp,req_comp);
+   #endif
+   data = stbi_load_from_memory(buffer, len, x, y, comp, req_comp);
+   if (data)
+      return ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return epf("unknown image type", "Image not of any known type, or corrupt");
+}
+#endif
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_HDR is
+// defined, for API simplicity; if STBI_NO_HDR is defined, it always
+// reports false!
+
+int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   return stbi_hdr_test_memory(buffer, len);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+extern int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+extern int      stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   return stbi_hdr_test_file(f);
+   #else
+   return 0;
+   #endif
+}
+
+#endif
+
+#ifndef STBI_NO_HDR
+static float h2l_gamma_i=1.0f/2.2f, h2l_scale_i=1.0f;
+static float l2h_gamma=2.2f, l2h_scale=1.0f;
+
+void   stbi_hdr_to_ldr_gamma(float gamma) { h2l_gamma_i = 1/gamma; }
+void   stbi_hdr_to_ldr_scale(float scale) { h2l_scale_i = 1/scale; }
+
+void   stbi_ldr_to_hdr_gamma(float gamma) { l2h_gamma = gamma; }
+void   stbi_ldr_to_hdr_scale(float scale) { l2h_scale = scale; }
+#endif
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   SCAN_load=0,
+   SCAN_type,
+   SCAN_header
+};
+
+typedef struct
+{
+   uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   #ifndef STBI_NO_STDIO
+   FILE  *img_file;
+   int buflen;
+   uint8 buffer_start[128];
+   int from_file;
+   #endif
+   uint8 *img_buffer, *img_buffer_end;
+} stbi;
+
+#ifndef STBI_NO_STDIO
+static void start_file(stbi *s, FILE *f)
+{
+   s->img_file = f;
+   s->buflen = sizeof(s->buffer_start);
+   s->img_buffer_end = s->buffer_start + s->buflen;
+   s->img_buffer = s->img_buffer_end;
+   s->from_file = 1;
+}
+#endif
+
+static void start_mem(stbi *s, uint8 const *buffer, int len)
+{
+#ifndef STBI_NO_STDIO
+   s->img_file = NULL;
+   s->from_file = 0;
+#endif
+   s->img_buffer = (uint8 *) buffer;
+   s->img_buffer_end = (uint8 *) buffer+len;
+}
+
+#ifndef STBI_NO_STDIO
+static void refill_buffer(stbi *s)
+{
+   int n = fread(s->buffer_start, 1, s->buflen, s->img_file);
+   if (n == 0) {
+      s->from_file = 0;
+      s->img_buffer = s->img_buffer_end-1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+#endif
+
+__forceinline static int get8(stbi *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+#ifndef STBI_NO_STDIO
+   if (s->from_file) {
+      refill_buffer(s);
+      return *s->img_buffer++;
+   }
+#endif
+   return 0;
+}
+
+__forceinline static int at_eof(stbi *s)
+{
+#ifndef STBI_NO_STDIO
+   if (s->img_file) {
+      if (!feof(s->img_file)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->from_file == 0) return 1;
+   }
+#endif
+   return s->img_buffer >= s->img_buffer_end;   
+}
+
+__forceinline static uint8 get8u(stbi *s)
+{
+   return (uint8) get8(s);
+}
+
+static void skip(stbi *s, int n)
+{
+#ifndef STBI_NO_STDIO
+   if (s->img_file) {
+      int blen = s->img_buffer_end - s->img_buffer;
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         fseek(s->img_file, n - blen, SEEK_CUR);
+         return;
+      }
+   }
+#endif
+   s->img_buffer += n;
+}
+
+static int getn(stbi *s, stbi_uc *buffer, int n)
+{
+#ifndef STBI_NO_STDIO
+   if (s->img_file) {
+      int blen = s->img_buffer_end - s->img_buffer;
+      if (blen < n) {
+         int res;
+         memcpy(buffer, s->img_buffer, blen);
+         res = ((int) fread(buffer + blen, 1, n - blen, s->img_file) == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+#endif
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+
+static int get16(stbi *s)
+{
+   int z = get8(s);
+   return (z << 8) + get8(s);
+}
+
+static uint32 get32(stbi *s)
+{
+   uint32 z = get16(s);
+   return (z << 16) + get16(s);
+}
+
+static int get16le(stbi *s)
+{
+   int z = get8(s);
+   return z + (get8(s) << 8);
+}
+
+static uint32 get32le(stbi *s)
+{
+   uint32 z = get16le(s);
+   return z + (get16le(s) << 16);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static uint8 compute_y(int r, int g, int b)
+{
+   return (uint8) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static unsigned char *convert_format(unsigned char *data, int img_n, int req_comp, uint x, uint y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   assert(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) malloc(req_comp * x * y);
+   if (good == NULL) {
+      free(data);
+      return epuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define COMBO(a,b)  ((a)*8+(b))
+      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (COMBO(img_n, req_comp)) {
+         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
+         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
+         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
+         CASE(2,1) dest[0]=src[0]; break;
+         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
+         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
+         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
+         CASE(3,1) dest[0]=compute_y(src[0],src[1],src[2]); break;
+         CASE(3,2) dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
+         CASE(4,1) dest[0]=compute_y(src[0],src[1],src[2]); break;
+         CASE(4,2) dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
+         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
+         default: assert(0);
+      }
+      #undef CASE
+   }
+
+   free(data);
+   return good;
+}
+
+#ifndef STBI_NO_HDR
+static float   *ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output = (float *) malloc(x * y * comp * sizeof(float));
+   if (output == NULL) { free(data); return epf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) pow(data[i*comp+k]/255.0f, l2h_gamma) * l2h_scale;
+      }
+      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
+   }
+   free(data);
+   return output;
+}
+
+#define float2int(x)   ((int) (x))
+static stbi_uc *hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output = (stbi_uc *) malloc(x * y * comp);
+   if (output == NULL) { free(data); return epuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*h2l_scale_i, h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (uint8) float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (uint8) float2int(z);
+      }
+   }
+   free(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
+//
+//    simple implementation
+//      - channel subsampling of at most 2 in each dimension
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - uses a lot of intermediate memory, could cache poorly
+//      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
+//          stb_jpeg:   1.34 seconds (MSVC6, default release build)
+//          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
+//          IJL11.dll:  1.08 seconds (compiled by intel)
+//          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
+//          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   uint8  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   uint16 code[256];
+   uint8  values[256];
+   uint8  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} huffman;
+
+typedef struct
+{
+   #ifdef STBI_SIMD
+   unsigned short dequant2[4][64];
+   #endif
+   stbi s;
+   huffman huff_dc[4];
+   huffman huff_ac[4];
+   uint8 dequant[4][64];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      uint8 *data;
+      void *raw_data;
+      uint8 *linebuf;
+   } img_comp[4];
+
+   uint32         code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+} jpeg;
+
+static int build_huffman(huffman *h, int *count)
+{
+   int i,j,k=0,code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i)
+      for (j=0; j < count[i]; ++j)
+         h->size[k++] = (uint8) (i+1);
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (uint16) (code++);
+         if (code-1 >= (1 << j)) return e("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (uint8) i;
+         }
+      }
+   }
+   return 1;
+}
+
+static void grow_buffer_unsafe(jpeg *j)
+{
+   do {
+      int b = j->nomore ? 0 : get8(&j->s);
+      if (b == 0xff) {
+         int c = get8(&j->s);
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static uint32 bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+__forceinline static int decode(jpeg *j, huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & bmask[k]) + h->delta[k];
+   assert((((j->code_buffer) >> (32 - h->size[c])) & bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+__forceinline static int extend_receive(jpeg *j, int n)
+{
+   unsigned int m = 1 << (n-1);
+   unsigned int k;
+   if (j->code_bits < n) grow_buffer_unsafe(j);
+
+   #if 1
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~bmask[n];
+   k &= bmask[n];
+   j->code_bits -= n;
+   #else
+   k = (j->code_buffer >> (32 - n)) & bmask[n];
+   j->code_bits -= n;
+   j->code_buffer <<= n;
+   #endif
+   // the following test is probably a random branch that won't
+   // predict well. I tried to table accelerate it but failed.
+   // maybe it's compiling as a conditional move?
+   if (k < m)
+      return (-1 << n) + k + 1;
+   else
+      return k;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static uint8 dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int decode_block(jpeg *j, short data[64], huffman *hdc, huffman *hac, int b)
+{
+   int diff,dc,k;
+   int t = decode(j, hdc);
+   if (t < 0) return e("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? extend_receive(j, t) : 0;
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   data[0] = (short) dc;
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      int r,s;
+      int rs = decode(j, hac);
+      if (rs < 0) return e("bad huffman code","Corrupt JPEG");
+      s = rs & 15;
+      r = rs >> 4;
+      if (s == 0) {
+         if (rs != 0xf0) break; // end block
+         k += 16;
+      } else {
+         k += r;
+         // decode into unzigzag'd location
+         data[dezigzag[k++]] = (short) extend_receive(j,s);
+      }
+   } while (k < 64);
+   return 1;
+}
+
+// take a -128..127 value and clamp it and convert to 0..255
+__forceinline static uint8 clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (uint8) x;
+}
+
+#define f2f(x)  (int) (((x) * 4096 + 0.5))
+#define fsh(x)  ((x) << 12)
+
+// derived from jidctint -- DCT_ISLOW
+#define IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7)       \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * f2f(0.5411961f);             \
+   t2 = p1 + p3*f2f(-1.847759065f);            \
+   t3 = p1 + p2*f2f( 0.765366865f);            \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = fsh(p2+p3);                            \
+   t1 = fsh(p2-p3);                            \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*f2f( 1.175875602f);            \
+   t0 = t0*f2f( 0.298631336f);                 \
+   t1 = t1*f2f( 2.053119869f);                 \
+   t2 = t2*f2f( 3.072711026f);                 \
+   t3 = t3*f2f( 1.501321110f);                 \
+   p1 = p5 + p1*f2f(-0.899976223f);            \
+   p2 = p5 + p2*f2f(-2.562915447f);            \
+   p3 = p3*f2f(-1.961570560f);                 \
+   p4 = p4*f2f(-0.390180644f);                 \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+#ifdef STBI_SIMD
+typedef unsigned short stbi_dequantize_t;
+#else
+typedef uint8 stbi_dequantize_t;
+#endif
+
+// .344 seconds on 3*anemones.jpg
+static void idct_block(uint8 *out, int out_stride, short data[64], stbi_dequantize_t *dequantize)
+{
+   int i,val[64],*v=val;
+   stbi_dequantize_t *dq = dequantize;
+   uint8 *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d,++dq, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0] * dq[0] << 2;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
+                 d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = clamp((x0+t3) >> 17);
+      o[7] = clamp((x0-t3) >> 17);
+      o[1] = clamp((x1+t2) >> 17);
+      o[6] = clamp((x1-t2) >> 17);
+      o[2] = clamp((x2+t1) >> 17);
+      o[5] = clamp((x2-t1) >> 17);
+      o[3] = clamp((x3+t0) >> 17);
+      o[4] = clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SIMD
+static stbi_idct_8x8 stbi_idct_installed = idct_block;
+
+extern void stbi_install_idct(stbi_idct_8x8 func)
+{
+   stbi_idct_installed = func;
+}
+#endif
+
+#define MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static uint8 get_marker(jpeg *j)
+{
+   uint8 x;
+   if (j->marker != MARKER_none) { x = j->marker; j->marker = MARKER_none; return x; }
+   x = get8u(&j->s);
+   if (x != 0xff) return MARKER_none;
+   while (x == 0xff)
+      x = get8u(&j->s);
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, reset the entropy decoder and
+// the dc prediction
+static void reset(jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->marker = MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int parse_entropy_coded_data(jpeg *z)
+{
+   reset(z);
+   if (z->scan_n == 1) {
+      int i,j;
+      #ifdef STBI_SIMD
+      __declspec(align(16))
+      #endif
+      short data[64];
+      int n = z->order[0];
+      // non-interleaved data, we just need to process one block at a time,
+      // in trivial scanline order
+      // number of blocks to do just depends on how many actual "pixels" this
+      // component has, independent of interleaved MCU blocking and such
+      int w = (z->img_comp[n].x+7) >> 3;
+      int h = (z->img_comp[n].y+7) >> 3;
+      for (j=0; j < h; ++j) {
+         for (i=0; i < w; ++i) {
+            if (!decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+z->img_comp[n].ha, n)) return 0;
+            #ifdef STBI_SIMD
+            stbi_idct_installed(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data, z->dequant2[z->img_comp[n].tq]);
+            #else
+            idct_block(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data, z->dequant[z->img_comp[n].tq]);
+            #endif
+            // every data block is an MCU, so countdown the restart interval
+            if (--z->todo <= 0) {
+               if (z->code_bits < 24) grow_buffer_unsafe(z);
+               // if it's NOT a restart, then just bail, so we get corrupt data
+               // rather than no data
+               if (!RESTART(z->marker)) return 1;
+               reset(z);
+            }
+         }
+      }
+   } else { // interleaved!
+      int i,j,k,x,y;
+      short data[64];
+      for (j=0; j < z->img_mcu_y; ++j) {
+         for (i=0; i < z->img_mcu_x; ++i) {
+            // scan an interleaved mcu... process scan_n components in order
+            for (k=0; k < z->scan_n; ++k) {
+               int n = z->order[k];
+               // scan out an mcu's worth of this component; that's just determined
+               // by the basic H and V specified for the component
+               for (y=0; y < z->img_comp[n].v; ++y) {
+                  for (x=0; x < z->img_comp[n].h; ++x) {
+                     int x2 = (i*z->img_comp[n].h + x)*8;
+                     int y2 = (j*z->img_comp[n].v + y)*8;
+                     if (!decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+z->img_comp[n].ha, n)) return 0;
+                     #ifdef STBI_SIMD
+                     stbi_idct_installed(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data, z->dequant2[z->img_comp[n].tq]);
+                     #else
+                     idct_block(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data, z->dequant[z->img_comp[n].tq]);
+                     #endif
+                  }
+               }
+            }
+            // after all interleaved components, that's an interleaved MCU,
+            // so now count down the restart interval
+            if (--z->todo <= 0) {
+               if (z->code_bits < 24) grow_buffer_unsafe(z);
+               // if it's NOT a restart, then just bail, so we get corrupt data
+               // rather than no data
+               if (!RESTART(z->marker)) return 1;
+               reset(z);
+            }
+         }
+      }
+   }
+   return 1;
+}
+
+static int process_marker(jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case MARKER_none: // no marker found
+         return e("expected marker","Corrupt JPEG");
+
+      case 0xC2: // SOF - progressive
+         return e("progressive jpeg","JPEG format not supported (progressive)");
+
+      case 0xDD: // DRI - specify restart interval
+         if (get16(&z->s) != 4) return e("bad DRI len","Corrupt JPEG");
+         z->restart_interval = get16(&z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = get16(&z->s)-2;
+         while (L > 0) {
+            int q = get8(&z->s);
+            int p = q >> 4;
+            int t = q & 15,i;
+            if (p != 0) return e("bad DQT type","Corrupt JPEG");
+            if (t > 3) return e("bad DQT table","Corrupt JPEG");
+            for (i=0; i < 64; ++i)
+               z->dequant[t][dezigzag[i]] = get8u(&z->s);
+            #ifdef STBI_SIMD
+            for (i=0; i < 64; ++i)
+               z->dequant2[t][i] = z->dequant[t][i];
+            #endif
+            L -= 65;
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = get16(&z->s)-2;
+         while (L > 0) {
+            uint8 *v;
+            int sizes[16],i,m=0;
+            int q = get8(&z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return e("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = get8(&z->s);
+               m += sizes[i];
+            }
+            L -= 17;
+            if (tc == 0) {
+               if (!build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < m; ++i)
+               v[i] = get8u(&z->s);
+            L -= m;
+         }
+         return L==0;
+   }
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      skip(&z->s, get16(&z->s)-2);
+      return 1;
+   }
+   return 0;
+}
+
+// after we see SOS
+static int process_scan_header(jpeg *z)
+{
+   int i;
+   int Ls = get16(&z->s);
+   z->scan_n = get8(&z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s.img_n) return e("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return e("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = get8(&z->s), which;
+      int q = get8(&z->s);
+      for (which = 0; which < z->s.img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s.img_n) return 0;
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return e("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return e("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+   if (get8(&z->s) != 0) return e("bad SOS","Corrupt JPEG");
+   get8(&z->s); // should be 63, but might be 0
+   if (get8(&z->s) != 0) return e("bad SOS","Corrupt JPEG");
+
+   return 1;
+}
+
+static int process_frame_header(jpeg *z, int scan)
+{
+   stbi *s = &z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = get16(s);         if (Lf < 11) return e("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = get8(s);          if (p != 8) return e("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = get16(s);   if (s->img_y == 0) return e("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = get16(s);   if (s->img_x == 0) return e("0 width","Corrupt JPEG"); // JPEG requires
+   c = get8(s);
+   if (c != 3 && c != 1) return e("bad component count","Corrupt JPEG");    // JFIF requires
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return e("bad SOF len","Corrupt JPEG");
+
+   for (i=0; i < s->img_n; ++i) {
+      z->img_comp[i].id = get8(s);
+      if (z->img_comp[i].id != i+1)   // JFIF requires
+         if (z->img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
+            return e("bad component ID","Corrupt JPEG");
+      q = get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return e("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return e("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = get8(s);  if (z->img_comp[i].tq > 3) return e("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != SCAN_load) return 1;
+
+   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return e("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
+      if (z->img_comp[i].raw_data == NULL) {
+         for(--i; i >= 0; --i) {
+            free(z->img_comp[i].raw_data);
+            z->img_comp[i].data = NULL;
+         }
+         return e("outofmem", "Out of memory");
+      }
+      // align blocks for installable-idct using mmx/sse
+      z->img_comp[i].data = (uint8*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define DNL(x)         ((x) == 0xdc)
+#define SOI(x)         ((x) == 0xd8)
+#define EOI(x)         ((x) == 0xd9)
+#define SOF(x)         ((x) == 0xc0 || (x) == 0xc1)
+#define SOS(x)         ((x) == 0xda)
+
+static int decode_jpeg_header(jpeg *z, int scan)
+{
+   int m;
+   z->marker = MARKER_none; // initialize cached marker to empty
+   m = get_marker(z);
+   if (!SOI(m)) return e("no SOI","Corrupt JPEG");
+   if (scan == SCAN_type) return 1;
+   m = get_marker(z);
+   while (!SOF(m)) {
+      if (!process_marker(z,m)) return 0;
+      m = get_marker(z);
+      while (m == MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (at_eof(&z->s)) return e("no SOF", "Corrupt JPEG");
+         m = get_marker(z);
+      }
+   }
+   if (!process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static int decode_jpeg_image(jpeg *j)
+{
+   int m;
+   j->restart_interval = 0;
+   if (!decode_jpeg_header(j, SCAN_load)) return 0;
+   m = get_marker(j);
+   while (!EOI(m)) {
+      if (SOS(m)) {
+         if (!process_scan_header(j)) return 0;
+         if (!parse_entropy_coded_data(j)) return 0;
+         if (j->marker == MARKER_none ) {
+            // handle 0s at the end of image data from IP Kamera 9060
+            while (!at_eof(&j->s)) {
+               int x = get8(&j->s);
+               if (x == 255) {
+                  j->marker = get8u(&j->s);
+                  break;
+               } else if (x != 0) {
+                  return 0;
+               }
+            }
+            // if we reach eof without hitting a marker, get_marker() below will fail and we'll eventually return 0
+         }
+      } else {
+         if (!process_marker(j, m)) return 0;
+      }
+      m = get_marker(j);
+   }
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef uint8 *(*resample_row_func)(uint8 *out, uint8 *in0, uint8 *in1,
+                                    int w, int hs);
+
+#define div4(x) ((uint8) ((x) >> 2))
+
+static uint8 *resample_row_1(uint8 *out, uint8 *in_near, uint8 *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static uint8* resample_row_v_2(uint8 *out, uint8 *in_near, uint8 *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static uint8*  resample_row_h_2(uint8 *out, uint8 *in_near, uint8 *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   uint8 *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = div4(n+input[i-1]);
+      out[i*2+1] = div4(n+input[i+1]);
+   }
+   out[i*2+0] = div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define div16(x) ((uint8) ((x) >> 4))
+
+static uint8 *resample_row_hv_2(uint8 *out, uint8 *in_near, uint8 *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = div16(3*t0 + t1 + 8);
+      out[i*2  ] = div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+static uint8 *resample_row_generic(uint8 *out, uint8 *in_near, uint8 *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   in_far = in_far;
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
+
+// 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
+// VC6 without processor=Pro is generating multiple LEAs per multiply!
+static void YCbCr_to_RGB_row(uint8 *out, const uint8 *y, const uint8 *pcb, const uint8 *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 16) + 32768; // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr*float2fixed(1.40200f);
+      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
+      b = y_fixed                            + cb*float2fixed(1.77200f);
+      r >>= 16;
+      g >>= 16;
+      b >>= 16;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (uint8)r;
+      out[1] = (uint8)g;
+      out[2] = (uint8)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#ifdef STBI_SIMD
+static stbi_YCbCr_to_RGB_run stbi_YCbCr_installed = YCbCr_to_RGB_row;
+
+void stbi_install_YCbCr_to_RGB(stbi_YCbCr_to_RGB_run func)
+{
+   stbi_YCbCr_installed = func;
+}
+#endif
+
+
+// clean up the temporary component buffers
+static void cleanup_jpeg(jpeg *j)
+{
+   int i;
+   for (i=0; i < j->s.img_n; ++i) {
+      if (j->img_comp[i].data) {
+         free(j->img_comp[i].raw_data);
+         j->img_comp[i].data = NULL;
+      }
+      if (j->img_comp[i].linebuf) {
+         free(j->img_comp[i].linebuf);
+         j->img_comp[i].linebuf = NULL;
+      }
+   }
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   uint8 *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion 
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi_resample;
+
+static uint8 *load_jpeg_image(jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n;
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return epuc("bad req_comp", "Internal error");
+   z->s.img_n = 0;
+
+   // load a jpeg image from whichever source
+   if (!decode_jpeg_image(z)) { cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s.img_n;
+
+   if (z->s.img_n == 3 && n < 3)
+      decode_n = 1;
+   else
+      decode_n = z->s.img_n;
+
+   // resample and color-convert
+   {
+      int k;
+      uint i,j;
+      uint8 *output;
+      uint8 *coutput[4];
+
+      stbi_resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi_resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (uint8 *) malloc(z->s.img_x + 3);
+         if (!z->img_comp[k].linebuf) { cleanup_jpeg(z); return epuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s.img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = resample_row_hv_2;
+         else                               r->resample = resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (uint8 *) malloc(n * z->s.img_x * z->s.img_y + 1);
+      if (!output) { cleanup_jpeg(z); return epuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s.img_y; ++j) {
+         uint8 *out = output + n * z->s.img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi_resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            uint8 *y = coutput[0];
+            if (z->s.img_n == 3) {
+               #ifdef STBI_SIMD
+               stbi_YCbCr_installed(out, y, coutput[1], coutput[2], z->s.img_x, n);
+               #else
+               YCbCr_to_RGB_row(out, y, coutput[1], coutput[2], z->s.img_x, n);
+               #endif
+            } else
+               for (i=0; i < z->s.img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            uint8 *y = coutput[0];
+            if (n == 1)
+               for (i=0; i < z->s.img_x; ++i) out[i] = y[i];
+            else
+               for (i=0; i < z->s.img_x; ++i) *out++ = y[i], *out++ = 255;
+         }
+      }
+      cleanup_jpeg(z);
+      *out_x = z->s.img_x;
+      *out_y = z->s.img_y;
+      if (comp) *comp  = z->s.img_n; // report original components, not output
+      return output;
+   }
+}
+
+#ifndef STBI_NO_STDIO
+unsigned char *stbi_jpeg_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   jpeg j;
+   start_file(&j.s, f);
+   return load_jpeg_image(&j, x,y,comp,req_comp);
+}
+
+unsigned char *stbi_jpeg_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return NULL;
+   data = stbi_jpeg_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return data;
+}
+#endif
+
+unsigned char *stbi_jpeg_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   #ifdef STBI_SMALL_STACK
+   unsigned char *result;
+   jpeg *j = (jpeg *) malloc(sizeof(*j));
+   start_mem(&j->s, buffer, len);
+   result = load_jpeg_image(j,x,y,comp,req_comp);
+   free(j);
+   return result;
+   #else
+   jpeg j;
+   start_mem(&j.s, buffer,len);
+   return load_jpeg_image(&j, x,y,comp,req_comp);
+   #endif
+}
+
+static int stbi_jpeg_info_raw(jpeg *j, int *x, int *y, int *comp)
+{
+   if (!decode_jpeg_header(j, SCAN_header))
+      return 0;
+   if (x) *x = j->s.img_x;
+   if (y) *y = j->s.img_y;
+   if (comp) *comp = j->s.img_n;
+   return 1;
+}
+
+#ifndef STBI_NO_STDIO
+int stbi_jpeg_test_file(FILE *f)
+{
+   int n,r;
+   jpeg j;
+   n = ftell(f);
+   start_file(&j.s, f);
+   r = decode_jpeg_header(&j, SCAN_type);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+
+int stbi_jpeg_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+    jpeg j;
+    long n = ftell(f);
+    int res;
+    start_file(&j.s, f);
+    res = stbi_jpeg_info_raw(&j, x, y, comp);
+    fseek(f, n, SEEK_SET);
+    return res;
+}
+
+int stbi_jpeg_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = fopen(filename, "rb");
+    int result;
+    if (!f) return e("can't fopen", "Unable to open file");
+    result = stbi_jpeg_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+#endif
+
+int stbi_jpeg_test_memory(stbi_uc const *buffer, int len)
+{
+   jpeg j;
+   start_mem(&j.s, buffer,len);
+   return decode_jpeg_header(&j, SCAN_type);
+}
+
+int stbi_jpeg_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+    jpeg j;
+    start_mem(&j.s, buffer, len);
+    return stbi_jpeg_info_raw(&j, x, y, comp);
+}
+
+#ifndef STBI_NO_STDIO
+extern int      stbi_jpeg_info            (char const *filename,           int *x, int *y, int *comp);
+extern int      stbi_jpeg_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+#endif
+extern int      stbi_jpeg_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define ZFAST_BITS  9 // accelerate all cases in default tables
+#define ZFAST_MASK  ((1 << ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   uint16 fast[1 << ZFAST_BITS];
+   uint16 firstcode[16];
+   int maxcode[17];
+   uint16 firstsymbol[16];
+   uint8  size[288];
+   uint16 value[288]; 
+} zhuffman;
+
+__forceinline static int bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+__forceinline static int bit_reverse(int v, int bits)
+{
+   assert(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return bitreverse16(v) >> (16-bits);
+}
+
+static int zbuild_huffman(zhuffman *z, uint8 *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 255, sizeof(z->fast));
+   for (i=0; i < num; ++i) 
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      assert(sizes[i] <= (1 << i));
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (uint16) code;
+      z->firstsymbol[i] = (uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return e("bad codelengths","Corrupt JPEG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         z->size[c] = (uint8)s;
+         z->value[c] = (uint16)i;
+         if (s <= ZFAST_BITS) {
+            int k = bit_reverse(next_code[s],s);
+            while (k < (1 << ZFAST_BITS)) {
+               z->fast[k] = (uint16) c;
+               k += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   uint8 *zbuffer, *zbuffer_end;
+   int num_bits;
+   uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   zhuffman z_length, z_distance;
+} zbuf;
+
+__forceinline static int zget8(zbuf *z)
+{
+   if (z->zbuffer >= z->zbuffer_end) return 0;
+   return *z->zbuffer++;
+}
+
+static void fill_bits(zbuf *z)
+{
+   do {
+      assert(z->code_buffer < (1U << z->num_bits));
+      z->code_buffer |= zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+__forceinline static unsigned int zreceive(zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;   
+}
+
+__forceinline static int zhuffman_decode(zbuf *a, zhuffman *z)
+{
+   int b,s,k;
+   if (a->num_bits < 16) fill_bits(a);
+   b = z->fast[a->code_buffer & ZFAST_MASK];
+   if (b < 0xffff) {
+      s = z->size[b];
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return z->value[b];
+   }
+
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = bit_reverse(a->code_buffer, 16);
+   for (s=ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s == 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   assert(z->size[b] == s);
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+static int expand(zbuf *z, int n)  // need to make room for n bytes
+{
+   char *q;
+   int cur, limit;
+   if (!z->z_expandable) return e("output buffer limit","Corrupt PNG");
+   cur   = (int) (z->zout     - z->zout_start);
+   limit = (int) (z->zout_end - z->zout_start);
+   while (cur + n > limit)
+      limit *= 2;
+   q = (char *) realloc(z->zout_start, limit);
+   if (q == NULL) return e("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static int length_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static int length_extra[31]= 
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static int dist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static int dist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int parse_huffman_block(zbuf *a)
+{
+   for(;;) {
+      int z = zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return e("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (a->zout >= a->zout_end) if (!expand(a, 1)) return 0;
+         *a->zout++ = (char) z;
+      } else {
+         uint8 *p;
+         int len,dist;
+         if (z == 256) return 1;
+         z -= 257;
+         len = length_base[z];
+         if (length_extra[z]) len += zreceive(a, length_extra[z]);
+         z = zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return e("bad huffman code","Corrupt PNG");
+         dist = dist_base[z];
+         if (dist_extra[z]) dist += zreceive(a, dist_extra[z]);
+         if (a->zout - a->zout_start < dist) return e("bad dist","Corrupt PNG");
+         if (a->zout + len > a->zout_end) if (!expand(a, len)) return 0;
+         p = (uint8 *) (a->zout - dist);
+         while (len--)
+            *a->zout++ = *p++;
+      }
+   }
+}
+
+static int compute_huffman_codes(zbuf *a)
+{
+   static uint8 length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   zhuffman z_codelength;
+   uint8 lencodes[286+32+137];//padding for maximum single op
+   uint8 codelength_sizes[19];
+   int i,n;
+
+   int hlit  = zreceive(a,5) + 257;
+   int hdist = zreceive(a,5) + 1;
+   int hclen = zreceive(a,4) + 4;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (uint8) s;
+   }
+   if (!zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < hlit + hdist) {
+      int c = zhuffman_decode(a, &z_codelength);
+      assert(c >= 0 && c < 19);
+      if (c < 16)
+         lencodes[n++] = (uint8) c;
+      else if (c == 16) {
+         c = zreceive(a,2)+3;
+         memset(lencodes+n, lencodes[n-1], c);
+         n += c;
+      } else if (c == 17) {
+         c = zreceive(a,3)+3;
+         memset(lencodes+n, 0, c);
+         n += c;
+      } else {
+         assert(c == 18);
+         c = zreceive(a,7)+11;
+         memset(lencodes+n, 0, c);
+         n += c;
+      }
+   }
+   if (n != hlit+hdist) return e("bad codelengths","Corrupt PNG");
+   if (!zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int parse_uncompressed_block(zbuf *a)
+{
+   uint8 header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (uint8) (a->code_buffer & 255); // wtf this warns?
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   assert(a->num_bits == 0);
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = (uint8) zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return e("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return e("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!expand(a, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int parse_zlib_header(zbuf *a)
+{
+   int cmf   = zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = zget8(a);
+   if ((cmf*256+flg) % 31 != 0) return e("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return e("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return e("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+// @TODO: should statically initialize these for optimal thread safety
+static uint8 default_length[288], default_distance[32];
+static void init_defaults(void)
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     default_length[i]   = 8;
+   for (   ; i <= 255; ++i)     default_length[i]   = 9;
+   for (   ; i <= 279; ++i)     default_length[i]   = 7;
+   for (   ; i <= 287; ++i)     default_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     default_distance[i] = 5;
+}
+
+int stbi_png_partial; // a quick hack to only allow decoding some of a PNG... I should implement real streaming support instead
+static int parse_zlib(zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = zreceive(a,1);
+      type = zreceive(a,2);
+      if (type == 0) {
+         if (!parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!default_distance[31]) init_defaults();
+            if (!zbuild_huffman(&a->z_length  , default_length  , 288)) return 0;
+            if (!zbuild_huffman(&a->z_distance, default_distance,  32)) return 0;
+         } else {
+            if (!compute_huffman_codes(a)) return 0;
+         }
+         if (!parse_huffman_block(a)) return 0;
+      }
+      if (stbi_png_partial && a->zout - a->zout_start > 65536)
+         break;
+   } while (!final);
+   return 1;
+}
+
+static int do_zlib(zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return parse_zlib(a, parse_header);
+}
+
+char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   zbuf a;
+   char *p = (char *) malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (uint8 *) buffer;
+   a.zbuffer_end = (uint8 *) buffer + len;
+   if (do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      free(a.zout_start);
+      return NULL;
+   }
+}
+
+char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   zbuf a;
+   char *p = (char *) malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (uint8 *) buffer;
+   a.zbuffer_end = (uint8 *) buffer + len;
+   if (do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      free(a.zout_start);
+      return NULL;
+   }
+}
+
+int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   zbuf a;
+   a.zbuffer = (uint8 *) ibuffer;
+   a.zbuffer_end = (uint8 *) ibuffer + ilen;
+   if (do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   zbuf a;
+   char *p = (char *) malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (uint8 *) buffer;
+   a.zbuffer_end = (uint8 *) buffer+len;
+   if (do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      free(a.zout_start);
+      return NULL;
+   }
+}
+
+int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   zbuf a;
+   a.zbuffer = (uint8 *) ibuffer;
+   a.zbuffer_end = (uint8 *) ibuffer + ilen;
+   if (do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+
+typedef struct
+{
+   uint32 length;
+   uint32 type;
+} chunk;
+
+#define PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+
+static chunk get_chunk_header(stbi *s)
+{
+   chunk c;
+   c.length = get32(s);
+   c.type   = get32(s);
+   return c;
+}
+
+static int check_png_header(stbi *s)
+{
+   static uint8 png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (get8(s) != png_sig[i]) return e("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi s;
+   uint8 *idata, *expanded, *out;
+} png;
+
+
+enum {
+   F_none=0, F_sub=1, F_up=2, F_avg=3, F_paeth=4,
+   F_avg_first, F_paeth_first
+};
+
+static uint8 first_row_filter[5] =
+{
+   F_none, F_sub, F_none, F_avg_first, F_paeth_first
+};
+
+static int paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+// create the png data from post-deflated data
+static int create_png_image_raw(png *a, uint8 *raw, uint32 raw_len, int out_n, uint32 x, uint32 y)
+{
+   stbi *s = &a->s;
+   uint32 i,j,stride = x*out_n;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+   assert(out_n == s->img_n || out_n == s->img_n+1);
+   if (stbi_png_partial) y = 1;
+   a->out = (uint8 *) malloc(x * y * out_n);
+   if (!a->out) return e("outofmem", "Out of memory");
+   if (!stbi_png_partial) {
+      if (s->img_x == x && s->img_y == y) {
+         if (raw_len != (img_n * x + 1) * y) return e("not enough pixels","Corrupt PNG");
+      } else { // interlaced:
+         if (raw_len < (img_n * x + 1) * y) return e("not enough pixels","Corrupt PNG");
+      }
+   }
+   for (j=0; j < y; ++j) {
+      uint8 *cur = a->out + stride*j;
+      uint8 *prior = cur - stride;
+      int filter = *raw++;
+      if (filter > 4) return e("invalid filter","Corrupt PNG");
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+      // handle first pixel explicitly
+      for (k=0; k < img_n; ++k) {
+         switch (filter) {
+            case F_none       : cur[k] = raw[k]; break;
+            case F_sub        : cur[k] = raw[k]; break;
+            case F_up         : cur[k] = raw[k] + prior[k]; break;
+            case F_avg        : cur[k] = raw[k] + (prior[k]>>1); break;
+            case F_paeth      : cur[k] = (uint8) (raw[k] + paeth(0,prior[k],0)); break;
+            case F_avg_first  : cur[k] = raw[k]; break;
+            case F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+      if (img_n != out_n) cur[img_n] = 255;
+      raw += img_n;
+      cur += out_n;
+      prior += out_n;
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (img_n == out_n) {
+         #define CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, raw+=img_n,cur+=img_n,prior+=img_n) \
+                   for (k=0; k < img_n; ++k)
+         switch (filter) {
+            CASE(F_none)  cur[k] = raw[k]; break;
+            CASE(F_sub)   cur[k] = raw[k] + cur[k-img_n]; break;
+            CASE(F_up)    cur[k] = raw[k] + prior[k]; break;
+            CASE(F_avg)   cur[k] = raw[k] + ((prior[k] + cur[k-img_n])>>1); break;
+            CASE(F_paeth)  cur[k] = (uint8) (raw[k] + paeth(cur[k-img_n],prior[k],prior[k-img_n])); break;
+            CASE(F_avg_first)    cur[k] = raw[k] + (cur[k-img_n] >> 1); break;
+            CASE(F_paeth_first)  cur[k] = (uint8) (raw[k] + paeth(cur[k-img_n],0,0)); break;
+         }
+         #undef CASE
+      } else {
+         assert(img_n+1 == out_n);
+         #define CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n) \
+                   for (k=0; k < img_n; ++k)
+         switch (filter) {
+            CASE(F_none)  cur[k] = raw[k]; break;
+            CASE(F_sub)   cur[k] = raw[k] + cur[k-out_n]; break;
+            CASE(F_up)    cur[k] = raw[k] + prior[k]; break;
+            CASE(F_avg)   cur[k] = raw[k] + ((prior[k] + cur[k-out_n])>>1); break;
+            CASE(F_paeth)  cur[k] = (uint8) (raw[k] + paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
+            CASE(F_avg_first)    cur[k] = raw[k] + (cur[k-out_n] >> 1); break;
+            CASE(F_paeth_first)  cur[k] = (uint8) (raw[k] + paeth(cur[k-out_n],0,0)); break;
+         }
+         #undef CASE
+      }
+   }
+   return 1;
+}
+
+static int create_png_image(png *a, uint8 *raw, uint32 raw_len, int out_n, int interlaced)
+{
+   uint8 *final;
+   int p;
+   int save;
+   if (!interlaced)
+      return create_png_image_raw(a, raw, raw_len, out_n, a->s.img_x, a->s.img_y);
+   save = stbi_png_partial;
+   stbi_png_partial = 0;
+
+   // de-interlacing
+   final = (uint8 *) malloc(a->s.img_x * a->s.img_y * out_n);
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s.img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s.img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         if (!create_png_image_raw(a, raw, raw_len, out_n, x, y)) {
+            free(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j)
+            for (i=0; i < x; ++i)
+               memcpy(final + (j*yspc[p]+yorig[p])*a->s.img_x*out_n + (i*xspc[p]+xorig[p])*out_n,
+                      a->out + (j*x+i)*out_n, out_n);
+         free(a->out);
+         raw += (x*out_n+1)*y;
+         raw_len -= (x*out_n+1)*y;
+      }
+   }
+   a->out = final;
+
+   stbi_png_partial = save;
+   return 1;
+}
+
+static int compute_transparency(png *z, uint8 tc[3], int out_n)
+{
+   stbi *s = &z->s;
+   uint32 i, pixel_count = s->img_x * s->img_y;
+   uint8 *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   assert(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int expand_palette(png *a, uint8 *palette, int len, int pal_img_n)
+{
+   uint32 i, pixel_count = a->s.img_x * a->s.img_y;
+   uint8 *p, *temp_out, *orig = a->out;
+
+   p = (uint8 *) malloc(pixel_count * pal_img_n);
+   if (p == NULL) return e("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   free(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi_unpremultiply_on_load = 0;
+static int stbi_de_iphone_flag = 0;
+
+void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi_unpremultiply_on_load = flag_true_if_should_unpremultiply;
+}
+void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi_de_iphone_flag = flag_true_if_should_convert;
+}
+
+static void stbi_de_iphone(png *z)
+{
+   stbi *s = &z->s;
+   uint32 i, pixel_count = s->img_x * s->img_y;
+   uint8 *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         uint8 t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      assert(s->img_out_n == 4);
+      if (stbi_unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            uint8 a = p[3];
+            uint8 t = p[0];
+            if (a) {
+               p[0] = p[2] * 255 / a;
+               p[1] = p[1] * 255 / a;
+               p[2] =  t   * 255 / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            } 
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            uint8 t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+static int parse_png_file(png *z, int scan, int req_comp)
+{
+   uint8 palette[1024], pal_img_n=0;
+   uint8 has_trans=0, tc[3];
+   uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, iphone=0;
+   stbi *s = &z->s;
+
+   if (!check_png_header(s)) return 0;
+
+   if (scan == SCAN_type) return 1;
+
+   for (;;) {
+      chunk c = get_chunk_header(s);
+      switch (c.type) {
+         case PNG_TYPE('C','g','B','I'):
+            iphone = stbi_de_iphone_flag;
+            skip(s, c.length);
+            break;
+         case PNG_TYPE('I','H','D','R'): {
+            int depth,color,comp,filter;
+            if (!first) return e("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return e("bad IHDR len","Corrupt PNG");
+            s->img_x = get32(s); if (s->img_x > (1 << 24)) return e("too large","Very large image (corrupt?)");
+            s->img_y = get32(s); if (s->img_y > (1 << 24)) return e("too large","Very large image (corrupt?)");
+            depth = get8(s);  if (depth != 8)        return e("8bit only","PNG not supported: 8-bit only");
+            color = get8(s);  if (color > 6)         return e("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return e("bad ctype","Corrupt PNG");
+            comp  = get8(s);  if (comp) return e("bad comp method","Corrupt PNG");
+            filter= get8(s);  if (filter) return e("bad filter method","Corrupt PNG");
+            interlace = get8(s); if (interlace>1) return e("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return e("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return e("too large", "Image too large to decode");
+               if (scan == SCAN_header) return 1;
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return e("too large","Corrupt PNG");
+               // if SCAN_header, have to scan to see if we have a tRNS
+            }
+            break;
+         }
+
+         case PNG_TYPE('P','L','T','E'):  {
+            if (first) return e("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return e("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return e("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = get8u(s);
+               palette[i*4+1] = get8u(s);
+               palette[i*4+2] = get8u(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case PNG_TYPE('t','R','N','S'): {
+            if (first) return e("first not IHDR", "Corrupt PNG");
+            if (z->idata) return e("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return e("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return e("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = get8u(s);
+            } else {
+               if (!(s->img_n & 1)) return e("tRNS with alpha","Corrupt PNG");
+               if (c.length != (uint32) s->img_n*2) return e("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               for (k=0; k < s->img_n; ++k)
+                  tc[k] = (uint8) get16(s); // non 8-bit images will be larger
+            }
+            break;
+         }
+
+         case PNG_TYPE('I','D','A','T'): {
+            if (first) return e("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return e("no PLTE","Corrupt PNG");
+            if (scan == SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if (ioff + c.length > idata_limit) {
+               uint8 *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               p = (uint8 *) realloc(z->idata, idata_limit); if (p == NULL) return e("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!getn(s, z->idata+ioff,c.length)) return e("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case PNG_TYPE('I','E','N','D'): {
+            uint32 raw_len;
+            if (first) return e("first not IHDR", "Corrupt PNG");
+            if (scan != SCAN_load) return 1;
+            if (z->idata == NULL) return e("no IDAT","Corrupt PNG");
+            z->expanded = (uint8 *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, 16384, (int *) &raw_len, !iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            free(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!create_png_image(z, z->expanded, raw_len, s->img_out_n, interlace)) return 0;
+            if (has_trans)
+               if (!compute_transparency(z, tc, s->img_out_n)) return 0;
+            if (iphone && s->img_out_n > 2)
+               stbi_de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!expand_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            }
+            free(z->expanded); z->expanded = NULL;
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return e("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX chunk not known";
+               invalid_chunk[0] = (uint8) (c.type >> 24);
+               invalid_chunk[1] = (uint8) (c.type >> 16);
+               invalid_chunk[2] = (uint8) (c.type >>  8);
+               invalid_chunk[3] = (uint8) (c.type >>  0);
+               #endif
+               return e(invalid_chunk, "PNG not supported: unknown chunk type");
+            }
+            skip(s, c.length);
+            break;
+      }
+      // end of chunk, read and skip CRC
+      get32(s);
+   }
+}
+
+static unsigned char *do_png(png *p, int *x, int *y, int *n, int req_comp)
+{
+   unsigned char *result=NULL;
+   p->expanded = NULL;
+   p->idata = NULL;
+   p->out = NULL;
+   if (req_comp < 0 || req_comp > 4) return epuc("bad req_comp", "Internal error");
+   if (parse_png_file(p, SCAN_load, req_comp)) {
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s.img_out_n) {
+         result = convert_format(result, p->s.img_out_n, req_comp, p->s.img_x, p->s.img_y);
+         p->s.img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s.img_x;
+      *y = p->s.img_y;
+      if (n) *n = p->s.img_n;
+   }
+   free(p->out);      p->out      = NULL;
+   free(p->expanded); p->expanded = NULL;
+   free(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+#ifndef STBI_NO_STDIO
+unsigned char *stbi_png_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   png p;
+   start_file(&p.s, f);
+   return do_png(&p, x,y,comp,req_comp);
+}
+
+unsigned char *stbi_png_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return NULL;
+   data = stbi_png_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return data;
+}
+#endif
+
+unsigned char *stbi_png_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   png p;
+   start_mem(&p.s, buffer,len);
+   return do_png(&p, x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+int stbi_png_test_file(FILE *f)
+{
+   png p;
+   int n,r;
+   n = ftell(f);
+   start_file(&p.s, f);
+   r = parse_png_file(&p, SCAN_type,STBI_default);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+#endif
+
+int stbi_png_test_memory(stbi_uc const *buffer, int len)
+{
+   png p;
+   start_mem(&p.s, buffer, len);
+   return parse_png_file(&p, SCAN_type,STBI_default);
+}
+
+static int stbi_png_info_raw(png *p, int *x, int *y, int *comp)
+{
+   if (!parse_png_file(p, SCAN_header, 0))
+      return 0;
+   if (x) *x = p->s.img_x;
+   if (y) *y = p->s.img_y;
+   if (comp) *comp = p->s.img_n;
+   return 1;
+}
+
+#ifndef STBI_NO_STDIO
+int      stbi_png_info             (char const *filename,           int *x, int *y, int *comp)
+{
+   int res;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return 0;
+   res = stbi_png_info_from_file(f, x, y, comp);
+   fclose(f);
+   return res;
+}
+
+int stbi_png_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   png p;
+   int res;
+   long n = ftell(f);
+   start_file(&p.s, f);
+   res = stbi_png_info_raw(&p, x, y, comp);
+   fseek(f, n, SEEK_SET);
+   return res;
+}
+#endif // !STBI_NO_STDIO
+
+int stbi_png_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   png p;
+   start_mem(&p.s, buffer, len);
+   return stbi_png_info_raw(&p, x, y, comp);
+}
+
+// Microsoft/Windows BMP image
+
+static int bmp_test(stbi *s)
+{
+   int sz;
+   if (get8(s) != 'B') return 0;
+   if (get8(s) != 'M') return 0;
+   get32le(s); // discard filesize
+   get16le(s); // discard reserved
+   get16le(s); // discard reserved
+   get32le(s); // discard data offset
+   sz = get32le(s);
+   if (sz == 12 || sz == 40 || sz == 56 || sz == 108) return 1;
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+int      stbi_bmp_test_file        (FILE *f)
+{
+   stbi s;
+   int r,n = ftell(f);
+   start_file(&s,f);
+   r = bmp_test(&s);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+#endif
+
+int      stbi_bmp_test_memory      (stbi_uc const *buffer, int len)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return bmp_test(&s);
+}
+
+// returns 0..31 for the highest set bit
+static int high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) n += 16, z >>= 16;
+   if (z >= 0x00100) n +=  8, z >>=  8;
+   if (z >= 0x00010) n +=  4, z >>=  4;
+   if (z >= 0x00004) n +=  2, z >>=  2;
+   if (z >= 0x00002) n +=  1, z >>=  1;
+   return n;
+}
+
+static int bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+static int shiftsigned(int v, int shift, int bits)
+{
+   int result;
+   int z=0;
+
+   if (shift < 0) v <<= -shift;
+   else v >>= shift;
+   result = v;
+
+   z = bits;
+   while (z < 8) {
+      result += v >> z;
+      z += bits;
+   }
+   return result;
+}
+
+static stbi_uc *bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
+{
+   uint8 *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, fake_a=0;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,compress=0,width;
+   int bpp, flip_vertically, pad, target, offset, hsz;
+   if (get8(s) != 'B' || get8(s) != 'M') return epuc("not BMP", "Corrupt BMP");
+   get32le(s); // discard filesize
+   get16le(s); // discard reserved
+   get16le(s); // discard reserved
+   offset = get32le(s);
+   hsz = get32le(s);
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108) return epuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = get16le(s);
+      s->img_y = get16le(s);
+   } else {
+      s->img_x = get32le(s);
+      s->img_y = get32le(s);
+   }
+   if (get16le(s) != 1) return epuc("bad BMP", "bad BMP");
+   bpp = get16le(s);
+   if (bpp == 1) return epuc("monochrome", "BMP type not supported: 1-bit");
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+   if (hsz == 12) {
+      if (bpp < 24)
+         psize = (offset - 14 - 24) / 3;
+   } else {
+      compress = get32le(s);
+      if (compress == 1 || compress == 2) return epuc("BMP RLE", "BMP type not supported: RLE");
+      get32le(s); // discard sizeof
+      get32le(s); // discard hres
+      get32le(s); // discard vres
+      get32le(s); // discard colorsused
+      get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            get32le(s);
+            get32le(s);
+            get32le(s);
+            get32le(s);
+         }
+         if (bpp == 16 || bpp == 32) {
+            mr = mg = mb = 0;
+            if (compress == 0) {
+               if (bpp == 32) {
+                  mr = 0xffu << 16;
+                  mg = 0xffu <<  8;
+                  mb = 0xffu <<  0;
+                  ma = 0xffu << 24;
+                  fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
+               } else {
+                  mr = 31u << 10;
+                  mg = 31u <<  5;
+                  mb = 31u <<  0;
+               }
+            } else if (compress == 3) {
+               mr = get32le(s);
+               mg = get32le(s);
+               mb = get32le(s);
+               // not documented, but generated by photoshop and handled by mspaint
+               if (mr == mg && mg == mb) {
+                  // ?!?!?
+                  return epuc("bad BMP", "bad BMP");
+               }
+            } else
+               return epuc("bad BMP", "bad BMP");
+         }
+      } else {
+         assert(hsz == 108);
+         mr = get32le(s);
+         mg = get32le(s);
+         mb = get32le(s);
+         ma = get32le(s);
+         get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            get32le(s); // discard color space parameters
+      }
+      if (bpp < 16)
+         psize = (offset - 14 - hsz) >> 2;
+   }
+   s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+   out = (stbi_uc *) malloc(target * s->img_x * s->img_y);
+   if (!out) return epuc("outofmem", "Out of memory");
+   if (bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { free(out); return epuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = get8u(s);
+         pal[i][1] = get8u(s);
+         pal[i][0] = get8u(s);
+         if (hsz != 12) get8(s);
+         pal[i][3] = 255;
+      }
+      skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
+      if (bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (bpp == 8) width = s->img_x;
+      else { free(out); return epuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      for (j=0; j < (int) s->img_y; ++j) {
+         for (i=0; i < (int) s->img_x; i += 2) {
+            int v=get8(s),v2=0;
+            if (bpp == 4) {
+               v2 = v & 15;
+               v >>= 4;
+            }
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+            if (i+1 == (int) s->img_x) break;
+            v = (bpp == 8) ? get8(s) : v2;
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+         }
+         skip(s, pad);
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      skip(s, offset - 14 - hsz);
+      if (bpp == 24) width = 3 * s->img_x;
+      else if (bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (bpp == 24) {
+         easy = 1;
+      } else if (bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0xff000000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) return epuc("bad masks", "Corrupt BMP");
+         // right shift amt to put high bit in position #7
+         rshift = high_bit(mr)-7; rcount = bitcount(mr);
+         gshift = high_bit(mg)-7; gcount = bitcount(mr);
+         bshift = high_bit(mb)-7; bcount = bitcount(mr);
+         ashift = high_bit(ma)-7; acount = bitcount(mr);
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               int a;
+               out[z+2] = get8u(s);
+               out[z+1] = get8u(s);
+               out[z+0] = get8u(s);
+               z += 3;
+               a = (easy == 2 ? get8(s) : 255);
+               if (target == 4) out[z++] = (uint8) a;
+            }
+         } else {
+            for (i=0; i < (int) s->img_x; ++i) {
+               uint32 v = (bpp == 16 ? get16le(s) : get32le(s));
+               int a;
+               out[z++] = (uint8) shiftsigned(v & mr, rshift, rcount);
+               out[z++] = (uint8) shiftsigned(v & mg, gshift, gcount);
+               out[z++] = (uint8) shiftsigned(v & mb, bshift, bcount);
+               a = (ma ? shiftsigned(v & ma, ashift, acount) : 255);
+               if (target == 4) out[z++] = (uint8) a; 
+            }
+         }
+         skip(s, pad);
+      }
+   }
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i], p1[i] = p2[i], p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = target;
+   return out;
+}
+
+#ifndef STBI_NO_STDIO
+stbi_uc *stbi_bmp_load             (char const *filename,           int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *data;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return NULL;
+   data = stbi_bmp_load_from_file(f, x,y,comp,req_comp);
+   fclose(f);
+   return data;
+}
+
+stbi_uc *stbi_bmp_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_file(&s, f);
+   return bmp_load(&s, x,y,comp,req_comp);
+}
+#endif
+
+stbi_uc *stbi_bmp_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return bmp_load(&s, x,y,comp,req_comp);
+}
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+
+static int tga_info(stbi *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp;
+    int sz;
+    get8u(s);                   // discard Offset
+    sz = get8u(s);              // color type
+    if( sz > 1 ) return 0;      // only RGB or indexed allowed
+    sz = get8u(s);              // image type
+    // only RGB or grey allowed, +/- RLE
+    if ((sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11)) return 0;
+    get16le(s);                 // discard palette start
+    get16le(s);                 // discard palette length
+    get8(s);                    // discard bits per palette color entry
+    get16le(s);                 // discard x origin
+    get16le(s);                 // discard y origin
+    tga_w = get16le(s);
+    if( tga_w < 1 ) return 0;   // test width
+    tga_h = get16le(s);
+    if( tga_h < 1 ) return 0;   // test height
+    sz = get8(s);               // bits per pixel
+    // only RGB or RGBA or grey allowed
+    if ((sz != 8) && (sz != 16) && (sz != 24) && (sz != 32)) return 0;
+    tga_comp = sz;
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp / 8;
+    return 1;                   // seems to have passed everything
+}
+
+#ifndef STBI_NO_STDIO
+int stbi_tga_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+    stbi s;
+    int r;
+    long n = ftell(f);
+    start_file(&s, f);
+    r = tga_info(&s, x, y, comp);
+    fseek(f, n, SEEK_SET);
+    return r;
+}
+#endif
+
+int stbi_tga_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+    stbi s;
+    start_mem(&s, buffer, len);
+    return tga_info(&s, x, y, comp);
+}
+
+static int tga_test(stbi *s)
+{
+   int sz;
+   get8u(s);      //   discard Offset
+   sz = get8u(s);   //   color type
+   if ( sz > 1 ) return 0;   //   only RGB or indexed allowed
+   sz = get8u(s);   //   image type
+   if ( (sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11) ) return 0;   //   only RGB or grey allowed, +/- RLE
+   get16(s);      //   discard palette start
+   get16(s);      //   discard palette length
+   get8(s);         //   discard bits per palette color entry
+   get16(s);      //   discard x origin
+   get16(s);      //   discard y origin
+   if ( get16(s) < 1 ) return 0;      //   test width
+   if ( get16(s) < 1 ) return 0;      //   test height
+   sz = get8(s);   //   bits per pixel
+   if ( (sz != 8) && (sz != 16) && (sz != 24) && (sz != 32) ) return 0;   //   only RGB or RGBA or grey allowed
+   return 1;      //   seems to have passed everything
+}
+
+#ifndef STBI_NO_STDIO
+int      stbi_tga_test_file        (FILE *f)
+{
+   stbi s;
+   int r,n = ftell(f);
+   start_file(&s, f);
+   r = tga_test(&s);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+#endif
+
+int      stbi_tga_test_memory      (stbi_uc const *buffer, int len)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return tga_test(&s);
+}
+
+static stbi_uc *tga_load(stbi *s, int *x, int *y, int *comp, int req_comp)
+{
+   //   read in the TGA header stuff
+   int tga_offset = get8u(s);
+   int tga_indexed = get8u(s);
+   int tga_image_type = get8u(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = get16le(s);
+   int tga_palette_len = get16le(s);
+   int tga_palette_bits = get8u(s);
+   int tga_x_origin = get16le(s);
+   int tga_y_origin = get16le(s);
+   int tga_width = get16le(s);
+   int tga_height = get16le(s);
+   int tga_bits_per_pixel = get8u(s);
+   int tga_inverted = get8u(s);
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4];
+   unsigned char trans_data[4];
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   /* int tga_alpha_bits = tga_inverted & 15; */
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   error check
+   if ( //(tga_indexed) ||
+      (tga_width < 1) || (tga_height < 1) ||
+      (tga_image_type < 1) || (tga_image_type > 3) ||
+      ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16) &&
+      (tga_bits_per_pixel != 24) && (tga_bits_per_pixel != 32))
+      )
+   {
+      return NULL;
+   }
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed )
+   {
+      tga_bits_per_pixel = tga_palette_bits;
+   }
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if ( (req_comp < 1) || (req_comp > 4) )
+   {
+      //   just use whatever the file was
+      req_comp = tga_bits_per_pixel / 8;
+      *comp = req_comp;
+   } else
+   {
+      //   force a new number of components
+      *comp = tga_bits_per_pixel/8;
+   }
+   tga_data = (unsigned char*)malloc( tga_width * tga_height * req_comp );
+
+   //   skip to the data's starting position (offset usually = 0)
+   skip(s, tga_offset );
+   //   do I need to load a palette?
+   if ( tga_indexed )
+   {
+      //   any data to skip? (offset usually = 0)
+      skip(s, tga_palette_start );
+      //   load the palette
+      tga_palette = (unsigned char*)malloc( tga_palette_len * tga_palette_bits / 8 );
+      if (!getn(s, tga_palette, tga_palette_len * tga_palette_bits / 8 ))
+         return NULL;
+   }
+   //   load the data
+   trans_data[0] = trans_data[1] = trans_data[2] = trans_data[3] = 0;
+   for (i=0; i < tga_width * tga_height; ++i)
+   {
+      //   if I'm in RLE mode, do I need to get a RLE chunk?
+      if ( tga_is_RLE )
+      {
+         if ( RLE_count == 0 )
+         {
+            //   yep, get the next byte as a RLE command
+            int RLE_cmd = get8u(s);
+            RLE_count = 1 + (RLE_cmd & 127);
+            RLE_repeating = RLE_cmd >> 7;
+            read_next_pixel = 1;
+         } else if ( !RLE_repeating )
+         {
+            read_next_pixel = 1;
+         }
+      } else
+      {
+         read_next_pixel = 1;
+      }
+      //   OK, if I need to read a pixel, do it now
+      if ( read_next_pixel )
+      {
+         //   load however much data we did have
+         if ( tga_indexed )
+         {
+            //   read in 1 byte, then perform the lookup
+            int pal_idx = get8u(s);
+            if ( pal_idx >= tga_palette_len )
+            {
+               //   invalid index
+               pal_idx = 0;
+            }
+            pal_idx *= tga_bits_per_pixel / 8;
+            for (j = 0; j*8 < tga_bits_per_pixel; ++j)
+            {
+               raw_data[j] = tga_palette[pal_idx+j];
+            }
+         } else
+         {
+            //   read in the data raw
+            for (j = 0; j*8 < tga_bits_per_pixel; ++j)
+            {
+               raw_data[j] = get8u(s);
+            }
+         }
+         //   convert raw to the intermediate format
+         switch (tga_bits_per_pixel)
+         {
+         case 8:
+            //   Luminous => RGBA
+            trans_data[0] = raw_data[0];
+            trans_data[1] = raw_data[0];
+            trans_data[2] = raw_data[0];
+            trans_data[3] = 255;
+            break;
+         case 16:
+            //   Luminous,Alpha => RGBA
+            trans_data[0] = raw_data[0];
+            trans_data[1] = raw_data[0];
+            trans_data[2] = raw_data[0];
+            trans_data[3] = raw_data[1];
+            break;
+         case 24:
+            //   BGR => RGBA
+            trans_data[0] = raw_data[2];
+            trans_data[1] = raw_data[1];
+            trans_data[2] = raw_data[0];
+            trans_data[3] = 255;
+            break;
+         case 32:
+            //   BGRA => RGBA
+            trans_data[0] = raw_data[2];
+            trans_data[1] = raw_data[1];
+            trans_data[2] = raw_data[0];
+            trans_data[3] = raw_data[3];
+            break;
+         }
+         //   clear the reading flag for the next pixel
+         read_next_pixel = 0;
+      } // end of reading a pixel
+      //   convert to final format
+      switch (req_comp)
+      {
+      case 1:
+         //   RGBA => Luminance
+         tga_data[i*req_comp+0] = compute_y(trans_data[0],trans_data[1],trans_data[2]);
+         break;
+      case 2:
+         //   RGBA => Luminance,Alpha
+         tga_data[i*req_comp+0] = compute_y(trans_data[0],trans_data[1],trans_data[2]);
+         tga_data[i*req_comp+1] = trans_data[3];
+         break;
+      case 3:
+         //   RGBA => RGB
+         tga_data[i*req_comp+0] = trans_data[0];
+         tga_data[i*req_comp+1] = trans_data[1];
+         tga_data[i*req_comp+2] = trans_data[2];
+         break;
+      case 4:
+         //   RGBA => RGBA
+         tga_data[i*req_comp+0] = trans_data[0];
+         tga_data[i*req_comp+1] = trans_data[1];
+         tga_data[i*req_comp+2] = trans_data[2];
+         tga_data[i*req_comp+3] = trans_data[3];
+         break;
+      }
+      //   in case we're in RLE mode, keep counting down
+      --RLE_count;
+   }
+   //   do I need to invert the image?
+   if ( tga_inverted )
+   {
+      for (j = 0; j*2 < tga_height; ++j)
+      {
+         int index1 = j * tga_width * req_comp;
+         int index2 = (tga_height - 1 - j) * tga_width * req_comp;
+         for (i = tga_width * req_comp; i > 0; --i)
+         {
+            unsigned char temp = tga_data[index1];
+            tga_data[index1] = tga_data[index2];
+            tga_data[index2] = temp;
+            ++index1;
+            ++index2;
+         }
+      }
+   }
+   //   clear my palette, if I had one
+   if ( tga_palette != NULL )
+   {
+      free( tga_palette );
+   }
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   //   OK, done
+   return tga_data;
+}
+
+#ifndef STBI_NO_STDIO
+stbi_uc *stbi_tga_load             (char const *filename,           int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *data;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return NULL;
+   data = stbi_tga_load_from_file(f, x,y,comp,req_comp);
+   fclose(f);
+   return data;
+}
+
+stbi_uc *stbi_tga_load_from_file   (FILE *f,                  int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_file(&s, f);
+   return tga_load(&s, x,y,comp,req_comp);
+}
+#endif
+
+stbi_uc *stbi_tga_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return tga_load(&s, x,y,comp,req_comp);
+}
+
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+static int psd_test(stbi *s)
+{
+   if (get32(s) != 0x38425053) return 0;   // "8BPS"
+   else return 1;
+}
+
+#ifndef STBI_NO_STDIO
+int stbi_psd_test_file(FILE *f)
+{
+   stbi s;
+   int r,n = ftell(f);
+   start_file(&s, f);
+   r = psd_test(&s);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+#endif
+
+int stbi_psd_test_memory(stbi_uc const *buffer, int len)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return psd_test(&s);
+}
+
+static stbi_uc *psd_load(stbi *s, int *x, int *y, int *comp, int req_comp)
+{
+   int   pixelCount;
+   int channelCount, compression;
+   int channel, i, count, len;
+   int w,h;
+   uint8 *out;
+
+   // Check identifier
+   if (get32(s) != 0x38425053)   // "8BPS"
+      return epuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (get16(s) != 1)
+      return epuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = get16(s);
+   if (channelCount < 0 || channelCount > 16)
+      return epuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = get32(s);
+   w = get32(s);
+   
+   // Make sure the depth is 8 bits.
+   if (get16(s) != 8)
+      return epuc("unsupported bit depth", "PSD bit depth is not 8 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (get16(s) != 3)
+      return epuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   skip(s,get32(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   skip(s, get32(s) );
+
+   // Skip the reserved data.
+   skip(s, get32(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = get16(s);
+   if (compression > 1)
+      return epuc("bad compression", "PSD has an unknown compression format");
+
+   // Create the destination image.
+   out = (stbi_uc *) malloc(4 * w*h);
+   if (!out) return epuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+   
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         uint8 *p;
+         
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++) *p = (channel == 3 ? 255 : 0), p += 4;
+         } else {
+            // Read the RLE data.
+            count = 0;
+            while (count < pixelCount) {
+               len = get8(s);
+               if (len == 128) {
+                  // No-op.
+               } else if (len < 128) {
+                  // Copy next len+1 bytes literally.
+                  len++;
+                  count += len;
+                  while (len) {
+                     *p = get8u(s);
+                     p += 4;
+                     len--;
+                  }
+               } else if (len > 128) {
+                  uint8   val;
+                  // Next -len+1 bytes in the dest are replicated from next source byte.
+                  // (Interpret len as a negative 8-bit int.)
+                  len ^= 0x0FF;
+                  len += 2;
+                  val = get8u(s);
+                  count += len;
+                  while (len) {
+                     *p = val;
+                     p += 4;
+                     len--;
+                  }
+               }
+            }
+         }
+      }
+      
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit value for each pixel in the image.
+      
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         uint8 *p;
+         
+         p = out + channel;
+         if (channel > channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++) *p = channel == 3 ? 255 : 0, p += 4;
+         } else {
+            // Read the data.
+            for (i = 0; i < pixelCount; i++)
+               *p = get8u(s), p += 4;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != 4) {
+      out = convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // convert_format frees input on failure
+   }
+
+   if (comp) *comp = channelCount;
+   *y = h;
+   *x = w;
+   
+   return out;
+}
+
+#ifndef STBI_NO_STDIO
+stbi_uc *stbi_psd_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *data;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return NULL;
+   data = stbi_psd_load_from_file(f, x,y,comp,req_comp);
+   fclose(f);
+   return data;
+}
+
+stbi_uc *stbi_psd_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_file(&s, f);
+   return psd_load(&s, x,y,comp,req_comp);
+}
+#endif
+
+stbi_uc *stbi_psd_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return psd_load(&s, x,y,comp,req_comp);
+}
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+static int pic_is4(stbi *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int pic_test(stbi *s)
+{
+   int i;
+
+   if (!pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      get8(s);
+
+   if (!pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} pic_packet_t;
+
+static stbi_uc *pic_readval(stbi *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (at_eof(s)) return epuc("bad file","PIC file too short");
+         dest[i]=get8u(s);
+      }
+   }
+
+   return dest;
+}
+
+static void pic_copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *pic_load2(stbi *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   pic_packet_t packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      pic_packet_t *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return epuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = get8(s);
+      packet->size    = get8u(s);
+      packet->type    = get8u(s);
+      packet->channel = get8u(s);
+
+      act_comp |= packet->channel;
+
+      if (at_eof(s))          return epuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return epuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         pic_packet_t *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return epuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!pic_readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=get8u(s);
+                     if (at_eof(s))   return epuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (uint8) left;
+
+                     if (!pic_readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        pic_copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = get8(s), i;
+                  if (at_eof(s))  return epuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+                     int i;
+
+                     if (count==128)
+                        count = get16(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return epuc("bad file","scanline overrun");
+
+                     if (!pic_readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        pic_copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return epuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!pic_readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static stbi_uc *pic_load(stbi *s,int *px,int *py,int *comp,int req_comp)
+{
+   stbi_uc *result;
+   int i, x,y;
+
+   for (i=0; i<92; ++i)
+      get8(s);
+
+   x = get16(s);
+   y = get16(s);
+   if (at_eof(s))  return epuc("bad file","file too short (pic header)");
+   if ((1 << 28) / x < y) return epuc("too large", "Image too large to decode");
+
+   get32(s); //skip `ratio'
+   get16(s); //skip `fields'
+   get16(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) malloc(x*y*4);
+   memset(result, 0xff, x*y*4);
+
+   if (!pic_load2(s,x,y,comp, result)) {
+      free(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+int stbi_pic_test_memory(stbi_uc const *buffer, int len)
+{
+   stbi s;
+   start_mem(&s,buffer,len);
+   return pic_test(&s);
+}
+
+stbi_uc *stbi_pic_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_mem(&s,buffer,len);
+   return pic_load(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+int stbi_pic_test_file(FILE *f)
+{
+   int result;
+   long l = ftell(f);
+   stbi s;
+   start_file(&s,f);
+   result = pic_test(&s);
+   fseek(f,l,SEEK_SET);
+   return result;
+}
+
+stbi_uc *stbi_pic_load(char const *filename,int *x, int *y, int *comp, int req_comp)
+{
+   stbi_uc *result;
+   FILE *f=fopen(filename,"rb");
+   if (!f) return 0;
+   result = stbi_pic_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+stbi_uc *stbi_pic_load_from_file(FILE *f,int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_file(&s,f);
+   return pic_load(&s,x,y,comp,req_comp);
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+typedef struct stbi_gif_lzw_struct {
+   int16 prefix;
+   uint8 first;
+   uint8 suffix;
+} stbi_gif_lzw;
+
+typedef struct stbi_gif_struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   int flags, bgindex, ratio, transparent, eflags;
+   uint8  pal[256][4];
+   uint8 lpal[256][4];
+   stbi_gif_lzw codes[4096];
+   uint8 *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+} stbi_gif;
+
+static int gif_test(stbi *s)
+{
+   int sz;
+   if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8') return 0;
+   sz = get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (get8(s) != 'a') return 0;
+   return 1;
+}
+
+#ifndef STBI_NO_STDIO
+int      stbi_gif_test_file        (FILE *f)
+{
+   stbi s;
+   int r,n = ftell(f);
+   start_file(&s,f);
+   r = gif_test(&s);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+#endif
+
+int      stbi_gif_test_memory      (stbi_uc const *buffer, int len)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return gif_test(&s);
+}
+
+static void stbi_gif_parse_colortable(stbi *s, uint8 pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = get8u(s);
+      pal[i][1] = get8u(s);
+      pal[i][0] = get8u(s);
+      pal[i][3] = transp ? 0 : 255;
+   }   
+}
+
+static int stbi_gif_header(stbi *s, stbi_gif *g, int *comp, int is_info)
+{
+   uint8 version;
+   if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
+      return e("not GIF", "Corrupt GIF");
+
+   version = get8u(s);
+   if (version != '7' && version != '9')    return e("not GIF", "Corrupt GIF");
+   if (get8(s) != 'a')                      return e("not GIF", "Corrupt GIF");
+ 
+   failure_reason = "";
+   g->w = get16le(s);
+   g->h = get16le(s);
+   g->flags = get8(s);
+   g->bgindex = get8(s);
+   g->ratio = get8(s);
+   g->transparent = -1;
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi_gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi_gif_info_raw(stbi *s, int *x, int *y, int *comp)
+{
+   stbi_gif g;   
+   if (!stbi_gif_header(s, &g, comp, 1)) return 0;
+   if (x) *x = g.w;
+   if (y) *y = g.h;
+   return 1;
+}
+
+static void stbi_out_gif_code(stbi_gif *g, uint16 code)
+{
+   uint8 *p, *c;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi_out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+  
+   p = &g->out[g->cur_x + g->cur_y];
+   c = &g->color_table[g->codes[code].suffix * 4];
+
+   if (c[3] >= 128) {
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static uint8 *stbi_process_gif_raster(stbi *s, stbi_gif *g)
+{
+   uint8 lzw_cs;
+   int32 len, code;
+   uint32 first;
+   int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi_gif_lzw *p;
+
+   lzw_cs = get8u(s);
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (code = 0; code < clear; code++) {
+      g->codes[code].prefix = -1;
+      g->codes[code].first = (uint8) code;
+      g->codes[code].suffix = (uint8) code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = get8(s); // start new block
+            if (len == 0) 
+               return g->out;
+         }
+         --len;
+         bits |= (int32) get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            skip(s, len);
+            while ((len = get8(s)) > 0)
+               skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) return epuc("no clear code", "Corrupt GIF");
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 4096)        return epuc("too many codes", "Corrupt GIF");
+               p->prefix = (int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return epuc("illegal code in raster", "Corrupt GIF");
+
+            stbi_out_gif_code(g, (uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return epuc("illegal code in raster", "Corrupt GIF");
+         }
+      } 
+   }
+}
+
+static void stbi_fill_gif_background(stbi_gif *g)
+{
+   int i;
+   uint8 *c = g->pal[g->bgindex];
+   // @OPTIMIZE: write a dword at a time
+   for (i = 0; i < g->w * g->h * 4; i += 4) {
+      uint8 *p  = &g->out[i];
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+static uint8 *stbi_gif_load_next(stbi *s, stbi_gif *g, int *comp, int req_comp)
+{
+   int i;
+   uint8 *old_out = 0;
+
+   if (g->out == 0) {
+      if (!stbi_gif_header(s, g, comp,0))     return 0; // failure_reason set by stbi_gif_header
+      g->out = (uint8 *) malloc(4 * g->w * g->h);
+      if (g->out == 0)                      return epuc("outofmem", "Out of memory");
+      stbi_fill_gif_background(g);
+   } else {
+      // animated-gif-only path
+      if (((g->eflags & 0x1C) >> 2) == 3) {
+         old_out = g->out;
+         g->out = (uint8 *) malloc(4 * g->w * g->h);
+         if (g->out == 0)                   return epuc("outofmem", "Out of memory");
+         memcpy(g->out, old_out, g->w*g->h*4);
+      }
+   }
+    
+   for (;;) {
+      switch (get8(s)) {
+         case 0x2C: /* Image Descriptor */
+         {
+            int32 x, y, w, h;
+            uint8 *o;
+
+            x = get16le(s);
+            y = get16le(s);
+            w = get16le(s);
+            h = get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return epuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            g->lflags = get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi_gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (uint8 *) g->lpal;       
+            } else if (g->flags & 0x80) {
+               for (i=0; i < 256; ++i)  // @OPTIMIZE: reset only the previous transparent
+                  g->pal[i][3] = 255; 
+               if (g->transparent >= 0 && (g->eflags & 0x01))
+                  g->pal[g->transparent][3] = 0;
+               g->color_table = (uint8 *) g->pal;
+            } else
+               return epuc("missing color table", "Corrupt GIF");
+   
+            o = stbi_process_gif_raster(s, g);
+            if (o == NULL) return NULL;
+
+            if (req_comp && req_comp != 4)
+               o = convert_format(o, 4, req_comp, g->w, g->h);
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            if (get8(s) == 0xF9) { // Graphic Control Extension.
+               len = get8(s);
+               if (len == 4) {
+                  g->eflags = get8(s);
+                  get16le(s); // delay
+                  g->transparent = get8(s);
+               } else {
+                  skip(s, len);
+                  break;
+               }
+            }
+            while ((len = get8(s)) != 0)
+               skip(s, len);
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (uint8 *) 1;
+
+         default:
+            return epuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+#ifndef STBI_NO_STDIO
+stbi_uc *stbi_gif_load             (char const *filename,           int *x, int *y, int *comp, int req_comp)
+{
+   uint8 *data;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return NULL;
+   data = stbi_gif_load_from_file(f, x,y,comp,req_comp);
+   fclose(f);
+   return data;
+}
+
+stbi_uc *stbi_gif_load_from_file   (FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   uint8 *u = 0;
+   stbi s;
+   stbi_gif g={0};
+   start_file(&s, f);
+
+   u = stbi_gif_load_next(&s, &g, comp, req_comp);
+   if (u == (void *) 1) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+   }
+
+   return u;
+}
+#endif
+
+stbi_uc *stbi_gif_load_from_memory (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   uint8 *u = 0;
+   stbi s;
+   stbi_gif g={0};
+   start_mem(&s, buffer, len);
+   u = stbi_gif_load_next(&s, &g, comp, req_comp);
+   if (u == (void *) 1) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+   }
+   return u;
+}
+
+#ifndef STBI_NO_STDIO
+int      stbi_gif_info             (char const *filename,           int *x, int *y, int *comp)
+{
+   int res;
+   FILE *f = fopen(filename, "rb");
+   if (!f) return 0;
+   res = stbi_gif_info_from_file(f, x, y, comp);
+   fclose(f);
+   return res;
+}
+
+int stbi_gif_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   stbi s;
+   int res;
+   long n = ftell(f);
+   start_file(&s, f);
+   res = stbi_gif_info_raw(&s, x, y, comp);
+   fseek(f, n, SEEK_SET);
+   return res;
+}
+#endif // !STBI_NO_STDIO
+
+int stbi_gif_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return stbi_gif_info_raw(&s, x, y, comp);
+}
+
+
+
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int hdr_test(stbi *s)
+{
+   const char *signature = "#?RADIANCE\n";
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (get8(s) != signature[i])
+         return 0;
+   return 1;
+}
+
+int stbi_hdr_test_memory(stbi_uc const *buffer, int len)
+{
+   stbi s;
+   start_mem(&s, buffer, len);
+   return hdr_test(&s);
+}
+
+#ifndef STBI_NO_STDIO
+int stbi_hdr_test_file(FILE *f)
+{
+   stbi s;
+   int r,n = ftell(f);
+   start_file(&s, f);
+   r = hdr_test(&s);
+   fseek(f,n,SEEK_SET);
+   return r;
+}
+#endif
+
+#define HDR_BUFLEN  1024
+static char *hdr_gettoken(stbi *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) get8(z);
+
+   while (!at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!at_eof(z) && get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+
+static float *hdr_load(stbi *s, int *x, int *y, int *comp, int req_comp)
+{
+   char buffer[HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+
+
+   // Check identifier
+   if (strcmp(hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+      return epf("not HDR", "Corrupt HDR image");
+   
+   // Parse header
+   for(;;) {
+      token = hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return epf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return epf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return epf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = strtol(token, NULL, 10);
+
+   *x = width;
+   *y = height;
+
+   *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   // Read data
+   hdr_data = (float *) malloc(height * width * req_comp * sizeof(float));
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            getn(s, rgbe, 4);
+            hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = get8(s);
+         c2 = get8(s);
+         len = get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            uint8 rgbe[4];
+            rgbe[0] = (uint8) c1;
+            rgbe[1] = (uint8) c2;
+            rgbe[2] = (uint8) len;
+            rgbe[3] = (uint8) get8u(s);
+            hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            free(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= get8(s);
+         if (len != width) { free(hdr_data); free(scanline); return epf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) scanline = (stbi_uc *) malloc(width * 4);
+            
+         for (k = 0; k < 4; ++k) {
+            i = 0;
+            while (i < width) {
+               count = get8u(s);
+               if (count > 128) {
+                  // Run
+                  value = get8u(s);
+                  count -= 128;
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = get8u(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      free(scanline);
+   }
+
+   return hdr_data;
+}
+
+#ifndef STBI_NO_STDIO
+float *stbi_hdr_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_file(&s,f);
+   return hdr_load(&s,x,y,comp,req_comp);
+}
+#endif
+
+float *stbi_hdr_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi s;
+   start_mem(&s,buffer, len);
+   return hdr_load(&s,x,y,comp,req_comp);
+}
+
+#endif // STBI_NO_HDR
+
+
+#ifndef STBI_NO_STDIO
+int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = fopen(filename, "rb");
+    int result;
+    if (!f) return e("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   if (stbi_jpeg_info_from_file(f, x, y, comp))
+       return 1;
+   if (stbi_png_info_from_file(f, x, y, comp))
+       return 1;
+   if (stbi_gif_info_from_file(f, x, y, comp))
+       return 1;
+   // @TODO: stbi_bmp_info_from_file
+   // @TODO: stbi_psd_info_from_file
+   #ifndef STBI_NO_HDR
+   // @TODO: stbi_hdr_info_from_file
+   #endif
+   // test tga last because it's a crappy test!
+   if (stbi_tga_info_from_file(f, x, y, comp))
+       return 1;
+   return e("unknown image type", "Image not of any known type, or corrupt");
+}
+#endif // !STBI_NO_STDIO
+
+int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   if (stbi_jpeg_info_from_memory(buffer, len, x, y, comp))
+       return 1;
+   if (stbi_png_info_from_memory(buffer, len, x, y, comp))
+       return 1;
+   if (stbi_gif_info_from_memory(buffer, len, x, y, comp))
+       return 1;
+   // @TODO: stbi_bmp_info_from_memory
+   // @TODO: stbi_psd_info_from_memory
+   #ifndef STBI_NO_HDR
+   // @TODO: stbi_hdr_info_from_memory
+   #endif
+   // test tga last because it's a crappy test!
+   if (stbi_tga_info_from_memory(buffer, len, x, y, comp))
+       return 1;
+   return e("unknown image type", "Image not of any known type, or corrupt");
+}
+
+#endif // STBI_HEADER_FILE_ONLY
+
+/*
+   revision history:
+      1.29 (2010-08-16) various warning fixes from Aurelien Pocheville 
+      1.28 (2010-08-01) fix bug in GIF palette transparency (SpartanJ)
+      1.27 (2010-08-01)
+             cast-to-uint8 to fix warnings
+      1.26 (2010-07-24)
+             fix bug in file buffering for PNG reported by SpartanJ
+      1.25 (2010-07-17)
+             refix trans_data warning (Won Chun)
+      1.24 (2010-07-12)
+             perf improvements reading from files on platforms with lock-heavy fgetc()
+             minor perf improvements for jpeg
+             deprecated type-specific functions so we'll get feedback if they're needed
+             attempt to fix trans_data warning (Won Chun)
+      1.23   fixed bug in iPhone support
+      1.22 (2010-07-10)
+             removed image *writing* support
+             removed image *writing* support
+             stbi_info support from Jetro Lauha
+             GIF support from Jean-Marc Lienher
+             iPhone PNG-extensions from James Brown
+             warning-fixes from Nicolas Schulz and Janez Zemva (i.e. Janez (U+017D)emva)
+      1.21   fix use of 'uint8' in header (reported by jon blow)
+      1.20   added support for Softimage PIC, by Tom Seddon
+      1.19   bug in interlaced PNG corruption check (found by ryg)
+      1.18 2008-08-02
+             fix a threading bug (local mutable static)
+      1.17   support interlaced PNG
+      1.16   major bugfix - convert_format converted one too many pixels
+      1.15   initialize some fields for thread safety
+      1.14   fix threadsafe conversion bug
+             header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13   threadsafe
+      1.12   const qualifiers in the API
+      1.11   Support installable IDCT, colorspace conversion routines
+      1.10   Fixes for 64-bit (don't use "unsigned long")
+             optimized upsampling by Fabian "ryg" Giesen
+      1.09   Fix format-conversion for PSD code (bad global variables!)
+      1.08   Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07   attempt to fix C++ warning/errors again
+      1.06   attempt to fix C++ warning/errors again
+      1.05   fix TGA loading to return correct *comp and use good luminance calc
+      1.04   default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03   bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02   support for (subset of) HDR files, float interface for preferred access to them
+      1.01   fix bug: possible bug in handling right-side up bmps... not sure
+             fix bug: the stbi_bmp_load() and stbi_tga_load() functions didn't work at all
+      1.00   interface to zlib that skips zlib header
+      0.99   correct handling of alpha in palette
+      0.98   TGA loader by lonesock; dynamically add loaders (untested)
+      0.97   jpeg errors on too large a file; also catch another malloc failure
+      0.96   fix detection of invalid v value - particleman@mollyrocket forum
+      0.95   during header scan, seek to markers in case of padding
+      0.94   STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93   handle jpegtran output; verbose errors
+      0.92   read 4,8,16,24,32-bit BMP files of several formats
+      0.91   output 24-bit Windows 3.0 BMP files
+      0.90   fix a few more warnings; bump version number to approach 1.0
+      0.61   bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60   fix compiling as c++
+      0.59   fix warnings: merge Dave Moore's -Wall fixes
+      0.58   fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57   fix bug: jpg last huffman symbol before marker was >9 bits but less
+                      than 16 available
+      0.56   fix bug: zlib uncompressed mode len vs. nlen
+      0.55   fix bug: restart_interval not initialized to 0
+      0.54   allow NULL for 'int *comp'
+      0.53   fix bug in png 3->4; speedup png decoding
+      0.52   png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51   obey req_comp requests, 1-component jpegs return as 1-component,
+             on 'test' only check type, not whether we support this variant
+*/
Index: ps/trunk/libraries/source/nvtt/src/src/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/CMakeLists.txt
@@ -1,23 +1,21 @@
 
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/poshlib)
+INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/stb)
+
 SUBDIRS(nvcore)
 SUBDIRS(nvmath)
 SUBDIRS(nvimage)
+SUBDIRS(nvthread)
 SUBDIRS(nvtt)
+SUBDIRS(bc6h)
+SUBDIRS(bc7)
 
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+# Make PNG optional (we disable it on macOS)
+SET(PNG TRUE CACHE BOOL "")
 
-# initial variables
-SET(GLUT TRUE CACHE BOOL "")
-SET(GLEW TRUE CACHE BOOL "")
-SET(CG TRUE CACHE BOOL "")
-SET(CUDA TRUE CACHE BOOL "")
-SET(OPENEXR TRUE CACHE BOOL "")
-SET(JPEG TRUE CACHE BOOL "")
-SET(PNG TRUE CACHE BOOL "")
-SET(TIFF TRUE CACHE BOOL "")
-
-# OpenGL
-INCLUDE(FindOpenGL)
+# OpenGL
+INCLUDE(FindOpenGL)
 IF(OPENGL_FOUND)
 	MESSAGE(STATUS "Looking for OpenGL - found")
 ELSE(OPENGL_FOUND)
@@ -25,15 +23,12 @@
 ENDIF(OPENGL_FOUND)
 
 # GLUT
-IF(GLUT)
-	INCLUDE(${NV_CMAKE_DIR}/FindGLUT.cmake)
-	#INCLUDE(FindGLUT)
-	IF(GLUT_FOUND)
-		MESSAGE(STATUS "Looking for GLUT - found")
-	ELSE(GLUT_FOUND)
-		MESSAGE(STATUS "Looking for GLUT - not found")
-	ENDIF(GLUT_FOUND)
-ENDIF(GLUT)
+#INCLUDE(FindGLUT)
+#IF(GLUT_FOUND)
+#	MESSAGE(STATUS "Looking for GLUT - found")
+#ELSE(GLUT_FOUND)
+#	MESSAGE(STATUS "Looking for GLUT - not found")
+#ENDIF(GLUT_FOUND)
 
 # DirectX
 INCLUDE(${NV_CMAKE_DIR}/FindDirectX.cmake)
@@ -44,105 +39,118 @@
 ENDIF(DX10_FOUND)
 
 # GLEW
-IF(GLEW)
-	INCLUDE(${NV_CMAKE_DIR}/FindGLEW.cmake)
-	IF(GLEW_FOUND)
-		MESSAGE(STATUS "Looking for GLEW - found")
-	ELSE(GLEW_FOUND)
-		MESSAGE(STATUS "Looking for GLEW - not found")
-	ENDIF(GLEW_FOUND)
-ENDIF(GLEW)
+#INCLUDE(${NV_CMAKE_DIR}/FindGLEW.cmake)
+#IF(GLEW_FOUND)
+#	MESSAGE(STATUS "Looking for GLEW - found")
+#ELSE(GLEW_FOUND)
+#	MESSAGE(STATUS "Looking for GLEW - not found")
+#ENDIF(GLEW_FOUND)
 
 # Cg
-IF(CG)
-	INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
-	IF(CG_FOUND)
-		MESSAGE(STATUS "Looking for Cg - found")
-	ELSE(CG_FOUND)
-		MESSAGE(STATUS "Looking for Cg - not found")
-	ENDIF(CG_FOUND)
-ENDIF(CG)
+#INCLUDE(${NV_CMAKE_DIR}/FindCg.cmake)
+#IF(CG_FOUND)
+#	MESSAGE(STATUS "Looking for Cg - found")
+#ELSE(CG_FOUND)
+#	MESSAGE(STATUS "Looking for Cg - not found")
+#ENDIF(CG_FOUND)
 
 # CUDA
-IF(CUDA)
-	INCLUDE(${NV_CMAKE_DIR}/FindCUDA.cmake)
-	IF(CUDA_FOUND)
-		SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
-		MESSAGE(STATUS "Looking for CUDA - found")
-	ELSE(CUDA_FOUND)
-		MESSAGE(STATUS "Looking for CUDA - not found")
-	ENDIF(CUDA_FOUND)
-ENDIF(CUDA)
+#FIND_PACKAGE(CUDA)
+#IF(CUDA_FOUND)
+#	IF(MINGW)
+#		MESSAGE(STATUS "Looking for CUDA - not supported on MinGW")
+#		UNSET(CUDA_FOUND)
+#	ENDIF(MINGW)
+#	IF(CUDA_FOUND)
+#		SET(HAVE_CUDA ${CUDA_FOUND} CACHE BOOL "Set to TRUE if CUDA is found, FALSE otherwise")
+#		MESSAGE(STATUS "Looking for CUDA - found")
+#	ENDIF(CUDA_FOUND)
+#ELSE(CUDA_FOUND)
+#	MESSAGE(STATUS "Looking for CUDA - not found")
+#ENDIF(CUDA_FOUND)
 
 # Maya
-INCLUDE(${NV_CMAKE_DIR}/FindMaya.cmake)
-IF(MAYA_FOUND)
-	SET(HAVE_MAYA ${MAYA_FOUND} CACHE BOOL "Set to TRUE if Maya is found, FALSE otherwise")
-	MESSAGE(STATUS "Looking for Maya - found")
-ELSE(MAYA_FOUND)
-	MESSAGE(STATUS "Looking for Maya - not found")
-ENDIF(MAYA_FOUND)
-
-# JPEG
-IF(JPEG)
-	INCLUDE(FindJPEG)
-	IF(JPEG_FOUND)
-		SET(HAVE_JPEG ${JPEG_FOUND} CACHE BOOL "Set to TRUE if JPEG is found, FALSE otherwise")
-		MESSAGE(STATUS "Looking for JPEG - found")
-	ELSE(JPEG_FOUND)
-		MESSAGE(STATUS "Looking for JPEG - not found")
-	ENDIF(JPEG_FOUND)
-ENDIF(JPEG)
-
-# PNG
-IF(PNG)
-	INCLUDE(FindPNG)
-	IF(PNG_FOUND)
-		SET(HAVE_PNG ${PNG_FOUND} CACHE BOOL "Set to TRUE if PNG is found, FALSE otherwise")
-		MESSAGE(STATUS "Looking for PNG - found")
-	ELSE(PNG_FOUND)
-		MESSAGE(STATUS "Looking for PNG - not found")
-	ENDIF(PNG_FOUND)
-ENDIF(PNG)
-
-# TIFF
-IF(TIFF)
-	INCLUDE(FindTIFF)
-	IF(TIFF_FOUND)
-		SET(HAVE_TIFF ${TIFF_FOUND} CACHE BOOL "Set to TRUE if TIFF is found, FALSE otherwise")
-		MESSAGE(STATUS "Looking for TIFF - found")
-	ELSE(TIFF_FOUND)
-		MESSAGE(STATUS "Looking for TIFF - not found")
-	ENDIF(TIFF_FOUND)
-ENDIF(TIFF)
-
-# OpenEXR
-IF(OPENEXR)
-	INCLUDE(${NV_CMAKE_DIR}/FindOpenEXR.cmake)
-	IF(OPENEXR_FOUND)
-		SET(HAVE_OPENEXR ${OPENEXR_FOUND} CACHE BOOL "Set to TRUE if OpenEXR is found, FALSE otherwise")
-		MESSAGE(STATUS "Looking for OpenEXR - found")
-	ELSE(OPENEXR_FOUND)
-		MESSAGE(STATUS "Looking for OpenEXR - not found")
-	ENDIF(OPENEXR_FOUND)
-ENDIF(OPENEXR)
-
-# Qt
-# We don't actually use this and it requires having Qt4 installed, so why is this in here?
-#FIND_PACKAGE(Qt4)
+#INCLUDE(${NV_CMAKE_DIR}/FindMaya.cmake)
+#IF(MAYA_FOUND)
+#	SET(HAVE_MAYA ${MAYA_FOUND} CACHE BOOL "Set to TRUE if Maya is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for Maya - found")
+#ELSE(MAYA_FOUND)
+#	MESSAGE(STATUS "Looking for Maya - not found")
+#ENDIF(MAYA_FOUND)
+
+# FreeImage
+#INCLUDE(${NV_CMAKE_DIR}/FindFreeImage.cmake)
+#IF(FREEIMAGE_FOUND)
+#	SET(HAVE_FREEIMAGE ${FREEIMAGE_FOUND} CACHE BOOL "Set to TRUE if FreeImage is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for FreeImage - found")
+#ELSE(FREEIMAGE_FOUND)
+#	MESSAGE(STATUS "Looking for FreeImage - not found")
+#ENDIF(FREEIMAGE_FOUND)
+
+# JPEG
+#INCLUDE(FindJPEG)
+#IF(JPEG_FOUND)
+#	SET(HAVE_JPEG ${JPEG_FOUND} CACHE BOOL "Set to TRUE if JPEG is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for JPEG - found")
+#ELSE(JPEG_FOUND)
+#	MESSAGE(STATUS "Looking for JPEG - not found")
+#ENDIF(JPEG_FOUND)
+
+# PNG
+IF(PNG)
+	INCLUDE(FindPNG)
+	IF(PNG_FOUND)
+		SET(HAVE_PNG ${PNG_FOUND} CACHE BOOL "Set to TRUE if PNG is found, FALSE otherwise")
+		MESSAGE(STATUS "Looking for PNG - found")
+	ELSE(PNG_FOUND)
+		MESSAGE(STATUS "Looking for PNG - not found")
+	ENDIF(PNG_FOUND)
+ENDIF(PNG)
+
+# TIFF
+#SET(TIFF_NAMES libtiff)
+#INCLUDE(FindTIFF)
+#IF(TIFF_FOUND)
+#	SET(HAVE_TIFF ${TIFF_FOUND} CACHE BOOL "Set to TRUE if TIFF is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for TIFF - found")
+#ELSE(TIFF_FOUND)
+#	MESSAGE(STATUS "Looking for TIFF - not found")
+#ENDIF(TIFF_FOUND)
+
+# OpenEXR
+#INCLUDE(${NV_CMAKE_DIR}/FindOpenEXR.cmake)
+#IF(OPENEXR_FOUND)
+#	SET(HAVE_OPENEXR ${OPENEXR_FOUND} CACHE BOOL "Set to TRUE if OpenEXR is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for OpenEXR - found")
+#ELSE(OPENEXR_FOUND)
+#	MESSAGE(STATUS "Looking for OpenEXR - not found")
+#ENDIF(OPENEXR_FOUND)
+
+# OpenMP
+#INCLUDE(FindOpenMP)
+#IF(OPENMP_FOUND)
+#	SET(HAVE_OPENMP ${OPENMP_FOUND} CACHE BOOL "Set to TRUE if OpenMP is found, FALSE otherwise")
+#	MESSAGE(STATUS "Looking for OpenMP - found")
+#	SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+#	SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+#ELSE(OPENMP_FOUND)
+#	MESSAGE(STATUS "Looking for OpenMP - not found")
+#ENDIF(OPENMP_FOUND)
 
 # Threads
 FIND_PACKAGE(Threads REQUIRED)
 MESSAGE(STATUS "Use thread library: ${CMAKE_THREAD_LIBS_INIT}")
+SET(LIBS ${LIBS} ${CMAKE_THREAD_LIBS_INIT})
 
 # configuration file
 INCLUDE(CheckIncludeFiles)
 
-CHECK_INCLUDE_FILES(unistd.h HAVE_UNISTD_H)
-CHECK_INCLUDE_FILES(stdarg.h HAVE_STDARG_H)
-CHECK_INCLUDE_FILES(signal.h HAVE_SIGNAL_H)
-CHECK_INCLUDE_FILES(execinfo.h HAVE_EXECINFO_H)
-CHECK_INCLUDE_FILES(malloc.h HAVE_MALLOC_H)
+CHECK_INCLUDE_FILES("unistd.h" HAVE_UNISTD_H)
+CHECK_INCLUDE_FILES("stdarg.h" HAVE_STDARG_H)
+CHECK_INCLUDE_FILES("signal.h" HAVE_SIGNAL_H)
+CHECK_INCLUDE_FILES("execinfo.h" HAVE_EXECINFO_H)
+CHECK_INCLUDE_FILES("malloc.h" HAVE_MALLOC_H)
+CHECK_INCLUDE_FILES("dispatch/dispatch.h" HAVE_DISPATCH_H)
 
 CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/nvconfig.h.in ${CMAKE_CURRENT_BINARY_DIR}/nvconfig.h)
 
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/CMakeLists.txt
@@ -0,0 +1,22 @@
+PROJECT(bc6h)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(BC6H_SRCS
+	bits.h
+	shapes_two.h
+	tile.h
+	zoh_utils.cpp
+	zoh_utils.h
+	zoh.cpp
+	zoh.h
+	zohone.cpp
+	zohtwo.cpp)
+
+ADD_LIBRARY(bc6h STATIC ${BC6H_SRCS})
+
+IF(NOT WIN32)
+    IF(CMAKE_COMPILER_IS_GNUCXX)
+        SET_TARGET_PROPERTIES(bc6h PROPERTIES COMPILE_FLAGS -fPIC)
+    ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+ENDIF(NOT WIN32)
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/bits.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/bits.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/bits.h
@@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_BITS_H
+#define _ZOH_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace ZOH {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/shapes_two.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/shapes_two.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_SHAPES_TWO_H
+#define _ZOH_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static const int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/tile.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/tile.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/tile.h
@@ -0,0 +1,83 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_TILE_H
+#define _ZOH_TILE_H
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.h"
+#include <math.h>
+
+namespace ZOH {
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+public:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(uint16 h)
+	{
+		return (float) Utils::ushort_to_format(h);
+	}
+	// NOTE: this is the inverse of the above operation
+	static uint16 float2half(float f)
+	{
+		return Utils::format_to_ushort((int)f);
+	}
+
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].x == data[yn][xn].x) &&
+				(data[y][x].y == data[yn][xn].y) &&
+				(data[y][x].z == data[yn][xn].z) );
+	}
+
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int x, int y)
+	{
+		return false;
+	}
+#endif
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+    nv::Vector3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+};
+
+}
+
+#endif // _ZOH_TILE_H
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh.h
@@ -0,0 +1,65 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include "tile.h"
+
+namespace ZOH {
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_ONE	= 1;
+static const int NCHANNELS		= 3;
+
+struct FltEndpts
+{
+    nv::Vector3 A;
+    nv::Vector3 B;
+};
+
+struct IntEndpts
+{
+	int A[NCHANNELS];
+	int B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	uint A[NCHANNELS];
+	uint B[NCHANNELS];
+};
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compressone(const Tile &t, char *block);
+float compresstwo(const Tile &t, char *block);
+void decompressone(const char *block, Tile &t);
+void decompresstwo(const char *block, Tile &t);
+
+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+bool isone(const char *block);
+
+}
+
+#endif // _ZOH_H
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh.cpp
@@ -0,0 +1,197 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include "tile.h"
+#include "zoh.h"
+
+#include <string.h> // memcpy
+
+using namespace ZOH;
+
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	float mseone = ZOH::compressone(t, oneblock);
+	float msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+/*
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s)
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	nvAssert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
+*/
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh_utils.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh_utils.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh_utils.h
@@ -0,0 +1,73 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#pragma once
+#ifndef _ZOH_UTILS_H
+#define _ZOH_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace ZOH {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
+
+enum Field {
+    FIELD_M = 1,	// mode
+    FIELD_D = 2,	// distribution/shape
+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+static const int F16S_MASK	=  0x8000;		// f16 sign mask
+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
+static const int U16MAX		=  0xffff;
+static const int S16MIN		= -0x8000;
+static const int S16MAX		=  0x7fff;
+static const int INT16_MASK	=  0xffff;
+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
+
+    // error metrics
+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
+
+    // conversion & clamp
+    static int ushort_to_format(unsigned short input);
+    static unsigned short format_to_ushort(int input);
+
+    // clamp to format
+    static void clamp(nv::Vector3 &v);
+
+    // quantization and unquantization
+    static int finish_unquantize(int q, int prec);
+    static int unquantize(int q, int prec);
+    static int quantize(float value, int prec);
+
+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
+
+    // lerping
+    static int lerp(int a, int b, int i, int denom);
+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
+};
+
+}
+
+#endif // _ZOH_UTILS_H
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh_utils.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh_utils.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/zoh_utils.cpp
@@ -0,0 +1,324 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace ZOH;
+
+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+/*static*/ Format Utils::FORMAT;
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvDebugCheck(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvUnreachable();
+	}
+
+	// no need to round these as this is an exact division
+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vector3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v.component[i] < 0.0) v.component[i] = 0;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		default:
+			nvUnreachable();
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		nvDebugCheck (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	nvDebugCheck (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+float Utils::norm(const Vector3 &a, const Vector3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	return lengthSquared(a - b);
+#endif
+#ifdef	NORM_ABS
+	Vector3 err = a - b;
+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
+#endif
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	nvDebugCheck (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else nvDebugCheck(0);
+		field = (Field) foo;
+	}
+}
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/zohone.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/zohone.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/zohone.cpp
@@ -0,0 +1,799 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static const int shapes[NSHAPES] =
+{
+    0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;				// associated mode value
+    int modebits;			// number of mode bits
+    const char *encoding;	// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static const Pattern patterns[NPATTERNS] =
+{
+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static const int mode_to_pat[MAXMODES] = {
+    -1,-1,-1,
+    3,	// 0x03
+    -1,-1,-1,
+    2,	// 0x07
+    -1,-1,-1,
+    1,	// 0x0b
+    -1,-1,-1,
+    0,	// 0x0f
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    int index_positions[NREGIONS_ONE];
+
+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        int x = index_positions[region] & 3;
+        int y = (index_positions[region] >> 2) & 3;
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_ONE];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx;
+    int gw, gx;
+    int bw, bx;
+
+    d = 0;
+    rw = rx = 0;
+    gw = gx = 0;
+    bw = bx = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 63);
+
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(endpts, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_ONE];
+    ComprEndpts compr_endpts[NREGIONS_ONE];
+
+    read_header(in, compr_endpts, p);
+    int shapeindex = 0;		// only one shape
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+    for (int r = 0; r < NREGIONS_ONE; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    // read indices
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+	*/
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+
+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x,y,shapeindex) == region)
+                {
+                    colors[np] = tile.data[y][x];
+                    mean += tile.data[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compressone(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+    float msebest = FLT_MAX;
+
+    /*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughone(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refineone(t, shapeindex_best, endptsbest, block);
+}
Index: ps/trunk/libraries/source/nvtt/src/src/bc6h/zohtwo.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc6h/zohtwo.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc6h/zohtwo.cpp
@@ -0,0 +1,883 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to nvDebugCheck we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;                   // associated mode value
+    int modebits;               // number of mode bits
+    const char *encoding;       // verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static const Pattern patterns[NPATTERNS] =
+{
+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static const int mode_to_pat[MAXMODES] = {	
+    3,	// 0x00
+    8,	// 0x01
+    0,	// 0x02
+    -1,-1,-1,
+    1,	// 0x06
+    -1,-1,-1,
+    2,	// 0x0a
+    -1,-1,-1,
+    4,	// 0x0e
+    -1,-1,-1,
+    5,	// 0x12
+    -2,-1,-1,
+    6,	// 0x16
+    -2,-1,-1,
+    7,	// 0x1a
+    -2,-1,-1,
+    9,	// 0x1e
+    -2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+        int x = POS_TO_X(position);
+        int y = POS_TO_Y(position);
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i)
+            {
+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+            }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_TWO];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_TWO; ++j)
+    {
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+        }
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int d = shapeindex;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_D:	out.write( d >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_RY:	out.write(ry >> endbit, len); break;
+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_GY:	out.write(gy >> endbit, len); break;
+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+        case FIELD_BY:	out.write(by >> endbit, len); break;
+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
+        default: nvUnreachable();
+        }
+    }
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    if (pat_index == -2)
+        return false;		// reserved mode found
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx, ry, rz;
+    int gw, gx, gy, gz;
+    int bw, bx, by, bz;
+
+    d = 0;
+    rw = rx = ry = rz = 0;
+    gw = gx = gy = gz = 0;
+    bw = bx = by = bz = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_D:	 d |= in.read(len) << endbit; break;
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+        case FIELD_BY:	by |= in.read(len) << endbit; break;
+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 46);
+
+    shapeindex = d;
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+    return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(compr_endpts, shapeindex, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_TWO];
+    ComprEndpts compr_endpts[NREGIONS_TWO];
+    int shapeindex;
+
+    if (!read_header(in, compr_endpts, shapeindex, p))
+    {
+        // reserved mode, return all zeroes
+        for (int y = 0; y < Tile::TILE_H; y++)
+            for (int x = 0; x < Tile::TILE_W; x++)
+                t.data[y][x] = Vector3(0.0f);
+
+        return;
+    }
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+    */
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            pixels[np] = tile.data[y][x];
+            importance[np] = tile.importance_map[y][x];
+            ++np;
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            colors[np] = tile.data[y][x];
+            mean += tile.data[y][x];
+            ++np;
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compresstwo(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+    float msebest = FLT_MAX;
+
+    /*
+    collect the mse values that are within 5% of the best values
+    optimize each one and choose the best
+    */
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughtwo(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/CMakeLists.txt
@@ -0,0 +1,30 @@
+PROJECT(bc7)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+SET(BC7_SRCS
+	avpcl.cpp
+	avpcl.h
+	avpcl_mode0.cpp
+	avpcl_mode1.cpp
+	avpcl_mode2.cpp
+	avpcl_mode3.cpp
+	avpcl_mode4.cpp
+	avpcl_mode5.cpp
+	avpcl_mode6.cpp
+	avpcl_mode7.cpp
+	bits.h
+	endpts.h
+	shapes_three.h
+	shapes_two.h
+	tile.h
+	avpcl_utils.cpp
+	avpcl_utils.h)
+
+ADD_LIBRARY(bc7 STATIC ${BC7_SRCS})
+
+IF(NOT WIN32)
+    IF(CMAKE_COMPILER_IS_GNUCXX)
+        SET_TARGET_PROPERTIES(bc7 PROPERTIES COMPILE_FLAGS -fPIC)
+    ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+ENDIF(NOT WIN32)
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl.h
@@ -0,0 +1,99 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_H
+#define _AVPCL_H
+
+#include "tile.h"
+#include "bits.h"
+
+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
+
+namespace AVPCL {
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_THREE	= 3;
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+// global flags
+extern bool flag_premult;
+extern bool flag_nonuniform;
+extern bool flag_nonuniform_ati;
+
+// global mode
+extern bool mode_rgb;		// true if image had constant alpha = 255
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compress_mode0(const Tile &t, char *block);
+void decompress_mode0(const char *block, Tile &t);
+
+float compress_mode1(const Tile &t, char *block);
+void decompress_mode1(const char *block, Tile &t);
+
+float compress_mode2(const Tile &t, char *block);
+void decompress_mode2(const char *block, Tile &t);
+
+float compress_mode3(const Tile &t, char *block);
+void decompress_mode3(const char *block, Tile &t);
+
+float compress_mode4(const Tile &t, char *block);
+void decompress_mode4(const char *block, Tile &t);
+
+float compress_mode5(const Tile &t, char *block);
+void decompress_mode5(const char *block, Tile &t);
+
+float compress_mode6(const Tile &t, char *block);
+void decompress_mode6(const char *block, Tile &t);
+
+float compress_mode7(const Tile &t, char *block);
+void decompress_mode7(const char *block, Tile &t);
+
+inline int getmode(Bits &in)
+{
+	int mode = 0;
+
+	if (in.read(1))			mode = 0;
+	else if (in.read(1))	mode = 1;
+	else if (in.read(1))	mode = 2;
+	else if (in.read(1))	mode = 3;
+	else if (in.read(1))	mode = 4;
+	else if (in.read(1))	mode = 5;
+	else if (in.read(1))	mode = 6;
+	else if (in.read(1))	mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+inline int getmode(const char *block)
+{
+	int bits = block[0], mode = 0;
+
+	if (bits & 1) mode = 0;
+	else if ((bits&3) == 2) mode = 1;
+	else if ((bits&7) == 4) mode = 2;
+	else if ((bits & 0xF) == 8) mode = 3;
+	else if ((bits & 0x1F) == 16) mode = 4;
+	else if ((bits & 0x3F) == 32) mode = 5;
+	else if ((bits & 0x7F) == 64) mode = 6;
+	else if ((bits & 0xFF) == 128) mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+
+}
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl.cpp
@@ -0,0 +1,264 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the avpcl compressor and decompressor
+
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// global flags
+bool AVPCL::flag_premult = false;
+bool AVPCL::flag_nonuniform = false;
+bool AVPCL::flag_nonuniform_ati = false;
+
+// global mode
+bool AVPCL::mode_rgb = false;		// true if image had constant alpha = 255
+
+void AVPCL::compress(const Tile &t, char *block)
+{
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	float mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+		
+	/*if (errfile)
+	{
+		float errs[21];
+		int nerrs = 8;
+		errs[0] = mse_mode0; 
+		errs[1] = mse_mode1; 
+		errs[2] = mse_mode2; 
+		errs[3] = mse_mode3; 
+		errs[4] = mse_mode4; 
+		errs[5] = mse_mode5; 
+		errs[6] = mse_mode6; 
+		errs[7] = mse_mode7;
+		if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs)
+			throw "Write error on error file";
+	}*/
+}
+
+/*
+static int getbit(char *b, int start)
+{
+	if (start < 0 || start >= 128) return 0; // out of range
+
+	int ix = start >> 3;
+	return (b[ix] & (1 << (start & 7))) != 0;
+}
+
+static int getbits(char *b, int start, int len)
+{
+	int out = 0;
+	for (int i=0; i<len; ++i)
+		out |= getbit(b, start+i) << i;
+	return out;
+}
+
+static void setbit(char *b, int start, int bit)
+{
+	if (start < 0 || start >= 128) return; // out of range
+
+	int ix = start >> 3;
+
+	if (bit & 1)
+		b[ix] |= (1 << (start & 7));
+	else
+		b[ix] &= ~(1 << (start & 7));
+}
+
+static void setbits(char *b, int start, int len, int bits)
+{
+	for (int i=0; i<len; ++i)
+		setbit(b, start+i, bits >> i);
+}
+*/
+
+void AVPCL::decompress(const char *cblock, Tile &t)
+{
+	char block[AVPCL::BLOCKSIZE];
+	memcpy(block, cblock, AVPCL::BLOCKSIZE);
+
+	switch(getmode(block))
+	{
+	case 0:	AVPCL::decompress_mode0(block, t);	break;
+	case 1:	AVPCL::decompress_mode1(block, t);	break;
+	case 2:	AVPCL::decompress_mode2(block, t);	break;
+	case 3:	AVPCL::decompress_mode3(block, t);	break;
+	case 4:	AVPCL::decompress_mode4(block, t);	break;
+	case 5:	AVPCL::decompress_mode5(block, t);	break;
+	case 6:	AVPCL::decompress_mode6(block, t);	break;
+	case 7:	AVPCL::decompress_mode7(block, t);	break;
+	case 8: // return a black tile if you get a reserved mode
+		for (int y=0; y<Tile::TILE_H; ++y)
+			for (int x=0; x<Tile::TILE_W; ++x)
+				t.data[y][x].set(0, 0, 0, 0);
+		break;
+	default: nvUnreachable();
+	}
+}
+
+/*
+void AVPCL::compress(string inf, string avpclf, string errf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	Targa::read(inf, pixels, w, h);
+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
+	FILE *errfile = NULL;
+	if (errf != "")
+	{
+		errfile = fopen(errf.c_str(), "wb");
+		if (errfile == NULL) throw "Unable to open error file for write";
+	}
+
+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
+	if (AVPCL::flag_premult)
+	{
+		if (AVPCL::mode_rgb)
+		{
+			AVPCL::flag_premult = false;
+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
+		}
+	}
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	clock_t start, prev, cur;
+
+	start = prev = clock();
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, float(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
+
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			AVPCL::compress(t, block, errfile);
+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+		}
+	}
+
+	cur = clock();
+	printf("\nTotal time to compress: %.2f seconds\n\n", float(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
+
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+	if (errfile && fclose(errfile)) throw "Close failed on error file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
+{
+	size_t n = avpclf.rfind('.', avpclf.length()-1);
+	size_t n1 = avpclf.rfind('-', n-1);
+	size_t n2 = avpclf.rfind('-', n1-1);
+	size_t n3 = avpclf.rfind('-', n2-1);
+	//	...-wwww-hhhh-RGB[A].avpcl
+	//     ^    ^    ^      ^
+	//     n3   n2   n1     n n3<n2<n1<n
+	string width = avpclf.substr(n3+1, n2-n3-1);
+	w = str2int(width);
+	string height = avpclf.substr(n2+1, n1-n2-1);
+	h = str2int(height);
+	string mode = avpclf.substr(n1+1, n-n1-1);
+	mode_rgb = mode == "RGB";
+}
+
+static int modehist[8];
+
+static void stats(char block[AVPCL::BLOCKSIZE])
+{
+	int m = AVPCL::getmode(block);
+	modehist[m]++;
+}
+
+static void printstats()
+{
+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
+	printf("\n");
+}
+
+void AVPCL::decompress(string avpclf, string outf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	extract(avpclf, w, h, AVPCL::mode_rgb);
+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+		
+			AVPCL::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+
+	Targa::write(outf, pixels, w, h);
+
+	printstats();	// print statistics
+}
+*/
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode0.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode0.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode0.cpp
@@ -0,0 +1,1066 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+//  x1		444.1x6 16p 45b (3bi)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_three.h"
+
+// use only the first 16 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 16
+#define SHAPEBITS 4
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+    const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	4,4,4,4,4,4,	4,4,4,4,4,4,	4,4,4,4,4,4,	0,	0x1, 1, "",	// really 444.1 x 6
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 555,555; reduce to 444,444 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 16);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 16);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 83);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 83);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode0(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+    nvAssert(false); // throw "No candidate found, should never happen (mode avpcl 0).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+// for this mode, we assume alpha = 255 constant and compress only the RGB portion.
+// however, we do the error check against the actual alpha values supplied for the tile.
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode0(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode1.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode1.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode1.cpp
@@ -0,0 +1,1047 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10	(666x2).1 (666x2).1 64p 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	2		// number of different lsb modes per region. since we have one .1 per region, that can have 2 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x2, 2, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	6,6,6, 6,6,6, 6,6,6, 6,6,6,	
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+
+static void transform_forward(IntEndptsRGB_1 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_1 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 777,777; reduce to 666,666 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_1& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 64);
+		nvAssert (compr_endpts.B[j] < 64);
+	}
+	compr_endpts.lsb = onescnt >= 3;
+}
+
+static void uncompress_one(const IntEndptsRGB_1& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_1 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_1 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_1 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_1 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_1 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		out.write(endpts[i].lsb, 1);
+
+	nvAssert (out.getptr() == 82);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_1 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		endpts[i].lsb  = in.read(1);
+	
+	nvAssert (in.getptr() == 82);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_1 &endpts_1, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_1, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// note: don't simplify to a + ((b-a)*i + BIAS)/DENOM as that doesn't work due to the way C handles integer division of negatives
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB_1 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode1(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_1 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_1 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_1 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_1 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_1 new_a, new_b;
+	IntEndptsRGB_1 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGB_1 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_1 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_1 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.lsb = lsbmode;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_1 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			//nvAssert(opt_toterr <= orig_toterr);
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 1).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			float err = Utils::metric4(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode1(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode2.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode2.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode2.cpp
@@ -0,0 +1,1004 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100 555x6 64p 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_three.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	6
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	5,5,5,5,5,5,	5,5,5,5,5,5,	5,5,5,5,5,5,	0,	0x4, 3, "",
+};
+
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS_THREE];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+#define	R_2 ep[1].A[i]
+#define	R_3	ep[1].B[i]
+
+static void transform_forward(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_1 -= R_3; R_2 -= R_3; R_0 -= R_3;
+	}
+}
+
+static void transform_inverse(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_0 += R_3; R_2 += R_3; R_1 += R_3;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, IntEndptsRGB q_endpts[NREGIONS_THREE])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB endpts[NREGIONS_THREE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB endpts[NREGIONS_THREE], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[i*2+0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[i*2+1]);
+		}
+	nvAssert (out.getptr() == 99);
+}
+
+static void read_header(Bits &in, IntEndptsRGB endpts[NREGIONS_THREE], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[i*2+0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[i*2+1]);
+		}
+	nvAssert (in.getptr() == 99);
+}
+
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB &endpts, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB endpts[NREGIONS_THREE])
+{
+	nvAssert (p.transformed != 0);
+
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+		endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
+		endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
+		endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
+		endpts[2].A[i] = SIGN_EXTEND(endpts[2].A[i], p.chan[i].nbitsizes[4]);
+		endpts[2].B[i] = SIGN_EXTEND(endpts[2].B[i], p.chan[i].nbitsizes[5]);
+	}
+}
+
+void AVPCL::decompress_mode2(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB endpts[NREGIONS_THREE];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB new_a, new_b;
+	IntEndptsRGB new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_THREE], 
+							const IntEndptsRGB orig_endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB opt_endpts[NREGIONS_THREE])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+		float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_THREE], char *block)
+{
+	float orig_err[NREGIONS_THREE], opt_err[NREGIONS_THREE], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB orig_endpts[NREGIONS_THREE], opt_endpts[NREGIONS_THREE];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS_THREE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 2).";
+	return FLT_MAX;
+
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_THREE], Vector4 palette[NREGIONS_THREE][NINDICES])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_THREE])
+{
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode2(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS_THREE];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode3.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode3.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode3.cpp
@@ -0,0 +1,1059 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000 777.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	7,7,7,7,	7,7,7,7,	7,7,7,7,	0,	0x8, 4, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7, 7,7,7, 7,7,7, 7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 888,888; reduce to 777,777 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 98);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode3(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+            float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 3).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode3(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode4.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode4.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode4.cpp
@@ -0,0 +1,1214 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000 2r 1i 555x2 6x2 2bi 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	8
+#define	INDEXBITS3	3
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	5,5,		5,5,		5,5,		6,6,	0x0, 0x10, 5, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,6,	5,5,5,6,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (out.getptr() == 50);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+	indexmode = in.read(INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (in.getptr() == 50);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
+
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		default: nvUnreachable();
+		}
+	}
+}
+
+void AVPCL::decompress_mode4(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	float toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vector3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
+
+		rgb.x = (colors[i]).x;
+		rgb.y = (colors[i]).y;
+		rgb.z = (colors[i]).z;
+		a = (colors[i]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
+			toterr += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.x = (tile.data[y][x]).x;
+		rgb.y = (tile.data[y][x]).y;
+		rgb.z = (tile.data[y][x]).z;
+		a = (tile.data[y][x]).w;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, 
+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					nvAssert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 4).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		float mina = FLT_MAX, maxa = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alphas[i] - mean.w;
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + Vector4(minp*direction, mina);
+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+float AVPCL::compress_mode4(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		{
+			float mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode5.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode5.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode5.cpp
@@ -0,0 +1,1216 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100000 2r 777x2 8x2 2bi 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	4
+#define	INDEXBITS3	2
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	7,7,		7,7,		7,7,		8,8,	0x0, 0x20, 6, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,8,	7,7,7,8,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+//	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (out.getptr() == 66);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+
+	indexmode = 0;		// we don't have any
+
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (in.getptr() == 66);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		default: nvUnreachable();
+		}
+	}
+}
+
+void AVPCL::decompress_mode5(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	float toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vector3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
+
+		rgb.x = (colors[i]).x;
+		rgb.y = (colors[i]).y;
+		rgb.z = (colors[i]).z;
+		a = (colors[i]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
+			toterr += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.x = (tile.data[y][x]).x;
+		rgb.y = (tile.data[y][x]).y;
+		rgb.z = (tile.data[y][x]).z;
+		a = (tile.data[y][x]).w;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts,
+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					nvAssert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 5).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		float mina = FLT_MAX, maxa = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alphas[i] - mean.w;
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + Vector4(minp*direction, mina);
+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+float AVPCL::compress_mode5(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+//		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		for (int i = 0; i < 1 && msebest > 0; ++i)
+		{
+			float mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode6.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode6.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode6.cpp
@@ -0,0 +1,1055 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000000 7777.1x2 4bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	NREGIONS	1
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red	green	blue	alpha	mode  mb verilog
+	7,7,	7,7,	7,7,	7,7,	0x40, 7, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,7,	7,7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 8888 ->7777.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 65);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 65);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	nvAssert ((indices[0][0] & HIGH_INDEXBIT) == 0);
+
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			out.write(indices[i>>2][i&3], INDEXBITS-1);	// write i..[2:0]
+		else
+			out.write(indices[i>>2][i&3], INDEXBITS);	// write i..[3:0]
+	}
+
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			indices[i>>2][i&3] = in.read(INDEXBITS-1);	// read i..[1:0]
+		else
+			indices[i>>2][i&3] = in.read(INDEXBITS);	// read i..[2:0]
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
+}
+
+void AVPCL::decompress_mode6(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+
+     simplify the above given that there is no transform now and that endpoints will always fit
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+
+		optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+		assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+		// (nreed) Commented out asserts because they go off all the time...not sure why
+		//for (int i=0; i<NREGIONS; ++i)
+		//	nvAssert(expected_opt_err[i] == opt_err[i]);
+		swap_indices(opt_endpts, opt_indices, shapeindex_best);
+
+		orig_toterr = opt_toterr = 0;
+		for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+		//nvAssert(opt_toterr <= orig_toterr);
+
+		if (opt_toterr < orig_toterr)
+		{
+			emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+			return opt_toterr;
+		}
+		else
+		{
+			emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+			return orig_toterr;
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 6).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+
+		besterr = Utils::metric4(tile.data[y][x], palette[region][0]);
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector4 colors[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean, direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode6(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=1;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode7.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode7.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_mode7.cpp
@@ -0,0 +1,1094 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000000 5555.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha		xfm	mode  mb
+	5,5,5,5,	5,5,5,5,	5,5,5,5,	5,5,5,5,	0,	0x80, 8, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,5,  5,5,5,5,  5,5,5,5,  5,5,5,5,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 6666 ->5555.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 32);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 32);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 98);
+}
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGBA_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode7(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 7).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector4 colors[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean, direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode7(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_utils.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_utils.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_utils.h
@@ -0,0 +1,61 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _AVPCL_UTILS_H
+#define _AVPCL_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace AVPCL {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); }
+
+static const int INDEXMODE_BITS				= 1;		// 2 different index modes
+static const int NINDEXMODES				= (1<<(INDEXMODE_BITS));
+static const int INDEXMODE_ALPHA_IS_3BITS	= 0;
+static const int INDEXMODE_ALPHA_IS_2BITS	= 1;
+
+static const int ROTATEMODE_BITS		= 2;		// 4 different rotate modes
+static const int NROTATEMODES			= (1<<(ROTATEMODE_BITS));
+static const int ROTATEMODE_RGBA_RGBA	= 0;
+static const int ROTATEMODE_RGBA_AGBR	= 1;
+static const int ROTATEMODE_RGBA_RABG	= 2;
+static const int ROTATEMODE_RGBA_RGAB	= 3;
+
+class Utils
+{
+public:
+	// error metrics
+	static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b);
+	static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode);
+	static float metric1(float a, float b, int rotatemode);
+
+	static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1);
+	static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1);
+	static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode);
+	static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
+
+	static float premult(float r, float a);
+
+	// quantization and unquantization
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	// lerping
+	static int lerp(int a, int b, int i, int bias, int denom);
+	static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom);
+};
+
+}
+
+#endif
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_utils.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_utils.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/avpcl_utils.cpp
@@ -0,0 +1,390 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "avpcl_utils.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+	nvAssert (a >= 0 && b >= 0);
+
+	int round = 0;
+#ifdef	USE_ZOH_INTERP_ROUNDED
+	round = 32;
+#endif
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
+	default: nvUnreachable(); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+//	nvAssert (a >= 0 && b >= 0);
+
+	// no need to bias these as this is an exact division
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f;
+	case 7:	return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f;
+	default: nvUnreachable(); return Vector4(0);
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+
+int Utils::unquantize(int q, int prec)
+{
+	int unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+#ifdef USE_ZOH_QUANT
+	if (prec >= 8)
+		unq = q;
+	else if (q == 0) 
+		unq = 0;
+	else if (q == ((1<<prec)-1)) 
+		unq = 255;
+	else
+		unq = (q * 256 + 128) >> prec;
+#else
+	// avpcl unquantizer -- bit replicate
+	unq = (q << (8-prec)) | (q >> (2*prec-8));
+#endif
+
+	return unq;
+}
+
+// quantize to the best value -- i.e., minimize unquantize error
+int Utils::quantize(float value, int prec)
+{
+	int q, unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+	unq = (int)floor(value + 0.5f);
+	nvAssert (unq <= 255);
+
+#ifdef USE_ZOH_QUANT
+	q = (prec >= 8) ? unq : (unq << prec) / 256;
+#else
+	// avpcl quantizer -- scale properly for best possible bit-replicated result
+	q = (unq * ((1<<prec)-1) + 127)/255;
+#endif
+
+	nvAssert (q >= 0 && q < (1 << prec));
+
+	return q;
+}
+
+float Utils::metric4(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
+float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode)
+{
+	Vector3 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1(const float a, const float b, int rotatemode)
+{
+	float err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
+
+float Utils::premult(float r, float a)
+{
+	// note that the args are really integers stored in floats
+	int R = int(r), A = int(a);
+
+	nvAssert ((R==r) && (A==a));
+
+	return float((R*A + 127)/255);
+}
+
+static void premult4(Vector4& rgba)
+{
+	rgba.x = Utils::premult(rgba.x, rgba.w);
+	rgba.y = Utils::premult(rgba.y, rgba.w);
+	rgba.z = Utils::premult(rgba.z, rgba.w);
+}
+
+static void premult3(Vector3& rgb, float a)
+{
+	rgb.x = Utils::premult(rgb.x, a);
+	rgb.y = Utils::premult(rgb.y, a);
+	rgb.z = Utils::premult(rgb.z, a);
+}
+
+float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 pma = a, pmb = b;
+
+	premult4(pma);
+	premult4(pmb);
+
+	Vector4 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	premult3(pma, a0);
+	premult3(pmb, a1);
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	switch(rotatemode)
+	{
+	case ROTATEMODE_RGBA_RGBA:
+		// this function isn't supposed to be called for this rotatemode
+		nvUnreachable();
+		break;
+	case ROTATEMODE_RGBA_AGBR:
+		pma.y = premult(pma.y, pma.x);
+		pma.z = premult(pma.z, pma.x);
+		pmb.y = premult(pmb.y, pmb.x);
+		pmb.z = premult(pmb.z, pmb.x);
+		break;
+	case ROTATEMODE_RGBA_RABG:
+		pma.x = premult(pma.x, pma.y);
+		pma.z = premult(pma.z, pma.y);
+		pmb.x = premult(pmb.x, pmb.y);
+		pmb.z = premult(pmb.z, pmb.y);
+		break;
+	case ROTATEMODE_RGBA_RGAB:
+		pma.x = premult(pma.x, pma.z);
+		pma.y = premult(pma.y, pma.z);
+		pmb.x = premult(pmb.x, pmb.z);
+		pmb.y = premult(pmb.y, pmb.z);
+		break;
+	default: nvUnreachable();
+	}
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
+{
+	float err = premult(rgb0, a0) - premult(rgb1, a1);
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/bits.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/bits.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/bits.h
@@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_BITS_H
+#define _AVPCL_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace AVPCL {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/endpts.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/endpts.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/endpts.h
@@ -0,0 +1,81 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_ENDPTS_H
+#define _AVPCL_ENDPTS_H
+
+// endpoint definitions and routines to search through endpoint space
+
+#include "nvmath/Vector.h"
+
+namespace AVPCL {
+
+static const int NCHANNELS_RGB	= 3;
+static const int NCHANNELS_RGBA	= 4;
+static const int CHANNEL_R		= 0;
+static const int CHANNEL_G		= 1;
+static const int CHANNEL_B		= 2;
+static const int CHANNEL_A		= 3;
+
+struct FltEndpts
+{
+	nv::Vector4	A;
+	nv::Vector4	B;
+};
+
+struct IntEndptsRGB
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+};
+
+struct IntEndptsRGB_1
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		lsb;				// shared lsb for A and B
+};
+
+struct IntEndptsRGB_2
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+
+struct IntEndptsRGBA
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+};
+
+struct IntEndptsRGBA_2
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+struct IntEndptsRGBA_2a
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for RGB channels of A
+	int		b_lsb;				// lsb for RGB channels of A
+};
+
+}
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/shapes_three.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/shapes_three.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/shapes_three.h
@@ -0,0 +1,132 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef	_AVPCL_SHAPES_THREE_H
+#define _AVPCL_SHAPES_THREE_H
+
+// shapes for 3 regions
+
+#define NREGIONS 3
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
+
+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
+
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+
+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
+
+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
+
+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
+
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
+
+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
+
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+
+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
+
+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
+
+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
+{
+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
+
+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
+
+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
+
+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/shapes_two.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/shapes_two.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_SHAPES_TWO_H
+#define _AVPCL_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/bc7/tile.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/bc7/tile.h
+++ ps/trunk/libraries/source/nvtt/src/src/bc7/tile.h
@@ -0,0 +1,41 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_TILE_H
+#define _AVPCL_TILE_H
+
+#include "nvmath/Vector.h"
+#include <math.h>
+#include "avpcl_utils.h"
+
+namespace AVPCL {
+
+// extract a tile of pixels from an array
+
+class Tile
+{
+public:
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	nv::Vector4 data[TILE_H][TILE_W];
+    float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+};
+
+}
+
+#endif
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvconfig.h.in
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvconfig.h.in
+++ ps/trunk/libraries/source/nvtt/src/src/nvconfig.h.in
@@ -7,10 +7,15 @@
 #cmakedefine HAVE_EXECINFO_H
 #cmakedefine HAVE_MALLOC_H
 
-#cmakedefine HAVE_PNG
-#cmakedefine HAVE_JPEG
-#cmakedefine HAVE_TIFF
-#cmakedefine HAVE_OPENEXR
+#cmakedefine HAVE_OPENMP
+#cmakedefine HAVE_DISPATCH_H
+
+#define HAVE_STBIMAGE
+//#cmakedefine HAVE_PNG
+//#cmakedefine HAVE_JPEG
+//#cmakedefine HAVE_TIFF
+//#cmakedefine HAVE_OPENEXR
+//#cmakedefine HAVE_FREEIMAGE
 
 #cmakedefine HAVE_MAYA
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Array.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Array.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Array.h
@@ -0,0 +1,182 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_ARRAY_H
+#define NV_CORE_ARRAY_H
+
+/*
+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
+are not supported.
+
+Note also that push_back and resize does not support inserting arguments elements that are in the same 
+container. This is forbidden to prevent an extra copy.
+*/
+
+
+#include "Memory.h"
+#include "Debug.h"
+#include "ForEach.h" // PseudoIndex
+
+
+namespace nv 
+{
+    class Stream;
+
+    /**
+    * Replacement for std::vector that is easier to debug and provides
+    * some nice foreach enumerators. 
+    */
+    template<typename T>
+    class NVCORE_CLASS Array {
+    public:
+        typedef uint size_type;
+
+        // Default constructor.
+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
+
+        // Copy constructor.
+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(a.m_buffer, a.m_size);
+        }
+
+        // Constructor that initializes the vector with the given elements.
+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(ptr, num);
+        }
+
+        // Allocate array.
+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            setArrayCapacity(capacity);
+        }
+
+        // Destructor.
+        NV_FORCEINLINE ~Array() {
+            clear();
+            free<T>(m_buffer);
+        }
+
+
+        /// Const element access.
+        NV_FORCEINLINE const T & operator[]( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE const T & at( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Element access.
+        NV_FORCEINLINE T & operator[] ( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE T & at( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint size() const { return m_size; }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint count() const { return m_size; }
+
+        /// Get vector capacity.
+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
+
+        /// Get const vector pointer.
+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
+
+        /// Get vector pointer.
+        NV_FORCEINLINE T * buffer() { return m_buffer; }
+
+        /// Provide begin/end pointers for C++11 range-based for loops.
+        NV_FORCEINLINE T * begin() { return m_buffer; }
+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
+
+        /// Is vector empty.
+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
+
+        /// Is a null vector.
+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
+
+
+        T & append();
+        void push_back( const T & val );
+        void pushBack( const T & val );
+        Array<T> & append( const T & val );
+        Array<T> & operator<< ( T & t );
+        void pop_back();
+        void popBack(uint count = 1);
+        void popFront(uint count = 1);
+        const T & back() const;
+        T & back();
+        const T & front() const;
+        T & front();
+        bool contains(const T & e) const;
+        bool find(const T & element, uint * indexPtr) const;
+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
+        void removeAt(uint index);
+        bool remove(const T & element);
+        void insertAt(uint index, const T & val = T());
+        void append(const Array<T> & other);
+        void append(const T other[], uint count);
+        void replaceWithLast(uint index);
+        void resize(uint new_size);
+        void resize(uint new_size, const T & elem);
+        void fill(const T & elem);
+        void clear();
+        void shrink();
+        void reserve(uint desired_size);
+        void copy(const T * data, uint count);
+        Array<T> & operator=( const Array<T> & a );
+        T * release();
+
+
+        // Array enumerator.
+        typedef uint PseudoIndex;
+
+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
+
+#if NV_CC_MSVC
+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
+            return m_buffer[i(this)];
+        }
+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
+            return m_buffer[i(this)];
+        }
+#endif
+
+        // Friends.
+        template <typename Typ> 
+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
+
+        template <typename Typ>
+        friend void swap(Array<Typ> & a, Array<Typ> & b);
+
+
+    protected:
+
+        void setArraySize(uint new_size);
+        void setArrayCapacity(uint new_capacity);
+
+        T * m_buffer;
+        uint m_capacity;
+        uint m_size;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Array.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Array.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Array.inl
@@ -0,0 +1,438 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_ARRAY_INL
+#define NV_CORE_ARRAY_INL
+
+#include "Array.h"
+
+#include "Stream.h"
+#include "Utils.h" // swap
+
+#include <string.h>	// memmove
+#include <new> // for placement new
+
+
+
+namespace nv 
+{
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::append()
+    {
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size);
+
+        return m_buffer[old_size]; // Return reference to last element.
+    }
+
+    // Push an element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::push_back( const T & val )
+    {
+#if 1
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
+
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size, val);
+#else
+        uint new_size = m_size + 1;
+
+        if (new_size > m_capacity)
+        {
+            // @@ Is there any way to avoid this copy?
+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?
+            // @@ Assert instead of copy?
+            const T copy(val);	// create a copy in case value is inside of this array.
+
+            setArraySize(new_size);
+
+            new (m_buffer+new_size-1) T(copy);
+        }
+        else
+        {
+            m_size = new_size;
+            new(m_buffer+new_size-1) T(val);
+        }
+#endif // 0/1
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )
+    {
+        push_back(val);
+    }
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
+    {
+        push_back(val);
+        return *this;
+    }
+
+    // Qt like push operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )
+    {
+        push_back(t);
+        return *this;
+    }
+
+    // Pop the element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pop_back()
+    {
+        nvDebugCheck( m_size > 0 );
+        resize( m_size - 1 );
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popBack(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        resize(m_size - count);
+    }
+
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popFront(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        //resize(m_size - count);
+
+        if (m_size == count) {
+            clear();
+        }
+        else {
+            destroy_range(m_buffer, 0, count);
+
+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));
+
+            m_size -= count;
+        }
+
+    }
+
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::back() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::back()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::front() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::front()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Check if the given element is contained in the array.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const
+    {
+        return find(e, NULL);
+    }
+
+    // Return true if element found.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const
+    {
+        return find(element, 0, m_size, indexPtr);
+    }
+
+    // Return true if element found within the given range.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const
+    {
+        return ::nv::find(element, m_buffer, begin, end, indexPtr);
+    }
+
+
+    // Remove the element at the given index. This is an expensive operation!
+    template <typename T>
+    void Array<T>::removeAt(uint index)
+    {
+        nvDebugCheck(index >= 0 && index < m_size);
+
+        if (m_size == 1) {
+            clear();
+        }
+        else {
+            m_buffer[index].~T();
+
+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));
+            m_size--;
+        }
+    }
+
+    // Remove the first instance of the given element.
+    template <typename T>
+    bool Array<T>::remove(const T & element)
+    {
+        uint index;
+        if (find(element, &index)) {
+            removeAt(index);
+            return true;
+        }
+        return false;
+    }
+
+    // Insert the given element at the given index shifting all the elements up.
+    template <typename T>
+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)
+    {
+        nvDebugCheck( index >= 0 && index <= m_size );
+
+        setArraySize(m_size + 1);
+
+        if (index < m_size - 1) {
+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));
+        }
+
+        // Copy-construct into the newly opened slot.
+        new(m_buffer+index) T(val);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)
+    {
+        append(other.m_buffer, other.m_size);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    void Array<T>::append(const T other[], uint count)
+    {
+        if (count > 0) {
+            const uint old_size = m_size;
+
+            setArraySize(m_size + count);
+
+            for (uint i = 0; i < count; i++ ) {
+                new(m_buffer + old_size + i) T(other[i]);
+            }
+        }
+    }
+
+
+    // Remove the given element by replacing it with the last one.
+    template <typename T> 
+    void Array<T>::replaceWithLast(uint index)
+    {
+        nvDebugCheck( index < m_size );
+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?
+        (m_buffer+m_size-1)->~T();
+        m_size--;
+    }
+
+    // Resize the vector preserving existing elements.
+    template <typename T> 
+    void Array<T>::resize(uint new_size)
+    {
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call default constructors
+        construct_range(m_buffer, new_size, old_size);
+    }
+
+
+    // Resize the vector preserving existing elements and initializing the
+    // new ones with the given value.
+    template <typename T> 
+    void Array<T>::resize(uint new_size, const T & elem)
+    {
+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);
+
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call copy constructors
+        construct_range(m_buffer, new_size, old_size, elem);
+    }
+
+    // Fill array with the given value.
+    template <typename T>
+    void Array<T>::fill(const T & elem)
+    {
+        fill(m_buffer, m_size, elem);
+    }
+
+    // Clear the buffer.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::clear()
+    {
+        nvDebugCheck(isValidPtr(m_buffer));
+
+        // Destruct old elements
+        destroy_range(m_buffer, 0, m_size);
+
+        m_size = 0;
+    }
+
+    // Shrink the allocated vector.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::shrink()
+    {
+        if (m_size < m_capacity) {
+            setArrayCapacity(m_size);
+        }
+    }
+
+    // Preallocate space.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)
+    {
+        if (desired_size > m_capacity) {
+            setArrayCapacity(desired_size);
+        }
+    }
+
+    // Copy elements to this array. Resizes it if needed.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
+    {
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
+
+        setArraySize(count);
+
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
+    }
+
+    // Assignment operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )
+    {
+        copy(a.m_buffer, a.m_size);
+        return *this;
+    }
+
+    // Release ownership of allocated memory and returns pointer to it.
+    template <typename T>
+    T * Array<T>::release() {
+        T * tmp = m_buffer;
+        m_buffer = NULL;
+        m_capacity = 0;
+        m_size = 0;
+        return tmp;
+    }
+
+
+
+    // Change array size.
+    template <typename T> 
+    inline void Array<T>::setArraySize(uint new_size) {
+        m_size = new_size;
+
+        if (new_size > m_capacity) {
+            uint new_buffer_size;
+            if (m_capacity == 0) {
+                // first allocation is exact
+                new_buffer_size = new_size;
+            }
+            else {
+                // following allocations grow array by 25%
+                new_buffer_size = new_size + (new_size >> 2);
+            }
+
+            setArrayCapacity( new_buffer_size );
+        }
+    }
+
+    // Change array capacity.
+    template <typename T> 
+    inline void Array<T>::setArrayCapacity(uint new_capacity) {
+        nvDebugCheck(new_capacity >= m_size);
+
+        if (new_capacity == 0) {
+            // free the buffer.
+            if (m_buffer != NULL) {
+                free<T>(m_buffer);
+                m_buffer = NULL;
+            }
+        }
+        else {
+            // realloc the buffer
+            m_buffer = realloc<T>(m_buffer, new_capacity);
+        }
+
+        m_capacity = new_capacity;
+    }
+
+    // Array serialization.
+    template <typename Typ> 
+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )
+    {
+        if (s.isLoading()) {
+            uint size;
+            s << size;
+            p.resize( size );
+        }
+        else {
+            s << p.m_size;
+        }
+
+        for (uint i = 0; i < p.m_size; i++) {
+            s << p.m_buffer[i];
+        }
+
+        return s;
+    }
+
+    // Swap the members of the two given vectors.
+    template <typename Typ>
+    inline void swap(Array<Typ> & a, Array<Typ> & b)
+    {
+        nv::swap(a.m_buffer, b.m_buffer);
+        nv::swap(a.m_capacity, b.m_capacity);
+        nv::swap(a.m_size, b.m_size);
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_INL
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/BitArray.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/BitArray.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/BitArray.h
@@ -1,168 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_CORE_BITARRAY_H
-#define NV_CORE_BITARRAY_H
-
-#include <nvcore/nvcore.h>
-#include <nvcore/Containers.h>
-
-namespace nv
-{
-
-/// Count the bits of @a x.
-inline uint bitsSet(uint8 x) {
-	uint count = 0;
-	for(; x != 0; x >>= 1) {
-		count += (x & 1);
-	}
-	return count;
-}
-
-
-/// Count the bits of @a x.
-inline uint bitsSet(uint32 x, int bits) {
-	uint count = 0;
-	for(; x != 0 && bits != 0; x >>= 1, bits--) {
-		count += (x & 1);
-	}
-	return count;
-}
-
-
-/// Simple bit array.
-class BitArray
-{
-public:
-
-	/// Default ctor.
-	BitArray() {}
-
-	/// Ctor with initial m_size.
-	BitArray(uint sz)
-	{
-		resize(sz);
-	}
-
-	/// Get array m_size.
-	uint size() const { return m_size; }
-
-	/// Clear array m_size.
-	void clear() { resize(0); }
-
-	/// Set array m_size.
-	void resize(uint sz)
-	{ 
-		m_size = sz;
-		m_bitArray.resize( (m_size + 7) >> 3 );
-	}
-	
-	/// Get bit.
-	bool bitAt(uint b) const
-	{
-		nvDebugCheck( b < m_size );
-		return (m_bitArray[b >> 3] & (1 << (b & 7))) != 0;
-	}
-
-	/// Set a bit.
-	void setBitAt(uint b)
-	{
-		nvDebugCheck( b < m_size );
-		m_bitArray[b >> 3] |=  (1 << (b & 7));
-	}
-
-	/// Clear a bit.
-	void clearBitAt( uint b )
-	{
-		nvDebugCheck( b < m_size );
-		m_bitArray[b >> 3] &= ~(1 << (b & 7));
-	}
-
-	/// Clear all the bits.
-	void clearAll()
-	{
-		memset(m_bitArray.unsecureBuffer(), 0, m_bitArray.size());
-	}
-
-	/// Set all the bits.
-	void setAll()
-	{
-		memset(m_bitArray.unsecureBuffer(), 0xFF, m_bitArray.size());
-	}
-
-	/// Toggle all the bits.
-	void toggleAll()
-	{
-		const uint byte_num = m_bitArray.size();
-		for(uint b = 0; b < byte_num; b++) {
-			m_bitArray[b] ^= 0xFF;
-		}
-	}
-	
-	/// Get a byte of the bit array.
-	const uint8 & byteAt(uint index) const
-	{
-		return m_bitArray[index];
-	}
-
-	/// Set the given byte of the byte array.
-	void setByteAt(uint index, uint8 b)
-	{
-		m_bitArray[index] = b;
-	}
-	
-	/// Count the number of bits set.
-	uint countSetBits() const
-	{
-		const uint num = m_bitArray.size();
-		if( num == 0 ) {
-			return 0;
-		}
-		
-		uint count = 0;				
-		for(uint i = 0; i < num - 1; i++) {
-			count += bitsSet(m_bitArray[i]);
-		}
-		count += bitsSet(m_bitArray[num-1], m_size & 0x7);
-		
-		//piDebugCheck(count + countClearBits() == m_size);
-		return count;
-	}
-
-	/// Count the number of bits clear.
-	uint countClearBits() const {
-		
-		const uint num = m_bitArray.size();
-		if( num == 0 ) {
-			return 0;
-		}
-		
-		uint count = 0;
-		for(uint i = 0; i < num - 1; i++) {
-			count += bitsSet(~m_bitArray[i]);
-		}
-		count += bitsSet(~m_bitArray[num-1], m_size & 0x7);
-		
-		//piDebugCheck(count + countSetBits() == m_size);
-		return count;
-	}
-
-	friend void swap(BitArray & a, BitArray & b)
-	{
-		swap(a.m_size, b.m_size);
-		swap(a.m_bitArray, b.m_bitArray);
-	}
-
-
-private:
-
-	/// Number of bits stored.
-	uint m_size;
-
-	/// Array of bits.
-	Array<uint8> m_bitArray;
-
-};
-
-} // nv namespace
-
-#endif // _PI_CORE_BITARRAY_H_
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/CMakeLists.txt
@@ -1,27 +1,24 @@
 PROJECT(nvcore)
-ADD_SUBDIRECTORY(poshlib)
 
 SET(CORE_SRCS
-	nvcore.h
-	Ptr.h
-	BitArray.h
-	Memory.h
-	Memory.cpp
-	Debug.h
-	Debug.cpp
-	Containers.h
-	StrLib.h
-	StrLib.cpp
-	Stream.h
-	StdStream.h
-	TextReader.h
-	TextReader.cpp
-	TextWriter.h
-	TextWriter.cpp
-	Radix.h
-	Radix.cpp
-	Library.h
-	Library.cpp)
+    nvcore.h
+    Array.h
+    Debug.h Debug.cpp
+    DefsGnucDarwin.h
+    DefsGnucLinux.h
+    DefsGnucWin32.h
+    DefsVcWin32.h
+    FileSystem.h FileSystem.cpp
+    ForEach.h
+    Memory.h Memory.cpp
+    Ptr.h
+    RefCounted.h
+    StrLib.h StrLib.cpp
+    Stream.h
+    StdStream.h
+    TextWriter.h TextWriter.cpp
+    Timer.h Timer.cpp
+    Utils.h)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
@@ -29,19 +26,24 @@
 ADD_DEFINITIONS(-DNVCORE_EXPORTS)
 
 IF(UNIX)
-	SET(LIBS ${LIBS} ${CMAKE_DL_LIBS})
+    SET(LIBS ${LIBS} ${CMAKE_DL_LIBS})
 ENDIF(UNIX)
 
 IF(NVCORE_SHARED)
-	ADD_DEFINITIONS(-DNVCORE_SHARED=1)
-	ADD_LIBRARY(nvcore SHARED ${CORE_SRCS})
+    ADD_DEFINITIONS(-DNVCORE_SHARED=1)
+    ADD_LIBRARY(nvcore SHARED ${CORE_SRCS})
 ELSE(NVCORE_SHARED)
-	ADD_LIBRARY(nvcore ${CORE_SRCS})
+    ADD_LIBRARY(nvcore ${CORE_SRCS})
 ENDIF(NVCORE_SHARED)
 
 TARGET_LINK_LIBRARIES(nvcore ${LIBS})
 
+# On NetBSD and FreeBSD backtrace() is provided by libexecinfo, not libc.
+if (CMAKE_SYSTEM_NAME MATCHES "NetBSD" OR CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+     TARGET_LINK_LIBRARIES(nvcore execinfo)
+endif()
+
 INSTALL(TARGETS nvcore
-	RUNTIME DESTINATION ${BINDIR}
-	LIBRARY DESTINATION ${LIBDIR}
-	ARCHIVE DESTINATION ${LIBDIR})
+    RUNTIME DESTINATION ${BINDIR}
+    LIBRARY DESTINATION ${LIBDIR}
+    ARCHIVE DESTINATION ${LIBDIR})
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Containers.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Containers.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Containers.h
@@ -1,1059 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_CORE_CONTAINER_H
-#define NV_CORE_CONTAINER_H
-
-/*
-These containers are based on Thatcher Ulrich <tu@tulrich.com> containers,
-donated to the Public Domain.
-
-I've also borrowed some ideas from the Qt toolkit, specially the cool
-foreach iterator.
-
-TODO
-Do not use memmove in insert & remove, use copy ctors instead.
-*/
-
-
-// nvcore
-#include "nvcore.h"
-#include "Memory.h"
-#include "Debug.h"
-
-#include <string.h>	// memmove
-#include <new>		// for placement new
-
-
-#if NV_CC_GNUC // If typeof is available:
-
-#define NV_FOREACH(i, container) \
-	typedef typeof(container) NV_STRING_JOIN2(cont,__LINE__); \
-	for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
-/*
-#define NV_FOREACH(i, container) \
-	for(typename typeof(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
-*/
-
-#else // If typeof not available:
-
-struct PseudoIndexWrapper {
-	template <typename T>
-	PseudoIndexWrapper(const T & container) {
-		nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
-		new (memory) typename T::PseudoIndex(container.start());
-	}
-	// PseudoIndex cannot have a dtor!
-
-	template <typename T> typename T::PseudoIndex & operator()(const T * container) {
-		return *reinterpret_cast<typename T::PseudoIndex *>(memory);
-	}
-	template <typename T> const typename T::PseudoIndex & operator()(const T * container) const {
-		return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
-	}
-
-	uint8 memory[4];	// Increase the size if we have bigger enumerators.
-};
-
-#define NV_FOREACH(i, container) \
-	for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
-
-#endif
-
-// Declare foreach keyword.
-#if !defined NV_NO_USE_KEYWORDS
-#	define foreach NV_FOREACH
-#endif
-
-
-
-namespace nv 
-{
-	// Templates
-
-	/// Return the maximum of two values.
-	template <typename T> 
-	inline const T & max(const T & a, const T & b)
-	{
-		//return std::max(a, b);
-		if( a < b ) {
-			return b; 
-		}
-		return a;
-	}
-	
-	/// Return the minimum of two values.
-	template <typename T> 
-	inline const T & min(const T & a, const T & b)
-	{
-		//return std::min(a, b);
-		if( b < a ) {
-			return b; 
-		}
-		return a;
-	}
-	
-	/// Clamp between two values.
-	template <typename T> 
-	inline const T & clamp(const T & x, const T & a, const T & b)
-	{
-		return min(max(x, a), b);
-	}
-	
-	/// Swap two values.
-	template <typename T> 
-	inline void swap(T & a, T & b)
-	{
-		//return std::swap(a, b);
-		T temp = a; 
-		a = b; 
-		b = temp;
-	}
-
-	template <typename Key> struct hash 
-	{
-		inline uint sdbm_hash(const void * data_in, uint size, uint h = 5381)
-		{
-			const uint8 * data = (const uint8 *) data_in;
-			uint i = 0;
-			while (i < size) {
-				h = (h << 16) + (h << 6) - h + (uint) data[i++];
-			}
-			return h;
-		}
-		
-		uint operator()(const Key & k) {
-			return sdbm_hash(&k, sizeof(Key));
-		}
-	};
-	template <> struct hash<int>
-	{
-		uint operator()(int x) const { return x; }
-	};
-	template <> struct hash<uint>
-	{
-		uint operator()(uint x) const { return x; }
-	};
-	
-	/// Delete all the elements of a container.
-	template <typename T>
-	void deleteAll(T & container)
-	{
-		for(typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
-		{
-			delete container[i];
-		}
-	}
-
-	
-	/** Return the next power of two. 
-	* @see http://graphics.stanford.edu/~seander/bithacks.html
-	* @warning Behaviour for 0 is undefined.
-	* @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
-	* @note nextPowerOfTwo(x) = 2 << log2(x-1)
-	*/
-	inline uint nextPowerOfTwo( uint x )
-	{
-		nvDebugCheck( x != 0 );
-	#if 1	// On modern CPUs this is as fast as using the bsr instruction.
-		x--;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		x |= x >> 8;
-		x |= x >> 16;
-		return x+1;	
-	#else
-		uint p = 1;
-		while( x > p ) {
-			p += p;
-		}
-		return p;
-	#endif
-	}
-
-	/// Return true if @a n is a power of two.
-	inline bool isPowerOfTwo( uint n )
-	{
-		return (n & (n-1)) == 0;
-	}
-
-	/// Simple iterator interface.
-	template <typename T>
-	struct Iterator
-	{
-		virtual void advance();
-		virtual bool isDone();
-		virtual T current();
-	};
-
-
-	/**
-	* Replacement for std::vector that is easier to debug and provides
-	* some nice foreach enumerators. 
-	*/
-	template<typename T>
-	class NVCORE_CLASS Array {
-	public:
-		
-		/// Ctor.
-		Array() : m_buffer(NULL), m_size(0), m_buffer_size(0)
-		{
-		}
-	
-		/// Copy ctor.
-		Array( const Array & a ) : m_buffer(NULL), m_size(0), m_buffer_size(0)
-		{
-			copy(a.m_buffer, a.m_size); 
-		}
-	
-		/// Ctor that initializes the vector with the given elements.
-		Array( const T * ptr, int num ) : m_buffer(NULL), m_size(0), m_buffer_size(0)
-		{
-			copy(ptr, num);
-		}
-	
-		/// Allocate array.
-		explicit Array(uint capacity) : m_buffer(NULL), m_size(0), m_buffer_size(0)
-		{
-			allocate(capacity);
-		}
-		
-	
-		/// Dtor.
-		~Array()
-		{
-			clear();
-			allocate(0);
-		}
-	
-	
-		/// Const and save vector access.
-		const T & operator[]( uint index ) const
-		{
-			nvDebugCheck(index < m_size);
-			return m_buffer[index];
-		}
-	
-		/// Safe vector access.
-		T & operator[] ( uint index )
-		{
-			nvDebugCheck(index < m_size);
-			return m_buffer[index];
-		}
-	
-	
-		/// Get vector size.
-		uint size() const { return m_size; }
-		
-		/// Get vector size.
-		uint count() const { return m_size; }
-	
-		/// Get const vector pointer.
-		const T * buffer() const { return m_buffer; }
-	
-		/// Get vector pointer.
-		T * unsecureBuffer() { return m_buffer; }
-	
-		/// Is vector empty.
-		bool isEmpty() const { return m_size == 0; }
-	
-		/// Is a null vector.
-		bool isNull() const	{ return m_buffer == NULL; }
-	
-	
-		/// Push an element at the end of the vector.
-		void push_back( const T & val )
-		{
-			uint new_size = m_size + 1;
-
-			if (new_size > m_buffer_size)
-			{
-				const T copy(val);	// create a copy in case value is inside of this array.
-				resize(new_size);
-				m_buffer[new_size-1] = copy;
-			}
-			else
-			{
-				m_size = new_size;
-				new(m_buffer+new_size-1) T(val);
-			}
-		}
-		void pushBack( const T & val )
-		{
-			push_back(val);
-		}
-		void append( const T & val )
-		{
-			push_back(val);
-		}
-		
-		/// Qt like push operator.
-		Array<T> & operator<< ( T & t )
-		{
-			push_back(t);
-			return *this;
-		}
-		
-		/// Pop and return element at the end of the vector.
-		void pop_back()
-		{
-			nvDebugCheck( m_size > 0 );
-			resize( m_size - 1 );
-		}
-		void popBack()
-		{
-			pop_back();
-		}
-		
-		/// Get back element.
-		const T & back() const
-		{
-			nvDebugCheck( m_size > 0 );
-			return m_buffer[m_size-1];
-		}
-		
-		/// Get back element.
-		T & back()
-		{
-			nvDebugCheck( m_size > 0 );
-			return m_buffer[m_size-1];
-		}
-		
-		/// Get front element.
-		const T & front() const
-		{
-			nvDebugCheck( m_size > 0 );
-			return m_buffer[0];
-		}
-		
-		/// Get front element.
-		T & front()
-		{
-			nvDebugCheck( m_size > 0 );
-			return m_buffer[0];
-		}
-		
-		/// Check if the given element is contained in the array.
-		bool contains(const T & e) const
-		{
-			for (uint i = 0; i < m_size; i++) {
-				if (m_buffer[i] == e) return true;
-			}
-			return false;
-		}
-		
-		/// Remove the element at the given index. This is an expensive operation!
-		void removeAt( uint index )
-		{
-			nvCheck(index >= 0 && index < m_size);
-			
-			if( m_size == 1 ) {
-				clear();
-			}
-			else {
-				m_buffer[index].~T();
-				
-				memmove( m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index) );
-				m_size--;
-			}
-		}
-		
-		/// Remove the first instance of the given element.
-		void remove(const T & element)
-		{
-			for(PseudoIndex i = start(); !isDone(i); advance(i)) {
-				removeAt(i);
-				break;
-			}
-		}
-		
-		/// Insert the given element at the given index shifting all the elements up.
-		void insertAt( uint index, const T & val = T() )
-		{
-			nvCheck( index >= 0 && index <= m_size );
-			
-			resize( m_size + 1 );
-			
-			if( index < m_size - 1 ) {
-				memmove( m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index) );
-			}
-			
-			// Copy-construct into the newly opened slot.
-			new(m_buffer+index) T(val);
-		}
-		
-		/// Append the given data to our vector.
-		void append(const Array<T> & other)
-		{
-			append(other.m_buffer, other.m_size);
-		}
-		
-		/// Append the given data to our vector.
-		void append(const T other[], uint count)
-		{
-			if( count > 0 ) {
-				const uint old_size = m_size;
-				resize(m_size + count);
-				// Must use operator=() to copy elements, in case of side effects (e.g. ref-counting).
-				for( uint i = 0; i < count; i++ ) {
-					m_buffer[old_size + i] = other[i];
-				}
-			}
-		}
-		
-		
-		/// Remove the given element by replacing it with the last one.
-		void replaceWithLast(uint index)
-		{
-			nvDebugCheck( index < m_size );
-			m_buffer[index] = back();
-			(m_buffer+m_size-1)->~T();
-			m_size--;
-		}
-	
-	
-		/// Resize the vector preserving existing elements.
-		void resize(uint new_size)
-		{
-			uint i;
-			uint old_size = m_size;
-			m_size = new_size;
-			
-			// Destruct old elements (if we're shrinking).
-			for( i = new_size; i < old_size; i++ ) {
-				(m_buffer+i)->~T();							// Explicit call to the destructor
-			}
-			
-			if( m_size == 0 ) {
-				//Allocate(0);	// Don't shrink automatically.
-			}
-			else if( m_size <= m_buffer_size/* && m_size > m_buffer_size >> 1*/) {
-				// don't compact yet.
-				nvDebugCheck(m_buffer != NULL);
-			}
-			else {
-				uint new_buffer_size;
-				if( m_buffer_size == 0 ) {
-					// first allocation
-					new_buffer_size = m_size;
-				}
-				else {
-					// growing
-					new_buffer_size = m_size + (m_size >> 2);
-				}
-				allocate( new_buffer_size );
-			}
-			
-			// Call default constructors
-			for( i = old_size; i < new_size; i++ ) {
-				new(m_buffer+i) T;	// placement new
-			}
-		}
-	
-	
-		/// Resize the vector preserving existing elements and initializing the
-		/// new ones with the given value.
-		void resize( uint new_size, const T &elem )
-		{
-			uint i;
-			uint old_size = m_size;
-			m_size = new_size;
-			
-			// Destruct old elements (if we're shrinking).
-			for( i = new_size; i < old_size; i++ ) {
-				(m_buffer+i)->~T();							// Explicit call to the destructor
-			}
-			
-			if( m_size == 0 ) {
-				//Allocate(0);	// Don't shrink automatically.
-			}
-			else if( m_size <= m_buffer_size && m_size > m_buffer_size >> 1 ) {
-				// don't compact yet.
-			}
-			else {
-				uint new_buffer_size;
-				if( m_buffer_size == 0 ) {
-					// first allocation
-					new_buffer_size = m_size;
-				}
-				else {
-					// growing
-					new_buffer_size = m_size + (m_size >> 2);
-				}
-				allocate( new_buffer_size );
-			}
-			
-			// Call copy constructors
-			for( i = old_size; i < new_size; i++ ) {
-				new(m_buffer+i) T( elem );	// placement new
-			}
-		}
-		
-		/// Tighten the memory used by the container.
-		void tighten()
-		{
-			// TODO Reallocate only if worth.
-		}
-		
-		/// Clear the buffer.
-		void clear()
-		{
-			resize(0);
-		}
-		
-		/// Shrink the allocated vector.
-		void shrink()
-		{
-			if( m_size < m_buffer_size ) {
-				allocate(m_size);
-			}
-		}
-		
-		/// Preallocate space.
-		void reserve(uint desired_size)
-		{
-			if( desired_size > m_buffer_size ) {
-				allocate( desired_size );
-			}
-		}
-		
-		/// Copy memory to our vector. Resizes the vector if needed.
-		void copy( const T * ptr, uint num )
-		{
-			resize( num );
-			for(uint i = 0; i < m_size; i++) {
-				m_buffer[i] = ptr[i];
-			}
-		}
-		
-		/// Assignment operator.
-		void operator=( const Array<T> & a )
-		{
-			copy( a.m_buffer, a.m_size );
-		}
-		
-		/*
-		/// Array serialization.
-		friend Stream & operator<< ( Stream & s, Array<T> & p )
-		{
-			if( s.isLoading() ) {
-				uint size;
-				s << size;
-				p.resize( size );
-			}
-			else {
-				s << p.m_size;
-			}
-			
-			for( uint i = 0; i < p.m_size; i++ ) {
-				s << p.m_buffer[i];
-			}
-			
-			return s;
-		}
-		*/
-	
-		// Array enumerator.
-		typedef uint PseudoIndex;
-		
-		PseudoIndex start() const { return 0; }
-		bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; };
-		void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
-		
-	#if NV_CC_MSVC
-		T & operator[]( const PseudoIndexWrapper & i ) {
-			return m_buffer[i(this)];
-		}
-		const T & operator[]( const PseudoIndexWrapper & i ) const {
-			return m_buffer[i(this)];		
-		}
-	#endif
-	
-	
-		/// Swap the members of this vector and the given vector.
-		friend void swap(Array<T> & a, Array<T> & b)
-		{
-			swap(a.m_buffer, b.m_buffer);
-			swap(a.m_size, b.m_size);
-			swap(a.m_buffer_size, b.m_buffer_size);
-		}
-	
-	
-	private:
-	
-		/// Change buffer size.
-		void allocate( uint rsize )
-		{
-			m_buffer_size = rsize;
-			
-			// free the buffer.
-			if( m_buffer_size == 0 ) {
-				if( m_buffer ) {
-					free( m_buffer );
-					m_buffer = NULL;
-				}
-			}
-			
-			// realloc the buffer
-			else {
-				if( m_buffer ) m_buffer = (T *) realloc(m_buffer, sizeof(T) * m_buffer_size);
-				else m_buffer = (T *) ::malloc(sizeof(T) * m_buffer_size);
-			}
-		}
-		
-		
-	private:
-		T * m_buffer;
-		uint m_size;
-		uint m_buffer_size;
-	};
-
-
-
-	/** Thatcher Ulrich's hash table.
-	*
-	* Hash table, linear probing, internal chaining.  One
-	* interesting/nice thing about this implementation is that the table
-	* itself is a flat chunk of memory containing no pointers, only
-	* relative indices.  If the key and value types of the hash contain
-	* no pointers, then the hash can be serialized using raw IO.  Could
-	* come in handy.
-	*
-	* Never shrinks, unless you explicitly clear() it.  Expands on
-	* demand, though.  For best results, if you know roughly how big your
-	* table will be, default it to that size when you create it.
-	*/
-	template<typename T, typename U, typename hash_functor = hash<T> >
-	class NVCORE_CLASS HashMap
-	{
-		NV_FORBID_COPY(HashMap)
-	public:
-
-		/// Default ctor.
-		HashMap() : entry_count(0), size_mask(-1), table(NULL) { }
-
-		/// Ctor with size hint.
-		explicit HashMap(int size_hint) : entry_count(0), size_mask(-1), table(NULL) { setCapacity(size_hint); }
-
-		/// Dtor.
-		~HashMap() { clear(); }
-	
-	
-		/// Set a new or existing value under the key, to the value.
-		void set(const T& key, const U& value)
-		{
-			int	index = findIndex(key);
-			if (index >= 0)
-			{
-				E(index).value = value;
-				return;
-			}
-			
-			// Entry under key doesn't exist.
-			add(key, value);
-		}
-		
-	
-		/// Add a new value to the hash table, under the specified key.
-		void add(const T& key, const U& value)
-		{
-			nvCheck(findIndex(key) == -1);
-			
-			checkExpand();
-			nvCheck(table != NULL);
-			entry_count++;
-			
-			const uint hash_value = hash_functor()(key);
-			const int index = hash_value & size_mask;
-			
-			Entry * natural_entry = &(E(index));
-			
-			if (natural_entry->isEmpty())
-			{
-				// Put the new entry in.
-				new (natural_entry) Entry(key, value, -1, hash_value);
-			}
-			else
-			{
-				// Find a blank spot.
-				int	blank_index = index;
-				for (;;)
-				{
-					blank_index = (blank_index + 1) & size_mask;
-					if (E(blank_index).isEmpty()) break;	// found it
-				}
-				Entry * blank_entry = &E(blank_index);
-				
-				if (int(natural_entry->hash_value & size_mask) == index)
-				{
-					// Collision.  Link into this chain.
-					
-					// Move existing list head.
-					new (blank_entry) Entry(*natural_entry);	// placement new, copy ctor
-					
-					// Put the new info in the natural entry.
-					natural_entry->key = key;
-					natural_entry->value = value;
-					natural_entry->next_in_chain = blank_index;
-					natural_entry->hash_value = hash_value;
-				}
-				else
-				{
-					// Existing entry does not naturally
-					// belong in this slot.  Existing
-					// entry must be moved.
-					
-					// Find natural location of collided element (i.e. root of chain)
-					int	collided_index = natural_entry->hash_value & size_mask;
-					for (;;)
-					{
-						Entry * e = &E(collided_index);
-						if (e->next_in_chain == index)
-						{
-							// Here's where we need to splice.
-							new (blank_entry) Entry(*natural_entry);
-							e->next_in_chain = blank_index;
-							break;
-						}
-						collided_index = e->next_in_chain;
-						nvCheck(collided_index >= 0 && collided_index <= size_mask);
-					}
-					
-					// Put the new data in the natural entry.
-					natural_entry->key = key;
-					natural_entry->value = value;
-					natural_entry->hash_value = hash_value;
-					natural_entry->next_in_chain = -1;
-				}
-			}
-		}
-	
-	
-		/// Remove the first value under the specified key.
-		bool remove(const T& key)
-		{
-			if (table == NULL)
-			{
-				return false;
-			}
-			
-			int	index = findIndex(key);
-			if (index < 0)
-			{
-				return false;
-			}
-			
-			Entry * entry = &E(index);
-			
-			if( entry->isEndOfChain() ) {
-				entry->clear();
-			}
-			else {
-				// Get next entry.
-				Entry & next_entry = E(entry->next_in_chain);
-				
-				// Copy next entry in this place.
-				new (entry) Entry(next_entry);
-				
-				next_entry.clear();
-			}
-			
-			entry_count--;
-			
-			return true;
-		}
-		
-	
-		/// Remove all entries from the hash table.
-		void clear()
-		{
-			if (table != NULL)
-			{
-				// Delete the entries.
-				for (int i = 0, n = size_mask; i <= n; i++)
-				{
-					Entry * e = &E(i);
-					if (e->isEmpty() == false)
-					{
-						e->clear();
-					}
-				}
-				free(table);
-				table = NULL;
-				entry_count = 0;
-				size_mask = -1;
-			}
-		}
-	
-		
-		/// Returns true if the hash is empty.
-		bool isEmpty() const
-		{
-			return table == NULL || entry_count == 0;
-		}
-	
-	
-		/** Retrieve the value under the given key.
-		 *
-		 * If there's no value under the key, then return false and leave
-		 * *value alone.
-		 *
-		 * If there is a value, return true, and set *value to the entry's
-		 * value.
-		 *
-		 * If value == NULL, return true or false according to the
-		 * presence of the key, but don't touch *value.
-		 */
-		bool get(const T& key, U* value = NULL) const
-		{
-			int	index = findIndex(key);
-			if (index >= 0)
-			{
-				if (value) {
-					*value = E(index).value;	// take care with side-effects!
-				}
-				return true;
-			}
-			return false;
-		}
-		
-		/// Determine if the given key is contained in the hash.
-		bool contains(const T & key) const
-		{
-			return get(key);
-		}
-	
-		/// Number of entries in the hash.
-		int	size() const
-		{
-			return entry_count;
-		}
-	
-		/// Number of entries in the hash.
-		int	count() const
-		{
-			return size();
-		}
-		
-	
-		/**
-		* Resize the hash table to fit one more entry.  Often this
-		* doesn't involve any action.
-		*/
-		void checkExpand()
-		{
-			if (table == NULL) {
-				// Initial creation of table.  Make a minimum-sized table.
-				setRawCapacity(16);
-			} 
-			else if (entry_count * 3 > (size_mask + 1) * 2) {
-				// Table is more than 2/3rds full.  Expand.
-				setRawCapacity(entry_count * 2);
-			}
-		}
-	
-	
-		/// Hint the bucket count to >= n.
-		void resize(int n)
-		{
-			// Not really sure what this means in relation to
-			// STLport's hash_map... they say they "increase the
-			// bucket count to at least n" -- but does that mean
-			// their real capacity after resize(n) is more like
-			// n*2 (since they do linked-list chaining within
-			// buckets?).
-			setCapacity(n);
-		}
-	
-		/**
-		* Size the hash so that it can comfortably contain the given
-		* number of elements.  If the hash already contains more
-		* elements than new_size, then this may be a no-op.
-		*/
-		void setCapacity(int new_size)
-		{
-			int	new_raw_size = (new_size * 3) / 2;
-			if (new_raw_size < size()) { return; }
-			
-			setRawCapacity(new_raw_size);
-		}
-	
-		/// Behaves much like std::pair.
-		struct Entry
-		{
-			int	next_in_chain;	// internal chaining for collisions
-			uint hash_value;	// avoids recomputing.  Worthwhile?
-			T key;
-			U value;
-			
-			Entry() : next_in_chain(-2) {}
-			Entry(const Entry& e)
-				: next_in_chain(e.next_in_chain), hash_value(e.hash_value), key(e.key), value(e.value)
-			{
-			}
-			Entry(const T& k, const U& v, int next, int hash)
-				: next_in_chain(next), hash_value(hash), key(k), value(v)
-			{
-			}
-			bool isEmpty() const { return next_in_chain == -2; }
-			bool isEndOfChain() const { return next_in_chain == -1; }
-			
-			void clear()
-			{
-				key.~T();	// placement delete
-				value.~U();	// placement delete
-				next_in_chain = -2;
-			}
-		};
-	
-		
-		// HashMap enumerator.
-		typedef int PseudoIndex;
-		PseudoIndex start() const { PseudoIndex i = 0; findNext(i); return i; }
-		bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= size_mask+1); return i == size_mask+1; };
-		void advance(PseudoIndex & i) const { nvDebugCheck(i <= size_mask+1); i++; findNext(i); }
-		
-	#if NV_CC_GNUC
-		Entry & operator[]( const PseudoIndex & i ) {
-			return E(i);
-		}
-		const Entry & operator[]( const PseudoIndex & i ) const {
-			return E(i);
-		}
-	#elif NV_CC_MSVC
-		Entry & operator[]( const PseudoIndexWrapper & i ) {
-			return E(i(this));
-		}
-		const Entry & operator[]( const PseudoIndexWrapper & i ) const {
-			return E(i(this));
-		}
-	#endif
-		
-		
-		
-	private:
-	
-		// Find the index of the matching entry. If no match, then return -1.
-		int	findIndex(const T& key) const
-		{
-			if (table == NULL) return -1;
-			
-			uint hash_value = hash_functor()(key);
-			int	index = hash_value & size_mask;
-			
-			const Entry * e = &E(index);
-			if (e->isEmpty()) return -1;
-			if (int(e->hash_value & size_mask) != index) return -1;	// occupied by a collider
-			
-			for (;;)
-			{
-				nvCheck((e->hash_value & size_mask) == (hash_value & size_mask));
-				
-				if (e->hash_value == hash_value && e->key == key)
-				{
-					// Found it.
-					return index;
-				}
-				nvDebugCheck(! (e->key == key));	// keys are equal, but hash differs!
-				
-				// Keep looking through the chain.
-				index = e->next_in_chain;
-				if (index == -1) break;	// end of chain
-				
-				nvCheck(index >= 0 && index <= size_mask);
-				e = &E(index);
-				
-				nvCheck(e->isEmpty() == false);
-			}
-			return -1;
-		}
-	
-		// Helpers.
-		Entry & E(int index)
-		{
-			nvDebugCheck(table != NULL);
-			nvDebugCheck(index >= 0 && index <= size_mask);
-			return table[index];
-		}
-		const Entry & E(int index) const
-		{
-			nvDebugCheck(table != NULL);
-			nvDebugCheck(index >= 0 && index <= size_mask);
-			return table[index];
-		}
-	
-		
-		/**
-		 * Resize the hash table to the given size (Rehash the
-		 * contents of the current table).  The arg is the number of
-		 * hash table entries, not the number of elements we should
-		 * actually contain (which will be less than this).
-		 */
-		void setRawCapacity(int new_size)
-		{
-			if (new_size <= 0) {
-				// Special case.
-				clear();
-				return;
-			}
-			
-			// Force new_size to be a power of two.
-			new_size = nextPowerOfTwo(new_size);
-			
-			HashMap<T, U, hash_functor> new_hash;
-			new_hash.table = (Entry *) ::malloc(sizeof(Entry) * new_size);
-			nvDebugCheck(new_hash.table != NULL);
-			
-			new_hash.entry_count = 0;
-			new_hash.size_mask = new_size - 1;
-			for (int i = 0; i < new_size; i++)
-			{
-				new_hash.E(i).next_in_chain = -2;	// mark empty
-			}
-			
-			// Copy stuff to new_hash
-			if (table != NULL)
-			{
-				for (int i = 0, n = size_mask; i <= n; i++)
-				{
-					Entry * e = &E(i);
-					if (e->isEmpty() == false)
-					{
-						// Insert old entry into new hash.
-						new_hash.add(e->key, e->value);
-						e->clear();	// placement delete of old element
-					}
-				}
-				
-				// Delete our old data buffer.
-				free(table);
-			}
-			
-			// Steal new_hash's data.
-			entry_count = new_hash.entry_count;
-			size_mask = new_hash.size_mask;
-			table = new_hash.table;
-			new_hash.entry_count = 0;
-			new_hash.size_mask = -1;
-			new_hash.table = NULL;
-		}
-	
-		// Move the enumerator to the next valid element.
-		void findNext(PseudoIndex & i) const {
-			while (i <= size_mask && E(i).isEmpty()) {
-				i++;
-			}
-		}
-		
-		
-		int	entry_count;
-		int	size_mask;
-		Entry * table;
-	
-	};
-
-
-
-} // nv namespace
-
-#endif // NV_CORE_CONTAINER_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Debug.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Debug.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Debug.h
@@ -1,131 +1,217 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
+#pragma once
 #ifndef NV_CORE_DEBUG_H
 #define NV_CORE_DEBUG_H
 
-#include <nvcore/nvcore.h>
+#include "nvcore.h"
+
+#include <stdarg.h> // va_list
 
-#if defined(HAVE_STDARG_H)
-#	include <stdarg.h>	// va_list
-#endif
 
-#define NV_ABORT_DEBUG		1
-#define NV_ABORT_IGNORE		2
-#define NV_ABORT_EXIT		3
+// Make sure we are using our assert.
+#undef assert
 
-#undef assert	// avoid conflicts with assert method.
+#define NV_ABORT_DEBUG      1
+#define NV_ABORT_IGNORE     2
+#define NV_ABORT_EXIT       3
 
 #define nvNoAssert(exp) \
-	do { \
-		(void)sizeof(exp); \
-	} while(0)
+    NV_MULTI_LINE_MACRO_BEGIN \
+    (void)sizeof(exp); \
+    NV_MULTI_LINE_MACRO_END
 
 #if NV_NO_ASSERT
 
-#	define nvAssert(exp) nvNoAssert(exp)
-#	define nvCheck(exp) nvNoAssert(exp)
-#	define nvDebugAssert(exp) nvNoAssert(exp)
-#	define nvDebugCheck(exp) nvNoAssert(exp)
-#	define nvDebugBreak() nvNoAssert(0)
+#   define nvAssert(exp) nvNoAssert(exp)
+#   define nvCheck(exp) nvNoAssert(exp)
+#   define nvDebugAssert(exp) nvNoAssert(exp)
+#   define nvDebugCheck(exp) nvNoAssert(exp)
+#   define nvDebugBreak() nvNoAssert(0)
 
 #else // NV_NO_ASSERT
 
-#	if NV_CC_MSVC
-		// @@ Does this work in msvc-6 and earlier?
-		// @@ Do I have to include <intrin.h> ?
-#		define nvDebugBreak()		__debugbreak()
-		// define nvDebugBreak()		__asm int 3
-#	elif NV_CC_GNUC && NV_CPU_PPC && NV_OS_DARWIN
-#		define nvDebugBreak()		__asm__ volatile ("trap");
-#	elif NV_CC_GNUC && NV_CPU_X86 && NV_OS_DARWIN
-#		define nvDebugBreak()		__asm__ volatile ("int3");
-#	elif NV_CC_GNUC && NV_CPU_X86 
-#		define nvDebugBreak()		__asm__ ( "int %0" : :"I"(3) )
-#	else
-#		include <signal.h>
-#		define nvDebugBreak()		raise(SIGTRAP); 
-		// define nvDebugBreak() 		*((int *)(0)) = 0
-#	endif
-
-#	define nvAssertMacro(exp) \
-		do { \
-			if(!(exp)) { \
-				if( nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG ) { \
-					nvDebugBreak(); \
-				} \
-			} \
-		} while(false)
-
-#	define nvAssert(exp)	nvAssertMacro(exp)
-#	define nvCheck(exp)		nvAssertMacro(exp)
-
-#	if defined(_DEBUG)
-#		define nvDebugAssert(exp)	nvAssertMacro(exp)
-#		define nvDebugCheck(exp)	nvAssertMacro(exp)
-#	else // _DEBUG
-#		define nvDebugAssert(exp)	nvNoAssert(exp)
-#		define nvDebugCheck(exp)	nvNoAssert(exp)
-#	endif // _DEBUG
+#   if NV_CC_MSVC
+        // @@ Does this work in msvc-6 and earlier?
+#       define nvDebugBreak()       __debugbreak()
+//#       define nvDebugBreak()        __asm { int 3 }
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __debugbreak()
+#   elif NV_CC_GNUC
+#       define nvDebugBreak()       __builtin_trap()
+#   else
+#       error "No nvDebugBreak()!"
+#   endif
+
+/*
+#   elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN
+        // @@ Use __builtin_trap() on GCC
+#       define nvDebugBreak()       __asm__ volatile ("trap")
+#   elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN
+#       define nvDebugBreak()       __asm__ volatile ("int3")
+#   elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64
+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
+#   else
+#       include <signal.h>
+#       define nvDebugBreak()       raise(SIGTRAP)
+#   endif
+*/
+
+#define nvDebugBreakOnce() \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    static bool firstTime = true; \
+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
+    NV_MULTI_LINE_MACRO_END
+
+#define nvAssertMacro(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    if (!(exp)) { \
+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
+            nvDebugBreak(); \
+        } \
+    } \
+    NV_MULTI_LINE_MACRO_END
+
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+        static bool ignoreAll = false; \
+        if (!ignoreAll && !(exp)) { \
+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
+            if (result == NV_ABORT_DEBUG) { \
+                nvDebugBreak(); \
+            } else if (result == NV_ABORT_IGNORE) { \
+                ignoreAll = true; \
+            } \
+        } \
+    NV_MULTI_LINE_MACRO_END
+
+// Interesting assert macro from Insomniac:
+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
+// Used as follows:
+// if (nvCheck(i < count)) {
+//     normal path
+// } else {
+//     fixup code.
+// }
+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
+#define nvCheckMacro(exp) \
+    (\
+        (exp) ? true : ( \
+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
+        ) \
+    )
+
+
+#define nvAssert(exp)    nvAssertMacro(exp)
+#define nvCheck(exp)     nvAssertMacro(exp)
+
+#if defined(_DEBUG)
+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
+#else // _DEBUG
+#   define nvDebugAssert(exp)   nvNoAssert(exp)
+#   define nvDebugCheck(exp)    nvNoAssert(exp)
+#endif // _DEBUG
 
 #endif // NV_NO_ASSERT
 
 // Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
+/*#if !defined(_DEBUG)
+#   if NV_CC_MSVC
+#       define nvAssume(exp)    __assume(exp)
+#   else
+#       define nvAssume(exp)    nvCheck(exp)
+#   endif
+#else
+#   define nvAssume(exp)    nvCheck(exp)
+#endif*/
+
 #if defined(_DEBUG)
-#	if NV_CC_MSVC
-#		define nvAssume(exp)	__assume(exp)
-#	else
-#		define nvAssume(exp)	nvCheck(exp)
-#	endif
+#  if NV_CC_MSVC
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
+#  else
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
+#  endif
 #else
-#	define nvAssume(exp)	nvCheck(exp)
+#  if NV_CC_MSVC
+#   define nvUnreachable() __assume(0)
+#  else
+#   define nvUnreachable() __builtin_unreachable()
+#  endif
 #endif
 
 
-#define nvError(x)		nvAbort(x, __FILE__, __LINE__, __FUNC__)
-#define nvWarning(x)	nvDebug("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
 
+#ifndef NV_DEBUG_PRINT
+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
+#endif
 
-#if PI_CC_MSVC
-// @@ I'm not sure it's a good idea to use the default static assert.
-#	define nvStaticCheck(x) _STATIC_ASSERT(x)
+#if NV_DEBUG_PRINT
+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
+#else
+#if NV_CC_MSVC
+#define nvDebug(...)    __noop(__VA_ARGS__)
 #else
-#	define nvStaticCheck(x) typedef char NV_DO_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
-// 	define nvStaticCheck(x) switch(0) { case 0: case x:; }
+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
 #endif
+#endif
+
 
-NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = 0);
-NVCORE_API void NV_CDECL nvDebug( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6)));
+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
 
 namespace nv
 {
-	/** Message handler interface. */
-	struct MessageHandler {
-		virtual void log(const char * str, va_list arg) = 0;
-		virtual ~MessageHandler() {}	
-	};
-	
-	/** Assert handler interface. */
-	struct AssertHandler {
-		virtual int assert(const char *exp, const char *file, int line, const char *func = 0) = 0;
-		virtual ~AssertHandler() {}	
-	};
-
-
-	namespace debug
-	{
-		NVCORE_API void dumpInfo();
-	
-		// These functions are not thread safe.
-		NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
-		NVCORE_API void resetMessageHandler();
-	
-		NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
-		NVCORE_API void resetAssertHandler();
-	
-		NVCORE_API void enableSigHandler();
-		NVCORE_API void disableSigHandler();
-	}
+    inline bool isValidPtr(const void * ptr) {
+    #if NV_CPU_X86_64 || POSH_CPU_PPC64
+        if (ptr == NULL) return true;
+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
+    #else
+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
+    #endif
+        return true;
+    }
+
+    // Message handler interface.
+    struct MessageHandler {
+        virtual void log(const char * str, va_list arg) = 0;
+        virtual ~MessageHandler() {}
+    };
+
+    // Assert handler interface.
+    struct AssertHandler {
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
+        virtual ~AssertHandler() {}
+    };
+
+
+    namespace debug
+    {
+        NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
+
+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+        NVCORE_API void resetMessageHandler();
+
+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+        NVCORE_API void resetAssertHandler();
+
+        NVCORE_API void enableSigHandler(bool interactive);
+        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
+        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
+    }
 
 } // nv namespace
 
-#endif	// NV_CORE_DEBUG_H
+#endif // NV_CORE_DEBUG_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Debug.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Debug.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Debug.cpp
@@ -1,489 +1,1030 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
-#include <nvcore/Debug.h>
-#include <nvcore/StrLib.h>
+#include "Debug.h"
+#include "Array.inl"
+#include "StrLib.h" // StringBuilder
+
+#include "StdStream.h" // fileOpen
+
+#include <stdlib.h>
 
 // Extern
 #if NV_OS_WIN32 //&& NV_CC_MSVC
-#	define WIN32_LEAN_AND_MEAN
-#	define VC_EXTRALEAN
-#	include <windows.h>
-#	include <direct.h>
-#	if NV_CC_MSVC
-#		include <crtdbg.h>
-#		if _MSC_VER < 1300
-#			define DECLSPEC_DEPRECATED
-			// VC6: change this path to your Platform SDK headers
-#			include <dbghelp.h>	// must be XP version of file
-//			include "M:\\dev7\\vs\\devtools\\common\\win32sdk\\include\\dbghelp.h"
-#		else
-			// VC7: ships with updated headers
-#			include <dbghelp.h>
-#		endif
-#	endif
+#   define WIN32_LEAN_AND_MEAN
+#   define VC_EXTRALEAN
+#   include <windows.h>
+#   include <direct.h>
+#   if NV_CC_MSVC
+#       include <crtdbg.h>
+#       if _MSC_VER < 1300
+#           define DECLSPEC_DEPRECATED
+// VC6: change this path to your Platform SDK headers
+#           include <dbghelp.h> // must be XP version of file
+//          include "M:\\dev7\\vs\\devtools\\common\\win32sdk\\include\\dbghelp.h"
+#       else
+// VC7: ships with updated headers
+#           include <dbghelp.h>
+#       endif
+#   endif
+#   pragma comment(lib,"dbghelp.lib")
 #endif
 
+#if NV_OS_XBOX
+#    include <Xtl.h>
+#    ifdef _DEBUG
+#        include <xbdm.h>
+#    endif //_DEBUG
+#endif //NV_OS_XBOX
+
 #if !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
-#	include <signal.h>
+#   include <signal.h>
 #endif
 
-#if NV_OS_LINUX || NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
-#	include <unistd.h>	// getpid
+#if NV_OS_UNIX
+#   include <unistd.h> // getpid
 #endif
 
 #if NV_OS_LINUX && defined(HAVE_EXECINFO_H)
-#	include <execinfo.h> // backtrace
-#	if NV_CC_GNUC // defined(HAVE_CXXABI_H)
-#		include <cxxabi.h>
-#	endif
-#endif
-
-#if NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
-#	include <sys/types.h>
-#	include <sys/param.h>
-#	include <sys/sysctl.h>	// sysctl
-#	if !NV_OS_OPENBSD
-#		include <sys/ucontext.h>
-#	endif
-#	undef HAVE_EXECINFO_H
-#	if defined(HAVE_EXECINFO_H) // only after OSX 10.5
-#		include <execinfo.h> // backtrace
-#		if NV_CC_GNUC // defined(HAVE_CXXABI_H)
-#			include <cxxabi.h>
-#		endif
-#	endif
+#   include <execinfo.h> // backtrace
+#   if NV_CC_GNUC // defined(HAVE_CXXABI_H)
+#       include <cxxabi.h>
+#   endif
 #endif
 
-#include <stdexcept> // std::runtime_error
-#undef assert // defined on mingw
+#if NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_NETBSD || NV_OS_OPENBSD
+#   include <sys/types.h>
+#   include <sys/param.h>
+#   include <sys/sysctl.h> // sysctl
+#   if !defined(NV_OS_OPENBSD)
+#       include <sys/ucontext.h>
+#   endif
+#   if defined(HAVE_EXECINFO_H) // only after OSX 10.5
+#       include <execinfo.h> // backtrace
+#       if NV_CC_GNUC // defined(HAVE_CXXABI_H)
+#           include <cxxabi.h>
+#       endif
+#   endif
+#endif
+
+#if NV_OS_ORBIS
+#include <libdbg.h>
+#endif
+
+#define NV_USE_SEPARATE_THREAD 1
+
 
 using namespace nv;
 
 namespace 
 {
 
-	static MessageHandler * s_message_handler = NULL;
-	static AssertHandler * s_assert_handler = NULL;
-	
-	static bool s_sig_handler_enabled = false;
+    static MessageHandler * s_message_handler = NULL;
+    static AssertHandler * s_assert_handler = NULL;
+
+    static bool s_sig_handler_enabled = false;
+    static bool s_interactive = true;
 
 #if NV_OS_WIN32 && NV_CC_MSVC
 
-	// Old exception filter.
-	static LPTOP_LEVEL_EXCEPTION_FILTER s_old_exception_filter = NULL;
+    // Old exception filter.
+    static LPTOP_LEVEL_EXCEPTION_FILTER s_old_exception_filter = NULL;
 
 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
 
-	// Old signal handlers.
-	struct sigaction s_old_sigsegv;
-	struct sigaction s_old_sigtrap;
-	struct sigaction s_old_sigfpe;
-	struct sigaction s_old_sigbus;
-	
+    // Old signal handlers.
+    struct sigaction s_old_sigsegv;
+    struct sigaction s_old_sigtrap;
+    struct sigaction s_old_sigfpe;
+    struct sigaction s_old_sigbus;
+
 #endif
 
 
 #if NV_OS_WIN32 && NV_CC_MSVC
 
-	// TODO write minidump
-	
-	static LONG WINAPI nvTopLevelFilter( struct _EXCEPTION_POINTERS * pExceptionInfo)
-	{
-		NV_UNUSED(pExceptionInfo);
-	/*	BOOL (WINAPI * Dump) (HANDLE, DWORD, HANDLE, MINIDUMP_TYPE, PMINIDUMP_EXCEPTION_INFORMATION, PMINIDUMP_USER_STREAM_INFORMATION, PMINIDUMP_CALLBACK_INFORMATION );
-	
-		AutoString dbghelp_path(512);
-		getcwd(dbghelp_path, 512);
-		dbghelp_path.Append("\\DbgHelp.dll");
-		nvTranslatePath(dbghelp_path);
-		
-		PiLibrary DbgHelp_lib(dbghelp_path, true);
-		
-		if( !DbgHelp_lib.IsValid() ) {
-			nvDebug("*** 'DbgHelp.dll' not found.\n");
-			return EXCEPTION_CONTINUE_SEARCH;
-		}
-		
-		if( !DbgHelp_lib.BindSymbol( (void **)&Dump, "MiniDumpWriteDump" ) ) {
-			nvDebug("*** 'DbgHelp.dll' too old.\n");
-			return EXCEPTION_CONTINUE_SEARCH;
-		}
-		
-		// create the file
-		HANDLE hFile = ::CreateFile( "nv.dmp", GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL );
-		if( hFile == INVALID_HANDLE_VALUE ) {
-			nvDebug("*** Failed to create dump file.\n");
-			return EXCEPTION_CONTINUE_SEARCH;
-		}
-		
-		
-		_MINIDUMP_EXCEPTION_INFORMATION ExInfo;
-	
-		ExInfo.ThreadId = ::GetCurrentThreadId();
-		ExInfo.ExceptionPointers = pExceptionInfo;
-		ExInfo.ClientPointers = NULL;
-	
-		// write the dump
-		bool ok = Dump( GetCurrentProcess(), GetCurrentProcessId(), hFile, MiniDumpNormal, &ExInfo, NULL, NULL )!=0;
-		::CloseHandle(hFile);
-		
-		if( !ok ) {
-			nvDebug("*** Failed to save dump file.\n");
-			return EXCEPTION_CONTINUE_SEARCH;
-		}
-		
-		nvDebug("--- Dump file saved.\n");
-		*/
-		return EXCEPTION_CONTINUE_SEARCH;
-	}
+    // We should try to simplify the top level filter as much as possible.
+    // http://www.nynaeve.net/?p=128
+
+#if NV_USE_SEPARATE_THREAD
+
+    // The critical section enforcing the requirement that only one exception be
+    // handled by a handler at a time.
+    static CRITICAL_SECTION s_handler_critical_section;
+
+    // Semaphores used to move exception handling between the exception thread
+    // and the handler thread.  handler_start_semaphore_ is signalled by the
+    // exception thread to wake up the handler thread when an exception occurs.
+    // handler_finish_semaphore_ is signalled by the handler thread to wake up
+    // the exception thread when handling is complete.
+    static HANDLE s_handler_start_semaphore = NULL;
+    static HANDLE s_handler_finish_semaphore = NULL;
+
+    // The exception handler thread.
+    static HANDLE s_handler_thread = NULL;
+
+    static DWORD s_requesting_thread_id = 0;
+    static EXCEPTION_POINTERS * s_exception_info = NULL;
+
+#endif // NV_USE_SEPARATE_THREAD
+
+
+    struct MinidumpCallbackContext {
+        ULONG64 memory_base;
+        ULONG memory_size;
+        bool finished;
+    };
+
+    // static
+    static BOOL CALLBACK miniDumpWriteDumpCallback(PVOID context, const PMINIDUMP_CALLBACK_INPUT callback_input, PMINIDUMP_CALLBACK_OUTPUT callback_output)
+    {
+        switch (callback_input->CallbackType)
+        {
+        case MemoryCallback: {
+            MinidumpCallbackContext* callback_context = reinterpret_cast<MinidumpCallbackContext*>(context);
+            if (callback_context->finished)
+                return FALSE;
+
+            // Include the specified memory region.
+            callback_output->MemoryBase = callback_context->memory_base;
+            callback_output->MemorySize = callback_context->memory_size;
+            callback_context->finished = true;
+            return TRUE;
+        }
+
+        // Include all modules.
+        case IncludeModuleCallback:
+        case ModuleCallback:
+            return TRUE;
+
+        // Include all threads.
+        case IncludeThreadCallback:
+        case ThreadCallback:
+            return TRUE;
+
+        // Stop receiving cancel callbacks.
+        case CancelCallback:
+            callback_output->CheckCancel = FALSE;
+            callback_output->Cancel = FALSE;
+            return TRUE;
+        }
+
+        // Ignore other callback types.
+        return FALSE;
+    }
+
+    static bool writeMiniDump(EXCEPTION_POINTERS * pExceptionInfo)
+    {
+        // create the file
+        HANDLE hFile = CreateFileA("crash.dmp", GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+        if (hFile == INVALID_HANDLE_VALUE) {
+            //nvDebug("*** Failed to create dump file.\n");
+            return false;
+        }
+
+        MINIDUMP_EXCEPTION_INFORMATION * pExInfo = NULL;
+        MINIDUMP_CALLBACK_INFORMATION * pCallback = NULL;
+
+        if (pExceptionInfo != NULL) {
+            MINIDUMP_EXCEPTION_INFORMATION ExInfo;
+            ExInfo.ThreadId = ::GetCurrentThreadId();
+            ExInfo.ExceptionPointers = pExceptionInfo;
+            ExInfo.ClientPointers = NULL;
+            pExInfo = &ExInfo;
+
+            MINIDUMP_CALLBACK_INFORMATION callback;
+            MinidumpCallbackContext context;
+
+            // Find a memory region of 256 bytes centered on the
+            // faulting instruction pointer.
+            const ULONG64 instruction_pointer = 
+            #if defined(_M_IX86)
+                pExceptionInfo->ContextRecord->Eip;
+            #elif defined(_M_AMD64)
+                pExceptionInfo->ContextRecord->Rip;
+            #else
+                #error Unsupported platform
+            #endif
+
+            MEMORY_BASIC_INFORMATION info;
+            
+            if (VirtualQuery(reinterpret_cast<LPCVOID>(instruction_pointer), &info, sizeof(MEMORY_BASIC_INFORMATION)) != 0 && info.State == MEM_COMMIT)
+            {
+                // Attempt to get 128 bytes before and after the instruction
+                // pointer, but settle for whatever's available up to the
+                // boundaries of the memory region.
+                const ULONG64 kIPMemorySize = 256;
+                context.memory_base = max(reinterpret_cast<ULONG64>(info.BaseAddress), instruction_pointer - (kIPMemorySize / 2));
+                ULONG64 end_of_range = min(instruction_pointer + (kIPMemorySize / 2), reinterpret_cast<ULONG64>(info.BaseAddress) + info.RegionSize);
+                context.memory_size = static_cast<ULONG>(end_of_range - context.memory_base);
+                context.finished = false;
+
+                callback.CallbackRoutine = miniDumpWriteDumpCallback;
+                callback.CallbackParam = reinterpret_cast<void*>(&context);
+                pCallback = &callback;
+            }
+        }
+
+        MINIDUMP_TYPE miniDumpType = (MINIDUMP_TYPE)(MiniDumpNormal|MiniDumpWithHandleData|MiniDumpWithThreadInfo);
+
+        // write the dump
+        BOOL ok = MiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), hFile, miniDumpType, pExInfo, NULL, pCallback) != 0;
+        CloseHandle(hFile);
+
+        if (ok == FALSE) {
+            //nvDebug("*** Failed to save dump file.\n");
+            return false;
+        }
+
+        //nvDebug("\nDump file saved.\n");
+
+        return true;
+    }
+
+#if NV_USE_SEPARATE_THREAD
+
+    static DWORD WINAPI ExceptionHandlerThreadMain(void* lpParameter) {
+        nvDebugCheck(s_handler_start_semaphore != NULL);
+        nvDebugCheck(s_handler_finish_semaphore != NULL);
+
+        while (true) {
+            if (WaitForSingleObject(s_handler_start_semaphore, INFINITE) == WAIT_OBJECT_0) {
+                writeMiniDump(s_exception_info);
+
+                // Allow the requesting thread to proceed.
+                ReleaseSemaphore(s_handler_finish_semaphore, 1, NULL);
+            }
+        }
+
+        // This statement is not reached when the thread is unconditionally
+        // terminated by the ExceptionHandler destructor.
+        return 0;
+    }
+
+#endif // NV_USE_SEPARATE_THREAD
+
+    static bool hasStackTrace() {
+        return true;
+    }
+
+    /*static NV_NOINLINE int backtrace(void * trace[], int maxcount) {
+
+        // In Windows XP and Windows Server 2003, the sum of the FramesToSkip and FramesToCapture parameters must be less than 63.
+        int xp_maxcount = min(63-1, maxcount);
+
+        int count = RtlCaptureStackBackTrace(1, xp_maxcount, trace, NULL);
+        nvDebugCheck(count <= maxcount);
+
+        return count;
+    }*/
+
+    static NV_NOINLINE int backtraceWithSymbols(CONTEXT * ctx, void * trace[], int maxcount, int skip = 0) {
+        
+        // Init the stack frame for this function
+        STACKFRAME64 stackFrame = { 0 };
+
+    #if NV_CPU_X86_64
+        DWORD dwMachineType = IMAGE_FILE_MACHINE_AMD64;
+        stackFrame.AddrPC.Offset = ctx->Rip;
+        stackFrame.AddrFrame.Offset = ctx->Rbp;
+        stackFrame.AddrStack.Offset = ctx->Rsp;
+    #elif NV_CPU_X86
+        DWORD dwMachineType = IMAGE_FILE_MACHINE_I386;
+        stackFrame.AddrPC.Offset = ctx->Eip;
+        stackFrame.AddrFrame.Offset = ctx->Ebp;
+        stackFrame.AddrStack.Offset = ctx->Esp;
+    #else
+        #error "Platform not supported!"
+    #endif
+        stackFrame.AddrPC.Mode = AddrModeFlat;
+        stackFrame.AddrFrame.Mode = AddrModeFlat;
+        stackFrame.AddrStack.Mode = AddrModeFlat;
+
+        // Walk up the stack
+        const HANDLE hThread = GetCurrentThread();
+        const HANDLE hProcess = GetCurrentProcess();
+        int i;
+        for (i = 0; i < maxcount; i++)
+        {
+            // walking once first makes us skip self
+            if (!StackWalk64(dwMachineType, hProcess, hThread, &stackFrame, ctx, NULL, &SymFunctionTableAccess64, &SymGetModuleBase64, NULL)) {
+                break;
+            }
+
+            /*if (stackFrame.AddrPC.Offset == stackFrame.AddrReturn.Offset || stackFrame.AddrPC.Offset == 0) {
+                break;
+            }*/
+
+            if (i >= skip) {
+                trace[i - skip] = (PVOID)stackFrame.AddrPC.Offset;
+            }
+        }
+
+        return i - skip;
+    }
+
+#pragma warning(push)
+#pragma warning(disable:4748)
+    static NV_NOINLINE int backtrace(void * trace[], int maxcount) {
+        CONTEXT ctx = { 0 };
+#if NV_CPU_X86 && !NV_CPU_X86_64
+        ctx.ContextFlags = CONTEXT_CONTROL;
+        _asm {
+             call x
+          x: pop eax
+             mov ctx.Eip, eax
+             mov ctx.Ebp, ebp
+             mov ctx.Esp, esp
+        }
+#else
+        RtlCaptureContext(&ctx); // Not implemented correctly in x86.
+#endif
+
+        return backtraceWithSymbols(&ctx, trace, maxcount, 1);
+    }
+#pragma warning(pop)
+
+    static NV_NOINLINE void writeStackTrace(void * trace[], int size, int start, Array<const char *> & lines)
+    {
+        StringBuilder builder(512);
+
+        HANDLE hProcess = GetCurrentProcess();
+        
+        // Resolve PC to function names
+        for (int i = start; i < size; i++)
+        {
+            // Check for end of stack walk
+            DWORD64 ip = (DWORD64)trace[i];
+            if (ip == NULL)
+                break;
+
+            // Get function name
+            #define MAX_STRING_LEN  (512)
+            unsigned char byBuffer[sizeof(IMAGEHLP_SYMBOL64) + MAX_STRING_LEN] = { 0 };
+            IMAGEHLP_SYMBOL64 * pSymbol = (IMAGEHLP_SYMBOL64*)byBuffer;
+            pSymbol->SizeOfStruct = sizeof(IMAGEHLP_SYMBOL64);
+            pSymbol->MaxNameLength = MAX_STRING_LEN;
+
+            DWORD64 dwDisplacement;
+            
+            if (SymGetSymFromAddr64(hProcess, ip, &dwDisplacement, pSymbol))
+            {
+                pSymbol->Name[MAX_STRING_LEN-1] = 0;
+                
+                /*
+                // Make the symbol readable for humans
+                UnDecorateSymbolName( pSym->Name, lpszNonUnicodeUnDSymbol, BUFFERSIZE, 
+                    UNDNAME_COMPLETE | 
+                    UNDNAME_NO_THISTYPE |
+                    UNDNAME_NO_SPECIAL_SYMS |
+                    UNDNAME_NO_MEMBER_TYPE |
+                    UNDNAME_NO_MS_KEYWORDS |
+                    UNDNAME_NO_ACCESS_SPECIFIERS );
+                */
+                
+                // pSymbol->Name
+                const char * pFunc = pSymbol->Name;
+
+                // Get file/line number
+                IMAGEHLP_LINE64 theLine = { 0 };
+                theLine.SizeOfStruct = sizeof(theLine);
+
+                DWORD dwDisplacement;
+                if (!SymGetLineFromAddr64(hProcess, ip, &dwDisplacement, &theLine))
+                {
+                    // Do not print unknown symbols anymore.
+                    break;
+                    //builder.format("unknown(%08X) : %s\n", (uint32)ip, pFunc);
+                }
+                else
+                {
+                    /*
+                    const char* pFile = strrchr(theLine.FileName, '\\');
+                    if ( pFile == NULL ) pFile = theLine.FileName;
+                    else pFile++;
+                    */
+                    const char * pFile = theLine.FileName;
+                    
+                    int line = theLine.LineNumber;
+                    
+                    builder.format("%s(%d) : %s\n", pFile, line, pFunc);
+                }
+
+                lines.append(builder.release());
+
+                if (pFunc != NULL && strcmp(pFunc, "WinMain") == 0) {
+                    break;
+                }
+            }
+        }
+    }
+
+
+    // Write mini dump and print stack trace.
+    static LONG WINAPI handleException(EXCEPTION_POINTERS * pExceptionInfo)
+    {
+        EnterCriticalSection(&s_handler_critical_section);
+#if NV_USE_SEPARATE_THREAD
+        s_requesting_thread_id = GetCurrentThreadId();
+        s_exception_info = pExceptionInfo;
+
+        // This causes the handler thread to call writeMiniDump.
+        ReleaseSemaphore(s_handler_start_semaphore, 1, NULL);
+
+        // Wait until WriteMinidumpWithException is done and collect its return value.
+        WaitForSingleObject(s_handler_finish_semaphore, INFINITE);
+        //bool status = s_handler_return_value;
+
+        // Clean up.
+        s_requesting_thread_id = 0;
+        s_exception_info = NULL;
+#else
+        // First of all, write mini dump.
+        writeMiniDump(pExceptionInfo);
+#endif
+        LeaveCriticalSection(&s_handler_critical_section);
+
+        nvDebug("\nDump file saved.\n");
+
+        // Try to attach to debugger.
+        if (s_interactive && debug::attachToDebugger()) {
+            nvDebugBreak();
+            return EXCEPTION_CONTINUE_EXECUTION;
+        }
+
+        // If that fails, then try to pretty print a stack trace and terminate.
+        void * trace[64];
+        
+        int size = backtraceWithSymbols(pExceptionInfo->ContextRecord, trace, 64);
+
+        // @@ Use win32's CreateFile?
+        FILE * fp = fileOpen("crash.txt", "wb");
+        if (fp != NULL) {
+            Array<const char *> lines;
+            writeStackTrace(trace, size, 0, lines);
+
+            for (uint i = 0; i < lines.count(); i++) {
+                fputs(lines[i], fp);
+                delete lines[i];
+            }
+
+            // @@ Add more info to crash.txt?
+
+            fclose(fp);
+        }
+
+        // This should terminate the process and set the error exit code.
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 2);
+
+        return EXCEPTION_EXECUTE_HANDLER;   // Terminate app. In case terminate process did not succeed.
+    }
+
+    static void handlePureVirtualCall() {
+        nvDebugBreak();
+        TerminateProcess(GetCurrentProcess(), EXIT_FAILURE + 8);
+    }
+
+    static void handleInvalidParameter(const wchar_t * wexpresion, const wchar_t * wfunction, const wchar_t * wfile, unsigned int line, uintptr_t reserved) {
+
+        size_t convertedCharCount = 0;
+        
+        StringBuilder expresion;
+        if (wexpresion != NULL) {
+            uint size = U32(wcslen(wexpresion) + 1);
+            expresion.reserve(size);
+            wcstombs_s(&convertedCharCount, expresion.str(), size, wexpresion, _TRUNCATE);
+        }
+
+        StringBuilder file;
+        if (wfile != NULL) {
+            uint size = U32(wcslen(wfile) + 1);
+            file.reserve(size);
+            wcstombs_s(&convertedCharCount, file.str(), size, wfile, _TRUNCATE);
+        }
+
+        StringBuilder function;
+        if (wfunction != NULL) {
+            uint size = U32(wcslen(wfunction) + 1);
+            function.reserve(size);
+            wcstombs_s(&convertedCharCount, function.str(), size, wfunction, _TRUNCATE);
+        }
+        
+        int result = nvAbort(expresion.str(), file.str(), line, function.str());
+        if (result == NV_ABORT_DEBUG) {
+            nvDebugBreak();
+        } 
+    }
 
 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) // NV_OS_LINUX || NV_OS_DARWIN
 
-#if defined(HAVE_EXECINFO_H) // NV_OS_LINUX
+#if defined(HAVE_EXECINFO_H)
+
+    static bool hasStackTrace() {
+        return true;
+    }
+
+
+    static void writeStackTrace(void * trace[], int size, int start, Array<const char *> & lines) {
+        StringBuilder builder(512);
+        char ** string_array = backtrace_symbols(trace, size);
+
+        for(int i = start; i < size-1; i++ ) {
+#       if NV_CC_GNUC // defined(HAVE_CXXABI_H)
+            // @@ Write a better parser for the possible formats.
+            char * begin = strchr(string_array[i], '(');
+            char * end = strrchr(string_array[i], '+');
+            char * module = string_array[i];
+
+            if (begin == 0 && end != 0) {
+                *(end - 1) = '\0';
+                begin = strrchr(string_array[i], ' ');
+                module = NULL; // Ignore module.
+            }
+
+            if (begin != 0 && begin < end) {
+                int stat;
+                *end = '\0';
+                *begin = '\0';
+                char * name = abi::__cxa_demangle(begin+1, 0, 0, &stat);
+                if (module == NULL) {
+                    if (name == NULL || stat != 0) {
+                        builder.format("  In: '%s'\n", begin+1);
+                    }
+                    else {
+                        builder.format("  In: '%s'\n", name);
+                    }
+                }
+                else {
+                    if (name == NULL || stat != 0) {
+                        builder.format("  In: [%s] '%s'\n", module, begin+1);
+                    }
+                    else {
+                        builder.format("  In: [%s] '%s'\n", module, name);
+                    }
+                }
+                free(name);
+            }
+            else {
+                builder.format("  In: '%s'\n", string_array[i]);
+            }
+#       else
+            builder.format("  In: '%s'\n", string_array[i]);
+#       endif
+            lines.append(builder.release());
+        }
+
+        free(string_array);
+    }
+
+    static void printStackTrace(void * trace[], int size, int start=0) {
+        nvDebug( "\nDumping stacktrace:\n" );
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 1, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            nvDebug("%s", lines[i]);
+            delete lines[i];
+        }
 
-	static bool nvHasStackTrace() {
+        nvDebug("\n");
+    }
+
+#endif // defined(HAVE_EXECINFO_H)
+
+    static void * callerAddress(void * secret)
+    {
 #if NV_OS_DARWIN
-		return backtrace != NULL;
+#  if defined(_STRUCT_MCONTEXT)
+#    if NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__srr0;
+#    elif NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__rip;
+#    elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__eip;
+#    elif NV_CPU_ARM
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->__ss.__pc;
+#    else
+#      error "Unknown CPU"
+#    endif
+#  else
+#    if NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->ss.srr0;
+#    elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext->ss.eip;
+#    else
+#      error "Unknown CPU"
+#    endif
+#  endif
+#elif NV_OS_FREEBSD
+#  if NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.mc_rip;
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.mc_eip;
+#    else
+#      error "Unknown CPU"
+#    endif
+#elif NV_OS_NETBSD
+#  if NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.__gregs[_REG_RIP];
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.__gregs[_REG_EIP];
+#  elif NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext.__gregs[_REG_PC];
+#  else
+#      error "Unknown CPU"
+#  endif
+#elif NV_OS_OPENBSD
+#  if NV_CPU_X86_64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->sc_rip;
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->sc_eip;
+#  else
+#       error "Unknown CPU"
+#  endif        
 #else
-		return true;
+#  if NV_CPU_X86_64
+        // #define REG_RIP REG_INDEX(rip) // seems to be 16
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.gregs[REG_RIP];
+#  elif NV_CPU_X86
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *)ucp->uc_mcontext.gregs[14/*REG_EIP*/];
+#  elif NV_CPU_PPC
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext.regs->nip;
+#    elif NV_CPU_AARCH64
+        ucontext_t * ucp = (ucontext_t *)secret;
+        return (void *) ucp->uc_mcontext.pc;
+#    else
+#      error "Unknown CPU"
+#    endif
 #endif
-	}
 
-	static void nvPrintStackTrace(void * trace[], int size, int start=0) {
-		char ** string_array = backtrace_symbols(trace, size);
-	
-		nvDebug( "\nDumping stacktrace:\n" );
-		for(int i = start; i < size-1; i++ ) {
-#		if NV_CC_GNUC // defined(HAVE_CXXABI_H)
-			char * begin = strchr(string_array[i], '(');
-			char * end = strchr(string_array[i], '+');
-			if( begin != 0 && begin < end ) {
-				int stat;
-				*end = '\0';
-				*begin = '\0';
-				char * module = string_array[i];
-				char * name = abi::__cxa_demangle(begin+1, 0, 0, &stat);
-				if( name == NULL || begin[1] != '_' || begin[2] != 'Z' ) {
-					nvDebug( "  In: [%s] '%s'\n", module, begin+1 );
-				}
-				else {
-					nvDebug( "  In: [%s] '%s'\n", module, name );
-				}
-				free(name);
-			}
-			else {
-				nvDebug( "  In: '%s'\n", string_array[i] );
-			}
-#		else
-			nvDebug( "  In: '%s'\n", string_array[i] );
-#		endif
-		}
-		nvDebug("\n");
-	
-		free(string_array);
-	}
+        // How to obtain the instruction pointers in different platforms, from mlton's source code.
+        // http://mlton.org/
+        // OpenBSD
+        // ucp->sc_eip
+        // FreeBSD:
+        // ucp->uc_mcontext.mc_eip
+        // HPUX:
+        // ucp->uc_link
+        // Solaris:
+        // ucp->uc_mcontext.gregs[REG_PC]
+        // Linux hppa:
+        // uc->uc_mcontext.sc_iaoq[0] & ~0x3UL
+        // Linux sparc:
+        // ((struct sigcontext*) secret)->sigc_regs.tpc
+        // Linux sparc64:
+        // ((struct sigcontext*) secret)->si_regs.pc
+
+        // potentially correct for other archs:
+        // Linux alpha: ucp->m_context.sc_pc
+        // Linux arm: ucp->m_context.ctx.arm_pc
+        // Linux ia64: ucp->m_context.sc_ip & ~0x3UL
+        // Linux mips: ucp->m_context.sc_pc
+        // Linux s390: ucp->m_context.sregs->regs.psw.addr
+    }
+
+    static void nvSigHandler(int sig, siginfo_t *info, void *secret)
+    {
+        void * pnt = callerAddress(secret);
+
+        // Do something useful with siginfo_t
+        if (sig == SIGSEGV) {
+            if (pnt != NULL) nvDebug("Got signal %d, faulty address is %p, from %p\n", sig, info->si_addr, pnt);
+            else nvDebug("Got signal %d, faulty address is %p\n", sig, info->si_addr);
+        }
+        else if(sig == SIGTRAP) {
+            nvDebug("Breakpoint hit.\n");
+        }
+        else {
+            nvDebug("Got signal %d\n", sig);
+        }
+
+#if defined(HAVE_EXECINFO_H)
+        if (hasStackTrace()) // in case of weak linking
+        {
+            void * trace[64];
+            int size = backtrace(trace, 64);
+
+            if (pnt != NULL) {
+                // Overwrite sigaction with caller's address.
+                trace[1] = pnt;
+            }
 
+            printStackTrace(trace, size, 1);
+        }
 #endif // defined(HAVE_EXECINFO_H)
 
-	static void * callerAddress(void * secret)
-	{
-#	if NV_OS_DARWIN
-#		if defined(_STRUCT_MCONTEXT)
-#			if NV_CPU_PPC
-				ucontext_t * ucp = (ucontext_t *)secret;
-				return (void *) ucp->uc_mcontext->__ss.__srr0;
-#			elif NV_CPU_X86
-				ucontext_t * ucp = (ucontext_t *)secret;
-				return (void *) ucp->uc_mcontext->__ss.__eip;
-#			endif
-#		else
-#			if NV_CPU_PPC
-				ucontext_t * ucp = (ucontext_t *)secret;
-				return (void *) ucp->uc_mcontext->ss.srr0;
-#			elif NV_CPU_X86
-				ucontext_t * ucp = (ucontext_t *)secret;
-				return (void *) ucp->uc_mcontext->ss.eip;
-#			endif
-#		endif
-#	elif NV_OS_FREEBSD
-#		if NV_CPU_X86_64
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->uc_mcontext.mc_rip;
-#		elif NV_CPU_X86
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->uc_mcontext.mc_eip;
-#		endif
-#	elif NV_OS_OPENBSD
-#		if NV_CPU_X86_64
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->sc_rip;
-#		elif NV_CPU_X86
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->sc_eip;
-#		endif
-#	else
-#		if NV_CPU_X86_64
-			// #define REG_RIP REG_INDEX(rip) // seems to be 16
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->uc_mcontext.gregs[REG_RIP];
-#		elif NV_CPU_X86
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *)ucp->uc_mcontext.gregs[14/*REG_EIP*/];
-#		elif NV_CPU_PPC
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *) ucp->uc_mcontext.regs->nip;
-#		elif NV_CPU_AARCH64
-			ucontext_t * ucp = (ucontext_t *)secret;
-			return (void *) ucp->uc_mcontext.pc;
-#		endif
-#	endif
-		
-		// How to obtain the instruction pointers in different platforms, from mlton's source code.
-		// http://mlton.org/
-		// OpenBSD && NetBSD
-		// ucp->sc_eip
-		// FreeBSD:
-		// ucp->uc_mcontext.mc_eip
-		// HPUX:
-		// ucp->uc_link
-		// Solaris:
-		// ucp->uc_mcontext.gregs[REG_PC]
-		// Linux hppa:
-		// uc->uc_mcontext.sc_iaoq[0] & ~0x3UL
-		// Linux sparc:
-		// ((struct sigcontext*) secret)->sigc_regs.tpc
-		// Linux sparc64:
-		// ((struct sigcontext*) secret)->si_regs.pc
-	
-		// potentially correct for other archs:
-		// Linux alpha: ucp->m_context.sc_pc
-		// Linux arm: ucp->m_context.ctx.arm_pc
-		// Linux ia64: ucp->m_context.sc_ip & ~0x3UL
-		// Linux mips: ucp->m_context.sc_pc
-		// Linux s390: ucp->m_context.sregs->regs.psw.addr
-	}
-	
-	static void nvSigHandler(int sig, siginfo_t *info, void *secret)
-	{
-		void * pnt = callerAddress(secret);
-		
-		// Do something useful with siginfo_t
-		if (sig == SIGSEGV) {
-			if (pnt != NULL) nvDebug("Got signal %d, faulty address is %p, from %p\n", sig, info->si_addr, pnt);
-			else nvDebug("Got signal %d, faulty address is %p\n", sig, info->si_addr);
-		}
-		else if(sig == SIGTRAP) {
-			nvDebug("Breakpoint hit.\n");
-		}
-		else {
-			nvDebug("Got signal %d\n", sig);
-		}
-		
-#	if defined(HAVE_EXECINFO_H)
-		if (nvHasStackTrace()) // in case of weak linking
-		{
-			void * trace[64];
-			int size = backtrace(trace, 64);
-		
-			if (pnt != NULL) {
-				// Overwrite sigaction with caller's address.
-				trace[1] = pnt;
-			}
-			
-			nvPrintStackTrace(trace, size, 1);
-		}
-#	endif // defined(HAVE_EXECINFO_H)
-		
-		exit(0);
-	}
+        exit(0);
+    }
 
 #endif // defined(HAVE_SIGNAL_H)
 
 
 
 #if NV_OS_WIN32 //&& NV_CC_MSVC
-	
-	/** Win32 asset handler. */
-	struct Win32AssertHandler : public AssertHandler 
-	{
-		// Code from Daniel Vogel.
-		static bool isDebuggerPresent()
-		{
-			bool result = false;
-			
-			HINSTANCE kern_lib = LoadLibraryExA( "kernel32.dll", NULL, 0 );
-			if( kern_lib ) {
-				FARPROC lIsDebuggerPresent = GetProcAddress( kern_lib, "IsDebuggerPresent" );
-				if( lIsDebuggerPresent && lIsDebuggerPresent() ) {
-					result = true;
-				}
-				
-				FreeLibrary( kern_lib );
-			}
-			return result;
-		}
-		
-		// Flush the message queue. This is necessary for the message box to show up.
-		static void flushMessageQueue()
-		{
-			MSG msg;
-			while( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) ) {
-				if( msg.message == WM_QUIT ) break;
-				TranslateMessage( &msg );
-				DispatchMessage( &msg );
-			}
-		}
-	
-		// Assert handler method.
-		virtual int assert( const char * exp, const char * file, int line, const char * func/*=NULL*/ )
-		{
-			int ret = NV_ABORT_EXIT;
-			
-			StringBuilder error_string;
-			if( func != NULL ) {
-				error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
-				nvDebug( error_string );
-			}
-			else {
-				error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
-				nvDebug( error_string );
-			}
-			
-		#if _DEBUG
-			
-			if( isDebuggerPresent() ) {
-				return NV_ABORT_DEBUG;
-			}
-			
-			flushMessageQueue();
-			int action = MessageBoxA(NULL, error_string, "Assertion failed", MB_ABORTRETRYIGNORE|MB_ICONERROR);
-			switch( action ) {
-				case IDRETRY:
-					ret = NV_ABORT_DEBUG;
-					break;
-				case IDIGNORE:
-					ret = NV_ABORT_IGNORE;
-					break;
-				case IDABORT:
-				default:
-					ret = NV_ABORT_EXIT;
-					break;
-			}
-			/*if( _CrtDbgReport( _CRT_ASSERT, file, line, module, exp ) == 1 ) {
-				return NV_ABORT_DEBUG;
-			}*/
-			
-		#endif
-			
-			if( ret == NV_ABORT_EXIT ) {
-				// Exit cleanly.
-				throw std::runtime_error("Assertion failed");
-			}
-			
-			return ret;			
-		}
-	};
-	
+
+    /** Win32 assert handler. */
+    struct Win32AssertHandler : public AssertHandler 
+    {
+        // Flush the message queue. This is necessary for the message box to show up.
+        static void flushMessageQueue()
+        {
+            MSG msg;
+            while( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) ) {
+                //if( msg.message == WM_QUIT ) break;
+                TranslateMessage( &msg );
+                DispatchMessage( &msg );
+            }
+        }
+
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            int ret = NV_ABORT_EXIT;
+
+            StringBuilder error_string;
+            error_string.format("*** Assertion failed: %s\n    On file: %s\n    On line: %d\n", exp, file, line );
+            if (func != NULL) {
+                error_string.appendFormat("    On function: %s\n", func);
+            }
+            if (msg != NULL) {
+                error_string.append("    Message: ");
+                va_list tmp;
+                va_copy(tmp, arg);
+                error_string.appendFormatList(msg, tmp);
+                va_end(tmp);
+                error_string.append("\n");
+            }
+            nvDebug( error_string.str() );
+
+            // Print stack trace:
+            debug::dumpInfo();
+
+            if (debug::isDebuggerPresent()) {
+                return NV_ABORT_DEBUG;
+            }
+
+            if (s_interactive) {
+                flushMessageQueue();
+                int action = MessageBoxA(NULL, error_string.str(), "Assertion failed", MB_ABORTRETRYIGNORE | MB_ICONERROR | MB_TOPMOST);
+                switch( action ) {
+                case IDRETRY:
+                    ret = NV_ABORT_DEBUG;
+                    break;
+                case IDIGNORE:
+                    ret = NV_ABORT_IGNORE;
+                    break;
+                case IDABORT:
+                default:
+                    ret = NV_ABORT_EXIT;
+                    break;
+                }
+                /*if( _CrtDbgReport( _CRT_ASSERT, file, line, module, exp ) == 1 ) {
+                    return NV_ABORT_DEBUG;
+                }*/
+            }
+
+            if (ret == NV_ABORT_EXIT) {
+                // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
+            }
+
+            return ret;
+        }
+    };
+#elif NV_OS_XBOX
+
+    /** Xbox360 assert handler. */
+    struct Xbox360AssertHandler : public AssertHandler 
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            int ret = NV_ABORT_EXIT;
+
+            StringBuilder error_string;
+            if( func != NULL ) {
+                error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+                nvDebug( error_string.str() );
+            }
+            else {
+                error_string.format( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+                nvDebug( error_string.str() );
+            }
+
+            if (debug::isDebuggerPresent()) {
+                return NV_ABORT_DEBUG;
+            }
+
+            if( ret == NV_ABORT_EXIT ) {
+                 // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
+            }
+
+            return ret;
+        }
+    };
+#elif NV_OS_ORBIS
+
+    /** Orbis assert handler. */
+    struct OrbisAssertHandler : public AssertHandler
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            if( func != NULL ) {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+            }
+            else {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+            }
+
+            //SBtodoORBIS print stack trace
+            /*if (hasStackTrace())
+            {
+                void * trace[64];
+                int size = backtrace(trace, 64);
+                printStackTrace(trace, size, 2);
+            }*/
+            
+            if (debug::isDebuggerPresent())
+                return NV_ABORT_DEBUG;
+
+            return NV_ABORT_IGNORE;
+        }
+    };
+
 #else
-	
-	/** Unix asset handler. */
-	struct UnixAssertHandler : public AssertHandler
-	{
-		bool isDebuggerPresent()
-		{
-#		if NV_OS_DARWIN
-			int mib[4];
-			struct kinfo_proc info;
-			size_t size;
-			mib[0] = CTL_KERN;
-			mib[1] = KERN_PROC;
-			mib[2] = KERN_PROC_PID;
-			mib[3] = getpid();
-			size = sizeof(info);
-			info.kp_proc.p_flag = 0;
-			sysctl(mib,4,&info,&size,NULL,0);
-			return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
-#		else
-			// if ppid != sid, some process spawned our app, probably a debugger. 
-			return getsid(getpid()) != getppid();
-#		endif
-		}
-		
-		// Assert handler method.
-		virtual int assert(const char * exp, const char * file, int line, const char * func)
-		{
-			if( func != NULL ) {
-				nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
-			}
-			else {
-				nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
-			}
-			
-#		if _DEBUG
-			if( isDebuggerPresent() ) {
-				return NV_ABORT_DEBUG;
-			}
-#		endif
-
-#		if defined(HAVE_EXECINFO_H)
-			if (nvHasStackTrace())
-			{
-				void * trace[64];
-				int size = backtrace(trace, 64);
-				nvPrintStackTrace(trace, size, 2);
-			}
-#		endif
-
-			// Exit cleanly.
-			throw std::runtime_error("Assertion failed");
-		}
-	};
-	
+
+    /** Unix assert handler. */
+    struct UnixAssertHandler : public AssertHandler
+    {
+        // Assert handler method.
+        virtual int assertion(const char * exp, const char * file, int line, const char * func, const char * msg, va_list arg)
+        {
+            int ret = NV_ABORT_EXIT;            
+            
+            if( func != NULL ) {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On function: %s\n    On line: %d\n ", exp, file, func, line );
+            }
+            else {
+                nvDebug( "*** Assertion failed: %s\n    On file: %s\n    On line: %d\n ", exp, file, line );
+            }
+
+#if _DEBUG
+            if (debug::isDebuggerPresent()) {
+                return NV_ABORT_DEBUG;
+            }
+#endif
+
+#if defined(HAVE_EXECINFO_H)
+            if (hasStackTrace())
+            {
+                void * trace[64];
+                int size = backtrace(trace, 64);
+                printStackTrace(trace, size, 2);
+            }
+#endif
+
+            if( ret == NV_ABORT_EXIT ) {
+                // Exit cleanly.
+                exit(EXIT_FAILURE + 1);
+            }
+            
+            return ret;
+        }
+    };
+
 #endif
 
 } // namespace
 
 
-/// Handle assertion through the asset handler.
-int nvAbort(const char * exp, const char * file, int line, const char * func)
+/// Handle assertion through the assert handler.
+int nvAbort(const char * exp, const char * file, int line, const char * func/*=NULL*/, const char * msg/*= NULL*/, ...)
 {
 #if NV_OS_WIN32 //&& NV_CC_MSVC
-	static Win32AssertHandler s_default_assert_handler;
+    static Win32AssertHandler s_default_assert_handler;
+#elif NV_OS_XBOX
+    static Xbox360AssertHandler s_default_assert_handler;
+#elif NV_OS_ORBIS
+    static OrbisAssertHandler s_default_assert_handler;
 #else
-	static UnixAssertHandler s_default_assert_handler;
+    static UnixAssertHandler s_default_assert_handler;
+#endif
+
+    va_list arg;
+    va_start(arg,msg);
+
+    AssertHandler * handler = s_assert_handler != NULL ? s_assert_handler : &s_default_assert_handler;
+    int result = handler->assertion(exp, file, line, func, msg, arg);
+
+    va_end(arg);
+
+    return result;
+}
+
+// Abnormal termination. Create mini dump and output call stack.
+void debug::terminate(int code)
+{
+#if NV_OS_WIN32
+    EnterCriticalSection(&s_handler_critical_section);
+
+    writeMiniDump(NULL);
+
+    const int max_stack_size = 64;
+    void * trace[max_stack_size];
+    int size = backtrace(trace, max_stack_size);
+
+    // @@ Use win32's CreateFile?
+    FILE * fp = fileOpen("crash.txt", "wb");
+    if (fp != NULL) {
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 0, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            fputs(lines[i], fp);
+            delete lines[i];
+        }
+
+        // @@ Add more info to crash.txt?
+
+        fclose(fp);
+    }
+
+    LeaveCriticalSection(&s_handler_critical_section);
 #endif
-	
-	if( s_assert_handler != NULL ) {
-		return s_assert_handler->assert( exp, file, line, func );
-	}
-	else {
-		return s_default_assert_handler.assert( exp, file, line, func );
-	}
+
+    exit(code);
 }
 
 
 /// Shows a message through the message handler.
-void NV_CDECL nvDebug(const char *msg, ...)
+void NV_CDECL nvDebugPrint(const char *msg, ...)
 {
-	va_list arg;
-	va_start(arg,msg);
-	if( s_message_handler != NULL ) {
-		s_message_handler->log( msg, arg );
-	}
-	va_end(arg);
+    va_list arg;
+    va_start(arg,msg);
+    if (s_message_handler != NULL) {
+        s_message_handler->log( msg, arg );
+    }
+    va_end(arg);
 }
 
 
 /// Dump debug info.
 void debug::dumpInfo()
 {
-#if !NV_OS_WIN32 && defined(HAVE_SIGNAL_H) && defined(HAVE_EXECINFO_H)
-	if (nvHasStackTrace())
-	{
-		void * trace[64];
-		int size = backtrace(trace, 64);
-		nvPrintStackTrace(trace, size, 1);
-	}
+#if (NV_OS_WIN32 && NV_CC_MSVC) || (defined(HAVE_SIGNAL_H) && defined(HAVE_EXECINFO_H))
+    if (hasStackTrace())
+    {
+        void * trace[64];
+        int size = backtrace(trace, 64);
+
+        nvDebug( "\nDumping stacktrace:\n" );
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, 1, lines);
+
+        for (uint i = 0; i < lines.count(); i++) {
+            nvDebug("%s", lines[i]);
+            delete lines[i];
+        }
+    }
+#endif
+}
+
+/// Dump callstack using the specified handler.
+void debug::dumpCallstack(MessageHandler *messageHandler, int callstackLevelsToSkip /*= 0*/)
+{
+#if (NV_OS_WIN32 && NV_CC_MSVC) || (defined(HAVE_SIGNAL_H) && defined(HAVE_EXECINFO_H))
+    if (hasStackTrace())
+    {
+        void * trace[64];
+        int size = backtrace(trace, 64);
+
+        Array<const char *> lines;
+        writeStackTrace(trace, size, callstackLevelsToSkip + 1, lines);     // + 1 to skip the call to dumpCallstack
+
+        for (uint i = 0; i < lines.count(); i++) {
+            messageHandler->log(lines[i], NULL);
+            delete lines[i];
+        }
+    }
 #endif
 }
 
@@ -491,72 +1032,239 @@
 /// Set the debug message handler.
 void debug::setMessageHandler(MessageHandler * message_handler)
 {
-	s_message_handler = message_handler;
+    s_message_handler = message_handler;
 }
 
 /// Reset the debug message handler.
 void debug::resetMessageHandler()
 {
-	s_message_handler = NULL;
+    s_message_handler = NULL;
 }
 
 /// Set the assert handler.
 void debug::setAssertHandler(AssertHandler * assert_handler)
 {
-	s_assert_handler = assert_handler;
+    s_assert_handler = assert_handler;
 }
 
 /// Reset the assert handler.
 void debug::resetAssertHandler()
 {
-	s_assert_handler = NULL;
+    s_assert_handler = NULL;
+}
+
+#if NV_OS_WIN32
+#if NV_USE_SEPARATE_THREAD
+
+static void initHandlerThread()
+{
+    static const int kExceptionHandlerThreadInitialStackSize = 64 * 1024;
+
+    // Set synchronization primitives and the handler thread.  Each
+    // ExceptionHandler object gets its own handler thread because that's the
+    // only way to reliably guarantee sufficient stack space in an exception,
+    // and it allows an easy way to get a snapshot of the requesting thread's
+    // context outside of an exception.
+    InitializeCriticalSection(&s_handler_critical_section);
+    
+    s_handler_start_semaphore = CreateSemaphore(NULL, 0, 1, NULL);
+    nvDebugCheck(s_handler_start_semaphore != NULL);
+
+    s_handler_finish_semaphore = CreateSemaphore(NULL, 0, 1, NULL);
+    nvDebugCheck(s_handler_finish_semaphore != NULL);
+
+    // Don't attempt to create the thread if we could not create the semaphores.
+    if (s_handler_finish_semaphore != NULL && s_handler_start_semaphore != NULL) {
+        DWORD thread_id;
+        s_handler_thread = CreateThread(NULL,         // lpThreadAttributes
+                                        kExceptionHandlerThreadInitialStackSize,
+                                        ExceptionHandlerThreadMain,
+                                        NULL,         // lpParameter
+                                        0,            // dwCreationFlags
+                                        &thread_id);
+        nvDebugCheck(s_handler_thread != NULL);
+    }
+
+    /* @@ We should avoid loading modules in the exception handler!
+    dbghelp_module_ = LoadLibrary(L"dbghelp.dll");
+    if (dbghelp_module_) {
+        minidump_write_dump_ = reinterpret_cast<MiniDumpWriteDump_type>(GetProcAddress(dbghelp_module_, "MiniDumpWriteDump"));
+    }
+    */
+}
+
+static void shutHandlerThread() {
+    // @@ Free stuff. Terminate thread.
 }
 
+#endif // NV_USE_SEPARATE_THREAD
+#endif // NV_OS_WIN32
+
 
-/// Enable signal handler.
-void debug::enableSigHandler()
+// Enable signal handler.
+void debug::enableSigHandler(bool interactive)
 {
-	nvCheck(s_sig_handler_enabled != true);
-	s_sig_handler_enabled = true;
-	
+    nvCheck(s_sig_handler_enabled != true);
+    s_sig_handler_enabled = true;
+    s_interactive = interactive;
+
 #if NV_OS_WIN32 && NV_CC_MSVC
-	
-	s_old_exception_filter = ::SetUnhandledExceptionFilter( nvTopLevelFilter );
-	
+    if (interactive) {
+        // Do not display message boxes on error.
+        // http://msdn.microsoft.com/en-us/library/windows/desktop/ms680621(v=vs.85).aspx
+        SetErrorMode(SEM_FAILCRITICALERRORS|SEM_NOGPFAULTERRORBOX|SEM_NOOPENFILEERRORBOX);
+
+        // CRT reports errors to debug output only.
+        // http://msdn.microsoft.com/en-us/library/1y71x448(v=vs.80).aspx
+        _CrtSetReportMode(_CRT_WARN, _CRTDBG_MODE_DEBUG);
+        _CrtSetReportMode(_CRT_ERROR, _CRTDBG_MODE_DEBUG);
+        _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_DEBUG);
+    }
+
+
+#if NV_USE_SEPARATE_THREAD
+    initHandlerThread();
+#endif
+
+    s_old_exception_filter = ::SetUnhandledExceptionFilter( handleException );
+
+#if _MSC_VER >= 1400  // MSVC 2005/8
+    _set_invalid_parameter_handler(handleInvalidParameter);
+#endif  // _MSC_VER >= 1400
+
+    _set_purecall_handler(handlePureVirtualCall);
+
+
+    // SYMOPT_DEFERRED_LOADS make us not take a ton of time unless we actual log traces
+    SymSetOptions(SYMOPT_DEFERRED_LOADS|SYMOPT_FAIL_CRITICAL_ERRORS|SYMOPT_LOAD_LINES|SYMOPT_UNDNAME);
+
+    if (!SymInitialize(GetCurrentProcess(), NULL, TRUE)) {
+        DWORD error = GetLastError();
+        nvDebug("SymInitialize returned error : %d\n", error);
+    }
+
 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
-	
-	// Install our signal handler
-	struct sigaction sa;
-	sa.sa_sigaction = nvSigHandler;
-	sigemptyset (&sa.sa_mask);
-	sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
-
-	sigaction(SIGSEGV, &sa, &s_old_sigsegv);
-	sigaction(SIGTRAP, &sa, &s_old_sigtrap);
-	sigaction(SIGFPE, &sa, &s_old_sigfpe);
-	sigaction(SIGBUS, &sa, &s_old_sigbus);
-	
+
+    // Install our signal handler
+    struct sigaction sa;
+    sa.sa_sigaction = nvSigHandler;
+    sigemptyset (&sa.sa_mask);
+    sa.sa_flags = SA_ONSTACK | SA_RESTART | SA_SIGINFO;
+
+    sigaction(SIGSEGV, &sa, &s_old_sigsegv);
+    sigaction(SIGTRAP, &sa, &s_old_sigtrap);
+    sigaction(SIGFPE, &sa, &s_old_sigfpe);
+    sigaction(SIGBUS, &sa, &s_old_sigbus);
+
 #endif
 }
 
 /// Disable signal handler.
 void debug::disableSigHandler()
 {
-	nvCheck(s_sig_handler_enabled == true);
-	s_sig_handler_enabled = false;
+    nvCheck(s_sig_handler_enabled == true);
+    s_sig_handler_enabled = false;
 
 #if NV_OS_WIN32 && NV_CC_MSVC
 
-	::SetUnhandledExceptionFilter( s_old_exception_filter );
-	s_old_exception_filter = NULL;
+    ::SetUnhandledExceptionFilter( s_old_exception_filter );
+    s_old_exception_filter = NULL;
+
+    SymCleanup(GetCurrentProcess());
 
 #elif !NV_OS_WIN32 && defined(HAVE_SIGNAL_H)
-	
-	sigaction(SIGSEGV, &s_old_sigsegv, NULL);
-	sigaction(SIGTRAP, &s_old_sigtrap, NULL);
-	sigaction(SIGFPE, &s_old_sigfpe, NULL);
-	sigaction(SIGBUS, &s_old_sigbus, NULL);
-	
+
+    sigaction(SIGSEGV, &s_old_sigsegv, NULL);
+    sigaction(SIGTRAP, &s_old_sigtrap, NULL);
+    sigaction(SIGFPE, &s_old_sigfpe, NULL);
+    sigaction(SIGBUS, &s_old_sigbus, NULL);
+
 #endif
 }
 
+
+bool debug::isDebuggerPresent()
+{
+#if NV_OS_WIN32
+    HINSTANCE kernel32 = GetModuleHandleA("kernel32.dll");
+    if (kernel32) {
+        FARPROC IsDebuggerPresent = GetProcAddress(kernel32, "IsDebuggerPresent");
+        if (IsDebuggerPresent != NULL && IsDebuggerPresent()) {
+            return true;
+        }
+    }
+    return false;
+#elif NV_OS_XBOX
+#ifdef _DEBUG
+    return DmIsDebuggerPresent() == TRUE;
+#else
+    return false;
+#endif
+#elif NV_OS_ORBIS
+  #if PS4_FINAL_REQUIREMENTS
+    return false; 
+  #else
+    return sceDbgIsDebuggerAttached() == 1;
+  #endif
+#elif NV_OS_DARWIN
+    int mib[4];
+    struct kinfo_proc info;
+    size_t size;
+    mib[0] = CTL_KERN;
+    mib[1] = KERN_PROC;
+    mib[2] = KERN_PROC_PID;
+    mib[3] = getpid();
+    size = sizeof(info);
+    info.kp_proc.p_flag = 0;
+    sysctl(mib,4,&info,&size,NULL,0);
+    return ((info.kp_proc.p_flag & P_TRACED) == P_TRACED);
+#else
+    // if ppid != sid, some process spawned our app, probably a debugger. 
+    return getsid(getpid()) != getppid();
+#endif
+}
+
+bool debug::attachToDebugger()
+{
+#if NV_OS_WIN32
+    if (isDebuggerPresent() == FALSE) {
+        Path process(1024);
+        process.copy("\"");
+        GetSystemDirectoryA(process.str() + 1, 1024 - 1);
+
+        process.appendSeparator();
+
+        process.appendFormat("VSJitDebugger.exe\" -p %lu", ::GetCurrentProcessId());
+
+        STARTUPINFOA sSi;
+        memset(&sSi, 0, sizeof(sSi));
+
+        PROCESS_INFORMATION sPi;
+        memset(&sPi, 0, sizeof(sPi));
+        
+        BOOL b = CreateProcessA(NULL, process.str(), NULL, NULL, FALSE, 0, NULL, NULL, &sSi, &sPi);
+        if (b != FALSE) {
+            ::WaitForSingleObject(sPi.hProcess, INFINITE);
+            
+            DWORD dwExitCode;
+            ::GetExitCodeProcess(sPi.hProcess, &dwExitCode);
+            if (dwExitCode != 0) //if exit code is zero, a debugger was selected
+                b = FALSE;
+        }
+
+        if (sPi.hThread != NULL) ::CloseHandle(sPi.hThread);
+        if (sPi.hProcess != NULL) ::CloseHandle(sPi.hProcess);
+
+        if (b == FALSE)
+            return false;
+
+        for (int i = 0; i < 5*60; i++) {
+            if (isDebuggerPresent())
+                break;
+            ::Sleep(200);
+        }
+    }
+#endif // NV_OS_WIN32
+
+    return true;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucDarwin.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucDarwin.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucDarwin.h
@@ -2,7 +2,8 @@
 #error "Do not include this file directly."
 #endif
 
-//#include <stdint.h> // uint8_t, int8_t, ...
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
 
 // Function linkage
 #define DLL_IMPORT
@@ -24,8 +25,9 @@
 #endif
 
 #define NV_FASTCALL		__attribute__((fastcall))
-#define NV_FORCEINLINE	__attribute__((always_inline))
+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
 #define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
 
 #if __GNUC__ > 2
 #define NV_PURE     __attribute__((pure))
@@ -35,6 +37,8 @@
 #define NV_CONST
 #endif
 
+#define NV_NOINLINE __attribute__((noinline))
+
 // Define __FUNC__ properly.
 #if __STDC_VERSION__ < 199901L
 #	if __GNUC__ >= 2
@@ -47,21 +51,3 @@
 #endif
 
 #define restrict    __restrict__
-
-/*
-// Type definitions
-typedef uint8_t     uint8;
-typedef int8_t      int8;
-
-typedef uint16_t    uint16;
-typedef int16_t     int16;
-
-typedef uint32_t    uint32;
-typedef int32_t     int32;
-
-typedef uint64_t    uint64;
-typedef int64_t     int64;
-
-// Aliases
-typedef uint32      uint;
-*/
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucLinux.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucLinux.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucLinux.h
@@ -2,29 +2,38 @@
 #error "Do not include this file directly."
 #endif
 
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
 // Function linkage
 #define DLL_IMPORT
 #if __GNUC__ >= 4
-#	define DLL_EXPORT   __attribute__((visibility("default")))
-#	define DLL_EXPORT_CLASS DLL_EXPORT
+#   define DLL_EXPORT   __attribute__((visibility("default")))
+#   define DLL_EXPORT_CLASS DLL_EXPORT
 #else
-#	define DLL_EXPORT
-#	define DLL_EXPORT_CLASS
+#   define DLL_EXPORT
+#   define DLL_EXPORT_CLASS
 #endif
 
 // Function calling modes
 #if NV_CPU_X86
-#	define NV_CDECL     __attribute__((cdecl))
-#	define NV_STDCALL   __attribute__((stdcall))
+#   define NV_CDECL     __attribute__((cdecl))
+#   define NV_STDCALL   __attribute__((stdcall))
 #else
-#	define NV_CDECL 
-#	define NV_STDCALL
+#   define NV_CDECL 
+#   define NV_STDCALL
 #endif
 
 #define NV_FASTCALL     __attribute__((fastcall))
-#define NV_FORCEINLINE  __attribute__((always_inline))
+//#if __GNUC__ > 3
+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
+#define NV_FORCEINLINE  inline __attribute__((always_inline))
+//#else
+// Some compilers complain that inline and always_inline are redundant.
+//#define NV_FORCEINLINE  __attribute__((always_inline))
+//#endif
 #define NV_DEPRECATED   __attribute__((deprecated))
-
+#define NV_THREAD_LOCAL __thread 
 
 #if __GNUC__ > 2
 #define NV_PURE     __attribute__((pure))
@@ -34,33 +43,17 @@
 #define NV_CONST
 #endif
 
+#define NV_NOINLINE __attribute__((noinline))
+
 // Define __FUNC__ properly.
 #if __STDC_VERSION__ < 199901L
-#	if __GNUC__ >= 2
-#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
-#	else
-#		define __FUNC__ "<unknown>"
-#	endif
+#   if __GNUC__ >= 2
+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
+#   else
+#       define __FUNC__ "<unknown>"
+#   endif
 #else
-#	define __FUNC__ __PRETTY_FUNCTION__
+#   define __FUNC__ __PRETTY_FUNCTION__
 #endif
 
 #define restrict    __restrict__
-
-/*
-// Type definitions
-typedef unsigned char       uint8;
-typedef signed char         int8;
-
-typedef unsigned short      uint16;
-typedef signed short        int16;
-
-typedef unsigned int        uint32;
-typedef signed int          int32;
-
-typedef unsigned long long  uint64;
-typedef signed long long    int64;
-
-// Aliases
-typedef uint32              uint;
-*/
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucWin32.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucWin32.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsGnucWin32.h
@@ -2,6 +2,8 @@
 #error "Do not include this file directly."
 #endif
 
+//#include <cstddef> // size_t, NULL
+
 // Function linkage
 #define DLL_IMPORT	__declspec(dllimport)
 #define DLL_EXPORT	__declspec(dllexport)
@@ -28,6 +30,8 @@
 #define NV_CONST
 #endif
 
+#define NV_NOINLINE __attribute__((noinline))
+
 // Define __FUNC__ properly.
 #if __STDC_VERSION__ < 199901L
 #	if __GNUC__ >= 2
@@ -58,3 +62,4 @@
 // Aliases
 typedef uint32				uint;
 */
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsVcWin32.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsVcWin32.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/DefsVcWin32.h
@@ -1,3 +1,5 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
 #ifndef NV_CORE_H
 #error "Do not include this file directly."
 #endif
@@ -11,22 +13,28 @@
 #define NV_CDECL        __cdecl
 #define NV_STDCALL      __stdcall
 #define NV_FASTCALL     __fastcall
-#define NV_FORCEINLINE  __forceinline
 #define NV_DEPRECATED
 
 #define NV_PURE
 #define NV_CONST
 
 // Set standard function names.
-#define snprintf _snprintf
+#if _MSC_VER < 1900
+#   define snprintf _snprintf
+#endif
 #if _MSC_VER < 1500
-#	define vsnprintf _vsnprintf
+#   define vsnprintf _vsnprintf
+#endif
+#if _MSC_VER < 1700
+#   define strtoll _strtoi64
+#   define strtoull _strtoui64
 #endif
-#define vsscanf _vsscanf
 #define chdir _chdir
 #define getcwd _getcwd 
 
-#define va_copy(a, b)	a = b
+#if _MSC_VER < 1800 // Not sure what version introduced this.
+#define va_copy(a, b) (a) = (b)
+#endif
 
 #if !defined restrict
 #define restrict
@@ -39,6 +47,13 @@
 #define __FUNC__ __FUNCTION__ 
 #endif
 
+#define NV_NOINLINE __declspec(noinline)
+#define NV_FORCEINLINE __forceinline
+
+#define NV_THREAD_LOCAL __declspec(thread)
+
+#include <stdint.h>
+
 /*
 // Type definitions
 typedef unsigned char       uint8;
@@ -59,20 +74,23 @@
 
 // Unwanted VC++ warnings to disable.
 /*
-#pragma warning(disable : 4244)		// conversion to float, possible loss of data
-#pragma warning(disable : 4245)		// conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
-#pragma warning(disable : 4100)		// unreferenced formal parameter
-#pragma warning(disable : 4514)		// unreferenced inline function has been removed
-#pragma warning(disable : 4710)		// inline function not expanded
-#pragma warning(disable : 4127)		// Conditional expression is constant
-#pragma warning(disable : 4305)		// truncation from 'const double' to 'float'
-#pragma warning(disable : 4505)		// unreferenced local function has been removed
-
-#pragma warning(disable : 4702)		// unreachable code in inline expanded function
-#pragma warning(disable : 4711)		// function selected for automatic inlining
-#pragma warning(disable : 4725)		// Pentium fdiv bug
+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)     // unreferenced formal parameter
+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
+#pragma warning(disable : 4710)     // inline function not expanded
+#pragma warning(disable : 4127)     // Conditional expression is constant
+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)     // unreferenced local function has been removed
+
+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
+#pragma warning(disable : 4711)     // function selected for automatic inlining
+#pragma warning(disable : 4725)     // Pentium fdiv bug
 
-#pragma warning(disable : 4786)		// Identifier was truncated and cannot be debugged.
+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
 
-#pragma warning(disable : 4675)		// resolved overload was found by argument-dependent lookup
+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
 */
+
+#pragma warning(1 : 4705)     // Report unused local variables.
+#pragma warning(1 : 4555)     // Expression has no effect.
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/FileSystem.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/FileSystem.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/FileSystem.h
@@ -0,0 +1,24 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_CORE_FILESYSTEM_H
+#define NV_CORE_FILESYSTEM_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+
+    namespace FileSystem
+    {
+        NVCORE_API bool exists(const char * path);
+        NVCORE_API bool createDirectory(const char * path);
+        NVCORE_API bool changeDirectory(const char * path);
+        NVCORE_API bool removeFile(const char * path);
+
+    } // FileSystem namespace
+
+} // nv namespace
+
+
+#endif // NV_CORE_FILESYSTEM_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/FileSystem.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/FileSystem.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/FileSystem.cpp
@@ -0,0 +1,80 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "FileSystem.h"
+
+#if NV_OS_WIN32
+#define _CRT_NONSTDC_NO_WARNINGS // _chdir is defined deprecated, but that's a bug, chdir is deprecated, _chdir is *not*.
+//#include <shlwapi.h> // PathFileExists
+#include <windows.h> // GetFileAttributes
+#include <direct.h> // _mkdir
+#elif NV_OS_XBOX
+#include <Xtl.h>
+#elif NV_OS_ORBIS
+#include <fios2.h>
+#else
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+#include <stdio.h> // remove, unlink
+
+using namespace nv;
+
+
+bool FileSystem::exists(const char * path)
+{
+#if NV_OS_UNIX
+	return access(path, F_OK|R_OK) == 0;
+	//struct stat buf;
+	//return stat(path, &buf) == 0;
+#elif NV_OS_WIN32 || NV_OS_XBOX
+    // PathFileExists requires linking to shlwapi.lib
+    //return PathFileExists(path) != 0;
+    return GetFileAttributesA(path) != INVALID_FILE_ATTRIBUTES;
+#elif NV_OS_ORBIS
+    const int BUFFER_SIZE = 2048;
+    char file_fullpath[BUFFER_SIZE];
+    snprintf(file_fullpath, BUFFER_SIZE, "/app0/%s", path);
+    return sceFiosExistsSync(NULL, file_fullpath);
+#else
+	if (FILE * fp = fopen(path, "r"))
+	{
+		fclose(fp);
+		return true;
+	}
+	return false;
+#endif
+}
+
+bool FileSystem::createDirectory(const char * path)
+{
+#if NV_OS_WIN32 || NV_OS_XBOX
+    return CreateDirectoryA(path, NULL) != 0;
+#elif NV_OS_ORBIS
+    // not implemented
+	return false;
+#else
+    return mkdir(path, 0777) != -1;
+#endif
+}
+
+bool FileSystem::changeDirectory(const char * path)
+{
+#if NV_OS_WIN32
+    return _chdir(path) != -1;
+#elif NV_OS_XBOX
+	// Xbox doesn't support Current Working Directory!
+	return false;
+#elif NV_OS_ORBIS
+    // Orbis doesn't support Current Working Directory!
+	return false;
+#else
+    return chdir(path) != -1;
+#endif
+}
+
+bool FileSystem::removeFile(const char * path)
+{
+    // @@ Use unlink or remove?
+    return remove(path) == 0;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/ForEach.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/ForEach.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/ForEach.h
@@ -0,0 +1,68 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_FOREACH_H
+#define NV_CORE_FOREACH_H
+
+/*
+These foreach macros are very non-standard and somewhat confusing, but I like them.
+*/
+
+#include "nvcore.h"
+
+#if NV_CC_GNUC // If typeof or decltype is available:
+#if !NV_CC_CPP11
+#   define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype
+#else
+#   define NV_DECLTYPE decltype
+#endif
+
+/*
+Ideally we would like to write this:
+
+#define NV_FOREACH(i, container) \
+    for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+But gcc versions prior to 4.7 required an intermediate type. See:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
+*/
+
+#define NV_FOREACH(i, container) \
+    typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \
+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+#else // If typeof not available:
+
+#include <new> // placement new
+
+struct PseudoIndexWrapper {
+    template <typename T>
+    PseudoIndexWrapper(const T & container) {
+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
+        new (memory) typename T::PseudoIndex(container.start());
+    }
+    // PseudoIndex cannot have a dtor!
+
+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
+    }
+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
+    }
+
+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
+};
+
+#define NV_FOREACH(i, container) \
+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
+
+#endif
+
+// Declare foreach keyword.
+#if !defined NV_NO_USE_KEYWORDS
+#   define foreach NV_FOREACH
+#   define foreach_index NV_FOREACH
+#endif
+
+
+#endif // NV_CORE_FOREACH_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Hash.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Hash.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Hash.h
@@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASH_H
+#define NV_CORE_HASH_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
+    {
+        const uint8 * data = (const uint8 *) data_in;
+        uint i = 0;
+        while (i < size) {
+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
+        }
+        return h;
+    }
+
+    // Note that this hash does not handle NaN properly.
+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
+    {
+        for (uint i = 0; i < count; i++) {
+            //nvDebugCheck(nv::isFinite(*f));
+            union { float f; uint32 i; } x = { f[i] };
+            if (x.i == 0x80000000) x.i = 0;
+            h = sdbmHash(&x, 4, h);
+        }
+        return h;
+    }
+
+
+    template <typename T>
+    inline uint hash(const T & t, uint h = 5381)
+    {
+        return sdbmHash(&t, sizeof(T), h);
+    }
+
+    template <>
+    inline uint hash(const float & f, uint h)
+    {
+        return sdbmFloatHash(&f, 1, h);
+    }
+
+
+    // Functors for hash table:
+    template <typename Key> struct Hash 
+    {
+        uint operator()(const Key & k) const {
+            return hash(k);
+        }
+    };
+
+    template <typename Key> struct Equal
+    {
+        bool operator()(const Key & k0, const Key & k1) const {
+            return k0 == k1;
+        }
+    };
+
+
+    // @@ Move to Utils.h?
+    template <typename T1, typename T2>
+    struct Pair {
+        T1 first;
+        T2 second;
+    };
+
+    template <typename T1, typename T2>
+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
+        return p0.first == p1.first && p0.second == p1.second;
+    }
+
+    template <typename T1, typename T2>
+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
+        return hash(p.second, hash(p.first));
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_HASH_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Library.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Library.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Library.h
@@ -1,50 +0,0 @@
-// This code is in the public domain -- castano@gmail.com
-
-#ifndef NV_CORE_LIBRARY_H
-#define NV_CORE_LIBRARY_H
-
-#include <nvcore/nvcore.h>
-
-#if NV_OS_WIN32
-#define LIBRARY_NAME(name)	#name ".dll"
-#elif NV_OS_DARWIN
-#define NV_LIBRARY_NAME(name)	"lib" #name ".dylib"
-#else
-#define NV_LIBRARY_NAME(name)	"lib" #name ".so"
-#endif
-
-NVCORE_API void * nvLoadLibrary(const char * name);
-NVCORE_API void nvUnloadLibrary(void * lib);
-NVCORE_API void * nvBindSymbol(void * lib, const char * symbol);
-
-class NVCORE_CLASS Library
-{
-public:
-	Library(const char * name)
-	{
-		handle = nvLoadLibrary(name);
-	}
-	~Library()
-	{
-		if (isValid())
-		{
-			nvUnloadLibrary(handle);
-		}
-	}
-	
-	bool isValid() const
-	{
-		return handle != NULL;
-	}
-	
-	void * bindSymbol(const char * symbol)
-	{
-		return nvBindSymbol(handle, symbol);
-	}
-	
-private:
-	void * handle;
-};
-
-
-#endif // NV_CORE_LIBRARY_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Library.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Library.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Library.cpp
@@ -1,41 +0,0 @@
-
-#include "Library.h"
-#include "Debug.h"
-
-#if NV_OS_WIN32
-#define WIN32_LEAN_AND_MEAN
-#define VC_EXTRALEAN
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-
-
-void * nvLoadLibrary(const char * name)
-{
-#if NV_OS_WIN32
-	return (void *)LoadLibraryExA( name, NULL, 0 );
-#else
-	return dlopen(name, RTLD_LAZY);
-#endif
-}
-
-void nvUnloadLibrary(void * handle)
-{
-	nvDebugCheck(handle != NULL);
-#if NV_OS_WIN32
-	FreeLibrary((HMODULE)handle);
-#else
-	dlclose(handle);
-#endif
-}
-
-void * nvBindSymbol(void * handle, const char * symbol)
-{
-#if NV_OS_WIN32
-	return (void *)GetProcAddress((HMODULE)handle, symbol);
-#else
-	return (void *)dlsym(handle, symbol);
-#endif	
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Memory.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Memory.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Memory.h
@@ -7,9 +7,17 @@
 #include "nvcore.h"
 
 #include <stdlib.h> // malloc(), realloc() and free()
-#include <stddef.h> // size_t
+//#include <stddef.h> // size_t
+
+//#include <new>	// new and delete
+
+
+#if NV_CC_GNUC
+#   define NV_ALIGN_16 __attribute__ ((__aligned__ (16)))
+#else
+#   define NV_ALIGN_16 __declspec(align(16))
+#endif
 
-#include <new>	// new and delete
 
 #define NV_OVERRIDE_ALLOC 0
 
@@ -35,18 +43,22 @@
 namespace nv {
 
     // C++ helpers.
-    template <typename T> T * malloc(size_t count) {
+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
         return (T *)::malloc(sizeof(T) * count);
     }
 
-    template <typename T> T * realloc(T * ptr, size_t count) {
+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
         return (T *)::realloc(ptr, sizeof(T) * count);
     }
 
-    template <typename T> void free(const T * ptr) {
+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
         ::free((void *)ptr);
     }
 
+    template <typename T> NV_FORCEINLINE void zero(T & data) {
+        memset(&data, 0, sizeof(T));
+    }
+
 } // nv namespace
 
 #endif // NV_CORE_MEMORY_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Memory.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Memory.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Memory.cpp
@@ -114,5 +114,6 @@
 
 #endif // 0
 
-
 #endif // NV_OVERRIDE_ALLOC
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Prefetch.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Prefetch.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Prefetch.h
@@ -1,31 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_CORE_PREFETCH_H
-#define NV_CORE_PREFETCH_H
-
-#include <nvcore/nvcore.h>
-
-// nvPrefetch
-#if NV_CC_GNUC
-
-#define nvPrefetch(ptr)	__builtin_prefetch(ptr)
-
-#elif NV_CC_MSVC 
-
-#if NV_CPU_X86
-__forceinline void nvPrefetch(const void * mem)
-{
-	__asm mov ecx, mem
-	__asm prefetcht0 [ecx];
-//	__asm prefetchnta [ecx];
-}
-#endif // NV_CPU_X86
-
-#else // NV_CC_MSVC
-
-// do nothing in other case.
-#define nvPrefetch(ptr)
-
-#endif // NV_CC_MSVC
-
-#endif // NV_CORE_PREFETCH_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Ptr.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Ptr.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Ptr.h
@@ -1,363 +1,321 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
 #ifndef NV_CORE_PTR_H
 #define NV_CORE_PTR_H
 
-#include <nvcore/nvcore.h>
-#include <nvcore/Debug.h>
+#include "nvcore.h"
+#include "Debug.h"
 
-#include <stdio.h>	// NULL
+#include "RefCounted.h"
 
 namespace nv
 {
-	
-/** Simple auto pointer template class.
- *
- * This is very similar to the standard auto_ptr class, but with some 
- * additional limitations to make its use less error prone:
- * - Copy constructor and assignment operator are disabled.
- * - reset method is removed.
- * 
- * The semantics of the standard auto_ptr are not clear and change depending
- * on the std implementation. For a discussion of the problems of auto_ptr read:
- * http://www.awprofessional.com/content/images/020163371X/autoptrupdate\auto_ptr_update.html
- */
-template <class T>
-class AutoPtr
-{
-	NV_FORBID_COPY(AutoPtr);
-	NV_FORBID_HEAPALLOC();
-public:
-	
-	/// Default ctor.
-	AutoPtr() : m_ptr(NULL) { }
-	
-	/// Ctor.
-	explicit AutoPtr( T * p ) : m_ptr(p) { }
-	
-	/** Dtor. Deletes owned pointer. */
-	~AutoPtr() {
-		delete m_ptr;
-		m_ptr = NULL;
-	}
-
-	/** Delete owned pointer and assign new one. */
-	void operator=( T * p ) {
-		if (p != m_ptr)
-		{
-			delete m_ptr;
-			m_ptr = p;
-		}
-	}
-
-	/** Member access. */
-	T * operator -> () const {
-		nvDebugCheck(m_ptr != NULL);
-		return m_ptr;
-	}
-
-	/** Get reference. */
-	T & operator*() const {
-		nvDebugCheck(m_ptr != NULL);
-		return *m_ptr;
-	}
-
-	/** Get pointer. */
-	T * ptr() const { return m_ptr; }
-	
-	/** Relinquish ownership of the underlying pointer and returns that pointer. */
-	T * release() {
-		T * tmp = m_ptr;
-		m_ptr = NULL;
-		return tmp;
-	}
-	
-	/** Const pointer equal comparation. */
-	friend bool operator == (const AutoPtr<T> & ap, const T * const p) {
-		return (ap.ptr() == p);
-	}
-
-	/** Const pointer nequal comparation. */
-	friend bool operator != (const AutoPtr<T> & ap, const T * const p) {
-		return (ap.ptr() != p);
-	}
-
-	/** Const pointer equal comparation. */
-	friend bool operator == (const T * const p, const AutoPtr<T> & ap) {
-		return (ap.ptr() == p);
-	}
-
-	/** Const pointer nequal comparation. */
-	friend bool operator != (const T * const p, const AutoPtr<T> & ap) {
-		return (ap.ptr() != p);
-	}
-
-private:
-	T * m_ptr;
-};
-
-#if 0
-/** Reference counted base class to be used with Pointer.
- *
- * The only requirement of the Pointer class is that the RefCounted class implements the 
- * addRef and release methods.
- */
-class RefCounted
-{
-	NV_FORBID_COPY(RefCounted);
-public:
+    class WeakProxy;
+
+    /** Simple auto pointer template class.
+    *
+    * This is very similar to the standard auto_ptr class, but with some 
+    * additional limitations to make its use less error prone:
+    * - Copy constructor and assignment operator are disabled.
+    * - reset method is removed.
+    * 
+    * The semantics of the standard auto_ptr are not clear and change depending
+    * on the std implementation. For a discussion of the problems of auto_ptr read:
+    * http://www.awprofessional.com/content/images/020163371X/autoptrupdate\auto_ptr_update.html
+    */
+    template <class T>
+    class AutoPtr
+    {
+        NV_FORBID_COPY(AutoPtr);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        /// Ctor.
+        AutoPtr(T * p = NULL) : m_ptr(p) { }
+
+        template <class Q>
+        AutoPtr(Q * p) : m_ptr(static_cast<T *>(p)) { }
+
+        /// Dtor. Deletes owned pointer.
+        ~AutoPtr() {
+            delete m_ptr;
+            m_ptr = NULL;
+        }
+
+        /// Delete owned pointer and assign new one.
+        void operator=( T * p ) {
+            if (p != m_ptr)
+            {
+                delete m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        template <class Q>
+        void operator=( Q * p ) {
+            if (p != m_ptr)
+            {
+                delete m_ptr;
+                m_ptr = static_cast<T *>(p);
+            }
+        }
+
+        /// Member access.
+        T * operator -> () const {
+            nvDebugCheck(m_ptr != NULL);
+            return m_ptr;
+        }
+
+        /// Get reference.
+        T & operator*() const {
+            nvDebugCheck(m_ptr != NULL);
+            return *m_ptr;
+        }
+
+        /// Get pointer.
+        T * ptr() const { return m_ptr; }
+
+        /// Relinquish ownership of the underlying pointer and returns that pointer.
+        T * release() {
+            T * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        /// Const pointer equal comparation.
+        friend bool operator == (const AutoPtr<T> & ap, const T * const p) {
+            return (ap.ptr() == p);
+        }
+
+        /// Const pointer nequal comparation.
+        friend bool operator != (const AutoPtr<T> & ap, const T * const p) {
+            return (ap.ptr() != p);
+        }
+
+        /// Const pointer equal comparation.
+        friend bool operator == (const T * const p, const AutoPtr<T> & ap) {
+            return (ap.ptr() == p);
+        }
+
+        /// Const pointer nequal comparation.
+        friend bool operator != (const T * const p, const AutoPtr<T> & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        T * m_ptr;
+    };
+
+
+    /// Smart pointer template class.
+    template <class BaseClass>
+    class SmartPtr {
+    public:
+
+        // BaseClass must implement addRef() and release().
+        typedef SmartPtr<BaseClass>	ThisType;
+
+        /// Default ctor.
+        SmartPtr() : m_ptr(NULL) 
+        {
+        }
+
+        /// Other type assignment.
+        template <class OtherBase>
+        SmartPtr( const SmartPtr<OtherBase> & tc )
+        {
+            m_ptr = static_cast<BaseClass *>( tc.ptr() );
+            if (m_ptr) {
+                m_ptr->addRef();
+            }
+        }
+
+        /// Copy ctor.
+        SmartPtr( const ThisType & bc )
+        {
+            m_ptr = bc.ptr();
+            if (m_ptr) {
+                m_ptr->addRef();
+            }
+        }
+
+        /// Copy cast ctor. SmartPtr(NULL) is valid.
+        explicit SmartPtr( BaseClass * bc )
+        {
+            m_ptr = bc;
+            if (m_ptr) {
+                m_ptr->addRef();
+            }
+        }
+
+        /// Dtor.
+        ~SmartPtr()
+        {
+            set(NULL);
+        }
+
+
+        /// -> operator.
+        BaseClass * operator -> () const
+        {
+            nvCheck( m_ptr != NULL );
+            return m_ptr;
+        }
+
+        /// * operator.
+        BaseClass & operator*() const
+        {
+            nvCheck( m_ptr != NULL );
+            return *m_ptr;
+        }
+
+        /// Get pointer.
+        BaseClass * ptr() const
+        {
+            return m_ptr;
+        }
+
+        /// Other type assignment.
+        template <class OtherBase>
+        void operator = ( const SmartPtr<OtherBase> & tc )
+        {
+            set( static_cast<BaseClass *>(tc.ptr()) );
+        }
+
+        /// This type assignment.
+        void operator = ( const ThisType & bc )
+        {
+            set( bc.ptr() );
+        }
+
+        /// Pointer assignment.
+        void operator = ( BaseClass * bc )
+        {
+            set( bc );
+        }
+
+
+        /// Other type equal comparation.
+        template <class OtherBase>
+        bool operator == ( const SmartPtr<OtherBase> & other ) const
+        {
+            return m_ptr == other.ptr();
+        }
+
+        /// This type equal comparation.
+        bool operator == ( const ThisType & bc ) const
+        {
+            return m_ptr == bc.ptr();
+        }
+
+        /// Const pointer equal comparation.
+        bool operator == ( const BaseClass * const bc ) const
+        {
+            return m_ptr == bc;
+        }
+
+        /// Other type not equal comparation.
+        template <class OtherBase>
+        bool operator != ( const SmartPtr<OtherBase> & other ) const
+        {
+            return m_ptr != other.ptr();
+        }
+
+        /// Other type not equal comparation.
+        bool operator != ( const ThisType & bc ) const
+        {
+            return m_ptr != bc.ptr();
+        }
+
+        /// Const pointer not equal comparation.
+        bool operator != (const BaseClass * const bc) const
+        {
+            return m_ptr != bc;
+        }
+
+        /// This type lower than comparation.
+        bool operator < (const ThisType & p) const
+        {
+            return m_ptr < p.ptr();
+        }
+
+        bool isValid() const {
+            return isValidPtr(m_ptr);
+        }
+
+    private:
+
+        // Set this pointer.
+        void set( BaseClass * p )
+        {
+            if (p) p->addRef();
+            if (m_ptr) m_ptr->release();
+            m_ptr = p;
+        }
+
+    private:
+
+        BaseClass * m_ptr;
+
+    };
+
+
+    /// Smart pointer template class.
+    template <class T>
+    class WeakPtr {
+    public:
+
+        WeakPtr() {}
+
+        WeakPtr(T * p)  { operator=(p); }
+        WeakPtr(const SmartPtr<T> & p) { operator=(p.ptr()); }
+
+        // Default constructor and assignment from weak_ptr<T> are OK.
+
+        void operator=(T * p)
+        {
+            if (p) {
+                m_proxy = p->getWeakProxy();
+                nvDebugCheck(m_proxy != NULL);
+                nvDebugCheck(m_proxy->ptr() == p);
+            }
+            else {
+                m_proxy = NULL;
+            }
+        }
+
+        void operator=(const SmartPtr<T> & ptr) { operator=(ptr.ptr()); }
+
+        bool operator==(const SmartPtr<T> & p) const { return ptr() == p.ptr(); }
+        bool operator!=(const SmartPtr<T> & p) const { return ptr() != p.ptr(); }
+
+        bool operator==(const WeakPtr<T> & p) const { return ptr() == p.ptr(); }
+        bool operator!=(const WeakPtr<T> & p) const { return ptr() != p.ptr(); }
+
+        bool operator==(T * p) const { return ptr() == p; }
+        bool operator!=(T * p) const { return ptr() != p; }
+
+        T * operator->() const
+        {
+            T * p = ptr();
+            nvDebugCheck(p != NULL);
+            return p;
+        }
+
+        T * ptr() const
+        {
+            if (m_proxy != NULL) {
+                return static_cast<T *>(m_proxy->ptr());
+            }
+            return NULL;
+        }
 
-	/// Ctor.
-	RefCounted() : m_count(0), m_weak_proxy(NULL)
-	{
-		s_total_obj_count++;
-	}
-
-	/// Virtual dtor.
-	virtual ~RefCounted()
-	{
-		nvCheck( m_count == 0 );
-		nvCheck( s_total_obj_count > 0 );
-		s_total_obj_count--;
-	}
-
-
-	/// Increase reference count.
-	uint addRef() const
-	{
-		s_total_ref_count++;
-		m_count++;
-		return m_count;
-	}
-
-
-	/// Decrease reference count and remove when 0.
-	uint release() const
-	{
-		nvCheck( m_count > 0 );
-		
-		s_total_ref_count--;
-		m_count--;
-		if( m_count == 0 ) {
-			releaseWeakProxy();
-			delete this;
-			return 0;
-		}
-		return m_count;
-	}
-
-	/// Get weak proxy.
-	WeakProxy * getWeakProxy() const
-	{
-		if (m_weak_proxy == NULL) {
-			m_weak_proxy = new WeakProxy;
-			m_weak_proxy->AddRef();
-		}
-		return m_weak_proxy;
-	}
-
-	/// Release the weak proxy.	
-	void releaseWeakProxy() const
-	{
-		if (m_weak_proxy != NULL) {
-			m_weak_proxy->NotifyObjectDied();
-			m_weak_proxy->Release();
-			m_weak_proxy = NULL;
-		}
-	}
-
-	/** @name Debug methods: */
-	//@{
-		/// Get reference count.
-		int refCount() const
-		{
-			return m_count;
-		}
-
-		/// Get total number of objects.
-		static int totalObjectCount()
-		{
-			return s_total_obj_count;
-		}
-
-		/// Get total number of references.
-		static int totalReferenceCount()
-		{
-			return s_total_ref_count;
-		}
-	//@}
-
-
-private:
-
-	NVCORE_API static int s_total_ref_count;
-	NVCORE_API static int s_total_obj_count;
-
-	mutable int m_count;
-	mutable WeakProxy * weak_proxy;
-
-};
-#endif
-
-/// Smart pointer template class.
-template <class BaseClass>
-class Pointer {
-public:
-
-	// BaseClass must implement addRef() and release().
-	typedef Pointer<BaseClass>	ThisType;
-
-	/// Default ctor.
-	Pointer() : m_ptr(NULL) 
-	{
-	}
-
-	/** Other type assignment. */
-	template <class OtherBase>
-	Pointer( const Pointer<OtherBase> & tc )
-	{
-		m_ptr = static_cast<BaseClass *>( tc.ptr() );
-		if( m_ptr ) {
-			m_ptr->addRef();
-		}
-	}
-
-	/** Copy ctor. */
-	Pointer( const ThisType & bc )
-	{
-		m_ptr = bc.ptr();
-		if( m_ptr ) {
-			m_ptr->addRef();
-		}
-	}
-
-	/** Copy cast ctor. Pointer(NULL) is valid. */
-	explicit Pointer( BaseClass * bc )
-	{
-		m_ptr = bc;
-		if( m_ptr ) {
-			m_ptr->addRef();
-		}
-	}
-
-	/** Dtor. */
-	~Pointer()
-	{
-		set(NULL);
-	}
-
-
-	/** @name Accessors: */
-	//@{
-		/** -> operator. */
-		BaseClass * operator -> () const
-		{
-			nvCheck( m_ptr != NULL );
-			return m_ptr;
-		}
-
-		/** * operator. */
-		BaseClass & operator*() const
-		{
-			nvCheck( m_ptr != NULL );
-			return *m_ptr;
-		}
-
-		/** Get pointer. */
-		BaseClass * ptr() const
-		{
-			return m_ptr;
-		}
-	//@}
-
-
-	/** @name Mutators: */
-	//@{
-		/** Other type assignment. */
-		template <class OtherBase>
-		void operator = ( const Pointer<OtherBase> & tc )
-		{
-			set( static_cast<BaseClass *>(tc.ptr()) );
-		}
-
-		/** This type assignment. */
-		void operator = ( const ThisType & bc )
-		{
-			set( bc.ptr() );
-		}
-
-		/** Pointer assignment. */
-		void operator = ( BaseClass * bc )
-		{
-			set( bc );
-		}
-	//@}
-
-
-	/** @name Comparators: */
-	//@{
-		/** Other type equal comparation. */
-		template <class OtherBase>
-		bool operator == ( const Pointer<OtherBase> & other ) const
-		{
-			return m_ptr == other.ptr();
-		}
-
-		/** This type equal comparation. */
-		bool operator == ( const ThisType & bc ) const
-		{
-			return m_ptr == bc.ptr();
-		}
-
-		/** Const pointer equal comparation. */
-		bool operator == ( const BaseClass * const bc ) const
-		{
-			return m_ptr == bc;
-		}
-
-		/** Other type not equal comparation. */
-		template <class OtherBase>
-		bool operator != ( const Pointer<OtherBase> & other ) const
-		{
-			return m_ptr != other.ptr();
-		}
-		
-		/** Other type not equal comparation. */
-		bool operator != ( const ThisType & bc ) const
-		{
-			return m_ptr != bc.ptr();
-		}
-
-		/** Const pointer not equal comparation. */
-		bool operator != (const BaseClass * const bc) const
-		{
-			return m_ptr != bc;
-		}
-
-		/** This type lower than comparation. */
-		bool operator < (const ThisType & p) const
-		{
-			return m_ptr < p.ptr();
-		}
-	//@}
-
-private:
-
-	/** Set this pointer. */
-	void set( BaseClass * p )
-	{
-		if( m_ptr != p ) {
-			if( m_ptr ) m_ptr->release();
-			if( p ) p->addRef();
-			m_ptr = p;
-		}
-	}
+    private:
 
-private:
+        mutable SmartPtr<WeakProxy> m_proxy;
 
-	BaseClass * m_ptr;
+    };
 
-};
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Radix.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Radix.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Radix.h
@@ -1,69 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Contains source code from the article "Radix Sort Revisited".
- *	\file		Radix.h
- *	\author		Pierre Terdiman
- *	\date		April, 4, 2000
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Include Guard
-#ifndef NV_CORE_RADIXSORT_H
-#define NV_CORE_RADIXSORT_H
-
-#include <nvcore/nvcore.h>
-
-
-#define RADIX_LOCAL_RAM
-
-
-class NVCORE_API RadixSort {
-	NV_FORBID_COPY(RadixSort);
-public:
-	// Constructor/Destructor
-	RadixSort();
-	~RadixSort();
-
-	// Sorting methods
-	RadixSort & sort(const uint32* input, uint32 nb, bool signedvalues=true);
-	RadixSort & sort(const float* input, uint32 nb);
-
-	//! Access to results. mIndices is a list of indices in sorted order, i.e. in the order you may further process your data
-	inline uint32 * indices() const { return mIndices; }
-
-	//! mIndices2 gets trashed on calling the sort routine, but otherwise you can recycle it the way you want.
-	inline uint32 * recyclable() const { return mIndices2; }
-
-	// Stats
-	uint32 usedRam() const;
-
-	//! Returns the total number of calls to the radix sorter.
-	inline uint32 totalCalls()	const { return mTotalCalls;	}
-
-	//! Returns the number of premature exits due to temporal coherence.
-	inline uint32 hits() const { return mNbHits; }
-
-
-	private:
-#ifndef RADIX_LOCAL_RAM
-	uint32*			mHistogram;					//!< Counters for each byte
-	uint32*			mOffset;					//!< Offsets (nearly a cumulative distribution function)
-#endif
-	uint32			mCurrentSize;				//!< Current size of the indices list
-	uint32			mPreviousSize;				//!< Size involved in previous call
-	uint32*			mIndices;					//!< Two lists, swapped each pass
-	uint32*			mIndices2;
-
-	// Stats
-	uint32			mTotalCalls;
-	uint32			mNbHits;
-
-	// Internal methods
-	bool			resize(uint32 nb);
-	void			resetIndices();
-
-};
-
-
-#endif // NV_CORE_RADIXSORT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Radix.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Radix.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Radix.cpp
@@ -1,429 +0,0 @@
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Contains source code from the article "Radix Sort Revisited".
- *	\file		Radix.cpp
- *	\author		Pierre Terdiman
- *	\date		April, 4, 2000
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Revisited Radix Sort.
- *	This is my new radix routine:
- *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
- *  - it creates all the histograms in one run instead of four
- *  - it sorts words faster than dwords and bytes faster than words
- *  - it correctly sorts negative floating-point values by patching the offsets
- *  - it automatically takes advantage of temporal coherence
- *  - multiple keys support is a side effect of temporal coherence
- *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
- *
- *	History:
- *	- 08.15.98: very first version
- *	- 04.04.00: recoded for the radix article
- *	- 12.xx.00: code lifting
- *	- 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
- *	- 10.11.01: added local ram support
- *	- 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
- *
- *	\class		RadixSort
- *	\author		Pierre Terdiman
- *	\version	1.3
- *	\date		August, 15, 1998
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*
-To do:
-	- add an offset parameter between two input values (avoid some data recopy sometimes)
-	- unroll ? asm ?
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Header
-
-#include <nvcore/Radix.h>
-
-#include <string.h> // memset
-
-//using namespace IceCore;
-
-#define DELETEARRAY(a)	{ delete [] a; a = NULL; }
-#define CHECKALLOC(a)
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Constructor.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort::RadixSort() : mCurrentSize(0), mPreviousSize(0), mIndices(NULL), mIndices2(NULL), mTotalCalls(0), mNbHits(0)
-{
-#ifndef RADIX_LOCAL_RAM
-	// Allocate input-independent ram
-	mHistogram		= new uint32[256*4];
-	mOffset			= new uint32[256];
-#endif
-	// Initialize indices
-	resetIndices();
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Destructor.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort::~RadixSort()
-{
-	// Release everything
-#ifndef RADIX_LOCAL_RAM
-	DELETEARRAY(mOffset);
-	DELETEARRAY(mHistogram);
-#endif
-	DELETEARRAY(mIndices2);
-	DELETEARRAY(mIndices);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Resizes the inner lists.
- *	\param		nb				[in] new size (number of dwords)
- *	\return		true if success
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-bool RadixSort::resize(uint32 nb)
-{
-	// Free previously used ram
-	DELETEARRAY(mIndices2);
-	DELETEARRAY(mIndices);
-
-	// Get some fresh one
-	mIndices		= new uint32[nb];	CHECKALLOC(mIndices);
-	mIndices2		= new uint32[nb];	CHECKALLOC(mIndices2);
-	mCurrentSize	= nb;
-
-	// Initialize indices so that the input buffer is read in sequential order
-	resetIndices();
-
-	return true;
-}
-
-#define CHECK_RESIZE(n)																			\
-	if(n!=mPreviousSize)																		\
-	{																							\
-				if(n>mCurrentSize)	resize(n);													\
-		else						resetIndices();												\
-		mPreviousSize = n;																		\
-	}
-
-#define CREATE_HISTOGRAMS(type, buffer)															\
-	/* Clear counters */																		\
-	memset(mHistogram, 0, 256*4*sizeof(uint32));												\
-																								\
-	/* Prepare for temporal coherence */														\
-	type PrevVal = (type)buffer[mIndices[0]];													\
-	bool AlreadySorted = true;	/* Optimism... */												\
-	uint32* Indices = mIndices;																	\
-																								\
-	/* Prepare to count */																		\
-	uint8* p = (uint8*)input;																	\
-	uint8* pe = &p[nb*4];																		\
-	uint32* h0= &mHistogram[0];		/* Histogram for first pass (LSB)	*/						\
-	uint32* h1= &mHistogram[256];	/* Histogram for second pass		*/						\
-	uint32* h2= &mHistogram[512];	/* Histogram for third pass			*/						\
-	uint32* h3= &mHistogram[768];	/* Histogram for last pass (MSB)	*/						\
-																								\
-	while(p!=pe)																				\
-	{																							\
-		/* Read input buffer in previous sorted order */										\
-		type Val = (type)buffer[*Indices++];													\
-		/* Check whether already sorted or not */												\
-		if(Val<PrevVal)	{ AlreadySorted = false; break; } /* Early out */						\
-		/* Update for next iteration */															\
-		PrevVal = Val;																			\
-																								\
-		/* Create histograms */																	\
-		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
-	}																							\
-																								\
-	/* If all input values are already sorted, we just have to return and leave the */			\
-	/* previous list unchanged. That way the routine may take advantage of temporal */			\
-	/* coherence, for example when used to sort transparent faces.					*/			\
-	if(AlreadySorted)	{ mNbHits++; return *this;	}											\
-																								\
-	/* Else there has been an early out and we must finish computing the histograms */			\
-	while(p!=pe)																				\
-	{																							\
-		/* Create histograms without the previous overhead */									\
-		h0[*p++]++;	h1[*p++]++;	h2[*p++]++;	h3[*p++]++;											\
-	}
-
-#define CHECK_PASS_VALIDITY(pass)																\
-	/* Shortcut to current counters */															\
-	uint32* CurCount = &mHistogram[pass<<8];													\
-																								\
-	/* Reset flag. The sorting pass is supposed to be performed. (default) */					\
-	bool PerformPass = true;																	\
-																								\
-	/* Check pass validity */																	\
-																								\
-	/* If all values have the same byte, sorting is useless. */									\
-	/* It may happen when sorting bytes or words instead of dwords. */							\
-	/* This routine actually sorts words faster than dwords, and bytes */						\
-	/* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */					\
-	/* for words and O(n) for bytes. Running time for floats depends on actual values... */		\
-																								\
-	/* Get first byte */																		\
-	uint8 UniqueVal = *(((uint8*)input)+pass);													\
-																								\
-	/* Check that byte's counter */																\
-	if(CurCount[UniqueVal]==nb)	PerformPass=false;
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Main sort routine.
- *	This one is for integer values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
- *	\param		input			[in] a list of integer values to sort
- *	\param		nb				[in] number of values to sort
- *	\param		signedvalues	[in] true to handle negative values, false if you know your input buffer only contains positive values
- *	\return		Self-Reference
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort& RadixSort::sort(const uint32* input, uint32 nb, bool signedvalues)
-{
-	uint32 i, j;
-	
-	// Checkings
-	if(!input || !nb)	return *this;
-
-	// Stats
-	mTotalCalls++;
-
-	// Resize lists if needed
-	CHECK_RESIZE(nb);
-
-#ifdef RADIX_LOCAL_RAM
-	// Allocate histograms & offsets on the stack
-	uint32 mHistogram[256*4];
-	uint32 mOffset[256];
-#endif
-
-	// Create histograms (counters). Counters for all passes are created in one run.
-	// Pros:	read input buffer once instead of four times
-	// Cons:	mHistogram is 4Kb instead of 1Kb
-	// We must take care of signed/unsigned values for temporal coherence.... I just
-	// have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
-	if(!signedvalues)	{ CREATE_HISTOGRAMS(uint32, input);	}
-	else				{ CREATE_HISTOGRAMS(int32, input);	}
-
-	// Compute #negative values involved if needed
-	uint32 NbNegativeValues = 0;
-	if(signedvalues)
-	{
-		// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
-		// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
-		// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
-		uint32* h3= &mHistogram[768];
-		for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
-	}
-
-	// Radix sort, j is the pass number (0=LSB, 3=MSB)
-	for( j=0;j<4;j++)
-	{
-		CHECK_PASS_VALIDITY(j);
-
-		// Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
-		// not a problem, numbers are correctly sorted anyway.
-		if(PerformPass)
-		{
-			// Should we care about negative values?
-			if(j!=3 || !signedvalues)
-			{
-				// Here we deal with positive values only
-
-				// Create offsets
-				mOffset[0] = 0;
-				for(i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
-			}
-			else
-			{
-				// This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
-
-				// Create biased offsets, in order for negative numbers to be sorted as well
-				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
-				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
-
-				// Fixing the wrong place for negative values
-				mOffset[128] = 0;
-				for(i=129;i<256;i++)			mOffset[i] = mOffset[i-1] + CurCount[i-1];
-			}
-
-			// Perform Radix Sort
-			uint8* InputBytes	= (uint8*)input;
-			uint32* Indices		= mIndices;
-			uint32* IndicesEnd	= &mIndices[nb];
-			InputBytes += j;
-			while(Indices!=IndicesEnd)
-			{
-				uint32 id = *Indices++;
-				mIndices2[mOffset[InputBytes[id<<2]]++] = id;
-			}
-
-			// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-			uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
-		}
-	}
-	return *this;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Main sort routine.
- *	This one is for floating-point values. After the call, mIndices contains a list of indices in sorted order, i.e. in the order you may process your data.
- *	\param		input			[in] a list of floating-point values to sort
- *	\param		nb				[in] number of values to sort
- *	\return		Self-Reference
- *	\warning	only sorts IEEE floating-point values
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort& RadixSort::sort(const float* input2, uint32 nb)
-{
-	uint32 i, j;
-	
-	// Checkings
-	if(!input2 || !nb)	return *this;
-
-	// Stats
-	mTotalCalls++;
-
-	uint32* input = (uint32*)input2;
-
-	// Resize lists if needed
-	CHECK_RESIZE(nb);
-
-#ifdef RADIX_LOCAL_RAM
-	// Allocate histograms & offsets on the stack
-	uint32 mHistogram[256*4];
-	uint32 mOffset[256];
-#endif
-
-	// Create histograms (counters). Counters for all passes are created in one run.
-	// Pros:	read input buffer once instead of four times
-	// Cons:	mHistogram is 4Kb instead of 1Kb
-	// Floating-point values are always supposed to be signed values, so there's only one code path there.
-	// Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
-	// is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
-	// generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
-	// wouldn't work with mixed positive/negative values....
-	{ CREATE_HISTOGRAMS(float, input2); }
-
-	// Compute #negative values involved if needed
-	uint32 NbNegativeValues = 0;
-	// An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
-	// last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
-	// responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
-	uint32* h3= &mHistogram[768];
-	for( i=128;i<256;i++)	NbNegativeValues += h3[i];	// 768 for last histogram, 128 for negative part
-
-	// Radix sort, j is the pass number (0=LSB, 3=MSB)
-	for( j=0;j<4;j++)
-	{
-		// Should we care about negative values?
-		if(j!=3)
-		{
-			// Here we deal with positive values only
-			CHECK_PASS_VALIDITY(j);
-
-			if(PerformPass)
-			{
-				// Create offsets
-				mOffset[0] = 0;
-				for( i=1;i<256;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];
-
-				// Perform Radix Sort
-				uint8* InputBytes	= (uint8*)input;
-				uint32* Indices		= mIndices;
-				uint32* IndicesEnd	= &mIndices[nb];
-				InputBytes += j;
-				while(Indices!=IndicesEnd)
-				{
-					uint32 id = *Indices++;
-					mIndices2[mOffset[InputBytes[id<<2]]++] = id;
-				}
-
-				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
-			}
-		}
-		else
-		{
-			// This is a special case to correctly handle negative values
-			CHECK_PASS_VALIDITY(j);
-
-			if(PerformPass)
-			{
-				// Create biased offsets, in order for negative numbers to be sorted as well
-				mOffset[0] = NbNegativeValues;												// First positive number takes place after the negative ones
-				for(i=1;i<128;i++)		mOffset[i] = mOffset[i-1] + CurCount[i-1];	// 1 to 128 for positive numbers
-
-				// We must reverse the sorting order for negative numbers!
-				mOffset[255] = 0;
-				for(i=0;i<127;i++)		mOffset[254-i] = mOffset[255-i] + CurCount[255-i];	// Fixing the wrong order for negative values
-				for(i=128;i<256;i++)	mOffset[i] += CurCount[i];							// Fixing the wrong place for negative values
-
-				// Perform Radix Sort
-				for(i=0;i<nb;i++)
-				{
-					uint32 Radix = input[mIndices[i]]>>24;								// Radix byte, same as above. AND is useless here (uint32).
-					// ### cmp to be killed. Not good. Later.
-					if(Radix<128)		mIndices2[mOffset[Radix]++] = mIndices[i];		// Number is positive, same as above
-					else				mIndices2[--mOffset[Radix]] = mIndices[i];		// Number is negative, flip the sorting order
-				}
-				// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-				uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
-			}
-			else
-			{
-				// The pass is useless, yet we still have to reverse the order of current list if all values are negative.
-				if(UniqueVal>=128)
-				{
-					for(i=0;i<nb;i++)	mIndices2[i] = mIndices[nb-i-1];
-
-					// Swap pointers for next pass. Valid indices - the most recent ones - are in mIndices after the swap.
-					uint32* Tmp	= mIndices;	mIndices = mIndices2; mIndices2 = Tmp;
-				}
-			}
-		}
-	}
-	return *this;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Resets the inner indices. After the call, mIndices is reset.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-void RadixSort::resetIndices()
-{
-	for(uint32 i=0;i<mCurrentSize;i++)	mIndices[i] = i;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- *	Gets the ram used.
- *	\return		memory used in bytes
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-uint32 RadixSort::usedRam() const
-{
-	uint32 UsedRam = sizeof(RadixSort);
-#ifndef RADIX_LOCAL_RAM
-	UsedRam += 256*4*sizeof(uint32);			// Histograms
-	UsedRam += 256*sizeof(uint32);				// Offsets
-#endif
-	UsedRam += 2*mCurrentSize*sizeof(uint32);	// 2 lists of indices
-	return UsedRam;
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/RefCounted.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/RefCounted.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/RefCounted.h
@@ -0,0 +1,149 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_REFCOUNTED_H
+#define NV_CORE_REFCOUNTED_H
+
+#include "nvcore.h"
+#include "Debug.h"
+
+#define NV_DECLARE_PTR(Class) \
+    template <class T> class SmartPtr; \
+    typedef SmartPtr<class Class> Class ## Ptr; \
+    typedef SmartPtr<const class Class> Class ## ConstPtr
+
+
+namespace nv
+{
+    /// Weak proxy.
+    class WeakProxy
+    {
+        NV_FORBID_COPY(WeakProxy);
+    public:
+	    /// Ctor.
+	    WeakProxy(void * ptr) : m_count(0), m_ptr(ptr) { }
+
+        /// Dtor.
+        ~WeakProxy()
+        {
+            nvCheck( m_count == 0 );
+        }
+
+        /// Increase reference count.
+        uint addRef() const
+        {
+            m_count++;
+            return m_count;
+        }
+
+        /// Decrease reference count and remove when 0.
+        uint release() const
+        {
+            nvCheck( m_count > 0 );
+
+            m_count--;
+            if( m_count == 0 ) {
+                delete this;
+                return 0;
+            }
+            return m_count;
+        }
+
+	    /// WeakPtr's call this to determine if their pointer is valid or not.
+	    bool isAlive() const {
+		    return m_ptr != NULL;
+	    }
+
+	    /// Only the actual object should call this.
+	    void notifyObjectDied() {
+		    m_ptr = NULL;
+	    }
+
+        /// Return proxy pointer.
+        void * ptr() const {
+            return m_ptr;
+        }
+
+    private:
+        mutable int m_count;
+	    void * m_ptr;
+    };
+
+
+    /// Reference counted base class to be used with SmartPtr and WeakPtr.
+    class RefCounted
+    {
+        NV_FORBID_COPY(RefCounted);
+    public:
+
+        /// Ctor.
+        RefCounted() : m_count(0), m_weak_proxy(NULL)
+        {
+        }
+
+        /// Virtual dtor.
+        virtual ~RefCounted()
+        {
+            nvCheck( m_count == 0 );
+            releaseWeakProxy();
+        }
+
+
+        /// Increase reference count.
+        uint addRef() const
+        {
+            m_count++;
+            return m_count;
+        }
+
+
+        /// Decrease reference count and remove when 0.
+        uint release() const
+        {
+            nvCheck( m_count > 0 );
+
+            m_count--;
+            if( m_count == 0 ) {
+                delete this;
+                return 0;
+            }
+            return m_count;
+        }
+
+        /// Get weak proxy.
+        WeakProxy * getWeakProxy() const
+        {
+            if (m_weak_proxy == NULL) {
+                m_weak_proxy = new WeakProxy((void *)this);
+                m_weak_proxy->addRef();
+            }
+            return m_weak_proxy;
+        }
+
+        /// Release the weak proxy.	
+        void releaseWeakProxy() const
+        {
+            if (m_weak_proxy != NULL) {
+                m_weak_proxy->notifyObjectDied();
+                m_weak_proxy->release();
+                m_weak_proxy = NULL;
+            }
+        }
+
+        /// Get reference count.
+        int refCount() const
+        {
+            return m_count;
+        }
+
+
+    private:
+
+        mutable int m_count;
+        mutable WeakProxy * m_weak_proxy;
+
+    };
+
+} // nv namespace
+
+
+#endif // NV_CORE_REFCOUNTED_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/StdStream.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/StdStream.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/StdStream.h
@@ -1,369 +1,463 @@
-#ifndef NV_STDSTREAM_H
-#define NV_STDSTREAM_H
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
-#include <nvcore/Stream.h>
+//#pragma once
+//#ifndef NV_CORE_STDSTREAM_H
+//#define NV_CORE_STDSTREAM_H
+
+#include "nvcore.h"
+#include "Stream.h"
+#include "Array.h"
 
 #include <stdio.h> // fopen
 #include <string.h> // memcpy
-#include <exception> // std::exception
 
 namespace nv
 {
 
-// Portable version of fopen.
-inline FILE * fileOpen(const char * fileName, const char * mode)
-{
-	nvCheck(fileName != NULL);
+    // Portable version of fopen.
+    inline FILE * fileOpen(const char * fileName, const char * mode)
+    {
+        nvCheck(fileName != NULL);
 #if NV_CC_MSVC && _MSC_VER >= 1400
-	FILE * fp;
-	if (fopen_s(&fp, fileName, mode) == 0) {
-		return fp;
-	}
-	return NULL;
+        FILE * fp;
+        if (fopen_s(&fp, fileName, mode) == 0) {
+            return fp;
+        }
+        return NULL;
 #else
-	return fopen(fileName, mode);
+        return fopen(fileName, mode);
 #endif
-}
-
-
-/// Base stdio stream.
-class NVCORE_CLASS StdStream : public Stream
-{
-	NV_FORBID_COPY(StdStream);
-public:
-
-	/// Ctor.
-	StdStream( FILE * fp, bool autoclose=true ) : 
-		m_fp(fp), m_autoclose(autoclose) { }
-	
-	/// Dtor. 
-	virtual ~StdStream()
-	{
-		if( m_fp != NULL && m_autoclose ) {
-			fclose( m_fp );
-		}
-	}
-
-
-	/** @name Stream implementation. */
-	//@{
-		virtual void seek( uint pos )
-		{
-			nvDebugCheck(m_fp != NULL);
-			nvDebugCheck(pos < size());
-			fseek(m_fp, pos, SEEK_SET);
-		}
-		
-		virtual uint tell() const
-		{
-			nvDebugCheck(m_fp != NULL);
-			return ftell(m_fp);
-		}
-		
-		virtual uint size() const
-		{
-			nvDebugCheck(m_fp != NULL);
-			uint pos = ftell(m_fp);
-			fseek(m_fp, 0, SEEK_END);
-			uint end = ftell(m_fp);
-			fseek(m_fp, pos, SEEK_SET);
-			return end;
-		}
-		
-		virtual bool isError() const
-		{
-			return m_fp == NULL || ferror( m_fp ) != 0;
-		}
-
-		virtual void clearError()
-		{
-			nvDebugCheck(m_fp != NULL);
-			clearerr(m_fp);
-		}
-		
-		virtual bool isAtEnd() const
-		{
-			nvDebugCheck(m_fp != NULL);
-			return feof( m_fp ) != 0;
-		}
-		
-		/// Always true.
-		virtual bool isSeekable() const { return true; }
-	//@}
-
-protected:
-
-	FILE * m_fp;
-	bool m_autoclose;
-
-};
+    }
 
 
-/// Standard output stream.
-class NVCORE_CLASS StdOutputStream : public StdStream
-{
-	NV_FORBID_COPY(StdOutputStream);
-public:
-
-	/// Construct stream by file name.
-	StdOutputStream( const char * name ) :
-		StdStream(fileOpen(name, "wb")) { }
-
-	/// Construct stream by file handle.
-	StdOutputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
-	{
-	}
-
-	/** @name Stream implementation. */
-	//@{
-		/// Write data.
-		virtual uint serialize( void * data, uint len )
-		{
-			nvDebugCheck(data != NULL);
-			nvDebugCheck(m_fp != NULL);
-			return (uint)fwrite(data, 1, len, m_fp);
-		}
-		
-		virtual bool isLoading() const
-		{
-			return false;
-		}
-		
-		virtual bool isSaving() const
-		{
-			return true;
-		}
-	//@}
-
-};
+    /// Base stdio stream.
+    class NVCORE_CLASS StdStream : public Stream
+    {
+        NV_FORBID_COPY(StdStream);
+    public:
+
+        /// Ctor.
+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
+
+        /// Dtor. 
+        virtual ~StdStream()
+        {
+            if( m_fp != NULL && m_autoclose ) {
+#if NV_OS_WIN32
+                _fclose_nolock( m_fp );
+#else
+                fclose( m_fp );
+#endif
+            }
+        }
 
 
-/// Standard input stream.
-class NVCORE_CLASS StdInputStream : public StdStream
-{
-	NV_FORBID_COPY(StdInputStream);
-public:
+        /** @name Stream implementation. */
+        //@{
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(m_fp != NULL);
+            nvDebugCheck(pos <= size());
+#if NV_OS_WIN32
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+        }
 
-	/// Construct stream by file name.
-	StdInputStream( const char * name ) : 
-		StdStream(fileOpen(name, "rb")) { }
-
-	/// Construct stream by file handle.
-	StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
-	{
-	}
-
-	/** @name Stream implementation. */
-	//@{
-		/// Read data.
-		virtual uint serialize( void * data, uint len )
-		{
-			nvDebugCheck(data != NULL);
-			nvDebugCheck(m_fp != NULL);
-			return (uint)fread(data, 1, len, m_fp);
-		}
-		
-		virtual bool isLoading() const
-		{
-			return true;
-		}
-		
-		virtual bool isSaving() const
-		{
-			return false;
-		}
-	//@}
-};
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return _ftell_nolock(m_fp);
+#else
+            return (uint)ftell(m_fp);
+#endif
+        }
 
+        virtual uint size() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return end;
+        }
 
+        virtual bool isError() const
+        {
+            return m_fp == NULL || ferror( m_fp ) != 0;
+        }
+
+        virtual void clearError()
+        {
+            nvDebugCheck(m_fp != NULL);
+            clearerr(m_fp);
+        }
+
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
+        virtual bool isAtEnd() const
+        {
+            if (m_fp == NULL) return true;
+            //nvDebugCheck(m_fp != NULL);
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
+        }
 
-/// Memory input stream.
-class NVCORE_CLASS MemoryInputStream : public Stream
-{
-	NV_FORBID_COPY(MemoryInputStream);
-public:
+        /// Always true.
+        virtual bool isSeekable() const { return true; }
+        //@}
+
+    protected:
+
+        FILE * m_fp;
+        bool m_autoclose;
+
+    };
+
+
+    /// Standard output stream.
+    class NVCORE_CLASS StdOutputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdOutputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Write data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                putc_unlocked(((char *)data)[i], m_fp);
+            }
+            return len;
+#else
+            return (uint)fwrite(data, 1, len, m_fp);
+#endif
+        }
 
-	/// Ctor.
-	MemoryInputStream( const uint8 * mem, uint size ) : 
-		m_mem(mem), m_ptr(mem), m_size(size) { }
-
-	/** @name Stream implementation. */
-	//@{
-		/// Read data.
-		virtual uint serialize( void * data, uint len )
-		{
-			nvDebugCheck(data != NULL);
-			nvDebugCheck(!isError());
-
-			uint left = m_size - tell();
-			if (len > left) len = left;
-			
-			memcpy( data, m_ptr, len );
-			m_ptr += len;
-				
-			return len;
-		}
-		
-		virtual void seek( uint pos )
-		{
-			nvDebugCheck(!isError());
-			m_ptr = m_mem + pos;
-			nvDebugCheck(!isError());
-		}
-		
-		virtual uint tell() const
-		{
-			nvDebugCheck(m_ptr >= m_mem);
-			return uint(m_ptr - m_mem);
-		}
-		
-		virtual uint size() const
-		{
-			return m_size;
-		}
-		
-		virtual bool isError() const
-		{
-			return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
-		}
-		
-		virtual void clearError()
-		{
-			// Nothing to do.
-		}
-
-		virtual bool isAtEnd() const
-		{
-			return m_ptr == m_mem + m_size;
-		}
-		
-		/// Always true.
-		virtual bool isSeekable() const
-		{
-			return true;
-		}
-		
-		virtual bool isLoading() const
-		{
-			return true;
-		}
-		
-		virtual bool isSaving() const
-		{
-			return false;
-		}
-	//@}
-
-	
-private:
-
-	const uint8 * m_mem;
-	const uint8 * m_ptr;
-	uint m_size;
+        virtual bool isLoading() const
+        {
+            return false;
+        }
+
+        virtual bool isSaving() const
+        {
+            return true;
+        }
+        //@}
+
+    };
+
+
+    /// Standard input stream.
+    class NVCORE_CLASS StdInputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdInputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fread_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fread_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                ((char *)data)[i] = getc_unlocked(m_fp);
+            }
+            return len;
+#else
+            return (uint)fread(data, 1, len, m_fp);
+#endif
+            
+        }
 
-};
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+    };
+
+
+
+    /// Memory input stream.
+    class NVCORE_CLASS MemoryInputStream : public Stream
+    {
+        NV_FORBID_COPY(MemoryInputStream);
+    public:
+
+        /// Ctor.
+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(!isError());
+
+            uint left = m_size - tell();
+            if (len > left) len = left;
+
+            memcpy( data, m_ptr, len );
+            m_ptr += len;
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(!isError());
+            m_ptr = m_mem + pos;
+            nvDebugCheck(!isError());
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_ptr >= m_mem);
+            return uint(m_ptr - m_mem);
+        }
+
+        virtual uint size() const
+        {
+            return m_size;
+        }
+
+        virtual bool isError() const
+        {
+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+        }
+
+        virtual void clearError()
+        {
+            // Nothing to do.
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_ptr == m_mem + m_size;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const
+        {
+            return true;
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+
+        const uint8 * ptr() const { return m_ptr; }
+
+
+    private:
+
+        const uint8 * m_mem;
+        const uint8 * m_ptr;
+        uint m_size;
+
+    };
+
+
+    /// Buffer output stream.
+    class NVCORE_CLASS BufferOutputStream : public Stream
+    {
+        NV_FORBID_COPY(BufferOutputStream);
+    public:
+
+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
+
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            m_buffer.append((uint8 *)data, len);
+            return len;
+        }
+
+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
+        virtual uint tell() const { return m_buffer.size(); }
+        virtual uint size() const { return m_buffer.size(); }
+
+        virtual bool isError() const { return false; }
+        virtual void clearError() {}
+
+        virtual bool isAtEnd() const { return true; }
+        virtual bool isSeekable() const { return false; }
+        virtual bool isLoading() const { return false; }
+        virtual bool isSaving() const { return true; }
+
+    private:
+        Array<uint8> & m_buffer;
+    };
+
+
+    /// Protected input stream.
+    class NVCORE_CLASS ProtectedStream : public Stream
+    {
+        NV_FORBID_COPY(ProtectedStream);
+    public:
+
+        /// Ctor.
+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+        { 
+        }
+
+        /// Ctor.
+        ProtectedStream( Stream * s, bool autodelete = true ) : 
+        m_s(s), m_autodelete(autodelete) 
+        {
+            nvDebugCheck(m_s != NULL);
+        }
+
+        /// Dtor.
+        virtual ~ProtectedStream()
+        {
+            if( m_autodelete ) {
+                delete m_s;
+            }
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            len = m_s->serialize( data, len );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            m_s->seek( pos );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+        }
+
+        virtual uint tell() const
+        {
+            return m_s->tell();
+        }
+
+        virtual uint size() const
+        {
+            return m_s->size();
+        }
+
+        virtual bool isError() const
+        {
+            return m_s->isError();
+        }
+
+        virtual void clearError()
+        {
+            m_s->clearError();
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_s->isAtEnd();
+        }
+
+        virtual bool isSeekable() const
+        {
+            return m_s->isSeekable();
+        }
+
+        virtual bool isLoading() const
+        {
+            return m_s->isLoading();
+        }
+
+        virtual bool isSaving() const
+        {
+            return m_s->isSaving();
+        }
+        //@}
 
 
-/// Protected input stream.
-class NVCORE_CLASS ProtectedStream : public Stream
-{
-	NV_FORBID_COPY(ProtectedStream);
-public:
+    private:
 
-	/// Ctor.
-	ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
-	{ 
-	}
-
-	/// Ctor.
-	ProtectedStream( Stream * s, bool autodelete = true ) : 
-		m_s(s), m_autodelete(autodelete) 
-	{
-		nvDebugCheck(m_s != NULL);
-	}
-
-	/// Dtor.
-	virtual ~ProtectedStream()
-	{
-		if( m_autodelete ) {
-			delete m_s;
-		}
-	}
-
-	/** @name Stream implementation. */
-	//@{
-		/// Read data.
-		virtual uint serialize( void * data, uint len )
-		{
-			nvDebugCheck(data != NULL);
-			len = m_s->serialize( data, len );
-			
-			if( m_s->isError() ) {
-				throw std::exception();
-			}
-
-			return len;
-		}
-		
-		virtual void seek( uint pos )
-		{
-			m_s->seek( pos );
-			
-			if( m_s->isError() ) {
-				throw std::exception();
-			}
-		}
-		
-		virtual uint tell() const
-		{
-			return m_s->tell();
-		}
-		
-		virtual uint size() const
-		{
-			return m_s->size();
-		}
-		
-		virtual bool isError() const
-		{
-			return m_s->isError();
-		}
-
-		virtual void clearError()
-		{
-			m_s->clearError();
-		}
-
-		virtual bool isAtEnd() const
-		{
-			return m_s->isAtEnd();
-		}
-		
-		virtual bool isSeekable() const
-		{
-			return m_s->isSeekable();
-		}
-		
-		virtual bool isLoading() const
-		{
-			return m_s->isLoading();
-		}
-		
-		virtual bool isSaving() const
-		{
-			return m_s->isSaving();
-		}
-	//@}
-
-	
-private:
-	
-	Stream * const m_s;
-	bool const m_autodelete;
+        Stream * const m_s;
+        bool const m_autodelete;
 
-};
+    };
 
 } // nv namespace
 
 
-#endif // NV_STDSTREAM_H
+//#endif // NV_CORE_STDSTREAM_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/StrLib.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/StrLib.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/StrLib.h
@@ -1,354 +1,429 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
+#pragma once
 #ifndef NV_CORE_STRING_H
 #define NV_CORE_STRING_H
 
-#include <nvcore/nvcore.h>
-#include <nvcore/Containers.h>	// swap
+#include "Debug.h"
+#include "Hash.h" // hash
 
-#include <string.h> // strlen, strcmp, etc.
+//#include <string.h> // strlen, etc.
 
+#if NV_OS_WIN32
+#define NV_PATH_SEPARATOR '\\'
+#else
+#define NV_PATH_SEPARATOR '/'
+#endif
 
 namespace nv
 {
 
-	uint strHash(const char * str, uint h) NV_PURE;
+    NVCORE_API uint strHash(const char * str, uint h) NV_PURE;
 
-	/// String hash based on Bernstein's hash.
-	inline uint strHash(const char * data, uint h = 5381)
-	{
-		uint i = 0;
-		while(data[i] != 0) {
-			h = (33 * h) ^ uint(data[i]);
-			i++;
-		}
-		return h;
-	}
-	
-	template <> struct hash<const char *> {
-		uint operator()(const char * str) const { return strHash(str); }
-	};
-	
-	NVCORE_API int strCaseCmp(const char * s1, const char * s2) NV_PURE;
-	NVCORE_API int strCmp(const char * s1, const char * s2) NV_PURE;
-	NVCORE_API void strCpy(char * dst, int size, const char * src);
-	NVCORE_API void strCpy(char * dst, int size, const char * src, int len);
-	NVCORE_API void strCat(char * dst, int size, const char * src);
-
-	NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
-
-	
-	/// String builder.
-	class NVCORE_CLASS StringBuilder
-	{
-	public:
-	
-		StringBuilder();
-		explicit StringBuilder( int size_hint );
-		StringBuilder( const char * str );
-		StringBuilder( const StringBuilder & );
-	
-		~StringBuilder();
-	
-		StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
-		StringBuilder & format( const char * format, va_list arg );
-	
-		StringBuilder & append( const char * str );
-		StringBuilder & appendFormat( const char * format, ... ) __attribute__((format (printf, 2, 3)));
-		StringBuilder & appendFormat( const char * format, va_list arg );
-	
-		StringBuilder & number( int i, int base = 10 );
-		StringBuilder & number( uint i, int base = 10 );
-	
-		StringBuilder & reserve( uint size_hint );
-		StringBuilder & copy( const char * str );
-		StringBuilder & copy( const StringBuilder & str );
-		
-		StringBuilder & toLower();
-		StringBuilder & toUpper();
-		
-		void reset();
-		bool isNull() const { return m_size == 0; }
-	
-		// const char * accessors
-		operator const char * () const { return m_str; }
-		operator char * () { return m_str; }
-		const char * str() const { return m_str; }
-		char * str() { return m_str; }
-	
-		/// Implement value semantics.
-		StringBuilder & operator=( const StringBuilder & s ) {
-			return copy(s);
-		}
-
-		/// Implement value semantics.
-		StringBuilder & operator=( const char * s ) {
-			return copy(s);
-		}
-
-		/// Equal operator.
-		bool operator==( const StringBuilder & s ) const {
-			if (s.isNull()) return isNull();
-			else if (isNull()) return false;
-			else return strcmp(s.m_str, m_str) != 0;
-		}
-		
-		/// Return the exact length.
-		uint length() const { return isNull() ? 0 : uint(strlen(m_str)); }
-	
-		/// Return the size of the string container.
-		uint capacity() const { return m_size; }
-	
-		/// Return the hash of the string.
-		uint hash() const { return isNull() ? 0 : strHash(m_str); }
-	
-		///	Swap strings.
-		friend void swap(StringBuilder & a, StringBuilder & b) {
-			nv::swap(a.m_size, b.m_size);
-			nv::swap(a.m_str, b.m_str);
-		}
-	
-	protected:
-		
-		/// Size of the string container.
-		uint m_size;
-		
-		/// String.
-		char * m_str;
-		
-	};
-
-
-	/// Path string. @@ This should be called PathBuilder.
-	class NVCORE_CLASS Path : public StringBuilder
-	{
-	public:
-		Path() : StringBuilder() {}
-		explicit Path(int size_hint) : StringBuilder(size_hint) {}
-		Path(const char * str) : StringBuilder(str) {}
-		Path(const Path & path) : StringBuilder(path) {}
-		
-		const char * fileName() const;
-		const char * extension() const;
-		
-		void translatePath();
-		
-		void stripFileName();
-		void stripExtension();
-
-		// statics
-		NVCORE_API static char separator();
-		NVCORE_API static const char * fileName(const char *);
-		NVCORE_API static const char * extension(const char *);
-	};
-	
-	
-	/// String class.
-	class NVCORE_CLASS String
-	{
-	public:
-
-		/// Constructs a null string. @sa isNull()
-		String()
-		{
-			data = NULL;
-		}
-
-		/// Constructs a shared copy of str.
-		String(const String & str)
-		{
-			data = str.data;
-			if (data != NULL) addRef();
-		}
-
-		/// Constructs a shared string from a standard string.
-		String(const char * str)
-		{
-			setString(str);
-		}
-
-		/// Constructs a shared string from a standard string.
-		String(const char * str, int length)
-		{
-			setString(str, length);
-		}
-
-		/// Constructs a shared string from a StringBuilder.
-		String(const StringBuilder & str)
-		{
-			setString(str);
-		}
-
-		/// Dtor.
-		~String()
-		{
-			release();
-		}
-
-		String clone() const;
-	
-		/// Release the current string and allocate a new one.
-		const String & operator=( const char * str )
-		{
-			release();
-			setString( str );
-			return *this;
-		}
-
-		/// Release the current string and allocate a new one.
-		const String & operator=( const StringBuilder & str )
-		{
-			release();
-			setString( str );
-			return *this;
-		}
-	
-		/// Implement value semantics.
-		String & operator=( const String & str )
-		{
-			if (str.data != data)
-			{
-				release();
-				data = str.data;
-				addRef();
-			}
-			return *this;
-		}
-
-		/// Equal operator.
-		bool operator==( const String & str ) const
-		{
-			if( str.data == data ) {
-				return true;
-			}
-			if ((data == NULL) != (str.data == NULL)) {
-				return false;
-			}
-			return strcmp(data, str.data) == 0;
-		}
-
-		/// Equal operator.
-		bool operator==( const char * str ) const
-		{
-			nvCheck(str != NULL);	// Use isNull!
-			if (data == NULL) {
-				return false;
-			}
-			return strcmp(data, str) == 0;
-		}
-
-		/// Not equal operator.
-		bool operator!=( const String & str ) const
-		{
-			if( str.data == data ) {
-				return false;
-			}
-			if ((data == NULL) != (str.data == NULL)) {
-				return true;
-			}
-			return strcmp(data, str.data) != 0;
-		}
-	
-		/// Not equal operator.
-		bool operator!=( const char * str ) const
-		{
-			nvCheck(str != NULL);	// Use isNull!
-			if (data == NULL) {
-				return false;
-			}
-			return strcmp(data, str) != 0;
-		}
-	
-		/// Returns true if this string is the null string.
-		bool isNull() const { return data == NULL; }
-	
-		/// Return the exact length.
-		uint length() const { nvDebugCheck(data != NULL); return uint(strlen(data)); }
-	
-		/// Return the hash of the string.
-		uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
-	
-		/// const char * cast operator.
-		operator const char * () const { return data; }
-	
-		/// Get string pointer.
-		const char * str() const { return data; }
-	
-
-	private:
-
-		// Add reference count.
-		void addRef()
-		{
-			if (data != NULL)
-			{
-				setRefCount(getRefCount() + 1);
-			}
-		}
-		
-		// Decrease reference count.
-		void release()
-		{
-			if (data != NULL)
-			{
-				const uint16 count = getRefCount();
-				setRefCount(count - 1);
-				if (count - 1 == 0) {
-					free(data - 2);
-					data = NULL;
-				}
-			}
-		}
-		
-		uint16 getRefCount() const
-		{
-			nvDebugCheck(data != NULL);
-			return *reinterpret_cast<const uint16 *>(data - 2);
-		}
-		
-		void setRefCount(uint16 count) {
-			nvDebugCheck(data != NULL);
-			nvCheck(count < 0xFFFF);
-			*reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
-		}
-		
-		void setData(const char * str) {
-			data = str + 2;
-		}
-		
-		void allocString(const char * str)
-		{
-			allocString(str, (int)strlen(str));
-		}
-
-		void allocString(const char * str, int len)
-		{
-			const char * ptr = static_cast<const char *>(::malloc(2 + len + 1));
-	
-			setData( ptr );				
-			setRefCount( 0 );
-			
-			// Copy string.
-			strCpy(const_cast<char *>(data), len+1, str, len);
-
-			// Add terminating character.
-			const_cast<char *>(data)[len] = '\0';
-		}
-	
-		void setString(const char * str);
-		void setString(const char * str, int length);
-		void setString(const StringBuilder & str);	
-	
-		///	Swap strings.
-		friend void swap(String & a, String & b) {
-			swap(a.data, b.data);
-		}
-	
-	private:
-
-		const char * data;
-		
-	};
+    /// String hash based on Bernstein's hash.
+    inline uint strHash(const char * data, uint h = 5381)
+    {
+        uint i = 0;
+        while(data[i] != 0) {
+            h = (33 * h) ^ uint(data[i]);
+            i++;
+        }
+        return h;
+    }
+
+    template <> struct Hash<const char *> {
+        uint operator()(const char * str) const { return strHash(str); }
+    };
+
+    NVCORE_API uint strLen(const char * str) NV_PURE;                       // Asserts on NULL strings.
+
+    NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE;       // Asserts on NULL strings.
+    NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE;   // Asserts on NULL strings.
+    NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE;     // Accepts NULL strings.
+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
+
+    template <> struct Equal<const char *> {
+        bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
+    };
+
+    NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE;
+    NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE;
+
+
+    NVCORE_API void strCpy(char * dst, uint size, const char * src);
+    NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len);
+    NVCORE_API void strCat(char * dst, uint size, const char * src);
+
+    NVCORE_API const char * strSkipWhiteSpace(const char * str);
+    NVCORE_API char * strSkipWhiteSpace(char * str);
+
+    NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
+
+    NVCORE_API bool isNumber(const char * str) NV_PURE;
+
+    /* @@ Implement these two functions and modify StringBuilder to use them?
+    NVCORE_API void strFormat(const char * dst, const char * fmt, ...);
+    NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg);
+
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3)));
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        strFormatList(buffer, count, fmt, args);
+        va_end(args);
+    }
+    template <size_t count> void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) {
+        va_list tmp;
+        va_copy(tmp, args);
+        strFormatList(buffer, count, fmt, tmp);
+        va_end(tmp);
+    }*/
+
+    template <int count> void strCpySafe(char (&buffer)[count], const char *src) {
+        strCpy(buffer, count, src);
+    }
+
+    template <int count> void strCatSafe(char (&buffer)[count], const char * src) {
+        strCat(buffer, count, src);
+    }
+
+
+
+    /// String builder.
+    class NVCORE_CLASS StringBuilder
+    {
+    public:
+
+        StringBuilder();
+        explicit StringBuilder( uint size_hint );
+        StringBuilder(const char * str);
+        StringBuilder(const char * str, uint len);
+        StringBuilder(const StringBuilder & other);
+
+        ~StringBuilder();
+
+        StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+        StringBuilder & formatList( const char * format, va_list arg );
+
+        StringBuilder & append(const char * str);
+		StringBuilder & append(const char * str, uint len);
+        StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
+        StringBuilder & appendFormatList(const char * format, va_list arg);
+
+        StringBuilder & appendSpace(uint n);
+
+        StringBuilder & number( int i, int base = 10 );
+        StringBuilder & number( uint i, int base = 10 );
+
+        StringBuilder & reserve(uint size_hint);
+        StringBuilder & copy(const char * str);
+        StringBuilder & copy(const char * str, uint len);
+        StringBuilder & copy(const StringBuilder & str);
+
+        StringBuilder & toLower();
+        StringBuilder & toUpper();
+
+        bool endsWith(const char * str) const;
+        bool beginsWith(const char * str) const;
+
+        char * reverseFind(char c);
+
+        void reset();
+        bool isNull() const { return m_size == 0; }
+
+        // const char * accessors
+        //operator const char * () const { return m_str; }
+        //operator char * () { return m_str; }
+        const char * str() const { return m_str; }
+        char * str() { return m_str; }
+
+        char * release();
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const StringBuilder & s ) {
+            return copy(s);
+        }
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const char * s ) {
+            return copy(s);
+        }
+
+        /// Equal operator.
+        bool operator==( const StringBuilder & s ) const {
+            return strMatch(s.m_str, m_str);
+        }
+
+        /// Return the exact length.
+        uint length() const { return isNull() ? 0 : strLen(m_str); }
+
+        /// Return the size of the string container.
+        uint capacity() const { return m_size; }
+
+        /// Return the hash of the string.
+        uint hash() const { return isNull() ? 0 : strHash(m_str); }
+
+        // Swap strings.
+        friend void swap(StringBuilder & a, StringBuilder & b);
+
+    protected:
+
+        /// Size of the string container.
+        uint m_size;
+
+        /// String.
+        char * m_str;
+
+    };
+
+
+    /// Path string. @@ This should be called PathBuilder.
+    class NVCORE_CLASS Path : public StringBuilder
+    {
+    public:
+        Path() : StringBuilder() {}
+        explicit Path(int size_hint) : StringBuilder(size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
+        Path(const Path & path) : StringBuilder(path) {}
+
+        const char * fileName() const;
+        const char * extension() const;
+
+        void translatePath(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void stripFileName();
+        void stripExtension();
+
+        // statics
+        static char separator();
+        static const char * fileName(const char *);
+        static const char * extension(const char *);
+
+        static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+    };
+
+
+    /// String class.
+    class NVCORE_CLASS String
+    {
+    public:
+
+        /// Constructs a null string. @sa isNull()
+        String()
+        {
+            data = NULL;
+        }
+
+        /// Constructs a shared copy of str.
+        String(const String & str)
+        {
+            data = str.data;
+            if (data != NULL) addRef();
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str)
+        {
+            setString(str);
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str, int length)
+        {
+            setString(str, length);
+        }
+
+        /// Constructs a shared string from a StringBuilder.
+        String(const StringBuilder & str)
+        {
+            setString(str);
+        }
+
+        /// Dtor.
+        ~String()
+        {
+            release();
+        }
+
+        String clone() const;
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const char * str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const StringBuilder & str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Implement value semantics.
+        String & operator=( const String & str )
+        {
+            if (str.data != data)
+            {
+                release();
+                data = str.data;
+                addRef();
+            }
+            return *this;
+        }
+
+        /// Equal operator.
+        bool operator==( const String & str ) const
+        {
+            return strMatch(str.data, data);
+        }
+
+        /// Equal operator.
+        bool operator==( const char * str ) const
+        {
+            return strMatch(str, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const String & str ) const
+        {
+            return !strMatch(str.data, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const char * str ) const
+        {
+            return !strMatch(str, data);
+        }
+
+        /// Returns true if this string is the null string.
+        bool isNull() const { return data == NULL; }
+
+        /// Return the exact length.
+        uint length() const { nvDebugCheck(data != NULL); return strLen(data); }
+
+        /// Return the hash of the string.
+        uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
+
+        /// const char * cast operator.
+        operator const char * () const { return data; }
+
+        /// Get string pointer.
+        const char * str() const { return data; }
+
+
+    private:
+
+        // Add reference count.
+        void addRef();
+
+        // Decrease reference count.
+        void release();
+
+        uint16 getRefCount() const
+        {
+            nvDebugCheck(data != NULL);
+            return *reinterpret_cast<const uint16 *>(data - 2);
+        }
+
+        void setRefCount(uint16 count) {
+            nvDebugCheck(data != NULL);
+            nvCheck(count < 0xFFFF);
+            *reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
+        }
+
+        void setData(const char * str) {
+            data = str + 2;
+        }
+
+        void allocString(const char * str)
+        {
+            allocString(str, strLen(str));
+        }
+
+        void allocString(const char * str, uint length);
+
+        void setString(const char * str);
+        void setString(const char * str, uint length);
+        void setString(const StringBuilder & str);
+
+        // Swap strings.
+        friend void swap(String & a, String & b);
+
+    private:
+
+        const char * data;
+
+    };
+
+    template <> struct Hash<String> {
+        uint operator()(const String & str) const { return str.hash(); }
+    };
+
+
+    // Like AutoPtr, but for const char strings.
+    class AutoString
+    {
+        NV_FORBID_COPY(AutoString);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        // Ctor.
+        AutoString(const char * p = NULL) : m_ptr(p) { }
+
+#if NV_CC_CPP11
+        // Move ctor.
+        AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; }
+#endif
+        
+        // Dtor. Deletes owned pointer.
+        ~AutoString() {
+            delete [] m_ptr;
+            m_ptr = NULL;
+        }
+
+        // Delete owned pointer and assign new one.
+        void operator=(const char * p) {
+            if (p != m_ptr) 
+            {
+                delete [] m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        // Get pointer.
+        const char * ptr() const { return m_ptr; }
+        operator const char *() const { return m_ptr; }
+
+        // Relinquish ownership of the underlying pointer and returns that pointer.
+        const char * release() {
+            const char * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        // comparison operators.
+        friend bool operator == (const AutoString & ap, const char * const p) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const AutoString & ap, const char * const p) {
+            return (ap.ptr() != p);
+        }
+        friend bool operator == (const char * const p, const AutoString & ap) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const char * const p, const AutoString & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        const char * m_ptr;
+    };
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/StrLib.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/StrLib.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/StrLib.cpp
@@ -1,137 +1,185 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
-#include <nvcore/StrLib.h>
+#include "StrLib.h"
 
-#include <math.h>	// log
-#include <stdio.h>	// vsnprintf
+#include "Memory.h"
+#include "Utils.h" // swap
+
+#include <math.h>   // log
+#include <stdio.h>  // vsnprintf
+#include <string.h> // strlen, strcmp, etc.
 
 #if NV_CC_MSVC
 #include <stdarg.h> // vsnprintf
 #endif
 
-#if NV_OS_WIN32
-#define NV_PATH_SEPARATOR '\\'
-#else
-#define NV_PATH_SEPARATOR '/'
-#endif
-
 using namespace nv;
 
 namespace 
 {
-	static char * strAlloc(uint size)
-	{
-		return static_cast<char *>(::malloc(size));
-	}
-	
-	static char * strReAlloc(char * str, uint size)
-	{
-		return static_cast<char *>(::realloc(str, size));
-	}
-	
-	static void strFree(const char * str)
-	{
-		return ::free(const_cast<char *>(str));
-	}
-	
-	/*static char * strDup( const char * str ) 
-	{
-		nvDebugCheck( str != NULL );
-		uint len = uint(strlen( str ) + 1);
-		char * dup = strAlloc( len );
-		memcpy( dup, str, len );
-		return dup;
-	}*/
-	
-	// helper function for integer to string conversion.
-	static char * i2a( uint i, char *a, uint r )
-	{
-		if( i / r > 0 ) {
-			a = i2a( i / r, a, r );
-		}
-		*a = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % r];
-		return a + 1;
-	}
-	
-	// Locale independent functions.
-	static inline char toUpper( char c ) {
-		return (c<'a' || c>'z') ? (c) : (c+'A'-'a');
-	}
-	static inline char toLower( char c ) {
-		return (c<'A' || c>'Z') ? (c) : (c+'a'-'A');
-	}
-	static inline bool isAlpha( char c ) {
-		return (c>='a' && c<='z') || (c>='A' && c<='Z');
-	}
-	static inline bool isDigit( char c ) {
-		return c>='0' && c<='9';
-	}
-	static inline bool isAlnum( char c ) {
-		return (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9');
-	}
-	
-}
-
-int nv::strCmp(const char * s1, const char * s2)
-{
-	nvDebugCheck(s1 != NULL);
-	nvDebugCheck(s2 != NULL);
-	return strcmp(s1, s2);
+    static char * strAlloc(uint size)
+    {
+        return malloc<char>(size);
+    }
+
+    static char * strReAlloc(char * str, uint size)
+    {
+        return realloc<char>(str, size);
+    }
+
+    static void strFree(const char * str)
+    {
+        return free<char>(str);
+    }
+
+    /*static char * strDup( const char * str )
+    {
+        nvDebugCheck( str != NULL );
+        uint len = uint(strlen( str ) + 1);
+        char * dup = strAlloc( len );
+        memcpy( dup, str, len );
+        return dup;
+    }*/
+
+    // helper function for integer to string conversion.
+    static char * i2a( uint i, char *a, uint r )
+    {
+        if( i / r > 0 ) {
+            a = i2a( i / r, a, r );
+        }
+        *a = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % r];
+        return a + 1;
+    }
+
+    // Locale independent functions.
+    static inline char toUpper( char c ) {
+        return (c<'a' || c>'z') ? (c) : (c+'A'-'a');
+    }
+    static inline char toLower( char c ) {
+        return (c<'A' || c>'Z') ? (c) : (c+'a'-'A');
+    }
+    static inline bool isAlpha( char c ) {
+        return (c>='a' && c<='z') || (c>='A' && c<='Z');
+    }
+    static inline bool isDigit( char c ) {
+        return c>='0' && c<='9';
+    }
+    static inline bool isAlnum( char c ) {
+        return (c>='a' && c<='z') || (c>='A' && c<='Z') || (c>='0' && c<='9');
+    }
+
+}
+
+uint nv::strLen(const char * str)
+{
+    nvDebugCheck(str != NULL);
+    return U32(strlen(str));
 }
 
-int nv::strCaseCmp(const char * s1, const char * s2)
+int nv::strDiff(const char * s1, const char * s2)
 {
-	nvDebugCheck(s1 != NULL);
-	nvDebugCheck(s1 != NULL);
+    nvDebugCheck(s1 != NULL);
+    nvDebugCheck(s2 != NULL);
+    return strcmp(s1, s2);
+}
+
+int nv::strCaseDiff(const char * s1, const char * s2)
+{
+    nvDebugCheck(s1 != NULL);
+    nvDebugCheck(s1 != NULL);
 #if NV_CC_MSVC
-	return _stricmp(s1, s2);
+    return _stricmp(s1, s2);
 #else
-	return strcasecmp(s1, s2);
+    return strcasecmp(s1, s2);
 #endif
 }
 
-void nv::strCpy(char * dst, int size, const char * src)
+bool nv::strEqual(const char * s1, const char * s2)
+{
+    if (s1 == s2) return true;
+    if (s1 == NULL || s2 == NULL) return false;
+    return strcmp(s1, s2) == 0;
+}
+
+bool nv::strCaseEqual(const char * s1, const char * s2)
 {
-	nvDebugCheck(dst != NULL);
-	nvDebugCheck(src != NULL);
+    if (s1 == s2) return true;
+    if (s1 == NULL || s2 == NULL) return false;
+    return strCaseDiff(s1, s2) == 0;
+}
+
+bool nv::strBeginsWith(const char * str, const char * prefix)
+{
+    //return strstr(str, prefix) == dst;
+    return strncmp(str, prefix, strlen(prefix)) == 0;
+}
+
+bool nv::strEndsWith(const char * str, const char * suffix)
+{
+    uint ml = strLen(str);
+    uint sl = strLen(suffix);
+    if (ml < sl) return false;
+    return strncmp(str + ml - sl, suffix, sl) == 0;
+}
+
+// @@ Add asserts to detect overlap between dst and src?
+void nv::strCpy(char * dst, uint size, const char * src)
+{
+    nvDebugCheck(dst != NULL);
+    nvDebugCheck(src != NULL);
 #if NV_CC_MSVC && _MSC_VER >= 1400
-	strcpy_s(dst, size, src);
+    strcpy_s(dst, size, src);
 #else
-	NV_UNUSED(size);
-	strcpy(dst, src);
+    NV_UNUSED(size);
+    strcpy(dst, src);
 #endif
 }
 
-void nv::strCpy(char * dst, int size, const char * src, int len)
+void nv::strCpy(char * dst, uint size, const char * src, uint len)
 {
-	nvDebugCheck(dst != NULL);
-	nvDebugCheck(src != NULL);
+    nvDebugCheck(dst != NULL);
+    nvDebugCheck(src != NULL);
 #if NV_CC_MSVC && _MSC_VER >= 1400
-	strncpy_s(dst, size, src, len);
+    strncpy_s(dst, size, src, len);
 #else
-	NV_UNUSED(size);
-	strncpy(dst, src, len);
+    int n = min(len+1, size);
+    strncpy(dst, src, n);
+    dst[n-1] = '\0';
 #endif
 }
 
-void nv::strCat(char * dst, int size, const char * src)
+void nv::strCat(char * dst, uint size, const char * src)
 {
-	nvDebugCheck(dst != NULL);
-	nvDebugCheck(src != NULL);
+    nvDebugCheck(dst != NULL);
+    nvDebugCheck(src != NULL);
 #if NV_CC_MSVC && _MSC_VER >= 1400
-	strcat_s(dst, size, src);
+    strcat_s(dst, size, src);
 #else
-	NV_UNUSED(size);
-	strcat(dst, src);
+    NV_UNUSED(size);
+    strcat(dst, src);
 #endif
 }
 
+NVCORE_API const char * nv::strSkipWhiteSpace(const char * str)
+{
+    nvDebugCheck(str != NULL);
+    while (*str == ' ') str++;
+    return str;
+}
+
+NVCORE_API char * nv::strSkipWhiteSpace(char * str)
+{
+    nvDebugCheck(str != NULL);
+    while (*str == ' ') str++;
+    return str;
+}
+
 
 /** Pattern matching routine. I don't remember where did I get this. */
 bool nv::strMatch(const char * str, const char * pat)
 {
-	nvDebugCheck(str != NULL);
-	nvDebugCheck(pat != NULL);
+    nvDebugCheck(str != NULL);
+    nvDebugCheck(pat != NULL);
 
     char c2;
 
@@ -187,6 +235,13 @@
     }
 }
 
+bool nv::isNumber(const char * str) {
+    while(*str != '\0') {
+        if (!isDigit(*str)) return false;
+        str++;
+    }
+    return true;
+}
 
 
 /** Empty string. */
@@ -195,313 +250,405 @@
 }
 
 /** Preallocate space. */
-StringBuilder::StringBuilder( int size_hint ) : m_size(size_hint)
+StringBuilder::StringBuilder( uint size_hint ) : m_size(size_hint)
 {
-	nvDebugCheck(m_size > 0);
-	m_str = strAlloc(m_size);
-	*m_str = '\0';
+    nvDebugCheck(m_size > 0);
+    m_str = strAlloc(m_size);
+    *m_str = '\0';
 }
 
 /** Copy ctor. */
 StringBuilder::StringBuilder( const StringBuilder & s ) : m_size(0), m_str(NULL)
 {
-	copy(s);
+    copy(s);
 }
 
 /** Copy string. */
-StringBuilder::StringBuilder( const char * s ) : m_size(0), m_str(NULL)
+StringBuilder::StringBuilder(const char * s) : m_size(0), m_str(NULL)
 {
-	copy(s);
+    if (s != NULL) {
+        copy(s);
+    }
+}
+
+/** Copy string. */
+StringBuilder::StringBuilder(const char * s, uint len) : m_size(0), m_str(NULL)
+{
+    copy(s, len);
 }
 
 /** Delete the string. */
 StringBuilder::~StringBuilder()
 {
-	m_size = 0;
-	strFree(m_str);
-	m_str = NULL;
+    strFree(m_str);
 }
 
 
 /** Format a string safely. */
 StringBuilder & StringBuilder::format( const char * fmt, ... )
 {
-	nvDebugCheck(fmt != NULL);
-	va_list arg;
-	va_start( arg, fmt );
+    nvDebugCheck(fmt != NULL);
+    va_list arg;
+    va_start( arg, fmt );
 
-	format( fmt, arg );
+    formatList( fmt, arg );
 
-	va_end( arg );
+    va_end( arg );
 
-	return *this;
+    return *this;
 }
 
 
 /** Format a string safely. */
-StringBuilder & StringBuilder::format( const char * fmt, va_list arg )
+StringBuilder & StringBuilder::formatList( const char * fmt, va_list arg )
 {
-	nvDebugCheck(fmt != NULL);
+    nvDebugCheck(fmt != NULL);
 
-	if( m_size == 0 ) {
-		m_size = 64;
-		m_str = strAlloc( m_size );
-	}
+    if (m_size == 0) {
+        m_size = 64;
+        m_str = strAlloc( m_size );
+    }
 
-	va_list tmp;
-	va_copy(tmp, arg);
+    va_list tmp;
+    va_copy(tmp, arg);
 #if NV_CC_MSVC && _MSC_VER >= 1400
-	int n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
+    int n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
 #else
-	int n = vsnprintf(m_str, m_size, fmt, tmp);
+    int n = vsnprintf(m_str, m_size, fmt, tmp);
 #endif
-	va_end(tmp);
+    va_end(tmp);
 
-	while( n < 0 || n >= int(m_size) ) {
-		if( n > -1 ) {
-			m_size = n + 1;
-		}
-		else {
-			m_size *= 2;
-		}
+    while( n < 0 || n >= int(m_size) ) {
+        if( n > -1 ) {
+            m_size = n + 1;
+        }
+        else {
+            m_size *= 2;
+        }
 
-		m_str = strReAlloc(m_str, m_size);
+        m_str = strReAlloc(m_str, m_size);
 
-		va_copy(tmp, arg);
+        va_copy(tmp, arg);
 #if NV_CC_MSVC && _MSC_VER >= 1400
-		n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
+        n = vsnprintf_s(m_str, m_size, _TRUNCATE, fmt, tmp);
 #else
-		n = vsnprintf(m_str, m_size, fmt, tmp);
+        n = vsnprintf(m_str, m_size, fmt, tmp);
 #endif
-		va_end(tmp);
-	}
-	
-	nvDebugCheck(n < int(m_size));
-	
-	// Make sure it's null terminated.
-	nvDebugCheck(m_str[n] == '\0');
-	//str[n] = '\0';
+        va_end(tmp);
+    }
+
+    nvDebugCheck(n < int(m_size));
+
+    // Make sure it's null terminated.
+    nvDebugCheck(m_str[n] == '\0');
+    //str[n] = '\0';
 
-	return *this;
+    return *this;
 }
 
 
 /** Append a string. */
 StringBuilder & StringBuilder::append( const char * s )
 {
-	nvDebugCheck(s != NULL);
+	return append(s, U32(strlen( s )));
+}
 
-	const uint slen = uint(strlen( s ));
 
-	if( m_str == NULL ) {
-		m_size = slen + 1;
-		m_str = strAlloc(m_size);
-		strCpy( m_str, m_size, s );
-	}
-	else {
-	
-		const uint len = uint(strlen( m_str ));
-
-		if( m_size < len + slen + 1 ) {
-			m_size = len + slen + 1;
-			m_str = strReAlloc(m_str, m_size);
-		}
-		
-		strCat( m_str, m_size, s );
-	}
+/** Append a string. */
+StringBuilder & StringBuilder::append(const char * s, uint len)
+{
+    nvDebugCheck(s != NULL);
+
+	uint offset = length();
+	const uint size = offset + len + 1;
+	reserve(size);
+	strCpy(m_str + offset, len + 1, s, len);
 
-	return *this;
+    return *this;
 }
 
 
 /** Append a formatted string. */
-StringBuilder & StringBuilder::appendFormat( const char * format, ... )
+StringBuilder & StringBuilder::appendFormat( const char * fmt, ... )
 {
-	nvDebugCheck( format != NULL );
+    nvDebugCheck( fmt != NULL );
 
-	va_list arg;
-	va_start( arg, format );
+    va_list arg;
+    va_start( arg, fmt );
 
-	appendFormat( format, arg );
+    appendFormatList( fmt, arg );
 
-	va_end( arg );
+    va_end( arg );
 
-	return *this;
+    return *this;
 }
 
 
 /** Append a formatted string. */
-StringBuilder & StringBuilder::appendFormat( const char * format, va_list arg )
+StringBuilder & StringBuilder::appendFormatList( const char * fmt, va_list arg )
 {
-	nvDebugCheck( format != NULL );
-	
-	va_list tmp;
-	va_copy(tmp, arg);
-
-	StringBuilder tmp_str;
-	tmp_str.format( format, tmp );
-	append( tmp_str );
-	
-	va_end(tmp);
+    nvDebugCheck( fmt != NULL );
+
+    va_list tmp;
+    va_copy(tmp, arg);
+
+    if (m_size == 0) {
+        formatList(fmt, arg);
+    }
+    else {
+        StringBuilder tmp_str;
+        tmp_str.formatList( fmt, tmp );
+        append( tmp_str.str() );
+    }
+
+    va_end(tmp);
+
+    return *this;
+}
+
+// Append n spaces.
+StringBuilder & StringBuilder::appendSpace(uint n)
+{
+    if (m_str == NULL) {
+        m_size = n + 1;
+        m_str = strAlloc(m_size);
+        memset(m_str, ' ', m_size);
+        m_str[n] = '\0';
+    }
+    else {
+        const uint len = strLen(m_str);
+        if (m_size < len + n + 1) {
+            m_size = len + n + 1;
+            m_str = strReAlloc(m_str, m_size);
+        }
+        memset(m_str + len, ' ', n);
+        m_str[len+n] = '\0';
+    }
 
-	return *this;
+    return *this;
 }
 
 
 /** Convert number to string in the given base. */
 StringBuilder & StringBuilder::number( int i, int base )
 {
-	nvCheck( base >= 2 );
-	nvCheck( base <= 36 );
+    nvCheck( base >= 2 );
+    nvCheck( base <= 36 );
 
-	// @@ This needs to be done correctly.
-	// length = floor(log(i, base));
-	uint len = uint(log(float(i)) / log(float(base)) + 2);	// one more if negative
-	reserve(len);
-
-	if( i < 0 ) {
-		*m_str = '-';
-		*i2a(uint(-i), m_str+1, base) = 0;
-	}
-	else {
-		*i2a(i, m_str, base) = 0;
-	}
+    // @@ This needs to be done correctly.
+    // length = floor(log(i, base));
+    uint len = uint(log(float(i)) / log(float(base)) + 2); // one more if negative
+    reserve(len);
+
+    if( i < 0 ) {
+        *m_str = '-';
+        *i2a(uint(-i), m_str+1, base) = 0;
+    }
+    else {
+        *i2a(i, m_str, base) = 0;
+    }
 
-	return *this;
+    return *this;
 }
 
 
 /** Convert number to string in the given base. */
 StringBuilder & StringBuilder::number( uint i, int base )
 {
-	nvCheck( base >= 2 );
-	nvCheck( base <= 36 );
+    nvCheck( base >= 2 );
+    nvCheck( base <= 36 );
 
-	// @@ This needs to be done correctly.
-	// length = floor(log(i, base));
-	uint len = uint(log(float(i)) / log(float(base)) - 0.5f + 1);
-	reserve(len);
+    // @@ This needs to be done correctly.
+    // length = floor(log(i, base));
+    uint len = uint(log(float(i)) / log(float(base)) - 0.5f + 1);
+    reserve(len);
 
-	*i2a(i, m_str, base) = 0;
+    *i2a(i, m_str, base) = 0;
 
-	return *this;
+    return *this;
 }
 
 
 /** Resize the string preserving the contents. */
 StringBuilder & StringBuilder::reserve( uint size_hint )
 {
-	nvCheck(size_hint != 0);
-	if( size_hint > m_size ) {
-		m_str = strReAlloc(m_str, size_hint);
-		m_size = size_hint;
-	}
-	return *this;
+    nvCheck(size_hint != 0);
+    if (size_hint > m_size) {
+        m_str = strReAlloc(m_str, size_hint);
+        m_size = size_hint;
+    }
+    return *this;
 }
 
 
 /** Copy a string safely. */
-StringBuilder & StringBuilder::copy( const char * s )
+StringBuilder & StringBuilder::copy(const char * s)
+{
+    nvCheck( s != NULL );
+    const uint str_size = uint(strlen( s )) + 1;
+    reserve(str_size);
+    memcpy(m_str, s, str_size);
+    return *this;
+}
+
+/** Copy a string safely. */
+StringBuilder & StringBuilder::copy(const char * s, uint len)
 {
-	nvCheck( s != NULL );
-	uint str_size = uint(strlen( s )) + 1;
-	reserve(str_size);
-	strCpy( m_str, str_size, s );
-	return *this;
+    nvCheck( s != NULL );
+    const uint str_size = len + 1;
+    reserve(str_size);
+    strCpy(m_str, str_size, s, len);
+    return *this;
 }
 
 
 /** Copy an StringBuilder. */
 StringBuilder & StringBuilder::copy( const StringBuilder & s )
 {
-	if( s.m_str == NULL ) {
-		nvCheck( s.m_size == 0 );
-		m_size = 0;
-		strFree( m_str );
-		m_str = NULL;
-	}
-	else {
-		reserve( s.m_size );
-		strCpy( m_str, s.m_size, s.m_str );
-	}
-	return *this;
+    if (s.m_str == NULL) {
+        nvCheck( s.m_size == 0 );
+        reset();
+    }
+    else {
+        reserve( s.m_size );
+        strCpy( m_str, s.m_size, s.m_str );
+    }
+    return *this;
+}
+
+bool StringBuilder::endsWith(const char * str) const
+{
+    uint l = uint(strlen(str));
+    uint ml = uint(strlen(m_str));
+    if (ml < l) return false;
+    return strncmp(m_str + ml - l, str, l) == 0;
+}
+
+bool StringBuilder::beginsWith(const char * str) const 
+{
+    size_t l = strlen(str);
+    return strncmp(m_str, str, l) == 0;
+}
+
+// Find given char starting from the end.
+char * StringBuilder::reverseFind(char c)
+{
+    int length = (int)strlen(m_str) - 1;
+    while (length >= 0 && m_str[length] != c) {
+        length--;
+    }
+    if (length >= 0) {
+        return m_str + length;
+    }
+    else {
+        return NULL;
+    }
 }
 
+
 /** Reset the string. */
 void StringBuilder::reset()
 {
-	m_size = 0;
-	strFree( m_str );
-	m_str = NULL;
+    m_size = 0;
+    strFree( m_str );
+    m_str = NULL;
+}
+
+/** Release the allocated string. */
+char * StringBuilder::release()
+{
+    char * str = m_str;
+    m_size = 0;
+    m_str = NULL;
+    return str;
+}
+
+// Swap strings.
+void nv::swap(StringBuilder & a, StringBuilder & b) {
+    swap(a.m_size, b.m_size);
+    swap(a.m_str, b.m_str);
 }
 
 
 /// Get the file name from a path.
 const char * Path::fileName() const
 {
-	return fileName(m_str);
+    return fileName(m_str);
 }
 
 
 /// Get the extension from a file path.
 const char * Path::extension() const
 {
-	return extension(m_str);
+    return extension(m_str);
 }
 
 
+/*static */void Path::translatePath(char * path, char pathSeparator/*= NV_PATH_SEPARATOR*/) {
+    nvCheck(path != NULL);
+
+    for (int i = 0;; i++) {
+        if (path[i] == '\0') break;
+        if (path[i] == '\\' || path[i] == '/') path[i] = pathSeparator;
+    }
+}
+
 /// Toggles path separators (ie. \\ into /).
-void Path::translatePath()
+void Path::translatePath(char pathSeparator/*=NV_PATH_SEPARATOR*/)
 {
-	nvCheck( m_str != NULL );
+    nvCheck(!isNull());
+    translatePath(m_str, pathSeparator);
+}
 
-	for(int i = 0; ; i++) {
-		if( m_str[i] == '\0' ) break;
-#if NV_PATH_SEPARATOR == '/'
-		if( m_str[i] == '\\' ) m_str[i] = NV_PATH_SEPARATOR;
-#else
-		if( m_str[i] == '/' ) m_str[i] = NV_PATH_SEPARATOR;
-#endif
-	}
+void Path::appendSeparator(char pathSeparator/*=NV_PATH_SEPARATOR*/)
+{
+    nvCheck(!isNull());
+
+    const uint l = length();
+    
+    if (m_str[l] != '\\' && m_str[l] != '/') {
+        char separatorString[] = { pathSeparator, '\0' };
+        append(separatorString);
+    }
 }
 
 
 /**
- * Strip the file name from a path.
- * @warning path cannot end with '/' o '\\', can't it?
- */
+* Strip the file name from a path.
+* @warning path cannot end with '/' o '\\', can't it?
+*/
 void Path::stripFileName()
 {
-	nvCheck( m_str != NULL );
+    nvCheck( m_str != NULL );
 
-	int length = (int)strlen(m_str) - 1;
-	while (length > 0 && m_str[length] != '/' && m_str[length] != '\\'){
-		length--;
-	}
-	if( length ) {
-		m_str[length+1] = 0;
-	}
-	else {
-		m_str[0] = 0;
-	}
+    int length = (int)strlen(m_str) - 1;
+    while (length > 0 && m_str[length] != '/' && m_str[length] != '\\'){
+        length--;
+    }
+    if( length ) {
+        m_str[length+1] = 0;
+    }
+    else {
+        m_str[0] = 0;
+    }
 }
 
 
 /// Strip the extension from a path name.
 void Path::stripExtension()
 {
-	nvCheck( m_str != NULL );
-	
-	int length = (int)strlen(m_str) - 1;
-	while( length > 0 && m_str[length] != '.' ) {
-		length--;
-		if( m_str[length] == NV_PATH_SEPARATOR ) {
-			return;		// no extension
-		}
-	}
-	if( length ) {
-		m_str[length] = 0;
-	}
+    nvCheck( m_str != NULL );
+
+    int length = (int)strlen(m_str) - 1;
+    while (length > 0 && m_str[length] != '.') {
+        length--;
+        if( m_str[length] == NV_PATH_SEPARATOR ) {
+            return; // no extension
+        }
+    }
+    if (length > 0) {
+        m_str[length] = 0;
+    }
 }
 
 
@@ -509,39 +656,39 @@
 // static
 char Path::separator()
 {
-	return NV_PATH_SEPARATOR;
+    return NV_PATH_SEPARATOR;
 }
 
 // static 
 const char * Path::fileName(const char * str)
 {
-	nvCheck( str != NULL );
+    nvCheck( str != NULL );
 
-	int length = (int)strlen(str) - 1;
-	while( length >= 0 && str[length] != separator() ) {
-		length--;
-	}
+    int length = (int)strlen(str) - 1;
+    while (length >= 0 && str[length] != '\\' && str[length] != '/') {
+        length--;
+    }
 
-	return &str[length+1];
+    return &str[length+1];
 }
 
 // static 
 const char * Path::extension(const char * str)
 {
-	nvCheck( str != NULL );
+    nvCheck( str != NULL );
 
-	int length, l;
-	l = length = (int)strlen( str );
-	while( length > 0 && str[length] != '.' ) {
-		length--;
-		if( str[length] == separator() ) {
-			return &str[l];		// no extension
-		}
-	}
-	if( length == 0 ) {
-		return &str[l];
-	}
-	return &str[length];
+    int length, l;
+    l = length = (int)strlen( str );
+    while (length > 0 && str[length] != '.') {
+        length--;
+        if (str[length] == '\\' || str[length] == '/') {
+            return &str[l]; // no extension
+        }
+    }
+    if (length == 0) {
+        return &str[l];
+    }
+    return &str[length];
 }
 
 
@@ -549,36 +696,77 @@
 /// Clone this string
 String String::clone() const
 {
-	String str(data);
-	return str;
+    String str(data);
+    return str;
 }
 
 void String::setString(const char * str)
 {
-	if (str == NULL) {
-		data = NULL;
-	}
-	else {
-		allocString( str );
-		addRef();
-	}
+    if (str == NULL) {
+        data = NULL;
+    }
+    else {
+        allocString( str );
+        addRef();
+    }
 }
 
-void String::setString(const char * str, int length)
+void String::setString(const char * str, uint length)
 {
-	nvDebugCheck(str != NULL);
+    nvDebugCheck(str != NULL);
 
-	allocString(str, length);
-	addRef();
+    allocString(str, length);
+    addRef();
 }
 
 void String::setString(const StringBuilder & str)
 {
-	if (str.str() == NULL) {
-		data =	NULL;
-	}
-	else {
-		allocString(str);
-		addRef();
-	}
+    if (str.str() == NULL) {
+        data =	NULL;
+    }
+    else {
+        allocString(str.str());
+        addRef();
+    }
 }	
+
+// Add reference count.
+void String::addRef()
+{
+    if (data != NULL)
+    {
+        setRefCount(getRefCount() + 1);
+    }
+}
+
+// Decrease reference count.
+void String::release()
+{
+    if (data != NULL)
+    {
+        const uint16 count = getRefCount();
+        setRefCount(count - 1);
+        if (count - 1 == 0) {
+            free(data - 2);
+            data = NULL;
+        }
+    }
+}
+
+void String::allocString(const char * str, uint len)
+{
+    const char * ptr = malloc<char>(2 + len + 1);
+
+    setData( ptr );
+    setRefCount( 0 );
+
+    // Copy string.
+    strCpy(const_cast<char *>(data), len+1, str, len);
+
+    // Add terminating character.
+    const_cast<char *>(data)[len] = '\0';
+}
+
+void nv::swap(String & a, String & b) {
+    swap(a.data, b.data);
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Stream.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Stream.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Stream.h
@@ -1,160 +1,164 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
-#ifndef NVCORE_STREAM_H
-#define NVCORE_STREAM_H
+#pragma once
+#ifndef NV_CORE_STREAM_H
+#define NV_CORE_STREAM_H
 
-#include <nvcore/nvcore.h>
-#include <nvcore/Debug.h>
+#include "nvcore.h"
+#include "Debug.h"
 
 namespace nv
 {
 
-/// Base stream class.
-class NVCORE_CLASS Stream {
-public:
-
-	enum ByteOrder {
-		LittleEndian = false,
-		BigEndian = true,
-	};
-
-	/// Get the byte order of the system.
-	static ByteOrder getSystemByteOrder() { 
-#	if NV_LITTLE_ENDIAN
-		return LittleEndian;
-#	else
-		return BigEndian;
-#	endif
-	}
-
-
-	/// Ctor.
-	Stream() : m_byteOrder(LittleEndian) { }
-
-	/// Virtual destructor.
-	virtual ~Stream() {}
-
-	/// Set byte order.
-	void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
-	
-	/// Get byte order.
-	ByteOrder byteOrder() const { return m_byteOrder; }
-
-	
-	/// Serialize the given data.
-	virtual uint serialize( void * data, uint len ) = 0;
-
-	/// Move to the given position in the archive.
-	virtual void seek( uint pos ) = 0;
-
-	/// Return the current position in the archive.
-	virtual uint tell() const = 0;
-
-	/// Return the current size of the archive.
-	virtual uint size() const = 0;
-
-	/// Determine if there has been any error.
-	virtual bool isError() const = 0;
-
-	/// Clear errors.
-	virtual void clearError() = 0;
-
-	/// Return true if the stream is at the end.
-	virtual bool isAtEnd() const = 0;
-
-	/// Return true if the stream is seekable.
-	virtual bool isSeekable() const = 0;
-	
-	/// Return true if this is an input stream.
-	virtual bool isLoading() const = 0;
-
-	/// Return true if this is an output stream.
-	virtual bool isSaving() const = 0;
-
-	
-	// friends	
-	friend Stream & operator<<( Stream & s, bool & c ) {
-#	if NV_OS_DARWIN
-		nvStaticCheck(sizeof(bool) == 4);
-		uint8 b = c ? 1 : 0;
-		s.serialize( &b, 1 );
-		c = (b == 1);
-#	else
-		nvStaticCheck(sizeof(bool) == 1);
-		s.serialize( &c, 1 );
-#	endif
-		return s;
-	}
-	friend Stream & operator<<( Stream & s, char & c ) {
-		nvStaticCheck(sizeof(char) == 1);
-		s.serialize( &c, 1 );
-		return s;
-	}
-	friend Stream & operator<<( Stream & s, uint8 & c ) {
-		nvStaticCheck(sizeof(uint8) == 1);
-		s.serialize( &c, 1 );
-		return s;
-	}
-	friend Stream & operator<<( Stream & s, int8 & c ) {
-		nvStaticCheck(sizeof(int8) == 1);
-		s.serialize( &c, 1 );
-		return s;
-	}
-	friend Stream & operator<<( Stream & s, uint16 & c ) {
-		nvStaticCheck(sizeof(uint16) == 2);
-		return s.byteOrderSerialize( &c, 2 );
-	}
-	friend Stream & operator<<( Stream & s, int16 & c ) {
-		nvStaticCheck(sizeof(int16) == 2);
-		return s.byteOrderSerialize( &c, 2 );
-	}
-	friend Stream & operator<<( Stream & s, uint32 & c ) {
-		nvStaticCheck(sizeof(uint32) == 4);
-		return s.byteOrderSerialize( &c, 4 );
-	}
-	friend Stream & operator<<( Stream & s, int32 & c ) {
-		nvStaticCheck(sizeof(int32) == 4);
-		return s.byteOrderSerialize( &c, 4 );
-	}
-	friend Stream & operator<<( Stream & s, uint64 & c ) {
-		nvStaticCheck(sizeof(uint64) == 8);
-		return s.byteOrderSerialize( &c, 8 );
-	}
-	friend Stream & operator<<( Stream & s, int64 & c ) {
-		nvStaticCheck(sizeof(int64) == 8);
-		return s.byteOrderSerialize( &c, 8 );
-	}
-	friend Stream & operator<<( Stream & s, float & c ) {
-		nvStaticCheck(sizeof(float) == 4);
-		return s.byteOrderSerialize( &c, 4 );
-	}
-	friend Stream & operator<<( Stream & s, double & c ) {
-		nvStaticCheck(sizeof(double) == 8);
-		return s.byteOrderSerialize( &c, 8 );
-	}
-
-protected:
-
-	/// Serialize in the stream byte order.
-	Stream & byteOrderSerialize( void * v, uint len ) {
-		if( m_byteOrder == getSystemByteOrder() ) {
-			serialize( v, len );
-		}
-		else {
-			for( uint i = len; i > 0; i-- ) {
-				serialize( (uint8 *)v + i - 1, 1 );
-			}
-		}
-		return *this;
-	}
+    /// Base stream class.
+    class NVCORE_CLASS Stream {
+    public:
+
+        enum ByteOrder {
+            LittleEndian = false,
+            BigEndian = true,
+        };
+
+        /// Get the byte order of the system.
+        static ByteOrder getSystemByteOrder() { 
+#if NV_LITTLE_ENDIAN
+            return LittleEndian;
+#else
+            return BigEndian;
+#endif
+        }
+
+
+        /// Ctor.
+        Stream() : m_byteOrder(LittleEndian) { }
+
+        /// Virtual destructor.
+        virtual ~Stream() {}
+
+        /// Set byte order.
+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+
+        /// Get byte order.
+        ByteOrder byteOrder() const { return m_byteOrder; }
+
+
+        /// Serialize the given data.
+        virtual uint serialize( void * data, uint len ) = 0;
+
+        /// Move to the given position in the archive.
+        virtual void seek( uint pos ) = 0;
+
+        /// Return the current position in the archive.
+        virtual uint tell() const = 0;
+
+        /// Return the current size of the archive.
+        virtual uint size() const = 0;
+
+        /// Determine if there has been any error.
+        virtual bool isError() const = 0;
+
+        /// Clear errors.
+        virtual void clearError() = 0;
+
+        /// Return true if the stream is at the end.
+        virtual bool isAtEnd() const = 0;
+
+        /// Return true if the stream is seekable.
+        virtual bool isSeekable() const = 0;
+
+        /// Return true if this is an input stream.
+        virtual bool isLoading() const = 0;
+
+        /// Return true if this is an output stream.
+        virtual bool isSaving() const = 0;
+
+
+        void advance(uint offset) { seek(tell() + offset); }
+
+
+        // friends	
+        friend Stream & operator<<( Stream & s, bool & c ) {
+#if NV_OS_DARWIN && !NV_CC_CPP11
+            nvStaticCheck(sizeof(bool) == 4);
+            uint8 b = c ? 1 : 0;
+            s.serialize( &b, 1 );
+            c = (b == 1);
+#else
+            nvStaticCheck(sizeof(bool) == 1);
+            s.serialize( &c, 1 );
+#endif
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, char & c ) {
+            nvStaticCheck(sizeof(char) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint8 & c ) {
+            nvStaticCheck(sizeof(uint8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, int8 & c ) {
+            nvStaticCheck(sizeof(int8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint16 & c ) {
+            nvStaticCheck(sizeof(uint16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, int16 & c ) {
+            nvStaticCheck(sizeof(int16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, uint32 & c ) {
+            nvStaticCheck(sizeof(uint32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, int32 & c ) {
+            nvStaticCheck(sizeof(int32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, uint64 & c ) {
+            nvStaticCheck(sizeof(uint64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, int64 & c ) {
+            nvStaticCheck(sizeof(int64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, float & c ) {
+            nvStaticCheck(sizeof(float) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, double & c ) {
+            nvStaticCheck(sizeof(double) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+
+    protected:
+
+        /// Serialize in the stream byte order.
+        Stream & byteOrderSerialize( void * v, uint len ) {
+            if( m_byteOrder == getSystemByteOrder() ) {
+                serialize( v, len );
+            }
+            else {
+                for( uint i = len; i > 0; i-- ) {
+                    serialize( (uint8 *)v + i - 1, 1 );
+                }
+            }
+            return *this;
+        }
 
 
-private:
+    private:
 
-	ByteOrder m_byteOrder;
+        ByteOrder m_byteOrder;
 
-};
+    };
 
 } // nv namespace
 
-#endif // NV_STREAM_H
+#endif // NV_CORE_STREAM_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/TextReader.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/TextReader.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/TextReader.h
@@ -1,38 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NVCORE_TEXTREADER_H
-#define NVCORE_TEXTREADER_H
-
-#include <nvcore/nvcore.h>
-#include <nvcore/Stream.h>
-#include <nvcore/Containers.h>
-
-namespace nv
-{
-
-/// Text reader.
-class NVCORE_CLASS TextReader {
-public:
-	
-	/// Ctor.
-	TextReader(Stream * stream) : m_stream(stream), m_text(512) {
-		nvCheck(stream != NULL);
-		nvCheck(stream->isLoading());
-	}
-	
-	char peek();
-	char read();
-	
-	const char *readToEnd();
-
-	// Returns a temporary string.
-	const char * readLine(); 
-
-private:
-	Stream * m_stream;
-	Array<char> m_text;
-};
-
-} // nv namespace
-
-#endif // NVCORE_TEXTREADER_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/TextReader.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/TextReader.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/TextReader.cpp
@@ -1,86 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvcore/TextReader.h>
-
-using namespace nv;
-
-/// Peek next character.
-char TextReader::peek()
-{
-	nvDebugCheck(m_stream != NULL);
-	nvDebugCheck(m_stream->isSeekable());
-	
-	if (m_stream->isAtEnd()) {
-		return 0;
-	}
-
-	uint pos = m_stream->tell();
-
-	char c;
-	m_stream->serialize(&c, 1);
-	m_stream->seek(pos);
-	return c;
-}
-
-/// Read a single char.
-char TextReader::read()
-{
-	nvDebugCheck(m_stream != NULL);
-	
-	char c;
-	m_stream->serialize(&c, 1);
-
-	if( m_stream->isAtEnd() ) {
-		return 0;
-	}
-	
-	return c;
-}
-
-/// Read from the current location to the end of the stream.
-const char * TextReader::readToEnd()
-{
-	nvDebugCheck(m_stream != NULL);
-	const int size = m_stream->size();
-	
-	m_text.clear();
-	
-	m_text.reserve(size + 1);
-	m_text.resize(size);
-	
-	m_stream->serialize(m_text.unsecureBuffer(), size);
-	m_text.pushBack('\0');
-	
-	return m_text.buffer();
-}
-
-/// Read from the current location to the end of the line.
-const char * TextReader::readLine()
-{
-	m_text.clear();
-
-	if (m_stream->isAtEnd()) {
-		return NULL;
-	}
-	
-	while (true) {
-		char c = read();
-		
-		if (c == 0 || c == '\n') {
-			break;
-		}
-		else if (c == '\r') {
-			if( peek() == '\n' ) {
-				read();
-			}
-			break;
-		}
-		
-		m_text.pushBack(c);
-	}
-	
-	m_text.pushBack('\0');
-	return m_text.buffer();
-}
-
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/TextWriter.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/TextWriter.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/TextWriter.h
@@ -1,65 +1,62 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
+#pragma once
 #ifndef NVCORE_TEXTWRITER_H
 #define NVCORE_TEXTWRITER_H
 
-#include <nvcore/nvcore.h>
-#include <nvcore/Stream.h>
-#include <nvcore/StrLib.h>
+#include "nvcore.h"
+#include "Stream.h"
+#include "StrLib.h"
 
 namespace nv
 {
 
-	/// Text writer.
-	class NVCORE_CLASS TextWriter
-	{
-	public:
-	
-		TextWriter(Stream * s);
-	
-		void writeString(const char * str);
-		void writeString(const char * str, uint len);
-		void write(const char * format, ...) __attribute__((format (printf, 2, 3)));
-		void write(const char * format, va_list arg);
-	
-	private:
-	
-		Stream * s;
-		
-		// Temporary string.
-		StringBuilder str;
-	
-	};
-
-
-	inline TextWriter & operator<<( TextWriter & tw, int i)
-	{
-		tw.write("%d", i);
-		return tw;
-	}
-
-	inline TextWriter & operator<<( TextWriter & tw, uint i)
-	{
-		tw.write("%u", i);
-		return tw;
-	}
-
-	inline TextWriter & operator<<( TextWriter & tw, float f)
-	{
-		tw.write("%f", f);
-		return tw;
-	}
-
-	inline TextWriter & operator<<( TextWriter & tw, const char * str)
-	{
-		tw.writeString(str);
-		return tw;
-	}
+    /// Text writer.
+    class NVCORE_CLASS TextWriter
+    {
+    public:
+
+        TextWriter(Stream * s);
+
+        void writeString(const char * str);
+        void writeString(const char * str, uint len);
+        void format(const char * format, ...) __attribute__((format (printf, 2, 3)));
+        void formatList(const char * format, va_list arg);
+
+    private:
+
+        Stream * s;
+
+        // Temporary string.
+        StringBuilder str;
+
+    };
+
+
+    inline TextWriter & operator<<( TextWriter & tw, int i)
+    {
+        tw.format("%d", i);
+        return tw;
+    }
+
+    inline TextWriter & operator<<( TextWriter & tw, uint i)
+    {
+        tw.format("%u", i);
+        return tw;
+    }
+
+    inline TextWriter & operator<<( TextWriter & tw, float f)
+    {
+        tw.format("%f", f);
+        return tw;
+    }
+
+    inline TextWriter & operator<<( TextWriter & tw, const char * str)
+    {
+        tw.writeString(str);
+        return tw;
+    }
 
 } // nv namespace
 
-
-
-
-
 #endif // NVCORE_TEXTWRITER_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/TextWriter.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/TextWriter.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/TextWriter.cpp
@@ -1,45 +1,45 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
-#include <nvcore/TextWriter.h>
+#include "TextWriter.h"
 
 using namespace nv;
 
 
 /// Constructor
 TextWriter::TextWriter(Stream * s) : 
-	s(s), 
-	str(1024)
+    s(s), 
+    str(1024)
 {
-	nvCheck(s != NULL);
-	nvCheck(s->isSaving());
+    nvCheck(s != NULL);
+    nvCheck(s->isSaving());
 }
 
 void TextWriter::writeString(const char * str)
 {
-	nvDebugCheck(s != NULL);
-	s->serialize(const_cast<char *>(str), (int)strlen(str));
+    nvDebugCheck(s != NULL);
+    s->serialize(const_cast<char *>(str), strLen(str));
 }
 
 void TextWriter::writeString(const char * str, uint len)
 {
-	nvDebugCheck(s != NULL);
-	s->serialize(const_cast<char *>(str), len);
+    nvDebugCheck(s != NULL);
+    s->serialize(const_cast<char *>(str), len);
 }
 
-void TextWriter::write(const char * format, ...)
+void TextWriter::format(const char * format, ...)
 {
-	va_list arg;
-	va_start(arg,format);
-	str.format(format, arg);
-	writeString(str.str(), str.length());
-	va_end(arg);
+    va_list arg;
+    va_start(arg,format);
+    str.formatList(format, arg);
+    writeString(str.str(), str.length());
+    va_end(arg);
 }
 
-void TextWriter::write(const char * format, va_list arg)
+void TextWriter::formatList(const char * format, va_list arg)
 {
-	va_list tmp;
-	va_copy(tmp, arg);
-	str.format(format, arg);
-	writeString(str.str(), str.length());
-	va_end(tmp);
+    va_list tmp;
+    va_copy(tmp, arg);
+    str.formatList(format, arg);
+    writeString(str.str(), str.length());
+    va_end(tmp);
 }
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Timer.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Timer.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Timer.h
@@ -0,0 +1,53 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_CORE_TIMER_H
+#define NV_CORE_TIMER_H
+
+#include "nvcore.h"
+
+#if NV_CC_MSVC
+#include <intrin.h>
+#endif
+
+namespace nv {
+
+#if NV_CC_MSVC
+    NV_FORCEINLINE uint64 fastCpuClock() { return __rdtsc(); }
+#elif NV_CC_GNUC && NV_CPU_X86
+    NV_FORCEINLINE uint64 fastCpuClock() {
+        uint64 val;
+        __asm__ volatile (".byte 0x0f, 0x31" : "=A" (val));
+        return val;
+    }
+#elif NV_CC_GNUC && NV_CPU_X86_64
+    NV_FORCEINLINE uint64 fastCpuClock() {
+        uint hi, lo;
+        __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+        return uint64(lo) | (uint64(hi) << 32);
+    }
+#else
+    NV_FORCEINLINE uint64 fastCpuClock() { return 0; }    
+#endif
+    
+    uint64 systemClockFrequency();
+    uint64 systemClock();
+
+    class NVCORE_CLASS Timer
+    {
+    public:
+        Timer() {}
+
+        void start() { m_start = systemClock(); }
+        void stop() { m_stop = systemClock(); }
+
+        float elapsed() const { return float(m_stop - m_start) / systemClockFrequency(); }
+
+    private:
+        uint64 m_start;
+        uint64 m_stop;
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_TIMER_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Timer.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Timer.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Timer.cpp
@@ -0,0 +1,44 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Timer.h"
+
+using namespace nv;
+    
+
+#if NV_OS_WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#define VC_EXTRALEAN
+#define NOMINMAX
+#include <windows.h> // QueryPerformanceFrequency, QueryPerformanceCounter
+
+
+uint64 nv::systemClockFrequency()
+{
+    uint64 frequency;
+    QueryPerformanceFrequency((LARGE_INTEGER*) &frequency);
+    return frequency;
+}
+
+uint64 nv::systemClock()
+{
+    uint64 counter;
+    QueryPerformanceCounter((LARGE_INTEGER*) &counter);
+    return counter;
+}
+
+#else
+
+#include <time.h> // clock
+
+uint64 nv::systemClockFrequency()
+{
+    return CLOCKS_PER_SEC;
+}
+
+uint64 nv::systemClock()
+{
+    return clock();
+}
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Tokenizer.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Tokenizer.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Tokenizer.h
@@ -1,99 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_CORE_TOKENIZER_H
-#define NV_CORE_TOKENIZER_H
-
-#include <nvcore/nvcore.h>
-#include <nvcore/Stream.h>
-#include <nvcore/TextReader.h>
-#include <nvcore/StrLib.h>
-
-namespace nv
-{
-	/// A token produced by the Tokenizer.
-	class NVCORE_CLASS Token
-	{
-	public:
-		Token();
-		Token(const Token & token);
-		Token(const char * str, int len);		
-		
-		bool operator==(const char * str) const;
-		bool operator!=(const char * str) const;
-
-		bool isNull();
-		
-		float toFloat() const;
-		int toInt() const;
-		uint toUnsignedInt() const;
-		String toString() const;
-		
-		bool parse(const char * format, int count, ...) const __attribute__((format (scanf, 2, 4)));
-		
-	private:
-		const char * m_str;
-		int m_len;
-	};
-	
-	/// Exception thrown by the tokenizer.
-	class TokenizerException
-	{
-	public:
-		TokenizerException(int line, int column) : m_line(line), m_column(column) {}
-		
-		int line() const { return m_line; }
-		int column() const { return m_column; }
-		
-	private:
-		int m_line;
-		int m_column;
-	};
-	
-	// @@ Use enums instead of bools for clarity!
-	//enum SkipEmptyLines { skipEmptyLines, noSkipEmptyLines };
-	//enum SkipEndOfLine { skipEndOfLine, noSkipEndOfLine };
-
-	/// A simple stream tokenizer.
-	class NVCORE_CLASS Tokenizer
-	{
-	public:
-		Tokenizer(Stream * stream);
-		
-		bool nextLine(bool skipEmptyLines = true);
-		bool nextToken(bool skipEndOfLine = false);
-		
-		const Token & token() const { return m_token; }
-		
-		int lineNumber() const { return m_lineNumber; }
-		int columnNumber() const { return m_columnNumber; }
-		
-		void setDelimiters(const char * str) { m_delimiters = str; }
-		const char * delimiters() const { return m_delimiters; }
-		
-		void setSpaces(const char * str) { m_spaces = str; }
-		const char * spaces() const { return m_spaces; }
-		
-	private:
-		char readChar();
-		bool readLine();
-		bool readToken(); 
-		void skipSpaces();
-		bool isSpace(char c);
-		bool isDelimiter(char c);
-		
-	private:
-		TextReader m_reader;
-		const char * m_line;
-		Token m_token;
-		
-		int m_lineNumber;
-		int m_columnNumber;
-		
-		const char * m_delimiters;
-		const char * m_spaces;
-	};
-	
-} // nv namespace
-
-
-#endif // NV_CORE_TOKENIZER_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Tokenizer.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Tokenizer.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Tokenizer.cpp
@@ -1,229 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvcore/Tokenizer.h>
-#include <nvcore/StrLib.h>
-
-#include <stdio.h> // vsscanf
-#include <stdarg.h>	// va_list
-#include <stdlib.h>	// atof, atoi
-
-#if NV_CC_MSVC
-#if 0 // This doesn't work on MSVC for x64
-/* vsscanf for Win32
- * Written 5/2003 by <mgix@mgix.com>
- * This code is in the Public Domain
- */
-
-#include <malloc.h> // alloca
-//#include <string.h>
-
-static int vsscanf(const char * buffer, const char * format, va_list argPtr)
-{
-	// Get an upper bound for the # of args
-	size_t count = 0;
-	const char *p = format;
-	while(1) {
-		char c = *(p++);
-		if(c==0) break;
-		if(c=='%' && (p[0]!='*' && p[0]!='%')) ++count;
-	}
-
-	// Make a local stack
-	size_t stackSize = (2+count)*sizeof(void*);
-	void **newStack = (void**)alloca(stackSize);
-
-	// Fill local stack the way sscanf likes it
-	newStack[0] = (void*)buffer;
-	newStack[1] = (void*)format;
-	memcpy(newStack+2, argPtr, count*sizeof(void*));
-
-	// @@ Use: CALL DWORD PTR [sscanf]
-	
-	// Warp into system sscanf with new stack
-	int result;
-	void *savedESP;
-	__asm
-	{
-		mov     savedESP, esp
-		mov     esp, newStack
-#if _MSC_VER >= 1400
-		call	DWORD PTR [sscanf_s]
-#else
-		call	DWORD PTR [sscanf]
-#endif
-		mov     esp, savedESP
-		mov     result, eax
-	}
-	return result;
-}
-#endif
-#endif
-
-using namespace nv;
-
-Token::Token() :
-	m_str(""), m_len(0)
-{
-}
-
-Token::Token(const Token & token) : 
-	m_str(token.m_str), m_len(token.m_len)
-{
-}
-
-Token::Token(const char * str, int len) : 
-	m_str(str), m_len(len)
-{
-}
-
-bool Token::operator==(const char * str) const
-{
-	return strncmp(m_str, str, m_len) == 0;
-}
-bool Token::operator!=(const char * str) const
-{
-	return strncmp(m_str, str, m_len) != 0;
-}
-
-bool Token::isNull()
-{
-	return m_len != 0;
-}
-
-float Token::toFloat() const
-{
-	return float(atof(m_str));
-}
-
-int Token::toInt() const
-{
-	return atoi(m_str);
-}
-
-uint Token::toUnsignedInt() const
-{
-	// @@ TBD
-	return uint(atoi(m_str));
-}
-
-String Token::toString() const
-{
-	return String(m_str, m_len);
-}
-
-bool Token::parse(const char * format, int count, ...) const
-{
-	va_list arg;
-	va_start(arg, count);
-
-	int readCount = vsscanf(m_str, format, arg);
-
-	va_end(arg);
-
-	return readCount == count;
-}
-
-
-Tokenizer::Tokenizer(Stream * stream) : 
-	m_reader(stream), m_lineNumber(0), m_columnNumber(0), m_delimiters("{}()="), m_spaces(" \t")
-{
-}
-
-bool Tokenizer::nextLine(bool skipEmptyLines /*= true*/)
-{
-	do {
-		if (!readLine()) {
-			return false;
-		}
-	}
-	while (!readToken() && skipEmptyLines);
-	
-	return true;
-}
-
-bool Tokenizer::nextToken(bool skipEndOfLine /*= false*/)
-{
-	if (!readToken()) {
-		if (!skipEndOfLine) {
-			return false;
-		}
-		else {
-			return nextLine(true);
-		}
-	}
-	return true;
-}
-	
-bool Tokenizer::readToken()
-{
-	skipSpaces();
-	
-	const char * begin = m_line + m_columnNumber;
-	
-	if (*begin == '\0') {
-		return false;
-	}
-	
-	char c = readChar();
-	if (isDelimiter(c)) {
-		m_token = Token(begin, 1);
-		return true;
-	}
-	
-	// @@ Add support for quoted tokens "", ''
-	
-	int len = 0;
-	while (!isDelimiter(c) && !isSpace(c) && c != '\0') {
-		c = readChar();
-		len++;
-	}
-	m_columnNumber--;
-	
-	m_token = Token(begin, len);
-	
-	return true;
-}
-
-char Tokenizer::readChar()
-{
-	return m_line[m_columnNumber++];
-}
-
-bool Tokenizer::readLine()
-{
-	m_lineNumber++;
-	m_columnNumber = 0;
-	m_line = m_reader.readLine();
-	return m_line != NULL;
-}
-
-void Tokenizer::skipSpaces()
-{
-	while (isSpace(readChar())) {}
-	m_columnNumber--;
-}
-
-bool Tokenizer::isSpace(char c)
-{
-	uint i = 0;
-	while (m_spaces[i] != '\0') {
-		if (c == m_spaces[i]) {
-			return true;
-		}
-		i++;
-	}
-	return false;
-}
-
-bool Tokenizer::isDelimiter(char c)
-{
-	uint i = 0;
-	while (m_delimiters[i] != '\0') {
-		if (c == m_delimiters[i]) {
-			return true;
-		}
-		i++;
-	}
-	return false;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/Utils.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/Utils.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/Utils.h
@@ -0,0 +1,282 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_UTILS_H
+#define NV_CORE_UTILS_H
+
+#include "Debug.h" // nvDebugCheck
+
+#include <new> // for placement new
+
+
+// Just in case. Grrr.
+#undef min
+#undef max
+
+#define NV_INT8_MIN    (-128)
+#define NV_INT8_MAX    127
+#define NV_UINT8_MAX    255
+#define NV_INT16_MIN    (-32767-1)
+#define NV_INT16_MAX    32767
+#define NV_UINT16_MAX   0xffff
+#define NV_INT32_MIN    (-2147483647-1)
+#define NV_INT32_MAX    2147483647
+#define NV_UINT32_MAX   0xffffffff
+#define NV_INT64_MAX    POSH_I64(9223372036854775807)
+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
+#define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
+
+#define NV_HALF_MAX     65504.0F
+#define NV_FLOAT_MAX    3.402823466e+38F
+
+#define NV_INTEGER_TO_FLOAT_MAX  16777217     // Largest integer such that it and all smaller integers can be stored in a 32bit float.
+
+
+namespace nv
+{
+    // Less error prone than casting. From CB:
+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
+
+    // These intentionally look like casts.
+
+    // uint32 casts:
+    template <typename T> inline uint32 U32(T x) { return x; }
+    template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
+    template <> inline uint32 U32<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
+    //template <> inline uint32 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint32 U32<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint16>(uint16 x) { return x; }
+    template <> inline uint32 U32<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
+    template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+
+    // int32 casts:
+    template <typename T> inline int32 I32(T x) { return x; }
+    template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    template <> inline int32 I32<int64>(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; }
+    template <> inline int32 I32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    //template <> inline int32 I32<int32>(int32 x) { return x; }
+    //template <> inline int32 I32<uint16>(uint16 x) { return x; }
+    //template <> inline int32 I32<int16>(int16 x) { return x; }
+    //template <> inline int32 I32<uint8>(uint8 x) { return x; }
+    //template <> inline int32 I32<int8>(int8 x) { return x; }
+
+    // uint16 casts:
+    template <typename T> inline uint16 U16(T x) { return x; }
+    template <> inline uint16 U16<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    //template <> inline uint16 U16<uint16>(uint16 x) { return x; }
+    template <> inline uint16 U16<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+    //template <> inline uint16 U16<uint8>(uint8 x) { return x; }
+    template <> inline uint16 U16<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+
+    // int16 casts:
+    template <typename T> inline int16 I16(T x) { return x; }
+    template <> inline int16 I16<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int64>(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int32>(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    //template <> inline int16 I16<int16>(int16 x) { return x; }
+    //template <> inline int16 I16<uint8>(uint8 x) { return x; }
+    //template <> inline int16 I16<int8>(int8 x) { return x; }
+
+    // uint8 casts:
+    template <typename T> inline uint8 U8(T x) { return x; }
+    template <> inline uint8 U8<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint16>(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int16>(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    //template <> inline uint8 U8<uint8>(uint8 x) { return x; }
+    template <> inline uint8 U8<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; }
+    //template <> inline uint8 U8<float>(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; }
+
+    // int8 casts:
+    template <typename T> inline int8 I8(T x) { return x; }
+    template <> inline int8 I8<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int64>(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int32>(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int16>(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint8>(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    //template <> inline int8 I8<int8>(int8 x) { return x; }
+
+    // float casts:
+    template <typename T> inline float F32(T x) { return x; }
+    template <> inline float F32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int64>(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int32>(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    // The compiler should not complain about these conversions:
+    //template <> inline float F32<uint16>(uint16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int16>(int16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<uint8>(uint8 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int8>(int8 x) { nvDebugCheck(return (float)x; }
+
+
+    /// Swap two values.
+    template <typename T> 
+    inline void swap(T & a, T & b)
+    {
+        T temp(a);
+        a = b; 
+        b = temp;
+    }
+
+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
+    template <typename T> 
+    //inline const T & max(const T & a, const T & b)
+    inline T max(const T & a, const T & b)
+    {
+        return (b < a) ? a : b;
+    }
+
+	/// Return the maximum of the four arguments.
+	template <typename T> 
+	//inline const T & max4(const T & a, const T & b, const T & c)
+	inline T max4(const T & a, const T & b, const T & c, const T & d)
+	{
+		return max(max(a, b), max(c, d));
+	}
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & max3(const T & a, const T & b, const T & c)
+    inline T max3(const T & a, const T & b, const T & c)
+    {
+        return max(a, max(b, c));
+    }
+
+    /// Return the minimum of two values.
+    template <typename T> 
+    //inline const T & min(const T & a, const T & b)
+    inline T min(const T & a, const T & b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & min3(const T & a, const T & b, const T & c)
+    inline T min3(const T & a, const T & b, const T & c)
+    {
+        return min(a, min(b, c));
+    }
+
+    /// Clamp between two values.
+    template <typename T> 
+    //inline const T & clamp(const T & x, const T & a, const T & b)
+    inline T clamp(const T & x, const T & a, const T & b)
+    {
+        return min(max(x, a), b);
+    }
+
+    /** Return the next power of two. 
+    * @see http://graphics.stanford.edu/~seander/bithacks.html
+    * @warning Behaviour for 0 is undefined.
+    * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
+    * @note nextPowerOfTwo(x) = 2 << log2(x-1)
+    */
+    inline uint nextPowerOfTwo( uint x )
+    {
+        nvDebugCheck( x != 0 );
+#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
+        x--;
+        x |= x >> 1;
+        x |= x >> 2;
+        x |= x >> 4;
+        x |= x >> 8;
+        x |= x >> 16;
+        return x+1;	
+#else
+        uint p = 1;
+        while( x > p ) {
+            p += p;
+        }
+        return p;
+#endif
+    }
+
+    /// Return true if @a n is a power of two.
+    inline bool isPowerOfTwo( uint n )
+    {
+        return (n & (n-1)) == 0;
+    }
+
+
+    // @@ Move this to utils?
+    /// Delete all the elements of a container.
+    template <typename T>
+    void deleteAll(T & container)
+    {
+        for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
+        {
+            delete container[i];
+        }
+    }
+
+
+
+    // @@ Specialize these methods for numeric, pointer, and pod types.
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T; // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(elem); // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
+    template <typename T>
+    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = new_size; i < old_size; i++) {
+            (ptr+i)->~T(); // Explicit call to the destructor
+        }
+    }
+
+    template <typename T>
+    void fill(T * restrict dst, uint count, const T & value) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = value;
+        }
+    }
+
+    template <typename T>
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = src[i];
+        }
+    }
+
+    template <typename T>
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
+        for (uint i = begin; i < end; i++) {
+            if (ptr[i] == element) {
+                if (index != NULL) *index = i;
+                return true;
+            }
+        }
+        return false;
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_UTILS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/nvcore.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/nvcore.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/nvcore.h
@@ -1,11 +1,9 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
+#pragma once
 #ifndef NV_CORE_H
 #define NV_CORE_H
 
-// cmake config
-#include <nvconfig.h>
-
 // Function linkage
 #if NVCORE_SHARED
 #ifdef NVCORE_EXPORTS
@@ -22,7 +20,7 @@
 
 
 // Platform definitions
-#include "poshlib/posh.h"
+#include <posh.h>
 
 // OS:
 // NV_OS_WIN32
@@ -32,34 +30,64 @@
 // NV_OS_LINUX
 // NV_OS_UNIX
 // NV_OS_DARWIN
+// NV_OS_XBOX
+// NV_OS_ORBIS
+// NV_OS_IOS
 
-#define NV_OS_STRING 	POSH_OS_STRING
+#define NV_OS_STRING POSH_OS_STRING
 
 #if defined POSH_OS_LINUX
-#	define NV_OS_LINUX 1
-#	define NV_OS_UNIX 1
+#   define NV_OS_LINUX 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
 #elif defined POSH_OS_FREEBSD
-#	define NV_OS_FREEBSD 1
-#	define NV_OS_UNIX 1
+#   define NV_OS_FREEBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_NETBSD
+#   define NV_OS_NETBSD 1
+#   define NV_OS_UNIX 1
 #elif defined POSH_OS_OPENBSD
-#	define NV_OS_OPENBSD 1
-#	define NV_OS_UNIX 1
+#   define NV_OS_OPENBSD 1
+#   define NV_OS_UNIX 1
 #elif defined POSH_OS_CYGWIN32
-#	define NV_OS_CYGWIN 1
+#   define NV_OS_CYGWIN 1
 #elif defined POSH_OS_MINGW
-#	define NV_OS_MINGW 1
-#	define NV_OS_WIN32 1
+#   define NV_OS_MINGW 1
+#   define NV_OS_WIN32 1
 #elif defined POSH_OS_OSX
-#	define NV_OS_DARWIN 1
-#	define NV_OS_UNIX 1
+#   define NV_OS_DARWIN 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
 #elif defined POSH_OS_UNIX
-#	define NV_OS_UNIX 1
-#elif defined POSH_OS_WIN32
-#	define NV_OS_WIN32 1
+#   define NV_OS_UNIX 1
 #elif defined POSH_OS_WIN64
-#	define NV_OS_WIN64 1
+#   define NV_OS_WIN32 1
+#   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_XBOX
+#   define NV_OS_XBOX 1
 #else
-#	error "Unsupported OS"
+#   error "Unsupported OS"
+#endif
+
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_DARWIN || NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
 #endif
 
 
@@ -70,45 +98,71 @@
 // NV_CPU_ARM
 // NV_CPU_AARCH64
 
-#define NV_CPU_STRING 	POSH_CPU_STRING
+#define NV_CPU_STRING   POSH_CPU_STRING
 
 #if defined POSH_CPU_X86_64
-#	define NV_CPU_X86_64 1
+//#   define NV_CPU_X86 1
+#   define NV_CPU_X86_64 1
 #elif defined POSH_CPU_X86
-#	define NV_CPU_X86 1
+#   define NV_CPU_X86 1
 #elif defined POSH_CPU_PPC
-#	define NV_CPU_PPC 1
+#   define NV_CPU_PPC 1
 #elif defined POSH_CPU_STRONGARM
-#	define NV_CPU_ARM 1
+#   define NV_CPU_ARM 1
 #elif defined POSH_CPU_AARCH64
-#	define NV_CPU_AARCH64 1
+#   define NV_CPU_AARCH64 1
 #else
-#	error "Unsupported CPU"
+#   error "Unsupported CPU"
 #endif
 
 
 // Compiler:
 // NV_CC_GNUC
 // NV_CC_MSVC
-// @@ NV_CC_MSVC6
-// @@ NV_CC_MSVC7
-// @@ NV_CC_MSVC8
-
-#if defined POSH_COMPILER_GCC
-#	define NV_CC_GNUC	1
-#	define NV_CC_STRING "gcc"
+// NV_CC_CLANG
+
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#elif defined POSH_COMPILER_GCC
+#   define NV_CC_GNUC   1
+#   define NV_CC_STRING "gcc"
 #elif defined POSH_COMPILER_MSVC
-#	define NV_CC_MSVC	1
-#	define NV_CC_STRING "msvc"
+#   define NV_CC_MSVC   1
+#   define NV_CC_STRING "msvc"
 #else
-#	error "Unsupported compiler"
+#   error "Unsupported compiler"
 #endif
 
+#if NV_CC_MSVC
+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
+#else
+// @@ IC: This works in CLANG, about GCC?
+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
+#ifdef __clang__
+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
+#elif defined __GNUC__ 
+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+#endif
+#endif
 
 // Endiannes:
-#define NV_LITTLE_ENDIAN 	POSH_LITTLE_ENDIAN
-#define NV_BIG_ENDIAN		POSH_BIG_ENDIAN
-#define NV_ENDIAN_STRING	POSH_ENDIAN_STRING
+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+
+
+// Define the right printf prefix for size_t arguments:
+#if POSH_64BIT_POINTER
+#  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
+#else
+#  define NV_SIZET_PRINTF_PREFIX
+#endif
+
+
+// cmake config
+#include "nvconfig.h"
 
 
 // Type definitions:
@@ -130,72 +184,124 @@
 
 // Version string:
 #define NV_VERSION_STRING \
-	NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
-	NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
 
 
-/// Disable copy constructor and assignment operator. 
-/// @hideinitializer
+// Disable copy constructor and assignment operator. 
+#if NV_CC_CPP11
+#define NV_FORBID_COPY(C) \
+    C( const C & ) = delete; \
+    C &operator=( const C & ) = delete
+#else
 #define NV_FORBID_COPY(C) \
     private: \
     C( const C & ); \
-    C &operator=( const C & );
-
+    C &operator=( const C & )
+#endif
 
-/// Disable dynamic allocation on the heap. 
-/// See Prohibiting Heap-Based Objects in More Effective C++.
-/// @hideinitializer 
+// Disable dynamic allocation on the heap. 
+// See Prohibiting Heap-Based Objects in More Effective C++.
 #define NV_FORBID_HEAPALLOC() \
-	private: \
-	static void *operator new(size_t size); \
-	static void *operator new[](size_t size);
+    private: \
+    void *operator new(size_t size); \
+    void *operator new[](size_t size)
+    //static void *operator new(size_t size); \
+    //static void *operator new[](size_t size);
 
 // String concatenation macros.
 #define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
 #define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
 #define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
 #define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+#define NV_STRING2(x) #x
+#define NV_STRING(x) NV_STRING2(x)
+
+#if NV_CC_MSVC
+#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_END \
+    __pragma(warning(push)) \
+    __pragma(warning(disable:4127)) \
+    } while(false) \
+    __pragma(warning(pop))  
+#else
+#define NV_MULTI_LINE_MACRO_BEGIN do {
+#define NV_MULTI_LINE_MACRO_END } while(false)
+#endif
+
+#if NV_CC_CPP11
+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+
+#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+#if 0 // Disabled in The Witness.
+#if NV_CC_MSVC
+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
+#else
+#define NV_MESSAGE(x) message(x)
+#endif
+#else
+#define NV_MESSAGE(x) 
+#endif
+
 
 // Startup initialization macro.
 #define NV_AT_STARTUP(some_code) \
-	namespace { \
-		static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
-			NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
-		} \
-		NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
-	};
+    namespace { \
+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+        } \
+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+    }
 
-/// Indicate the compiler that the parameter is not used to suppress compier warnings.
-/// @hideinitializer 
+// Indicate the compiler that the parameter is not used to suppress compier warnings.
 #define NV_UNUSED(a) ((a)=(a))
 
-/// Null index. @@ Move this somewhere else... This could have collisions with other definitions!
-#define NIL uint(~0)
+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
+//const unsigned int NIL = unsigned int(~0);
+//#define NIL uint(~0)
 
-/// Null pointer.
+// Null pointer.
 #ifndef NULL
 #define NULL 0
 #endif
 
 // Platform includes
 #if NV_CC_MSVC
-#	if NV_OS_WIN32
-#		include "DefsVcWin32.h"
-#	else
-#		error "MSVC: Platform not supported"
-#	endif
+#   if NV_OS_WIN32
+#       include "DefsVcWin32.h"
+#   elif NV_OS_XBOX
+#       include "DefsVcXBox.h"
+#   else
+#       error "MSVC: Platform not supported"
+#   endif
 #elif NV_CC_GNUC
-#	if NV_OS_LINUX
-#		include "DefsGnucLinux.h"
-#	elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
-#		include "DefsGnucDarwin.h"
-#	elif NV_OS_MINGW
-#		include "DefsGnucWin32.h"
-#	elif NV_OS_CYGWIN
-#		error "GCC: Cygwin not supported"
-#	else
-#		error "GCC: Platform not supported"
-#	endif
+#   if NV_OS_LINUX
+#       include "DefsGnucLinux.h"
+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_NETBSD || NV_OS_OPENBSD
+#       include "DefsGnucDarwin.h"
+#   elif NV_OS_MINGW
+#       include "DefsGnucWin32.h"
+#   elif NV_OS_CYGWIN
+#       error "GCC: Cygwin not supported"
+#   else
+#       error "GCC: Platform not supported"
+#   endif
 #endif
 
 #endif // NV_CORE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/CMakeLists.txt
@@ -1,7 +0,0 @@
-
-SET(POSHLIB_SRCS
-	posh.c
-	posh.h)
-
-ADD_LIBRARY(posh STATIC ${POSHLIB_SRCS})
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/posh.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/posh.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/posh.h
@@ -1,1022 +0,0 @@
-/**
-@file posh.h
-@author Brian Hook
-@version 1.3.001
-
-Header file for POSH, the Portable Open Source Harness project.
-
-NOTE: Unlike most header files, this one is designed to be included
-multiple times, which is why it does not have the @#ifndef/@#define
-preamble.
-
-POSH relies on environment specified preprocessor symbols in order
-to infer as much as possible about the target OS/architecture and
-the host compiler capabilities.
-
-NOTE: POSH is simple and focused. It attempts to provide basic
-functionality and information, but it does NOT attempt to emulate
-missing functionality.  I am also not willing to make POSH dirty
-and hackish to support truly ancient and/or outmoded and/or bizarre
-technologies such as non-ANSI compilers, systems with non-IEEE
-floating point formats, segmented 16-bit operating systems, etc.
-
-Please refer to the accompanying HTML documentation or visit
-http://www.poshlib.org for more information on how to use POSH.
-
-LICENSE:
-
-Copyright (c) 2004, Brian Hook
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-
-    * The names of this package'ss contributors contributors may not
-      be used to endorse or promote products derived from this
-      software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-REVISION:
-
-I've been lax about revision histories, so this starts at, um, 1.3.001.
-Sorry for any inconveniences.
-
-1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary,
-                      where I was not detecting Visual Studio
-                      compilation on x86-64 systems.  Added check for
-                      _M_X64 which should fix that.
-
-*/
-/*
-I have yet to find an authoritative reference on preprocessor
-symbols, but so far this is what I've gleaned:
-
-GNU GCC/G++:
-   - __GNUC__: GNU C version
-   - __GNUG__: GNU C++ compiler
-   - __sun__ : on Sun platforms
-   - __svr4__: on Solaris and other SysV R4 platforms
-   - __mips__: on MIPS processor platforms
-   - __sparc_v9__: on Sparc 64-bit CPUs
-   - __sparcv9: 64-bit Solaris
-   - __MIPSEL__: mips processor, compiled for little endian
-   - __MIPSEB__: mips processor, compiled for big endian
-   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
-   - mc68000: 68K
-   - m68000: 68K
-   - m68k: 68K
-   - __palmos__: PalmOS
-
-Intel C/C++ Compiler:
-   - __ECC      : compiler version, IA64 only
-   - __EDG__
-   - __ELF__
-   - __GXX_ABI_VERSION
-   - __i386     : IA-32 only
-   - __i386__   : IA-32 only
-   - i386       : IA-32 only
-   - __ia64     : IA-64 only
-   - __ia64__   : IA-64 only
-   - ia64       : IA-64 only
-   - __ICC      : IA-32 only
-   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
-
-Apple's C/C++ Compiler for OS X:
-   - __APPLE_CC__
-   - __APPLE__
-   - __BIG_ENDIAN__
-   - __APPLE__
-   - __ppc__
-   - __MACH__
-
-DJGPP:
-   - __MSDOS__
-   - __unix__
-   - __unix
-   - __GNUC__
-   - __GO32
-   - DJGPP
-   - __i386, __i386, i386
-
-Cray's C compiler:
-   - _ADDR64: if 64-bit pointers
-   - _UNICOS: 
-   - __unix:
-
-SGI's CC compiler predefines the following (and more) with -ansi:
-   - __sgi
-   - __unix
-   - __host_mips
-   - _SYSTYPE_SVR4
-   - __mips
-   - _MIPSEB
-   - anyone know if there is a predefined symbol for the compiler?!
-
-MinGW:
-   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
-   - __MINGW32__
-
-Cygwin:
-   - as Gnu C, but also
-   - __unix__
-   - __CYGWIN32__
-
-Microsoft Visual Studio predefines the following:
-   - _MSC_VER
-   - _WIN32: on Win32
-   - _M_IX6 (on x86 systems)
-   - _M_X64: on x86-64 systems
-   - _M_ALPHA (on DEC AXP systems)
-   - _SH3: WinCE, Hitachi SH-3
-   - _MIPS: WinCE, MIPS
-   - _ARM: WinCE, ARM
-
-Sun's C Compiler:
-   - sun and _sun
-   - unix and _unix
-   - sparc and _sparc (SPARC systems only)
-   - i386 and _i386 (x86 systems only)
-   - __SVR4 (Solaris only)
-   - __sparcv9: 64-bit solaris
-   - __SUNPRO_C
-   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
-
-Borland C/C++ predefines the following:
-   - __BORLANDC__:
-
-DEC/Compaq C/C++ on Alpha:
-   - __alpha
-   - __arch64__
-   - __unix__ (on Tru64 Unix)
-   - __osf__
-   - __DECC
-   - __DECCXX (C++ compilation)
-   - __DECC_VER
-   - __DECCXX_VER
-
-IBM's AIX compiler:
-   - __64BIT__ if 64-bit mode
-   - _AIX
-   - __IBMC__: C compiler version
-   - __IBMCPP__: C++ compiler version
-   - _LONG_LONG: compiler allows long long
-
-Watcom:
-   - __WATCOMC__
-   - __DOS__ : if targeting DOS
-   - __386__ : if 32-bit support
-   - __WIN32__ : if targetin 32-bit Windows
-
-HP-UX C/C++ Compiler:
-   - __hpux
-   - __unix
-   - __hppa (on PA-RISC)
-   - __LP64__: if compiled in 64-bit mode
-
-Metrowerks:
-   - __MWERKS__
-   - __powerpc__
-   - _powerc
-   - __MC68K__
-   - macintosh when compiling for MacOS
-   - __INTEL__ for x86 targets
-   - __POWERPC__
-
-*/
-
-/*
-** ----------------------------------------------------------------------------
-** Include <limits.h> optionally
-** ----------------------------------------------------------------------------
-*/
-#ifdef POSH_USE_LIMITS_H
-#  include <limits.h>
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** Determine compilation environment
-** ----------------------------------------------------------------------------
-*/
-#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
-#  define POSH_COMPILER_STRING "Intel C/C++"
-#  define POSH_COMPILER_INTEL 1
-#endif
-
-#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
-#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
-#  define POSH_COMPILER_MIPSPRO 1 
-#endif
-
-#if defined __hpux && !defined __GNUC__
-#  define POSH_COMPILER_STRING "HP-UX CC"
-#  define POSH_COMPILER_HPCC 1 
-#endif
-
-#if defined __GNUC__
-#  define POSH_COMPILER_STRING "Gnu GCC"
-#  define POSH_COMPILER_GCC 1
-#endif
-
-#if defined __APPLE_CC__
-   /* we don't define the compiler string here, let it be GNU */
-#  define POSH_COMPILER_APPLECC 1
-#endif
-
-#if defined __IBMC__ || defined __IBMCPP__
-#  define POSH_COMPILER_STRING "IBM C/C++"
-#  define POSH_COMPILER_IBM 1
-#endif
-
-#if defined _MSC_VER
-#  define POSH_COMPILER_STRING "Microsoft Visual C++"
-#  define POSH_COMPILER_MSVC 1
-#endif
-
-#if defined __SUNPRO_C
-#  define POSH_COMPILER_STRING "Sun Pro" 
-#  define POSH_COMPILER_SUN 1
-#endif
-
-#if defined __BORLANDC__
-#  define POSH_COMPILER_STRING "Borland C/C++"
-#  define POSH_COMPILER_BORLAND 1
-#endif
-
-#if defined __MWERKS__
-#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
-#  define POSH_COMPILER_METROWERKS 1
-#endif
-
-#if defined __DECC || defined __DECCXX
-#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
-#  define POSH_COMPILER_DEC 1
-#endif
-
-#if defined __WATCOMC__
-#  define POSH_COMPILER_STRING "Watcom C/C++"
-#  define POSH_COMPILER_WATCOM 1
-#endif
-
-#if !defined POSH_COMPILER_STRING
-#  define POSH_COMPILER_STRING "Unknown compiler"
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** Determine target operating system
-** ----------------------------------------------------------------------------
-*/
-#if defined linux || defined __linux__
-#  define POSH_OS_LINUX 1 
-#  define POSH_OS_STRING "Linux"
-#endif
-
-#if defined __FreeBSD__
-#  define POSH_OS_FREEBSD 1 
-#  define POSH_OS_STRING "FreeBSD"
-#endif
-
-#if defined __OpenBSD__
-#  define POSH_OS_OPENBSD 1
-#  define POSH_OS_STRING "OpenBSD"
-#endif
-
-#if defined __CYGWIN32__
-#  define POSH_OS_CYGWIN32 1
-#  define POSH_OS_STRING "Cygwin"
-#endif
-
-#if defined GEKKO
-#  define POSH_OS_GAMECUBE
-#  define __powerpc__
-#  define POSH_OS_STRING "GameCube"
-#endif
-
-#if defined __MINGW32__
-#  define POSH_OS_MINGW 1
-#  define POSH_OS_STRING "MinGW"
-#endif
-
-#if defined GO32 && defined DJGPP && defined __MSDOS__ 
-#  define POSH_OS_GO32 1
-#  define POSH_OS_STRING "GO32/MS-DOS"
-#endif
-
-/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
-   otherwise Watcom assumes host=target */
-#if defined __WATCOMC__  && defined __386__ && defined __DOS__
-#  define POSH_OS_DOS32 1
-#  define POSH_OS_STRING "DOS/32-bit"
-#endif
-
-#if defined _UNICOS
-#  define POSH_OS_UNICOS 1
-#  define POSH_OS_STRING "UNICOS"
-#endif
-
-#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
-#  define POSH_OS_OSX 1
-#  define POSH_OS_STRING "MacOS X"
-#endif
-
-#if defined __sun__ || defined sun || defined __sun || defined __solaris__
-#  if defined __SVR4 || defined __svr4__ || defined __solaris__
-#     define POSH_OS_STRING "Solaris"
-#     define POSH_OS_SOLARIS 1
-#  endif
-#  if !defined POSH_OS_STRING
-#     define POSH_OS_STRING "SunOS"
-#     define POSH_OS_SUNOS 1
-#  endif
-#endif
-
-#if defined __sgi__ || defined sgi || defined __sgi
-#  define POSH_OS_IRIX 1
-#  define POSH_OS_STRING "Irix"
-#endif
-
-#if defined __hpux__ || defined __hpux
-#  define POSH_OS_HPUX 1
-#  define POSH_OS_STRING "HP-UX"
-#endif
-
-#if defined _AIX
-#  define POSH_OS_AIX 1
-#  define POSH_OS_STRING "AIX"
-#endif
-
-#if ( defined __alpha && defined __osf__ )
-#  define POSH_OS_TRU64 1
-#  define POSH_OS_STRING "Tru64"
-#endif
-
-#if defined __BEOS__ || defined __beos__
-#  define POSH_OS_BEOS 1
-#  define POSH_OS_STRING "BeOS"
-#endif
-
-#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
-#  define POSH_OS_AMIGA 1
-#  define POSH_OS_STRING "Amiga"
-#endif
-
-#if defined __unix__
-#  define POSH_OS_UNIX 1 
-#  if !defined POSH_OS_STRING
-#     define POSH_OS_STRING "Unix-like(generic)"
-#  endif
-#endif
-
-#if defined _WIN32_WCE
-#  define POSH_OS_WINCE 1
-#  define POSH_OS_STRING "Windows CE"
-#endif
-
-#if defined _XBOX
-#  define POSH_OS_XBOX 1
-#  define POSH_OS_STRING "XBOX"
-#endif
-
-#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
-#  define POSH_OS_WIN32 1
-#  if !defined POSH_OS_XBOX
-#     if defined _WIN64
-#        define POSH_OS_WIN64 1
-#        define POSH_OS_STRING "Win64"
-#     else
-#        if !defined POSH_OS_STRING
-#           define POSH_OS_STRING "Win32"
-#        endif
-#     endif
-#  endif
-#endif
-
-#if defined __palmos__
-#  define POSH_OS_PALM 1
-#  define POSH_OS_STRING "PalmOS"
-#endif
-
-#if defined THINK_C || defined macintosh
-#  define POSH_OS_MACOS 1
-#  define POSH_OS_STRING "MacOS"
-#endif
-
-/*
-** -----------------------------------------------------------------------------
-** Determine target CPU
-** -----------------------------------------------------------------------------
-*/
-
-#if defined GEKKO
-#  define POSH_CPU_PPC750 1
-#  define POSH_CPU_STRING "IBM PowerPC 750 (NGC)"
-#endif
-
-#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
-#  define POSH_CPU_68K 1
-#  define POSH_CPU_STRING "MC68000"
-#endif
-
-#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__
-#  define POSH_CPU_PPC 1
-#  if !defined POSH_CPU_STRING
-#    if defined __powerpc64__
-#       define POSH_CPU_STRING "PowerPC64"
-#    else
-#       define POSH_CPU_STRING "PowerPC"
-#    endif
-#  endif
-#endif
-
-#if defined _CRAYT3E || defined _CRAYMPP
-#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
-#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
-#endif
-
-#if defined CRAY || defined _CRAY && !defined _CRAYT3E
-#  error Non-AXP Cray systems not supported
-#endif
-
-#if defined _SH3
-#  define POSH_CPU_SH3 1
-#  define POSH_CPU_STRING "Hitachi SH-3"
-#endif
-
-#if defined __sh4__ || defined __SH4__
-#  define POSH_CPU_SH3 1
-#  define POSH_CPU_SH4 1
-#  define POSH_CPU_STRING "Hitachi SH-4"
-#endif
-
-#if defined __sparc__ || defined __sparc
-#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
-#     define POSH_CPU_SPARC64 1 
-#     define POSH_CPU_STRING "Sparc/64"
-#  else
-#     define POSH_CPU_STRING "Sparc/32"
-#  endif
-#  define POSH_CPU_SPARC 1
-#endif
-
-#if defined ARM || defined __arm__ || defined _ARM
-#  define POSH_CPU_STRONGARM 1
-#  define POSH_CPU_STRING "ARM"
-#endif
-
-#if defined __aarch64__
-#  define POSH_CPU_AARCH64 1
-#  define POSH_CPU_STRING "ARM64"
-#endif
-
-#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
-#  define POSH_CPU_MIPS 1 
-#  if defined _R5900
-#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
-#  else
-#    define POSH_CPU_STRING "MIPS"
-#  endif
-#endif
-
-#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
-#  define POSH_CPU_IA64 1
-#  define POSH_CPU_STRING "IA64"
-#endif
-
-#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
-#  define POSH_CPU_X86 1
-#  if defined __x86_64__ || defined _M_X64
-#     define POSH_CPU_X86_64 1 
-#  endif
-#  if defined POSH_CPU_X86_64
-#     define POSH_CPU_STRING "AMD x86-64"
-#  else
-#     define POSH_CPU_STRING "Intel 386+"
-#  endif
-#endif
-
-#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
-#  define POSH_CPU_AXP 1
-#  define POSH_CPU_STRING "AXP"
-#endif
-
-#if defined __hppa || defined hppa
-#  define POSH_CPU_HPPA 1
-#  define POSH_CPU_STRING "PA-RISC"
-#endif
-
-#if !defined POSH_CPU_STRING
-#  error POSH cannot determine target CPU
-#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
-#endif
-
-/*
-** -----------------------------------------------------------------------------
-** Attempt to autodetect building for embedded on Sony PS2
-** -----------------------------------------------------------------------------
-*/
-#if !defined POSH_OS_STRING
-#  if !defined FORCE_DOXYGEN
-#    define POSH_OS_EMBEDDED 1 
-#  endif
-#  if defined _R5900
-#     define POSH_OS_STRING "Sony PS2(embedded)"
-#  else
-#     define POSH_OS_STRING "Embedded/Unknown"
-#  endif
-#endif
-
-/*
-** ---------------------------------------------------------------------------
-** Handle cdecl, stdcall, fastcall, etc.
-** ---------------------------------------------------------------------------
-*/
-#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
-#  if defined __GNUC__
-#     define POSH_CDECL __attribute__((cdecl))
-#     define POSH_STDCALL __attribute__((stdcall))
-#     define POSH_FASTCALL __attribute__((fastcall))
-#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
-#     define POSH_CDECL    __cdecl
-#     define POSH_STDCALL  __stdcall
-#     define POSH_FASTCALL __fastcall
-#  endif
-#else
-#  define POSH_CDECL    
-#  define POSH_STDCALL  
-#  define POSH_FASTCALL 
-#endif
-
-/*
-** ---------------------------------------------------------------------------
-** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
-** ---------------------------------------------------------------------------
-*/
-
-/*
-** We undefine this so that multiple inclusions will work
-*/
-#if defined POSH_IMPORTEXPORT
-#  undef POSH_IMPORTEXPORT
-#endif
-
-#if defined POSH_DLL
-#   if defined POSH_OS_WIN32
-#      if defined _MSC_VER 
-#         if ( _MSC_VER >= 800 )
-#            if defined POSH_BUILDING_LIB
-#               define POSH_IMPORTEXPORT __declspec( dllexport )
-#            else
-#               define POSH_IMPORTEXPORT __declspec( dllimport )
-#            endif
-#         else
-#            if defined POSH_BUILDING_LIB
-#               define POSH_IMPORTEXPORT __export
-#            else
-#               define POSH_IMPORTEXPORT 
-#            endif
-#         endif
-#      endif  /* defined _MSC_VER */
-#      if defined __BORLANDC__
-#         if ( __BORLANDC__ >= 0x500 )
-#            if defined POSH_BUILDING_LIB 
-#               define POSH_IMPORTEXPORT __declspec( dllexport )
-#            else
-#               define POSH_IMPORTEXPORT __declspec( dllimport )
-#            endif
-#         else
-#            if defined POSH_BUILDING_LIB
-#               define POSH_IMPORTEXPORT __export
-#            else
-#               define POSH_IMPORTEXPORT 
-#            endif
-#         endif
-#      endif /* defined __BORLANDC__ */
-       /* for all other compilers, we're just making a blanket assumption */
-#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
-#         if defined POSH_BUILDING_LIB
-#            define POSH_IMPORTEXPORT __declspec( dllexport )
-#         else
-#            define POSH_IMPORTEXPORT __declspec( dllimport )
-#         endif
-#      endif /* all other compilers */
-#      if !defined POSH_IMPORTEXPORT
-#         error Building DLLs not supported on this compiler (poshlib@poshlib.org if you know how)
-#      endif
-#   endif /* defined POSH_OS_WIN32 */
-#endif
-
-/* On pretty much everything else, we can thankfully just ignore this */
-#if !defined POSH_IMPORTEXPORT
-#  define POSH_IMPORTEXPORT
-#endif
-
-#if defined FORCE_DOXYGEN
-#  define POSH_DLL    
-#  define POSH_BUILDING_LIB
-#  undef POSH_DLL
-#  undef POSH_BUILDING_LIB
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** (Re)define POSH_PUBLIC_API export signature 
-** ----------------------------------------------------------------------------
-*/
-#ifdef POSH_PUBLIC_API
-#  undef POSH_PUBLIC_API
-#endif
-
-#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
-#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
-#else
-#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** Try to infer endianess.  Basically we just go through the CPUs we know are
-** little endian, and assume anything that isn't one of those is big endian.
-** As a sanity check, we also do this with operating systems we know are
-** little endian, such as Windows.  Some processors are bi-endian, such as 
-** the MIPS series, so we have to be careful about those.
-** ----------------------------------------------------------------------------
-*/
-#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
-#  define POSH_ENDIAN_STRING "little"
-#  define POSH_LITTLE_ENDIAN 1
-#else
-#  define POSH_ENDIAN_STRING "big"
-#  define POSH_BIG_ENDIAN 1
-#endif
-
-#if defined FORCE_DOXYGEN
-#  define POSH_LITTLE_ENDIAN
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** Cross-platform compile time assertion macro
-** ----------------------------------------------------------------------------
-*/
-#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
-
-/*
-** ----------------------------------------------------------------------------
-** 64-bit Integer
-**
-** We don't require 64-bit support, nor do we emulate its functionality, we
-** simply export it if it's available.  Since we can't count on <limits.h>
-** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
-** ----------------------------------------------------------------------------
-*/
-#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
-#  define POSH_64BIT_INTEGER 1
-typedef long posh_i64_t; 
-typedef unsigned long posh_u64_t;
-#  define POSH_I64( x ) ((posh_i64_t)x)
-#  define POSH_U64( x ) ((posh_u64_t)x)
-#  define POSH_I64_PRINTF_PREFIX "l"
-#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
-#  define POSH_64BIT_INTEGER 1
-typedef __int64 posh_i64_t;
-typedef unsigned __int64 posh_u64_t;
-#  define POSH_I64( x ) ((posh_i64_t)x)
-#  define POSH_U64( x ) ((posh_u64_t)x)
-#  define POSH_I64_PRINTF_PREFIX "I64"
-#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
-#  define POSH_64BIT_INTEGER 1
-typedef long long posh_i64_t;
-typedef unsigned long long posh_u64_t;
-#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
-#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
-#  define POSH_I64_PRINTF_PREFIX "ll"
-#endif
-
-/* hack */
-/*#ifdef __MINGW32__
-#undef POSH_I64
-#undef POSH_U64
-#undef POSH_I64_PRINTF_PREFIX
-#define POSH_I64( x ) ((posh_i64_t)x)
-#define POSH_U64( x ) ((posh_u64_t)x)
-#define POSH_I64_PRINTF_PREFIX "I64"
-#endif*/
-
-#ifdef FORCE_DOXYGEN
-typedef long long posh_i64_t;
-typedef unsigned long posh_u64_t;
-#  define POSH_64BIT_INTEGER
-#  define POSH_I64_PRINTF_PREFIX
-#  define POSH_I64(x)
-#  define POSH_U64(x)
-#endif
-
-/** Minimum value for a 64-bit signed integer */
-#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
-/** Maximum value for a 64-bit signed integer */
-#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
-/** Minimum value for a 64-bit unsigned integer */
-#define POSH_U64_MIN  POSH_U64(0)
-/** Maximum value for a 64-bit unsigned integer */
-#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
-
-/* ----------------------------------------------------------------------------
-** Basic Sized Types
-**
-** These types are expected to be EXACTLY sized so you can use them for
-** serialization.
-** ----------------------------------------------------------------------------
-*/
-#define POSH_FALSE 0 
-#define POSH_TRUE  1 
-
-typedef int            posh_bool_t;
-typedef unsigned char  posh_byte_t;
-
-/* NOTE: These assume that CHAR_BIT is 8!! */
-typedef unsigned char  posh_u8_t;
-typedef signed char    posh_i8_t;
-
-#if defined POSH_USE_LIMITS_H
-#  if CHAR_BITS > 8
-#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
-#  endif /* CHAR_BITS > 8 */
-
-/* 16-bit */
-#  if ( USHRT_MAX == 65535 ) 
-   typedef unsigned short posh_u16_t;
-   typedef short          posh_i16_t;
-#  else
-   /* Yes, in theory there could still be a 16-bit character type and shorts are
-      32-bits in size...if you find such an architecture, let me know =P */
-#    error No 16-bit type found
-#  endif
-
-/* 32-bit */
-#  if ( INT_MAX == 2147483647 )
-  typedef unsigned       posh_u32_t;
-  typedef int            posh_i32_t;
-#  elif ( LONG_MAX == 2147483647 )
-  typedef unsigned long  posh_u32_t;
-  typedef long           posh_i32_t;
-#  else
-      error No 32-bit type found
-#  endif
-
-#else /* POSH_USE_LIMITS_H */
-
-  typedef unsigned short posh_u16_t;
-  typedef short          posh_i16_t;
-
-#  if !defined POSH_OS_PALM
-  typedef unsigned       posh_u32_t;
-  typedef int            posh_i32_t;
-#  else
-  typedef unsigned long  posh_u32_t;
-  typedef long           posh_i32_t;
-#  endif
-#endif
-
-/** Minimum value for a byte */
-#define POSH_BYTE_MIN    0
-/** Maximum value for an 8-bit unsigned value */
-#define POSH_BYTE_MAX    255
-/** Minimum value for a byte */
-#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
-/** Maximum value for a 16-bit signed value */
-#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
-/** Minimum value for a 16-bit unsigned value */
-#define POSH_U16_MIN     0
-/** Maximum value for a 16-bit unsigned value */
-#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
-/** Minimum value for a 32-bit signed value */
-#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
-/** Maximum value for a 32-bit signed value */
-#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
-/** Minimum value for a 32-bit unsigned value */
-#define POSH_U32_MIN     0
-/** Maximum value for a 32-bit unsigned value */
-#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
-
-/*
-** ----------------------------------------------------------------------------
-** Sanity checks on expected sizes
-** ----------------------------------------------------------------------------
-*/
-#if !defined FORCE_DOXYGEN
-
-POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
-POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
-POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
-POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
-POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
-POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
-POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
-
-#if !defined POSH_NO_FLOAT
-   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
-   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
-#endif
-
-#if defined POSH_64BIT_INTEGER
-   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
-   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
-#endif
-
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** 64-bit pointer support
-** ----------------------------------------------------------------------------
-*/
-#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
-#  define POSH_64BIT_POINTER 1
-#endif
-
-#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
-#  define POSH_64BIT_POINTER 1
-#endif
-
-#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
-#   define POSH_64BIT_POINTER 1
-#endif
-
-#if defined POSH_64BIT_POINTER
-   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
-#elif !defined FORCE_DOXYGEN
-/* if this assertion is hit then you're on a system that either has 64-bit
-   addressing and we didn't catch it, or you're on a system with 16-bit
-   pointers.  In the latter case, POSH doesn't actually care, we're just
-   triggering this assertion to make sure you're aware of the situation,
-   so feel free to delete it.
-
-   If this assertion is triggered on a known 32 or 64-bit platform, 
-   please let us know (poshlib@poshlib.org) */
-   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
-#endif
-
-#if defined FORCE_DOXYGEN
-#  define POSH_64BIT_POINTER
-#endif
-
-/*
-** ----------------------------------------------------------------------------
-** POSH Utility Functions
-**
-** These are optional POSH utility functions that are not required if you don't
-** need anything except static checking of your host and target environment.
-** 
-** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
-** to enforce their export if your own library is only using them internally.
-** ----------------------------------------------------------------------------
-*/
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const char *POSH_GetArchString( void );
-
-#if !defined POSH_NO_FLOAT
-
-posh_u32_t  POSH_LittleFloatBits( float f );
-posh_u32_t  POSH_BigFloatBits( float f );
-float       POSH_FloatFromLittleBits( posh_u32_t bits );
-float       POSH_FloatFromBigBits( posh_u32_t bits );
-
-void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
-double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
-
-/* unimplemented
-float      *POSH_WriteFloatToLittle( void *dst, float f );
-float      *POSH_WriteFloatToBig( void *dst, float f );
-float       POSH_ReadFloatFromLittle( const void *src );
-float       POSH_ReadFloatFromBig( const void *src );
-
-double     *POSH_WriteDoubleToLittle( void *dst, double d );
-double     *POSH_WriteDoubleToBig( void *dst, double d );
-double      POSH_ReadDoubleFromLittle( const void *src );
-double      POSH_ReadDoubleFromBig( const void *src );
-*/
-#endif /* !defined POSH_NO_FLOAT */
-
-#if defined FORCE_DOXYGEN
-#  define POSH_NO_FLOAT
-#  undef  POSH_NO_FLOAT
-#endif
-
-extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
-extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
-extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
-extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
-
-#if defined POSH_64BIT_INTEGER
-
-extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
-extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
-
-#endif /*POSH_64BIT_INTEGER */
-
-extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
-extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
-extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
-extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
-
-extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
-extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
-extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
-extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
-
-extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
-extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
-extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
-extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
-
-extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
-extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
-extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
-extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
-
-#if defined POSH_64BIT_INTEGER
-extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
-extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
-extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
-extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
-
-extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
-extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
-extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
-extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
-#endif /* POSH_64BIT_INTEGER */
-
-#if defined POSH_LITTLE_ENDIAN
-
-#  define POSH_LittleU16(x) (x)
-#  define POSH_LittleU32(x) (x)
-#  define POSH_LittleI16(x) (x)
-#  define POSH_LittleI32(x) (x)
-#  if defined POSH_64BIT_INTEGER
-#    define POSH_LittleU64(x) (x)
-#    define POSH_LittleI64(x) (x)
-#  endif /* defined POSH_64BIT_INTEGER */
-
-#  define POSH_BigU16(x) POSH_SwapU16(x)
-#  define POSH_BigU32(x) POSH_SwapU32(x)
-#  define POSH_BigI16(x) POSH_SwapI16(x)
-#  define POSH_BigI32(x) POSH_SwapI32(x)
-#  if defined POSH_64BIT_INTEGER
-#    define POSH_BigU64(x) POSH_SwapU64(x)
-#    define POSH_BigI64(x) POSH_SwapI64(x)
-#  endif /* defined POSH_64BIT_INTEGER */
-
-#else
-
-#  define POSH_BigU16(x) (x)
-#  define POSH_BigU32(x) (x)
-#  define POSH_BigI16(x) (x)
-#  define POSH_BigI32(x) (x)
-
-#  if defined POSH_64BIT_INTEGER
-#    define POSH_BigU64(x) (x)
-#    define POSH_BigI64(x) (x)
-#  endif /* POSH_64BIT_INTEGER */
-
-#  define POSH_LittleU16(x) POSH_SwapU16(x)
-#  define POSH_LittleU32(x) POSH_SwapU32(x)
-#  define POSH_LittleI16(x) POSH_SwapI16(x)
-#  define POSH_LittleI32(x) POSH_SwapI32(x)
-
-#  if defined POSH_64BIT_INTEGER
-#    define POSH_LittleU64(x) POSH_SwapU64(x)
-#    define POSH_LittleI64(x) POSH_SwapI64(x)
-#  endif /* POSH_64BIT_INTEGER */
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/posh.c
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/posh.c
+++ ps/trunk/libraries/source/nvtt/src/src/nvcore/poshlib/posh.c
@@ -1,1006 +0,0 @@
-/*
-LICENSE:
-
-Copyright (c) 2004, Brian Hook
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above
-      copyright notice, this list of conditions and the following
-      disclaimer in the documentation and/or other materials provided
-      with the distribution.
-
-    * The names of this package'ss contributors contributors may not
-      be used to endorse or promote products derived from this
-      software without specific prior written permission.
-
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-/** 
- @file    posh.c
- @author  Brian Hook
- @date    2002
- @brief   Portable Open Source Harness primary source file
-*/
-#include "posh.h"
-
-#if !defined FORCE_DOXYGEN
-
-#if !defined POSH_NO_FLOAT
-#  define POSH_FLOAT_STRING "enabled"
-#else
-#  define POSH_FLOAT_STRING "disabled"
-#endif
-
-#if defined POSH_64BIT_INTEGER
-#  define POSH_64BIT_INTEGER_STRING "yes"
-#else
-#  define POSH_64BIT_INTEGER_STRING "no"
-#endif
-
-#if defined POSH_64BIT_POINTER
-#  define POSH_POINTER_STRING "64-bits"
-#else
-#  define POSH_POINTER_STRING "32-bits"
-#endif
-
-#if defined POSH_LITTLE_ENDIAN
-#  define IS_BIG_ENDIAN    0
-
-#  define NATIVE16  POSH_LittleU16
-#  define NATIVE32  POSH_LittleU32
-#  define NATIVE64  POSH_LittleU64
-#  define FOREIGN16 POSH_BigU16
-#  define FOREIGN32 POSH_BigU32
-#  define FOREIGN64 POSH_BigU64
-#else
-#  define IS_BIG_ENDIAN    1
-
-#  define NATIVE16  POSH_BigU16
-#  define NATIVE32  POSH_BigU32
-#  define NATIVE64  POSH_BigU64
-#  define FOREIGN16 POSH_LittleU16
-#  define FOREIGN32 POSH_LittleU32
-#  define FOREIGN64 POSH_LittleU64
-#endif /* POSH_LITTLE_ENDIAN */
-
-static 
-int 
-s_testBigEndian( void )
-{
-   union 
-   {
-      posh_byte_t c[ 4 ];
-      posh_u32_t  i;
-   } u;
-
-   u.i= 1;
-
-   if ( u.c[ 0 ] == 1 )
-   {
-      return 0;
-   }
-   return 1;
-}
-
-static
-const char *
-s_testSerialization( void )
-{
-   posh_byte_t serbuf[ 8 ];
-   posh_u16_t  tmp16;
-   posh_u32_t  tmp32;
-
-   /* 16-bit serialization */
-   POSH_WriteU16ToLittle( serbuf, 0xABCD );
-   if ( ( tmp16 = POSH_ReadU16FromLittle( serbuf ) ) != 0xABCD )
-   {
-      return "*ERROR: failed little-endian 16-bit serialization test";
-   }
-
-   POSH_WriteU16ToBig( serbuf, 0xABCD );
-   if ( ( tmp16 = POSH_ReadU16FromBig( serbuf ) ) != 0xABCD )
-   {
-      return "*ERROR: failed big-endian 16-bit serialization test";
-   }
-
-   /* 32-bit serialization */
-   POSH_WriteU32ToLittle( serbuf, 0xABCD1234L );
-   if ( ( tmp32 = POSH_ReadU32FromLittle( serbuf ) ) != 0xABCD1234 )
-   {
-      return "*ERROR: failed little-endian 32-bit serialization test";
-   }
-
-   POSH_WriteU32ToBig( serbuf, 0xABCD1234L );
-   if ( ( tmp32 = POSH_ReadU32FromBig( serbuf ) ) != 0xABCD1234 )
-   {
-      return "*ERROR: failed big-endian 32-bit serialization test";
-   }
-
-#if defined POSH_64BIT_INTEGER
-   {
-#define REF64 POSH_U64(0xFEDCBA9876543210)
-
-      posh_u64_t tmp64;
-
-      POSH_WriteU64ToLittle( serbuf, REF64 );
-
-      if ( ( tmp64 = POSH_ReadU64FromLittle( serbuf ) ) != REF64 )
-      {
-         return "*ERROR: failed little-endian 64-bit serialization test";
-      }
-
-      POSH_WriteU64ToBig( serbuf, REF64 );
-
-      if ( ( tmp64 = POSH_ReadU64FromBig( serbuf ) ) != REF64 )
-      {
-         return "*ERROR: failed big-endian 64-bit serialization test";
-      }
-   }
-#endif
-
-   return 0;
-}
-
-#if !defined POSH_NO_FLOAT
-static
-const char *
-s_testFloatingPoint( void )
-{
-   float fRef = 10.0f/30.0f;
-   double dRef = 10.0/30.0;
-   posh_byte_t dbuf[ 8 ];
-   float fTmp;
-   double dTmp;
-
-   fTmp = POSH_FloatFromLittleBits( POSH_LittleFloatBits( fRef ) );
-
-   if ( fTmp != fRef )
-   {
-      return "*ERROR: POSH little endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
-   }
-
-   fTmp = POSH_FloatFromBigBits( POSH_BigFloatBits( fRef ) );
-   if ( fTmp != fRef )
-   {
-      return "*ERROR: POSH big endian floating point conversion failed.  Please report this to poshlib@poshlib.org!\n";
-   }
-
-   POSH_DoubleBits( dRef, dbuf );
-
-   dTmp = POSH_DoubleFromBits( dbuf );
-
-   if ( dTmp != dRef )
-   {
-      return "*ERROR: POSH double precision floating point serialization failed.  Please report this to poshlib@poshlib.org!\n";
-   }
-
-   return 0;
-}
-#endif /* !defined POSH_NO_FLOAT */
-
-static
-const char *
-s_testEndianess( void )
-{
-   /* check endianess */
-   if ( s_testBigEndian() != IS_BIG_ENDIAN )
-   {
-      return "*ERROR: POSH compile time endianess does not match run-time endianess verification.  Please report this to poshlib@poshlib.org!\n";
-   }
-
-   /* make sure our endian swap routines work */
-   if ( ( NATIVE32( 0x11223344L ) != 0x11223344L ) || 
-        ( FOREIGN32( 0x11223344L ) != 0x44332211L ) ||
-        ( NATIVE16( 0x1234 ) != 0x1234 ) ||
-        ( FOREIGN16( 0x1234 ) != 0x3412 ) )
-   {
-      return "*ERROR: POSH endianess macro selection failed.  Please report this to poshlib@poshlib.org!\n";
-   }
-
-   /* test serialization routines */
-
-   return 0;
-}
-#endif /* !defined FORCE_DOXYGEN */
-
-/**
-  Returns a string describing this platform's basic attributes.  
-
-  POSH_GetArchString() reports on an architecture's statically determined
-  attributes.  In addition, it will perform run-time verification checks
-  to make sure the various platform specific functions work.  If an error
-  occurs, please contact me at poshlib@poshlib.org so we can try to resolve
-  what the specific failure case is.
-  @returns a string describing this platform on success, or a string in the 
-           form "*ERROR: [text]" on failure.  You can simply check to see if
-           the first character returned is '*' to verify an error condition.
-*/
-const char *
-POSH_GetArchString( void )
-{
-   const char *err;
-   const char *s = "OS:.............."POSH_OS_STRING"\n"
-                   "CPU:............."POSH_CPU_STRING"\n"
-                   "endian:.........."POSH_ENDIAN_STRING"\n"
-                   "ptr size:........"POSH_POINTER_STRING"\n"
-                   "64-bit ints......"POSH_64BIT_INTEGER_STRING"\n"
-                   "floating point..."POSH_FLOAT_STRING"\n"
-                   "compiler........."POSH_COMPILER_STRING"\n";
-
-   /* test endianess */
-   err = s_testEndianess();
-
-   if ( err != 0 )
-   {
-      return err;
-   }
-
-   /* test serialization */
-   err = s_testSerialization();
-
-   if ( err != 0 )
-   {
-      return err;
-   }
-
-#if !defined POSH_NO_FLOAT
-   /* check that our floating point support is correct */
-   err = s_testFloatingPoint();
-
-   if ( err != 0 )
-   {
-      return err;
-   }
-
-#endif
-
-   return s;
-}
-
-/* ---------------------------------------------------------------------------*/
-/*                           BYTE SWAPPING SUPPORT                            */
-/* ---------------------------------------------------------------------------*/
-/** 
- * Byte swaps a 16-bit unsigned value
- *
-   @ingroup ByteSwapFunctions
-   @param v [in] unsigned 16-bit input value to swap
-   @returns a byte swapped version of v
- */
-posh_u16_t
-POSH_SwapU16( posh_u16_t v )
-{
-   posh_u16_t swapped;
-
-   swapped  = v << 8;
-   swapped |= v >> 8;
-
-   return swapped;
-}
-
-/** 
- * Byte swaps a 16-bit signed value
- *
-   @ingroup ByteSwapFunctions
-   @param v [in] signed 16-bit input value to swap
-   @returns a byte swapped version of v
-   @remarks This just calls back to the unsigned version, since byte swapping 
-            is independent of sign.  However, we still provide this function to
-            avoid signed/unsigned mismatch compiler warnings.
- */
-posh_i16_t
-POSH_SwapI16( posh_i16_t v )
-{
-   return ( posh_i16_t ) POSH_SwapU16( v );
-}
-
-/** 
- * Byte swaps a 32-bit unsigned value
- *
-   @ingroup ByteSwapFunctions
-   @param v [in] unsigned 32-bit input value to swap
-   @returns a byte swapped version of v
- */
-posh_u32_t
-POSH_SwapU32( posh_u32_t v )
-{
-   posh_u32_t swapped;
-
-   swapped  = ( v & 0xFF ) << 24;
-   swapped |= ( v & 0xFF00 ) << 8;
-   swapped |= ( v >> 8 ) & 0xFF00;
-   swapped |= ( v >> 24 );
-
-   return swapped;
-}
-
-/** 
- * Byte swaps a 32-bit signed value
- *
-   @ingroup ByteSwapFunctions
-   @param v [in] signed 32-bit input value to swap
-   @returns a byte swapped version of v
-   @remarks This just calls back to the unsigned version, since byte swapping 
-            is independent of sign.  However, we still provide this function to
-            avoid signed/unsigned mismatch compiler warnings.
- */
-posh_i32_t
-POSH_SwapI32( posh_i32_t v )
-{
-   return ( posh_i32_t ) POSH_SwapU32( ( posh_u32_t ) v );
-}
-
-#if defined POSH_64BIT_INTEGER
-/**
- * Byte swaps a 64-bit unsigned value
-
-   @param v [in] a 64-bit input value to swap
-   @ingroup SixtyFourBit
-   @returns a byte swapped version of v
-*/
-posh_u64_t 
-POSH_SwapU64( posh_u64_t v )
-{
-   posh_byte_t tmp;
-   union {
-      posh_byte_t bytes[ 8 ];
-      posh_u64_t  u64;
-   } u;
-
-   u.u64 = v;
-
-   tmp = u.bytes[ 0 ]; u.bytes[ 0 ] = u.bytes[ 7 ]; u.bytes[ 7 ] = tmp;
-   tmp = u.bytes[ 1 ]; u.bytes[ 1 ] = u.bytes[ 6 ]; u.bytes[ 6 ] = tmp;
-   tmp = u.bytes[ 2 ]; u.bytes[ 2 ] = u.bytes[ 5 ]; u.bytes[ 5 ] = tmp;
-   tmp = u.bytes[ 3 ]; u.bytes[ 3 ] = u.bytes[ 4 ]; u.bytes[ 4 ] = tmp;
-
-   return u.u64;
-}
-
-/**
- * Byte swaps a 64-bit signed value
-
-   @param v [in] a 64-bit input value to swap
-   @ingroup SixtyFourBit
-   @returns a byte swapped version of v
-*/
-posh_i64_t 
-POSH_SwapI64( posh_i64_t v )
-{
-   return ( posh_i64_t ) POSH_SwapU64( ( posh_u64_t ) v );
-}
-
-#endif /* defined POSH_64BIT_INTEGER */
-
-/* ---------------------------------------------------------------------------*/
-/*                           IN-MEMORY SERIALIZATION                          */
-/* ---------------------------------------------------------------------------*/
-
-/**
- * Writes an unsigned 16-bit value to a little endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL.  Alignment doesn't matter.
- @param value [in] host-endian unsigned 16-bit value
- @returns a pointer to the location two bytes after dst
- @remarks does no validation of the inputs
-*/
-posh_u16_t *
-POSH_WriteU16ToLittle( void *dst, posh_u16_t value )
-{
-   posh_u16_t  *p16 = ( posh_u16_t * ) dst;
-   posh_byte_t *p   = ( posh_byte_t * ) dst;
-
-   p[ 0 ] = value & 0xFF;
-   p[ 1 ] = ( value & 0xFF00) >> 8;
-
-   return p16 + 1;
-}
-
-/**
- * Writes a signed 16-bit value to a little endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian signed 16-bit value
- @returns a pointer to the location two bytes after dst
- @remarks does no validation of the inputs.  This simply calls
-          POSH_WriteU16ToLittle() with appropriate casting.
-*/
-posh_i16_t *
-POSH_WriteI16ToLittle( void *dst, posh_i16_t value )
-{
-   return ( posh_i16_t * ) POSH_WriteU16ToLittle( dst, ( posh_u16_t ) value );
-}
-
-/**
- * Writes an unsigned 32-bit value to a little endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian signed 32-bit value
- @returns a pointer to the location four bytes after dst
- @remarks does no validation of the inputs.
-*/
-posh_u32_t *
-POSH_WriteU32ToLittle( void *dst, posh_u32_t value )
-{
-   posh_u32_t  *p32   = ( posh_u32_t * ) dst;
-   posh_byte_t *p     = ( posh_byte_t * ) dst;
-
-   p[ 0 ] = ( value & 0xFF );
-   p[ 1 ] = ( value & 0xFF00 ) >> 8;
-   p[ 2 ] = ( value & 0xFF0000 ) >> 16;
-   p[ 3 ] = ( value & 0xFF000000 ) >> 24;
-
-   return p32 + 1;
-}
-
-/**
- * Writes a signed 32-bit value to a little endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian signed 32-bit value
- @returns a pointer to the location four bytes after dst
- @remarks does no validation of the inputs.  This simply calls
-          POSH_WriteU32ToLittle() with appropriate casting.
-*/
-posh_i32_t *
-POSH_WriteI32ToLittle( void *dst, posh_i32_t value )
-{
-   return ( posh_i32_t * ) POSH_WriteU32ToLittle( dst, ( posh_u32_t ) value );
-}
-
-/**
- * Writes an unsigned 16-bit value to a big endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian unsigned 16-bit value
- @returns a pointer to the location two bytes after dst
- @remarks does no validation of the inputs
-*/
-posh_u16_t *
-POSH_WriteU16ToBig( void *dst, posh_u16_t value )
-{
-   posh_u16_t *p16 = ( posh_u16_t * ) dst;
-   posh_byte_t *p  = ( posh_byte_t * ) dst;
-
-   p[ 1 ] = ( value & 0xFF );
-   p[ 0 ] = ( value & 0xFF00 ) >> 8;
-
-   return p16 + 1;
-}
-
-/**
- * Writes a signed 16-bit value to a big endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian signed 16-bit value
- @returns a pointer to the location two bytes after dst
- @remarks does no validation of the inputs.  This simply calls
-          POSH_WriteU16ToLittle() with appropriate casting.
-*/
-posh_i16_t *
-POSH_WriteI16ToBig( void *dst, posh_i16_t value )
-{
-   return ( posh_i16_t * ) POSH_WriteU16ToBig( dst, ( posh_u16_t ) value );
-}
-
-/**
- * Writes an unsigned 32-bit value to a big endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian unsigned 32-bit value
- @returns a pointer to the location four bytes after dst
- @remarks does no validation of the inputs.
-*/
-posh_u32_t *
-POSH_WriteU32ToBig( void *dst, posh_u32_t value )
-{
-   posh_u32_t *p32 = ( posh_u32_t * ) dst;
-   posh_byte_t *p  = ( posh_byte_t * ) dst;
-
-   p[ 3 ] = ( value & 0xFF );
-   p[ 2 ] = ( value & 0xFF00 ) >> 8;
-   p[ 1 ] = ( value & 0xFF0000 ) >> 16;
-   p[ 0 ] = ( value & 0xFF000000 ) >> 24;
-
-   return p32 + 1;
-}
-
-/**
- * Writes a signed 32-bit value to a big endian buffer
-
- @ingroup MemoryBuffer
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian signed 32-bit value
- @returns a pointer to the location four bytes after dst
- @remarks does no validation of the inputs.  This simply calls
-          POSH_WriteU32ToBig() with appropriate casting.
-*/
-posh_i32_t *
-POSH_WriteI32ToBig( void *dst, posh_i32_t value )
-{
-   return ( posh_i32_t * ) POSH_WriteU32ToBig( dst, ( posh_u32_t ) value );
-}
-
-#if defined POSH_64BIT_INTEGER
-/**
- * Writes an unsigned 64-bit value to a little-endian buffer
-
- @ingroup SixtyFourBit
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian unsigned 64-bit value
- @returns a pointer to the location eight bytes after dst
- @remarks does no validation of the inputs.
-*/
-posh_u64_t *
-POSH_WriteU64ToLittle( void *dst, posh_u64_t value )
-{
-   posh_u64_t *p64 = ( posh_u64_t * ) dst;
-   posh_byte_t *p  = ( posh_byte_t * ) dst;
-   int i;
-
-   for ( i = 0; i < 8; i++, value >>= 8 )
-   {
-       p[ i ] = ( posh_byte_t ) ( value & 0xFF );
-   }
-
-   return p64 + 1;
-}
-
-/**
- * Writes a signed 64-bit value to a little-endian buffer
-
- @ingroup SixtyFourBit
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian unsigned 64-bit value
- @returns a pointer to the location eight bytes after dst
- @remarks does no validation of the inputs.
-*/
-posh_i64_t *
-POSH_WriteI64ToLittle( void *dst, posh_i64_t value )
-{
-   return ( posh_i64_t * ) POSH_WriteU64ToLittle( dst, ( posh_u64_t ) value );
-}
-
-/**
- * Writes an unsigned 64-bit value to a big-endian buffer
-
- @ingroup SixtyFourBit
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian unsigned 64-bit value
- @returns a pointer to the location eight bytes after dst
- @remarks does no validation of the inputs.
-*/
-posh_u64_t *
-POSH_WriteU64ToBig( void *dst, posh_u64_t value )
-{
-   posh_u64_t *p64 = ( posh_u64_t * ) dst;
-   posh_byte_t *p  = ( posh_byte_t * ) dst;
-   int i;
-
-   for ( i = 0; i < 8; i++, value >>= 8 )
-   {
-       p[ 7-i ] = ( posh_byte_t ) ( value & 0xFF );
-   }
-
-   return p64 + 8;
-}
-
-/**
- * Writes a signed 64-bit value to a big-endian buffer
-
- @ingroup SixtyFourBit
- @param dst [out] pointer to the destination buffer, may not be NULL
- @param value [in] host-endian signed 64-bit value
- @returns a pointer to the location eight bytes after dst
- @remarks does no validation of the inputs.
-*/
-posh_i64_t *
-POSH_WriteI64ToBig( void *dst, posh_i64_t value )
-{
-   return ( posh_i64_t * ) POSH_WriteU64ToBig( dst, ( posh_u64_t ) value );
-}
-
-#endif /* POSH_64BIT_INTEGER */
-
-/* ---------------------------------------------------------------------------*/
-/*                         IN-MEMORY DESERIALIZATION                          */
-/* ---------------------------------------------------------------------------*/
-
-/** 
- * Reads an unsigned 16-bit value from a little-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian unsigned 16-bit value
-*/
-posh_u16_t  
-POSH_ReadU16FromLittle( const void *src )
-{
-    posh_u16_t   v = 0;
-    posh_byte_t *p = ( posh_byte_t * ) src;
-
-    v |= p[ 0 ];
-    v |= ( ( posh_u16_t ) p[ 1 ] ) << 8;
-
-    return v;
-}
-
-/** 
- * Reads a signed 16-bit value from a little-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian signed 16-bit value
-*/
-posh_i16_t  
-POSH_ReadI16FromLittle( const void *src )
-{
-   return ( posh_i16_t ) POSH_ReadU16FromLittle( src );
-}
-
-/** 
- * Reads an unsigned 32-bit value from a little-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian unsigned 32-bit value
-*/
-posh_u32_t  
-POSH_ReadU32FromLittle( const void *src )
-{
-    posh_u32_t v = 0;
-    posh_byte_t *p = ( posh_byte_t * ) src;
-
-    v |= p[ 0 ];
-    v |= ( ( posh_u32_t ) p[ 1 ] ) << 8;
-    v |= ( ( posh_u32_t ) p[ 2 ] ) << 16;
-    v |= ( ( posh_u32_t ) p[ 3 ] ) << 24;
-
-    return v;
-}
-
-/** 
- * Reads a signed 32-bit value from a little-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian signed 32-bit value
-*/
-posh_i32_t  
-POSH_ReadI32FromLittle( const void *src )
-{
-   return ( posh_i32_t ) POSH_ReadU32FromLittle( src );
-}
-
-
-/** 
- * Reads an unsigned 16-bit value from a big-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian unsigned 16-bit value
-*/
-posh_u16_t  
-POSH_ReadU16FromBig( const void *src )
-{
-    posh_u16_t   v = 0;
-    posh_byte_t *p = ( posh_byte_t * ) src;
-
-    v |= p[ 1 ];
-    v |= ( ( posh_u16_t ) p[ 0 ] ) << 8;
-
-    return v;
-}
-
-/** 
- * Reads a signed 16-bit value from a big-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian signed 16-bit value
-*/
-posh_i16_t  
-POSH_ReadI16FromBig( const void *src )
-{
-   return ( posh_i16_t ) POSH_ReadU16FromBig( src );
-}
-
-/** 
- * Reads an unsigned 32-bit value from a big-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian unsigned 32-bit value
-*/
-posh_u32_t  
-POSH_ReadU32FromBig( const void *src )
-{
-    posh_u32_t   v = 0;
-    posh_byte_t *p = ( posh_byte_t * ) src;
-
-    v |= p[ 3 ];
-    v |= ( ( posh_u32_t ) p[ 2 ] ) << 8;
-    v |= ( ( posh_u32_t ) p[ 1 ] ) << 16;
-    v |= ( ( posh_u32_t ) p[ 0 ] ) << 24;
-
-    return v;
-}
-
-/** 
- * Reads a signed 32-bit value from a big-endian buffer
- @ingroup MemoryBuffer
- @param src [in] source buffer
- @returns host-endian signed 32-bit value
-*/
-posh_i32_t  
-POSH_ReadI32FromBig( const void *src )
-{
-   return POSH_BigI32( (*(const posh_i32_t*)src ) );
-}
-
-#if defined POSH_64BIT_INTEGER
-
-/** 
- * Reads an unsigned 64-bit value from a little-endian buffer
- @param src [in] source buffer
- @returns host-endian unsigned 32-bit value
-*/
-posh_u64_t  
-POSH_ReadU64FromLittle( const void *src )
-{
-    posh_u64_t v = 0;
-    posh_byte_t *p = ( posh_byte_t * ) src;
-    int i;
-
-    for ( i = 0; i < 8; i++ )
-    {
-        v |= ( ( posh_u64_t ) p[ i ] ) << (i*8);
-    }
-
-    return v;
-}
-
-/** 
- * Reads a signed 64-bit value from a little-endian buffer
- @param src [in] source buffer
- @returns host-endian signed 32-bit value
-*/
-posh_i64_t  
-POSH_ReadI64FromLittle( const void *src )
-{
-   return ( posh_i64_t ) POSH_ReadU64FromLittle( src );
-}
-
-/** 
- * Reads an unsigned 64-bit value from a big-endian buffer
- @param src [in] source buffer
- @returns host-endian unsigned 32-bit value
-*/
-posh_u64_t
-POSH_ReadU64FromBig( const void *src )
-{
-    posh_u64_t v = 0;
-    posh_byte_t *p = ( posh_byte_t * ) src;
-    int i;
-
-    for ( i = 0; i < 8; i++ )
-    {
-        v |= ( ( posh_u64_t ) p[ 7-i ] ) << (i*8);
-    }
-
-    return v;
-}
-
-/** 
- * Reads an signed 64-bit value from a big-endian buffer
- @param src [in] source buffer
- @returns host-endian signed 32-bit value
-*/
-posh_i64_t
-POSH_ReadI64FromBig( const void *src )
-{
-   return ( posh_i64_t ) POSH_ReadU64FromBig( src );
-}
-
-#endif /* POSH_64BIT_INTEGER */
-
-/* ---------------------------------------------------------------------------*/
-/*                           FLOATING POINT SUPPORT                           */
-/* ---------------------------------------------------------------------------*/
-
-#if !defined POSH_NO_FLOAT
-
-/** @ingroup FloatingPoint
-    @param[in] f floating point value
-    @returns a little-endian bit representation of f
- */
-posh_u32_t
-POSH_LittleFloatBits( float f )
-{
-   union
-   {
-      float f32;
-      posh_u32_t u32;
-   } u;
-
-   u.f32 = f;
-
-   return POSH_LittleU32( u.u32 );
-}
-
-/** 
- * Extracts raw big-endian bits from a 32-bit floating point value
- *
-   @ingroup FloatingPoint
-   @param   f [in] floating point value
-   @returns a big-endian bit representation of f
- */
-posh_u32_t
-POSH_BigFloatBits( float f )
-{
-   union
-   {
-      float f32;
-      posh_u32_t u32;
-   } u;
-
-   u.f32 = f;
-
-   return POSH_BigU32( u.u32 );
-}
-
-/** 
- * Extracts raw, little-endian bit representation from a 64-bit double.
- *
-   @param d [in] 64-bit double precision value
-   @param dst [out] 8-byte storage buffer
-   @ingroup FloatingPoint
-   @returns the raw bits used to represent the value 'd', in the form dst[0]=LSB
- */
-void
-POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] )
-{
-   union
-   {
-      double d64;
-      posh_byte_t bytes[ 8 ];
-   } u;
-
-   u.d64 = d;
-
-#if defined POSH_LITTLE_ENDIAN
-   dst[ 0 ] = u.bytes[ 0 ];
-   dst[ 1 ] = u.bytes[ 1 ];
-   dst[ 2 ] = u.bytes[ 2 ];
-   dst[ 3 ] = u.bytes[ 3 ];
-   dst[ 4 ] = u.bytes[ 4 ];
-   dst[ 5 ] = u.bytes[ 5 ];
-   dst[ 6 ] = u.bytes[ 6 ];
-   dst[ 7 ] = u.bytes[ 7 ];
-#else
-   dst[ 0 ] = u.bytes[ 7 ];
-   dst[ 1 ] = u.bytes[ 6 ];
-   dst[ 2 ] = u.bytes[ 5 ];
-   dst[ 3 ] = u.bytes[ 4 ];
-   dst[ 4 ] = u.bytes[ 3 ];
-   dst[ 5 ] = u.bytes[ 2 ];
-   dst[ 6 ] = u.bytes[ 1 ];
-   dst[ 7 ] = u.bytes[ 0 ];
-#endif
-}
-
-/** 
- * Creates a double-precision, 64-bit floating point value from a set of raw, 
- * little-endian bits
-
-   @ingroup FloatingPoint
-   @param src [in] little-endian byte representation of 64-bit double precision 
-                  floating point value
-   @returns double precision floating point representation of the raw bits
-   @remarks No error checking is performed, so there are no guarantees that the 
-            result is a valid number, nor is there any check to ensure that src is 
-            non-NULL.  BE CAREFUL USING THIS.
- */
-double
-POSH_DoubleFromBits( const posh_byte_t src[ 8 ] )
-{
-   union
-   {
-      double d64;
-      posh_byte_t bytes[ 8 ];
-   } u;
-
-#if defined POSH_LITTLE_ENDIAN
-   u.bytes[ 0 ] = src[ 0 ];
-   u.bytes[ 1 ] = src[ 1 ];
-   u.bytes[ 2 ] = src[ 2 ];
-   u.bytes[ 3 ] = src[ 3 ];
-   u.bytes[ 4 ] = src[ 4 ];
-   u.bytes[ 5 ] = src[ 5 ];
-   u.bytes[ 6 ] = src[ 6 ];
-   u.bytes[ 7 ] = src[ 7 ];
-#else
-   u.bytes[ 0 ] = src[ 7 ];
-   u.bytes[ 1 ] = src[ 6 ];
-   u.bytes[ 2 ] = src[ 5 ];
-   u.bytes[ 3 ] = src[ 4 ];
-   u.bytes[ 4 ] = src[ 3 ];
-   u.bytes[ 5 ] = src[ 2 ];
-   u.bytes[ 6 ] = src[ 1 ];
-   u.bytes[ 7 ] = src[ 0 ];
-#endif
-
-   return u.d64;
-}
-
-/** 
- * Creates a floating point number from little endian bits
- *
-   @ingroup FloatingPoint
-   @param   bits [in] raw floating point bits in little-endian form
-   @returns a floating point number based on the given bit representation
-   @remarks No error checking is performed, so there are no guarantees that the 
-            result is a valid number.  BE CAREFUL USING THIS.
- */
-float       
-POSH_FloatFromLittleBits( posh_u32_t bits )
-{
-   union
-   {
-      float f32;
-      posh_u32_t u32;
-   } u;
-
-   u.u32 = bits;
-#if defined POSH_BIG_ENDIAN
-   u.u32 = POSH_SwapU32( u.u32 );
-#endif
-
-   return u.f32;
-}
-
-/** 
- * Creates a floating point number from big-endian bits
- *
-   @ingroup FloatingPoint
-   @param   bits [in] raw floating point bits in big-endian form
-   @returns a floating point number based on the given bit representation
-   @remarks No error checking is performed, so there are no guarantees that the 
-            result is a valid number.  BE CAREFUL USING THIS.
- */
-float
-POSH_FloatFromBigBits( posh_u32_t bits )
-{
-   union
-   {
-      float f32;
-      posh_u32_t u32;
-   } u;
-
-   u.u32 = bits;
-#if defined POSH_LITTLE_ENDIAN
-   u.u32 = POSH_SwapU32( u.u32 );
-#endif
-
-   return u.f32;
-}
-
-#endif /* !defined POSH_NO_FLOAT */
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/BlockDXT.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/BlockDXT.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/BlockDXT.h
@@ -21,202 +21,228 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
+#pragma once
 #ifndef NV_IMAGE_BLOCKDXT_H
 #define NV_IMAGE_BLOCKDXT_H
 
-#include <nvmath/Color.h>
+#include "nvimage.h"
 
-#include <nvimage/nvimage.h>
+#include "nvmath/Color.h"
 
 namespace nv
 {
-	struct ColorBlock;
-	class Stream;
-
-
-	/// DXT1 block.
-	struct BlockDXT1
-	{
-		Color16 col0;
-		Color16 col1;
-		union {
-			uint8 row[4];
-			uint indices;
-		};
-	
-		bool isFourColorMode() const;
-	
-		uint evaluatePalette(Color32 color_array[4]) const;
-		uint evaluatePaletteFast(Color32 color_array[4]) const;
-		void evaluatePalette3(Color32 color_array[4]) const;
-		void evaluatePalette4(Color32 color_array[4]) const;
-		
-		void decodeBlock(ColorBlock * block) const;
-		
-		void setIndices(int * idx);
-
-		void flip4();
-		void flip2();
-	};
-	
-	/// Return true if the block uses four color mode, false otherwise.
-	inline bool BlockDXT1::isFourColorMode() const
-	{
-		return col0.u > col1.u;
-	}
-	
-
-	/// DXT3 alpha block with explicit alpha.
-	struct AlphaBlockDXT3
-	{
-		union {
-			struct {
-				uint alpha0 : 4;
-				uint alpha1 : 4;
-				uint alpha2 : 4;
-				uint alpha3 : 4;
-				uint alpha4 : 4;
-				uint alpha5 : 4;
-				uint alpha6 : 4;
-				uint alpha7 : 4;
-				uint alpha8 : 4;
-				uint alpha9 : 4;
-				uint alphaA : 4;
-				uint alphaB : 4;
-				uint alphaC : 4;
-				uint alphaD : 4;
-				uint alphaE : 4;
-				uint alphaF : 4;
-			};
-			uint16 row[4];
-		};
-		
-		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
-	};
-	
-	
-	/// DXT3 block.
-	struct BlockDXT3
-	{
-		AlphaBlockDXT3 alpha;
-		BlockDXT1 color;
-		
-		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
-	};
-	
-	
-	/// DXT5 alpha block.
-	struct AlphaBlockDXT5
-	{
-		union {
-			struct {
-				uint64 alpha0 : 8;	// 8
-				uint64 alpha1 : 8;	// 16
-				uint64 bits0 : 3;	// 3 - 19
-				uint64 bits1 : 3; 	// 6 - 22
-				uint64 bits2 : 3; 	// 9 - 25
-				uint64 bits3 : 3;	// 12 - 28
-				uint64 bits4 : 3;	// 15 - 31
-				uint64 bits5 : 3;	// 18 - 34
-				uint64 bits6 : 3;	// 21 - 37
-				uint64 bits7 : 3;	// 24 - 40
-				uint64 bits8 : 3;	// 27 - 43
-				uint64 bits9 : 3; 	// 30 - 46
-				uint64 bitsA : 3; 	// 33 - 49
-				uint64 bitsB : 3;	// 36 - 52
-				uint64 bitsC : 3;	// 39 - 55
-				uint64 bitsD : 3;	// 42 - 58
-				uint64 bitsE : 3;	// 45 - 61
-				uint64 bitsF : 3;	// 48 - 64
-			};
-			uint64 u;
-		};
-		
-		void evaluatePalette(uint8 alpha[8]) const;
-		void evaluatePalette8(uint8 alpha[8]) const;
-		void evaluatePalette6(uint8 alpha[8]) const;
-		void indices(uint8 index_array[16]) const;
-
-		uint index(uint index) const;
-		void setIndex(uint index, uint value);
-		
-		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
-	};
-
-	
-	/// DXT5 block.
-	struct BlockDXT5
-	{
-		AlphaBlockDXT5 alpha;
-		BlockDXT1 color;
-		
-		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
-	};
-
-	/// ATI1 block.
-	struct BlockATI1
-	{
-		AlphaBlockDXT5 alpha;
-		
-		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
-	};
-
-	/// ATI2 block.
-	struct BlockATI2
-	{
-		AlphaBlockDXT5 x;
-		AlphaBlockDXT5 y;
-		
+    struct ColorBlock;
+    struct ColorSet;
+    struct AlphaBlock4x4;
+    class Stream;
+    class Vector3;
+
+
+    /// DXT1 block.
+    struct NVIMAGE_CLASS BlockDXT1
+    {
+        Color16 col0;
+        Color16 col1;
+        union {
+            uint8 row[4];
+            uint indices;
+        };
+
+        bool isFourColorMode() const;
+
+        uint evaluatePalette(Color32 color_array[4], bool d3d9) const;
+        uint evaluatePaletteNV5x(Color32 color_array[4]) const;
+
+        void evaluatePalette3(Color32 color_array[4], bool d3d9) const;
+        void evaluatePalette4(Color32 color_array[4], bool d3d9) const;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlockNV5x(ColorBlock * block) const;
+
+        void setIndices(int * idx);
+
+        void flip4();
+        void flip2();
+    };
+
+    /// Return true if the block uses four color mode, false otherwise.
+    inline bool BlockDXT1::isFourColorMode() const
+    {
+        return col0.u > col1.u;
+    }
+
+
+    /// DXT3 alpha block with explicit alpha.
+    struct AlphaBlockDXT3
+    {
+        union {
+            struct {
+                uint alpha0 : 4;
+                uint alpha1 : 4;
+                uint alpha2 : 4;
+                uint alpha3 : 4;
+                uint alpha4 : 4;
+                uint alpha5 : 4;
+                uint alpha6 : 4;
+                uint alpha7 : 4;
+                uint alpha8 : 4;
+                uint alpha9 : 4;
+                uint alphaA : 4;
+                uint alphaB : 4;
+                uint alphaC : 4;
+                uint alphaD : 4;
+                uint alphaE : 4;
+                uint alphaF : 4;
+            };
+            uint16 row[4];
+        };
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    /// DXT3 block.
+    struct NVIMAGE_CLASS BlockDXT3
+    {
+        AlphaBlockDXT3 alpha;
+        BlockDXT1 color;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlockNV5x(ColorBlock * block) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    /// DXT5 alpha block.
+    struct NVIMAGE_CLASS AlphaBlockDXT5
+    {
+        union {
+            struct {
+                uint64 alpha0 : 8;	// 8
+                uint64 alpha1 : 8;	// 16
+                uint64 bits0 : 3;	// 3 - 19
+                uint64 bits1 : 3; 	// 6 - 22
+                uint64 bits2 : 3; 	// 9 - 25
+                uint64 bits3 : 3;	// 12 - 28
+                uint64 bits4 : 3;	// 15 - 31
+                uint64 bits5 : 3;	// 18 - 34
+                uint64 bits6 : 3;	// 21 - 37
+                uint64 bits7 : 3;	// 24 - 40
+                uint64 bits8 : 3;	// 27 - 43
+                uint64 bits9 : 3; 	// 30 - 46
+                uint64 bitsA : 3; 	// 33 - 49
+                uint64 bitsB : 3;	// 36 - 52
+                uint64 bitsC : 3;	// 39 - 55
+                uint64 bitsD : 3;	// 42 - 58
+                uint64 bitsE : 3;	// 45 - 61
+                uint64 bitsF : 3;	// 48 - 64
+            };
+            uint64 u;
+        };
+
+        void evaluatePalette(uint8 alpha[8], bool d3d9) const;
+        void evaluatePalette8(uint8 alpha[8], bool d3d9) const;
+        void evaluatePalette6(uint8 alpha[8], bool d3d9) const;
+        void indices(uint8 index_array[16]) const;
+
+        uint index(uint index) const;
+        void setIndex(uint index, uint value);
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlock(AlphaBlock4x4 * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+
+    /// DXT5 block.
+    struct NVIMAGE_CLASS BlockDXT5
+    {
+        AlphaBlockDXT5 alpha;
+        BlockDXT1 color;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+        void decodeBlockNV5x(ColorBlock * block) const;
+
+        void flip4();
+        void flip2();
+    };
+
+    /// ATI1 block.
+    struct NVIMAGE_CLASS BlockATI1
+    {
+        AlphaBlockDXT5 alpha;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+    /// ATI2 block.
+    struct NVIMAGE_CLASS BlockATI2
+    {
+        AlphaBlockDXT5 x;
+        AlphaBlockDXT5 y;
+
+        void decodeBlock(ColorBlock * block, bool d3d9 = false) const;
+
+        void flip4();
+        void flip2();
+    };
+
+    /// CTX1 block.
+    struct BlockCTX1
+    {
+        uint8 col0[2];
+        uint8 col1[2];
+        union {
+            uint8 row[4];
+            uint indices;
+        };
+
+        void evaluatePalette(Color32 color_array[4]) const;
+        void setIndices(int * idx);
+
+        void decodeBlock(ColorBlock * block) const;
+
+        void flip4();
+        void flip2();
+    };
+
+	/// BC6 block.
+	struct NVIMAGE_CLASS BlockBC6
+	{
+		uint8 data[16];		// Not even going to try to write a union for this thing.
+		void decodeBlock(Vector3 colors[16]) const;
+	};
+
+	/// BC7 block.
+	struct NVIMAGE_CLASS BlockBC7
+	{
+		uint8 data[16];		// Not even going to try to write a union for this thing.
 		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
 	};
 
-	/// CTX1 block.
-	struct BlockCTX1
-	{
-		uint8 col0[2];
-		uint8 col1[2];
-		union {
-			uint8 row[4];
-			uint indices;
-		};
-
-		void evaluatePalette(Color32 color_array[4]) const;
-		void setIndices(int * idx);
-
-		void decodeBlock(ColorBlock * block) const;
-		
-		void flip4();
-		void flip2();
-	};
 
 
-	// Serialization functions.
-	NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT1 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT3 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT3 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT5 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT5 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI1 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI2 & block);
-	NVIMAGE_API Stream & operator<<(Stream & stream, BlockCTX1 & block);
+    // Serialization functions.
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT1 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT3 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT3 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, AlphaBlockDXT5 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockDXT5 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI1 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockATI2 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockCTX1 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockBC6 & block);
+    NVIMAGE_API Stream & operator<<(Stream & stream, BlockBC7 & block);
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/BlockDXT.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/BlockDXT.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/BlockDXT.cpp
@@ -21,584 +21,654 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#include <nvcore/Stream.h>
-
-#include "ColorBlock.h"
 #include "BlockDXT.h"
+#include "ColorBlock.h"
+
+#include "nvcore/Stream.h"
+#include "nvcore/Utils.h" // swap
+#include "nvmath/Half.h"
+#include "nvmath/Vector.inl"
+
+#include "bc6h/zoh.h"
+#include "bc7/avpcl.h"
+
 
 using namespace nv;
 
 
 /*----------------------------------------------------------------------------
-	BlockDXT1
+BlockDXT1
 ----------------------------------------------------------------------------*/
 
-uint BlockDXT1::evaluatePalette(Color32 color_array[4]) const
+uint BlockDXT1::evaluatePalette(Color32 color_array[4], bool d3d9/*= false*/) const
 {
-	// Does bit expansion before interpolation.
-	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
-	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-	color_array[0].a = 0xFF;
-	
-	// @@ Same as above, but faster?
-//	Color32 c;
-//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
-//	c.u |= (c.u >> 5) & 0x070007;
-//	c.u |= (c.u >> 6) & 0x000300;
-//	color_array[0].u = c.u;
-	
-	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
-	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
-	color_array[1].a = 0xFF;
-	
-	// @@ Same as above, but faster?
-//	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
-//	c.u |= (c.u >> 5) & 0x070007;
-//	c.u |= (c.u >> 6) & 0x000300;
-//	color_array[1].u = c.u;
-	
-	if( col0.u > col1.u ) {
-		// Four-color block: derive the other two colors.
-		color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
-		color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
-		color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
-		color_array[2].a = 0xFF;
-		
-		color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
-		color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
-		color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
-		color_array[3].a = 0xFF;
-		
-		return 4;
-	}
-	else {
-		// Three-color block: derive the other color.
-		color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
-		color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
-		color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
-		color_array[2].a = 0xFF;
-		
-		// Set all components to 0 to match DXT specs.
-		color_array[3].r = 0x00; // color_array[2].r;
-		color_array[3].g = 0x00; // color_array[2].g;
-		color_array[3].b = 0x00; // color_array[2].b;
-		color_array[3].a = 0x00;
-		
-		return 3;
-	}
+    // Does bit expansion before interpolation.
+    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+    color_array[0].a = 0xFF;
+
+    // @@ Same as above, but faster?
+    //	Color32 c;
+    //	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+    //	c.u |= (c.u >> 5) & 0x070007;
+    //	c.u |= (c.u >> 6) & 0x000300;
+    //	color_array[0].u = c.u;
+
+    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+    color_array[1].a = 0xFF;
+
+    // @@ Same as above, but faster?
+    //	c.u = ((col1.u << 3) & 0xf8) | ((col1.u << 5) & 0xfc00) | ((col1.u << 8) & 0xf80000);
+    //	c.u |= (c.u >> 5) & 0x070007;
+    //	c.u |= (c.u >> 6) & 0x000300;
+    //	color_array[1].u = c.u;
+
+    if( col0.u > col1.u ) {
+        int bias = 0;
+        if (d3d9) bias = 1;
+
+        // Four-color block: derive the other two colors.
+        color_array[2].r = (2 * color_array[0].r + color_array[1].r + bias) / 3;
+        color_array[2].g = (2 * color_array[0].g + color_array[1].g + bias) / 3;
+        color_array[2].b = (2 * color_array[0].b + color_array[1].b + bias) / 3;
+        color_array[2].a = 0xFF;
+
+        color_array[3].r = (2 * color_array[1].r + color_array[0].r + bias) / 3;
+        color_array[3].g = (2 * color_array[1].g + color_array[0].g + bias) / 3;
+        color_array[3].b = (2 * color_array[1].b + color_array[0].b + bias) / 3;
+        color_array[3].a = 0xFF;
+
+        return 4;
+    }
+    else {
+        // Three-color block: derive the other color.
+        color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+        color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+        color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+        color_array[2].a = 0xFF;
+
+        // Set all components to 0 to match DXT specs.
+        color_array[3].r = 0x00; // color_array[2].r;
+        color_array[3].g = 0x00; // color_array[2].g;
+        color_array[3].b = 0x00; // color_array[2].b;
+        color_array[3].a = 0x00;
+
+        return 3;
+    }
+}
+
+
+uint BlockDXT1::evaluatePaletteNV5x(Color32 color_array[4]) const
+{
+    // Does bit expansion before interpolation.
+    color_array[0].b = (3 * col0.b * 22) / 8;
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (3 * col0.r * 22) / 8;
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = (3 * col1.r * 22) / 8;
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (3 * col1.b * 22) / 8;
+    color_array[1].a = 0xFF;
+
+    int gdiff = color_array[1].g - color_array[0].g;
+
+    if( col0.u > col1.u ) {
+        // Four-color block: derive the other two colors.
+        color_array[2].r = ((2 * col0.r + col1.r) * 22) / 8;
+        color_array[2].g = (256 * color_array[0].g + gdiff / 4 + 128 + gdiff * 80) / 256;
+        color_array[2].b = ((2 * col0.b + col1.b) * 22) / 8;
+        color_array[2].a = 0xFF;
+
+        color_array[3].r = ((2 * col1.r + col0.r) * 22) / 8;
+        color_array[3].g = (256 * color_array[1].g - gdiff / 4 + 128 - gdiff * 80) / 256;
+        color_array[3].b = ((2 * col1.b + col0.b) * 22) / 8;
+        color_array[3].a = 0xFF;
+
+        return 4;
+    }
+    else {
+        // Three-color block: derive the other color.
+        color_array[2].r = ((col0.r + col1.r) * 33) / 8;
+        color_array[2].g = (256 * color_array[0].g + gdiff / 4 + 128 + gdiff * 128) / 256;
+        color_array[2].b = ((col0.b + col1.b) * 33) / 8;
+        color_array[2].a = 0xFF;
+
+        // Set all components to 0 to match DXT specs.
+        color_array[3].r = 0x00;
+        color_array[3].g = 0x00;
+        color_array[3].b = 0x00;
+        color_array[3].a = 0x00;
+
+        return 3;
+    }
 }
 
 // Evaluate palette assuming 3 color block.
-void BlockDXT1::evaluatePalette3(Color32 color_array[4]) const
+void BlockDXT1::evaluatePalette3(Color32 color_array[4], bool d3d9) const
 {
-	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
-	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-	color_array[0].a = 0xFF;
-	
-	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
-	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
-	color_array[1].a = 0xFF;
-	
-	// Three-color block: derive the other color.
-	color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
-	color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
-	color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
-	color_array[2].a = 0xFF;
-		
-	// Set all components to 0 to match DXT specs.
-	color_array[3].r = 0x00; // color_array[2].r;
-	color_array[3].g = 0x00; // color_array[2].g;
-	color_array[3].b = 0x00; // color_array[2].b;
-	color_array[3].a = 0x00;
+    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+    color_array[1].a = 0xFF;
+
+    // Three-color block: derive the other color.
+    color_array[2].r = (color_array[0].r + color_array[1].r) / 2;
+    color_array[2].g = (color_array[0].g + color_array[1].g) / 2;
+    color_array[2].b = (color_array[0].b + color_array[1].b) / 2;
+    color_array[2].a = 0xFF;
+
+    // Set all components to 0 to match DXT specs.
+    color_array[3].r = 0x00;
+    color_array[3].g = 0x00;
+    color_array[3].b = 0x00;
+    color_array[3].a = 0x00;
 }
 
 // Evaluate palette assuming 4 color block.
-void BlockDXT1::evaluatePalette4(Color32 color_array[4]) const
+void BlockDXT1::evaluatePalette4(Color32 color_array[4], bool d3d9) const
 {
-	color_array[0].b = (col0.b << 3) | (col0.b >> 2);
-	color_array[0].g = (col0.g << 2) | (col0.g >> 4);
-	color_array[0].r = (col0.r << 3) | (col0.r >> 2);
-	color_array[0].a = 0xFF;
-	
-	color_array[1].r = (col1.r << 3) | (col1.r >> 2);
-	color_array[1].g = (col1.g << 2) | (col1.g >> 4);
-	color_array[1].b = (col1.b << 3) | (col1.b >> 2);
-	color_array[1].a = 0xFF;
-	
-	// Four-color block: derive the other two colors.
-	color_array[2].r = (2 * color_array[0].r + color_array[1].r) / 3;
-	color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
-	color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
-	color_array[2].a = 0xFF;
-		
-	color_array[3].r = (2 * color_array[1].r + color_array[0].r) / 3;
-	color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
-	color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
-	color_array[3].a = 0xFF;
-}
-
+    color_array[0].b = (col0.b << 3) | (col0.b >> 2);
+    color_array[0].g = (col0.g << 2) | (col0.g >> 4);
+    color_array[0].r = (col0.r << 3) | (col0.r >> 2);
+    color_array[0].a = 0xFF;
 
-/* Jason Dorie's code.
-// ----------------------------------------------------------------------------
-// Build palette for a 3 color + traparent black block
-// ----------------------------------------------------------------------------
-void DXTCGen::BuildCodes3(cbVector *pVects, cbVector &v1, cbVector &v2)
-{
-	//pVects[0] = v1;
-	//pVects[2] = v2;
-	//pVects[1][0] = v1[0];
-	//pVects[1][1] = (BYTE)( ((long)v1[1] + (long)v2[1]) / 2 );
-	//pVects[1][2] = (BYTE)( ((long)v1[2] + (long)v2[2]) / 2 );
-	//pVects[1][3] = (BYTE)( ((long)v1[3] + (long)v2[3]) / 2 );
-
-	__asm {
-		mov			ecx, dword ptr pVects
-		mov			eax, dword ptr v1
-		mov			ebx, dword ptr v2
-
-		movd		mm0, [eax]
-		movd		mm1, [ebx]
-		pxor		mm2, mm2
-		nop
-
-		movd		[ecx], mm0
-		movd		[ecx+8], mm1
+    color_array[1].r = (col1.r << 3) | (col1.r >> 2);
+    color_array[1].g = (col1.g << 2) | (col1.g >> 4);
+    color_array[1].b = (col1.b << 3) | (col1.b >> 2);
+    color_array[1].a = 0xFF;
 
-		punpcklbw	mm0, mm2
-		punpcklbw	mm1, mm2
+    int bias = 0;
+    if (d3d9) bias = 1;
 
-		paddw		mm0, mm1
-		psrlw		mm0, 1
+    // Four-color block: derive the other two colors.
+    color_array[2].r = (2 * color_array[0].r + color_array[1].r + bias) / 3;
+    color_array[2].g = (2 * color_array[0].g + color_array[1].g + bias) / 3;
+    color_array[2].b = (2 * color_array[0].b + color_array[1].b + bias) / 3;
+    color_array[2].a = 0xFF;
 
-		packuswb	mm0, mm0
-		movd		[ecx+4], mm0
-	}
-	// *(long *)&pVects[1] = r1;
+    color_array[3].r = (2 * color_array[1].r + color_array[0].r + bias) / 3;
+    color_array[3].g = (2 * color_array[1].g + color_array[0].g + bias) / 3;
+    color_array[3].b = (2 * color_array[1].b + color_array[0].b + bias) / 3;
+    color_array[3].a = 0xFF;
 }
 
-__int64 ScaleOneThird = 0x5500550055005500;
 
-// ----------------------------------------------------------------------------
-// Build palette for a 4 color block
-// ----------------------------------------------------------------------------
-void DXTCGen::BuildCodes4(cbVector *pVects, cbVector &v1, cbVector &v2)
+void BlockDXT1::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
 {
-// 	pVects[0] = v1;
-// 	pVects[3] = v2;
-// 
-// 	pVects[1][0] = v1[0];
-// 	pVects[1][1] = (BYTE)( ((long)v1[1] * 2 + (long)v2[1]) / 3 );
-// 	pVects[1][2] = (BYTE)( ((long)v1[2] * 2 + (long)v2[2]) / 3 );
-// 	pVects[1][3] = (BYTE)( ((long)v1[3] * 2 + (long)v2[3]) / 3 );
-// 
-// 	pVects[2][0] = v1[0];
-// 	pVects[2][1] = (BYTE)( ((long)v2[1] * 2 + (long)v1[1]) / 3 );
-// 	pVects[2][2] = (BYTE)( ((long)v2[2] * 2 + (long)v1[2]) / 3 );
-// 	pVects[2][3] = (BYTE)( ((long)v2[3] * 2 + (long)v1[3]) / 3 );
-
-	__asm {
-		mov			ecx, dword ptr pVects
-		mov			eax, dword ptr v1
-		mov			ebx, dword ptr v2
-
-		movd		mm0, [eax]
-		movd		mm1, [ebx]
-
-		pxor		mm2, mm2
-		movd		[ecx], mm0
-		movd		[ecx+12], mm1
-
-		punpcklbw	mm0, mm2
-		punpcklbw	mm1, mm2
-		movq		mm3, mm0		// mm3 = v0
-
-		paddw		mm0, mm1		// mm0 = v0 + v1
-		paddw		mm3, mm3		// mm3 = v0*2
-
-		paddw		mm0, mm1		// mm0 = v0 + v1*2
-		paddw		mm1, mm3		// mm1 = v0*2 + v1
-
-		pmulhw		mm0, ScaleOneThird
-		pmulhw		mm1, ScaleOneThird
-		packuswb	mm1, mm0
+    nvDebugCheck(block != NULL);
 
-		movq		[ecx+4], mm1
-	}
+    // Decode color block.
+    Color32 color_array[4];
+    evaluatePalette(color_array, d3d9);
+
+    // Write color block.
+    for( uint j = 0; j < 4; j++ ) {
+        for( uint i = 0; i < 4; i++ ) {
+            uint idx = (row[j] >> (2 * i)) & 3;
+            block->color(i, j) = color_array[idx];
+        }
+    }	
 }
-*/
 
-void BlockDXT1::decodeBlock(ColorBlock * block) const
+void BlockDXT1::decodeBlockNV5x(ColorBlock * block) const
 {
-	nvDebugCheck(block != NULL);
-	
-	// Decode color block.
-	Color32 color_array[4];
-	evaluatePalette(color_array);
-	
-	// Write color block.
-	for( uint j = 0; j < 4; j++ ) {
-		for( uint i = 0; i < 4; i++ ) {
-			uint idx = (row[j] >> (2 * i)) & 3;
-			block->color(i, j) = color_array[idx];
-		}
-	}	
+    nvDebugCheck(block != NULL);
+
+    // Decode color block.
+    Color32 color_array[4];
+    evaluatePaletteNV5x(color_array);
+
+    // Write color block.
+    for( uint j = 0; j < 4; j++ ) {
+        for( uint i = 0; i < 4; i++ ) {
+            uint idx = (row[j] >> (2 * i)) & 3;
+            block->color(i, j) = color_array[idx];
+        }
+    }
 }
 
 void BlockDXT1::setIndices(int * idx)
 {
-	indices = 0;
-	for(uint i = 0; i < 16; i++) {
-		indices |= (idx[i] & 3) << (2 * i);
-	}
+    indices = 0;
+    for(uint i = 0; i < 16; i++) {
+        indices |= (idx[i] & 3) << (2 * i);
+    }
 }
 
 
 /// Flip DXT1 block vertically.
 inline void BlockDXT1::flip4()
 {
-	swap(row[0], row[3]);
-	swap(row[1], row[2]);
+    swap(row[0], row[3]);
+    swap(row[1], row[2]);
 }
 
 /// Flip half DXT1 block vertically.
 inline void BlockDXT1::flip2()
 {
-	swap(row[0], row[1]);
+    swap(row[0], row[1]);
 }
 
 
 /*----------------------------------------------------------------------------
-	BlockDXT3
+BlockDXT3
 ----------------------------------------------------------------------------*/
 
-void BlockDXT3::decodeBlock(ColorBlock * block) const
+void BlockDXT3::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
 {
-	nvDebugCheck(block != NULL);
-	
-	// Decode color.
-	color.decodeBlock(block);
-	
-	// Decode alpha.
-	alpha.decodeBlock(block);
-}
-
-void AlphaBlockDXT3::decodeBlock(ColorBlock * block) const
-{
-	nvDebugCheck(block != NULL);
-
-	block->color(0x0).a = (alpha0 << 4) | alpha0;
-	block->color(0x1).a = (alpha1 << 4) | alpha1;
-	block->color(0x2).a = (alpha2 << 4) | alpha2;
-	block->color(0x3).a = (alpha3 << 4) | alpha3;
-	block->color(0x4).a = (alpha4 << 4) | alpha4;
-	block->color(0x5).a = (alpha5 << 4) | alpha5;
-	block->color(0x6).a = (alpha6 << 4) | alpha6;
-	block->color(0x7).a = (alpha7 << 4) | alpha7;
-	block->color(0x8).a = (alpha8 << 4) | alpha8;
-	block->color(0x9).a = (alpha9 << 4) | alpha9;
-	block->color(0xA).a = (alphaA << 4) | alphaA;
-	block->color(0xB).a = (alphaB << 4) | alphaB;
-	block->color(0xC).a = (alphaC << 4) | alphaC;
-	block->color(0xD).a = (alphaD << 4) | alphaD;
-	block->color(0xE).a = (alphaE << 4) | alphaE;
-	block->color(0xF).a = (alphaF << 4) | alphaF;
+    nvDebugCheck(block != NULL);
+
+    // Decode color.
+    color.decodeBlock(block, d3d9);
+
+    // Decode alpha.
+    alpha.decodeBlock(block, d3d9);
+}
+
+void BlockDXT3::decodeBlockNV5x(ColorBlock * block) const
+{
+    nvDebugCheck(block != NULL);
+
+    color.decodeBlockNV5x(block);
+    alpha.decodeBlock(block);
+}
+
+void AlphaBlockDXT3::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    block->color(0x0).a = (alpha0 << 4) | alpha0;
+    block->color(0x1).a = (alpha1 << 4) | alpha1;
+    block->color(0x2).a = (alpha2 << 4) | alpha2;
+    block->color(0x3).a = (alpha3 << 4) | alpha3;
+    block->color(0x4).a = (alpha4 << 4) | alpha4;
+    block->color(0x5).a = (alpha5 << 4) | alpha5;
+    block->color(0x6).a = (alpha6 << 4) | alpha6;
+    block->color(0x7).a = (alpha7 << 4) | alpha7;
+    block->color(0x8).a = (alpha8 << 4) | alpha8;
+    block->color(0x9).a = (alpha9 << 4) | alpha9;
+    block->color(0xA).a = (alphaA << 4) | alphaA;
+    block->color(0xB).a = (alphaB << 4) | alphaB;
+    block->color(0xC).a = (alphaC << 4) | alphaC;
+    block->color(0xD).a = (alphaD << 4) | alphaD;
+    block->color(0xE).a = (alphaE << 4) | alphaE;
+    block->color(0xF).a = (alphaF << 4) | alphaF;
 }
 
 /// Flip DXT3 alpha block vertically.
 void AlphaBlockDXT3::flip4()
 {
-	swap(row[0], row[3]);
-	swap(row[1], row[2]);
+    swap(row[0], row[3]);
+    swap(row[1], row[2]);
 }
 
 /// Flip half DXT3 alpha block vertically.
 void AlphaBlockDXT3::flip2()
 {
-	swap(row[0], row[1]);
+    swap(row[0], row[1]);
 }
 
 /// Flip DXT3 block vertically.
 void BlockDXT3::flip4()
 {
-	alpha.flip4();
-	color.flip4();
+    alpha.flip4();
+    color.flip4();
 }
 
 /// Flip half DXT3 block vertically.
 void BlockDXT3::flip2()
 {
-	alpha.flip2();
-	color.flip2();
+    alpha.flip2();
+    color.flip2();
 }
 
 
 /*----------------------------------------------------------------------------
-	BlockDXT5
+BlockDXT5
 ----------------------------------------------------------------------------*/
 
-void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8]) const
+void AlphaBlockDXT5::evaluatePalette(uint8 alpha[8], bool d3d9) const
 {
-	if (alpha0 > alpha1) {
-		evaluatePalette8(alpha);
-	}
-	else {
-		evaluatePalette6(alpha);
-	}
-}
-
-void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8]) const
-{
-	// 8-alpha block:  derive the other six alphas.
-	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
-	alpha[0] = alpha0;
-	alpha[1] = alpha1;
-	alpha[2] = (6 * alpha[0] + 1 * alpha[1]) / 7;	// bit code 010
-	alpha[3] = (5 * alpha[0] + 2 * alpha[1]) / 7;	// bit code 011
-	alpha[4] = (4 * alpha[0] + 3 * alpha[1]) / 7;	// bit code 100
-	alpha[5] = (3 * alpha[0] + 4 * alpha[1]) / 7;	// bit code 101
-	alpha[6] = (2 * alpha[0] + 5 * alpha[1]) / 7;	// bit code 110
-	alpha[7] = (1 * alpha[0] + 6 * alpha[1]) / 7;	// bit code 111
-}
-
-void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8]) const
-{
-	// 6-alpha block.
-	// Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
-	alpha[0] = alpha0;
-	alpha[1] = alpha1;
-	alpha[2] = (4 * alpha[0] + 1 * alpha[1]) / 5;	// Bit code 010
-	alpha[3] = (3 * alpha[0] + 2 * alpha[1]) / 5;	// Bit code 011
-	alpha[4] = (2 * alpha[0] + 3 * alpha[1]) / 5;	// Bit code 100
-	alpha[5] = (1 * alpha[0] + 4 * alpha[1]) / 5;	// Bit code 101
-	alpha[6] = 0x00;							// Bit code 110
-	alpha[7] = 0xFF;							// Bit code 111
+    if (alpha0 > alpha1) {
+        evaluatePalette8(alpha, d3d9);
+    }
+    else {
+        evaluatePalette6(alpha, d3d9);
+    }
+}
+
+void AlphaBlockDXT5::evaluatePalette8(uint8 alpha[8], bool d3d9) const
+{
+    int bias = 0;
+    if (d3d9) bias = 3;
+
+    // 8-alpha block:  derive the other six alphas.
+    // Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+    alpha[0] = alpha0;
+    alpha[1] = alpha1;
+    alpha[2] = (6 * alpha[0] + 1 * alpha[1] + bias) / 7;    // bit code 010
+    alpha[3] = (5 * alpha[0] + 2 * alpha[1] + bias) / 7;    // bit code 011
+    alpha[4] = (4 * alpha[0] + 3 * alpha[1] + bias) / 7;    // bit code 100
+    alpha[5] = (3 * alpha[0] + 4 * alpha[1] + bias) / 7;    // bit code 101
+    alpha[6] = (2 * alpha[0] + 5 * alpha[1] + bias) / 7;    // bit code 110
+    alpha[7] = (1 * alpha[0] + 6 * alpha[1] + bias) / 7;    // bit code 111
+}
+
+void AlphaBlockDXT5::evaluatePalette6(uint8 alpha[8], bool d3d9) const
+{
+    int bias = 0;
+    if (d3d9) bias = 2;
+
+    // 6-alpha block.
+    // Bit code 000 = alpha0, 001 = alpha1, others are interpolated.
+    alpha[0] = alpha0;
+    alpha[1] = alpha1;
+    alpha[2] = (4 * alpha[0] + 1 * alpha[1] + bias) / 5;    // Bit code 010
+    alpha[3] = (3 * alpha[0] + 2 * alpha[1] + bias) / 5;    // Bit code 011
+    alpha[4] = (2 * alpha[0] + 3 * alpha[1] + bias) / 5;    // Bit code 100
+    alpha[5] = (1 * alpha[0] + 4 * alpha[1] + bias) / 5;    // Bit code 101
+    alpha[6] = 0x00;                                        // Bit code 110
+    alpha[7] = 0xFF;                                        // Bit code 111
 }
 
 void AlphaBlockDXT5::indices(uint8 index_array[16]) const
 {
-	index_array[0x0] = bits0;
-	index_array[0x1] = bits1;
-	index_array[0x2] = bits2;
-	index_array[0x3] = bits3;
-	index_array[0x4] = bits4;
-	index_array[0x5] = bits5;
-	index_array[0x6] = bits6;
-	index_array[0x7] = bits7;
-	index_array[0x8] = bits8;
-	index_array[0x9] = bits9;
-	index_array[0xA] = bitsA;
-	index_array[0xB] = bitsB;
-	index_array[0xC] = bitsC;
-	index_array[0xD] = bitsD;
-	index_array[0xE] = bitsE;
-	index_array[0xF] = bitsF;
+    index_array[0x0] = bits0;
+    index_array[0x1] = bits1;
+    index_array[0x2] = bits2;
+    index_array[0x3] = bits3;
+    index_array[0x4] = bits4;
+    index_array[0x5] = bits5;
+    index_array[0x6] = bits6;
+    index_array[0x7] = bits7;
+    index_array[0x8] = bits8;
+    index_array[0x9] = bits9;
+    index_array[0xA] = bitsA;
+    index_array[0xB] = bitsB;
+    index_array[0xC] = bitsC;
+    index_array[0xD] = bitsD;
+    index_array[0xE] = bitsE;
+    index_array[0xF] = bitsF;
 }
 
 uint AlphaBlockDXT5::index(uint index) const
 {
-	nvDebugCheck(index < 16);
+    nvDebugCheck(index < 16);
 
-	int offset = (3 * index + 16);
-	return uint((this->u >> offset) & 0x7);
+    int offset = (3 * index + 16);
+    return uint((this->u >> offset) & 0x7);
 }
 
 void AlphaBlockDXT5::setIndex(uint index, uint value)
 {
-	nvDebugCheck(index < 16);
-	nvDebugCheck(value < 8);
+    nvDebugCheck(index < 16);
+    nvDebugCheck(value < 8);
 
-	int offset = (3 * index + 16);
-	uint64 mask = uint64(0x7) << offset;
-	this->u = (this->u & ~mask) | (uint64(value) << offset);
-}
-
-void AlphaBlockDXT5::decodeBlock(ColorBlock * block) const
-{
-	nvDebugCheck(block != NULL);
-	
-	uint8 alpha_array[8];
-	evaluatePalette(alpha_array);
-	
-	uint8 index_array[16];
-	indices(index_array);
-	
-	for(uint i = 0; i < 16; i++) {
-		block->color(i).a = alpha_array[index_array[i]];
-	}
+    int offset = (3 * index + 16);
+    uint64 mask = uint64(0x7) << offset;
+    this->u = (this->u & ~mask) | (uint64(value) << offset);
+}
+
+void AlphaBlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    uint8 alpha_array[8];
+    evaluatePalette(alpha_array, d3d9);
+
+    uint8 index_array[16];
+    indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        block->color(i).a = alpha_array[index_array[i]];
+    }
+}
+
+void AlphaBlockDXT5::decodeBlock(AlphaBlock4x4 * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    uint8 alpha_array[8];
+    evaluatePalette(alpha_array, d3d9);
+
+    uint8 index_array[16];
+    indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        block->alpha[i] = alpha_array[index_array[i]];
+    }
 }
 
+
 void AlphaBlockDXT5::flip4()
 {
-	uint64 * b = (uint64 *)this;
-	
-	// @@ The masks might have to be byte swapped.
-	uint64 tmp = (*b & POSH_U64(0x000000000000FFFF));
-	tmp |= (*b & POSH_U64(0x000000000FFF0000)) << 36;
-	tmp |= (*b & POSH_U64(0x000000FFF0000000)) << 12;
-	tmp |= (*b & POSH_U64(0x000FFF0000000000)) >> 12;
-	tmp |= (*b & POSH_U64(0xFFF0000000000000)) >> 36;
-	
-	*b = tmp;
+    uint64 * b = (uint64 *)this;
+
+    // @@ The masks might have to be byte swapped.
+    uint64 tmp = (*b & POSH_U64(0x000000000000FFFF));
+    tmp |= (*b & POSH_U64(0x000000000FFF0000)) << 36;
+    tmp |= (*b & POSH_U64(0x000000FFF0000000)) << 12;
+    tmp |= (*b & POSH_U64(0x000FFF0000000000)) >> 12;
+    tmp |= (*b & POSH_U64(0xFFF0000000000000)) >> 36;
+
+    *b = tmp;
 }
 
 void AlphaBlockDXT5::flip2()
 {
-	uint * b = (uint *)this;
-	
-	// @@ The masks might have to be byte swapped.
-	uint tmp = (*b & 0xFF000000);
-	tmp |=  (*b & 0x00000FFF) << 12;
-	tmp |= (*b & 0x00FFF000) >> 12;
-	
-	*b = tmp;
-}
-
-void BlockDXT5::decodeBlock(ColorBlock * block) const
-{
-	nvDebugCheck(block != NULL);
-	
-	// Decode color.
-	color.decodeBlock(block);
-	
-	// Decode alpha.
-	alpha.decodeBlock(block);
+    uint * b = (uint *)this;
+
+    // @@ The masks might have to be byte swapped.
+    uint tmp = (*b & 0xFF000000);
+    tmp |=  (*b & 0x00000FFF) << 12;
+    tmp |= (*b & 0x00FFF000) >> 12;
+
+    *b = tmp;
+}
+
+void BlockDXT5::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color.
+    color.decodeBlock(block, d3d9);
 
+    // Decode alpha.
+    alpha.decodeBlock(block, d3d9);
+}
+
+void BlockDXT5::decodeBlockNV5x(ColorBlock * block) const
+{
+    nvDebugCheck(block != NULL);
+
+    // Decode color.
+    color.decodeBlockNV5x(block);
+
+    // Decode alpha.
+    alpha.decodeBlock(block);
 }
 
 /// Flip DXT5 block vertically.
 void BlockDXT5::flip4()
 {
-	alpha.flip4();
-	color.flip4();
+    alpha.flip4();
+    color.flip4();
 }
 
 /// Flip half DXT5 block vertically.
 void BlockDXT5::flip2()
 {
-	alpha.flip2();
-	color.flip2();
+    alpha.flip2();
+    color.flip2();
 }
 
 
 /// Decode ATI1 block.
-void BlockATI1::decodeBlock(ColorBlock * block) const
+void BlockATI1::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
 {
-	uint8 alpha_array[8];
-	alpha.evaluatePalette(alpha_array);
-	
-	uint8 index_array[16];
-	alpha.indices(index_array);
-	
-	for(uint i = 0; i < 16; i++) {
-		Color32 & c = block->color(i);
-		c.b = c.g = c.r = alpha_array[index_array[i]];
-		c.a = 255;
-	}
+    uint8 alpha_array[8];
+    alpha.evaluatePalette(alpha_array, d3d9);
+
+    uint8 index_array[16];
+    alpha.indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        Color32 & c = block->color(i);
+        c.b = c.g = c.r = alpha_array[index_array[i]];
+        c.a = 255;
+    }
 }
 
 /// Flip ATI1 block vertically.
 void BlockATI1::flip4()
 {
-	alpha.flip4();
+    alpha.flip4();
 }
 
 /// Flip half ATI1 block vertically.
 void BlockATI1::flip2()
 {
-	alpha.flip2();
+    alpha.flip2();
 }
 
 
 /// Decode ATI2 block.
-void BlockATI2::decodeBlock(ColorBlock * block) const
+void BlockATI2::decodeBlock(ColorBlock * block, bool d3d9/*= false*/) const
 {
-	uint8 alpha_array[8];
-	uint8 index_array[16];
-	
-	x.evaluatePalette(alpha_array);
-	x.indices(index_array);
-	
-	for(uint i = 0; i < 16; i++) {
-		Color32 & c = block->color(i);
-		c.r = alpha_array[index_array[i]];
-	}
+    uint8 alpha_array[8];
+    uint8 index_array[16];
 
-	y.evaluatePalette(alpha_array);
-	y.indices(index_array);
-	
-	for(uint i = 0; i < 16; i++) {
-		Color32 & c = block->color(i);
-		c.g = alpha_array[index_array[i]];
-		c.b = 0;
-		c.a = 255;
-	}
+    x.evaluatePalette(alpha_array, d3d9);
+    x.indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        Color32 & c = block->color(i);
+        c.r = alpha_array[index_array[i]];
+    }
+
+    y.evaluatePalette(alpha_array, d3d9);
+    y.indices(index_array);
+
+    for(uint i = 0; i < 16; i++) {
+        Color32 & c = block->color(i);
+        c.g = alpha_array[index_array[i]];
+        c.b = 0;
+        c.a = 255;
+    }
 }
 
 /// Flip ATI2 block vertically.
 void BlockATI2::flip4()
 {
-	x.flip4();
-	y.flip4();
+    x.flip4();
+    y.flip4();
 }
 
 /// Flip half ATI2 block vertically.
 void BlockATI2::flip2()
 {
-	x.flip2();
-	y.flip2();
+    x.flip2();
+    y.flip2();
 }
 
 
 void BlockCTX1::evaluatePalette(Color32 color_array[4]) const
 {
-	// Does bit expansion before interpolation.
-	color_array[0].b = 0x00;
-	color_array[0].g = col0[1];
-	color_array[0].r = col0[0];
-	color_array[0].a = 0xFF;
-	
-	color_array[1].r = 0x00;
-	color_array[1].g = col0[1];
-	color_array[1].b = col1[0];
-	color_array[1].a = 0xFF;
-	
-	color_array[2].r = 0x00;
-	color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
-	color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
-	color_array[2].a = 0xFF;
-		
-	color_array[3].r = 0x00;
-	color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
-	color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
-	color_array[3].a = 0xFF;
+    // Does bit expansion before interpolation.
+    color_array[0].b = 0x00;
+    color_array[0].g = col0[1];
+    color_array[0].r = col0[0];
+    color_array[0].a = 0xFF;
+
+    color_array[1].r = 0x00;
+    color_array[1].g = col0[1];
+    color_array[1].b = col1[0];
+    color_array[1].a = 0xFF;
+
+    color_array[2].r = 0x00;
+    color_array[2].g = (2 * color_array[0].g + color_array[1].g) / 3;
+    color_array[2].b = (2 * color_array[0].b + color_array[1].b) / 3;
+    color_array[2].a = 0xFF;
+
+    color_array[3].r = 0x00;
+    color_array[3].g = (2 * color_array[1].g + color_array[0].g) / 3;
+    color_array[3].b = (2 * color_array[1].b + color_array[0].b) / 3;
+    color_array[3].a = 0xFF;
 }
 
 void BlockCTX1::decodeBlock(ColorBlock * block) const
 {
-	nvDebugCheck(block != NULL);
-	
-	// Decode color block.
-	Color32 color_array[4];
-	evaluatePalette(color_array);
-	
-	// Write color block.
-	for( uint j = 0; j < 4; j++ ) {
-		for( uint i = 0; i < 4; i++ ) {
-			uint idx = (row[j] >> (2 * i)) & 3;
-			block->color(i, j) = color_array[idx];
-		}
-	}	
+    nvDebugCheck(block != NULL);
+
+    // Decode color block.
+    Color32 color_array[4];
+    evaluatePalette(color_array);
+
+    // Write color block.
+    for( uint j = 0; j < 4; j++ ) {
+        for( uint i = 0; i < 4; i++ ) {
+            uint idx = (row[j] >> (2 * i)) & 3;
+            block->color(i, j) = color_array[idx];
+        }
+    }	
 }
 
 void BlockCTX1::setIndices(int * idx)
 {
-	indices = 0;
-	for(uint i = 0; i < 16; i++) {
-		indices |= (idx[i] & 3) << (2 * i);
+    indices = 0;
+    for(uint i = 0; i < 16; i++) {
+        indices |= (idx[i] & 3) << (2 * i);
+    }
+}
+
+
+/// Decode BC6 block.
+void BlockBC6::decodeBlock(Vector3 colors[16]) const
+{
+	ZOH::Tile tile(4, 4);
+	ZOH::decompress((const char *)data, tile);
+
+	// Convert ZOH's tile struct to Vector3, and convert half to float.
+	for (uint y = 0; y < 4; ++y)
+	{
+		for (uint x = 0; x < 4; ++x)
+		{
+			uint16 rHalf = ZOH::Tile::float2half(tile.data[y][x].x);
+			uint16 gHalf = ZOH::Tile::float2half(tile.data[y][x].y);
+			uint16 bHalf = ZOH::Tile::float2half(tile.data[y][x].z);
+			colors[y * 4 + x].x = to_float(rHalf);
+			colors[y * 4 + x].y = to_float(gHalf);
+			colors[y * 4 + x].z = to_float(bHalf);
+		}
+	}
+}
+
+
+/// Decode BC7 block.
+void BlockBC7::decodeBlock(ColorBlock * block) const
+{
+	AVPCL::Tile tile(4, 4);
+	AVPCL::decompress((const char *)data, tile);
+
+	// Convert AVPCL's tile struct back to NVTT's.
+	for (uint y = 0; y < 4; ++y)
+	{
+		for (uint x = 0; x < 4; ++x)
+		{
+			Vector4 rgba = tile.data[y][x];
+			// Note: decoded rgba values are in [0, 255] range and should be an integer,
+			// because BC7 never uses more than 8 bits per channel.  So no need to round.
+			block->color(x, y).setRGBA(uint8(rgba.x), uint8(rgba.y), uint8(rgba.z), uint8(rgba.w));
+		}
 	}
 }
 
@@ -606,14 +676,14 @@
 /// Flip CTX1 block vertically.
 inline void BlockCTX1::flip4()
 {
-	swap(row[0], row[3]);
-	swap(row[1], row[2]);
+    swap(row[0], row[3]);
+    swap(row[1], row[2]);
 }
 
 /// Flip half CTX1 block vertically.
 inline void BlockCTX1::flip2()
 {
-	swap(row[0], row[1]);
+    swap(row[0], row[1]);
 }
 
 
@@ -621,46 +691,57 @@
 
 Stream & nv::operator<<(Stream & stream, BlockDXT1 & block)
 {
-	stream << block.col0.u << block.col1.u;
-	stream.serialize(&block.indices, sizeof(block.indices));
-	return stream;
+    stream << block.col0.u << block.col1.u;
+    stream.serialize(&block.indices, sizeof(block.indices));
+    return stream;
 }
 
 Stream & nv::operator<<(Stream & stream, AlphaBlockDXT3 & block)
 {
-	stream.serialize(&block, sizeof(block));
-	return stream;
+    stream.serialize(&block, sizeof(block));
+    return stream;
 }
 
 Stream & nv::operator<<(Stream & stream, BlockDXT3 & block)
 {
-	return stream << block.alpha << block.color;
+    return stream << block.alpha << block.color;
 }
 
 Stream & nv::operator<<(Stream & stream, AlphaBlockDXT5 & block)
 {
-	stream.serialize(&block, sizeof(block));
-	return stream;
+    stream.serialize(&block, sizeof(block));
+    return stream;
 }
 
 Stream & nv::operator<<(Stream & stream, BlockDXT5 & block)
 {
-	return stream << block.alpha << block.color;
+    return stream << block.alpha << block.color;
 }
 
 Stream & nv::operator<<(Stream & stream, BlockATI1 & block)
 {
-	return stream << block.alpha;
+    return stream << block.alpha;
 }
 
 Stream & nv::operator<<(Stream & stream, BlockATI2 & block)
 {
-	return stream << block.x << block.y;
+    return stream << block.x << block.y;
 }
 
 Stream & nv::operator<<(Stream & stream, BlockCTX1 & block)
 {
-	stream.serialize(&block, sizeof(block));
-	return stream;
+    stream.serialize(&block, sizeof(block));
+    return stream;
 }
 
+Stream & nv::operator<<(Stream & stream, BlockBC6 & block)
+{
+    stream.serialize(&block, sizeof(block));
+    return stream;
+}
+
+Stream & nv::operator<<(Stream & stream, BlockBC7 & block)
+{
+    stream.serialize(&block, sizeof(block));
+    return stream;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/CMakeLists.txt
@@ -1,68 +1,62 @@
 PROJECT(nvimage)
 
 SET(IMAGE_SRCS	
-	nvimage.h
-	FloatImage.h
-	FloatImage.cpp
-	Filter.h
-	Filter.cpp
-	Image.h
-	Image.cpp
-	ImageIO.h
-	ImageIO.cpp
-	ColorBlock.h
-	ColorBlock.cpp
-	BlockDXT.h
-	BlockDXT.cpp
-	HoleFilling.h
-	HoleFilling.cpp
-	DirectDrawSurface.h
-	DirectDrawSurface.cpp
-	Quantize.h
-	Quantize.cpp
-	NormalMap.h
-	NormalMap.cpp
-	NormalMipmap.h
-	NormalMipmap.cpp
-	PsdFile.h
-	TgaFile.h)
+    nvimage.h
+    BlockDXT.h BlockDXT.cpp
+    ColorBlock.h ColorBlock.cpp
+    DirectDrawSurface.h DirectDrawSurface.cpp
+    ErrorMetric.h ErrorMetric.cpp
+    Filter.h Filter.cpp
+    FloatImage.h FloatImage.cpp
+    Image.h Image.cpp
+    ImageIO.h ImageIO.cpp
+    #KtxFile.h KtxFile.cpp
+    NormalMap.h NormalMap.cpp
+    PixelFormat.h
+    PsdFile.h
+    TgaFile.h)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
 IF(PNG_FOUND)
-	SET(LIBS ${LIBS} ${PNG_LIBRARIES})
-	INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
+    SET(LIBS ${LIBS} ${PNG_LIBRARIES})
+    INCLUDE_DIRECTORIES(${PNG_INCLUDE_DIR})
 ENDIF(PNG_FOUND)
 
 IF(JPEG_FOUND)
-	SET(LIBS ${LIBS} ${JPEG_LIBRARIES})
-	INCLUDE_DIRECTORIES(${JPEG_INCLUDE_DIR})
+    SET(LIBS ${LIBS} ${JPEG_LIBRARIES})
+    INCLUDE_DIRECTORIES(${JPEG_INCLUDE_DIR})
 ENDIF(JPEG_FOUND)
 
 IF(TIFF_FOUND)
-	SET(LIBS ${LIBS} ${TIFF_LIBRARIES})
-	INCLUDE_DIRECTORIES(${TIFF_INCLUDE_DIR})
+    SET(LIBS ${LIBS} ${TIFF_LIBRARIES})
+    INCLUDE_DIRECTORIES(${TIFF_INCLUDE_DIR})
 ENDIF(TIFF_FOUND)
 
 IF(OPENEXR_FOUND)
-	SET(LIBS ${LIBS} ${OPENEXR_LIBRARIES})
-	INCLUDE_DIRECTORIES(${OPENEXR_INCLUDE_PATHS})
+    SET(LIBS ${LIBS} ${OPENEXR_LIBRARIES})
+    INCLUDE_DIRECTORIES(${OPENEXR_INCLUDE_PATHS})
 ENDIF(OPENEXR_FOUND)
 
+IF(FREEIMAGE_FOUND)
+    SET(LIBS ${LIBS} ${FREEIMAGE_LIBRARIES})
+    INCLUDE_DIRECTORIES(${FREEIMAGE_INCLUDE_PATH})
+ENDIF(FREEIMAGE_FOUND)
+
 # targets
 ADD_DEFINITIONS(-DNVIMAGE_EXPORTS)
 
-IF(NVIMAGE_SHARED)	
-	ADD_DEFINITIONS(-DNVIMAGE_SHARED=1)
-	ADD_LIBRARY(nvimage SHARED ${IMAGE_SRCS})
+IF(NVIMAGE_SHARED)
+    ADD_DEFINITIONS(-DNVIMAGE_SHARED=1)
+    ADD_LIBRARY(nvimage SHARED ${IMAGE_SRCS})
 ELSE(NVIMAGE_SHARED)
-	ADD_LIBRARY(nvimage ${IMAGE_SRCS})
+    ADD_LIBRARY(nvimage ${IMAGE_SRCS})
 ENDIF(NVIMAGE_SHARED)
 
-TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore nvmath posh)
+TARGET_LINK_LIBRARIES(nvimage ${LIBS} nvcore posh bc6h bc7 nvmath)
 
 INSTALL(TARGETS nvimage
-	RUNTIME DESTINATION ${BINDIR}
-	LIBRARY DESTINATION ${LIBDIR} 
-	ARCHIVE DESTINATION ${LIBDIR})
+    RUNTIME DESTINATION ${BINDIR}
+    LIBRARY DESTINATION ${LIBDIR}
+    ARCHIVE DESTINATION ${LIBDIR})
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorBlock.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorBlock.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorBlock.h
@@ -1,95 +1,163 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_COLORBLOCK_H
 #define NV_IMAGE_COLORBLOCK_H
 
-#include <nvmath/Color.h>
+#include "nvimage.h"
+
+#include "nvmath/Color.h"
+#include "nvmath/Vector.h"
 
 namespace nv
 {
-	class Image;
+    class Image;
+    class FloatImage;
+
+
+    /// Uncompressed 4x4 color block.
+    struct NVIMAGE_CLASS ColorBlock
+    {
+        ColorBlock();
+        ColorBlock(const uint * linearImage);
+        ColorBlock(const ColorBlock & block);
+        ColorBlock(const Image * img, uint x, uint y);
+
+        void init(const Image * img, uint x, uint y);
+        void init(uint w, uint h, const uint * data, uint x, uint y);
+        void init(uint w, uint h, const float * data, uint x, uint y);
+
+        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+
+        bool isSingleColor(Color32 mask = Color32(0xFF, 0xFF, 0xFF, 0x00)) const;
+        bool hasAlpha() const;
+
+
+        // Accessors
+        const Color32 * colors() const;
+
+        Color32 color(uint i) const;
+        Color32 & color(uint i);
+
+        Color32 color(uint x, uint y) const;
+        Color32 & color(uint x, uint y);
+
+    private:
+
+        Color32 m_color[4*4];
+
+    };
+
+
+    /// Get pointer to block colors.
+    inline const Color32 * ColorBlock::colors() const
+    {
+        return m_color;
+    }
+
+    /// Get block color.
+    inline Color32 ColorBlock::color(uint i) const
+    {
+        nvDebugCheck(i < 16);
+        return m_color[i];
+    }
+
+    /// Get block color.
+    inline Color32 & ColorBlock::color(uint i)
+    {
+        nvDebugCheck(i < 16);
+        return m_color[i];
+    }
+
+    /// Get block color.
+    inline Color32 ColorBlock::color(uint x, uint y) const
+    {
+        nvDebugCheck(x < 4 && y < 4);
+        return m_color[y * 4 + x];
+    }
+
+    /// Get block color.
+    inline Color32 & ColorBlock::color(uint x, uint y)
+    {
+        nvDebugCheck(x < 4 && y < 4);
+        return m_color[y * 4 + x];
+    }
+
+    /*
+    struct ColorSet
+    {
+        ColorSet() : colorCount(0), indexCount(0), w(0), h(0) {}
+        //~ColorSet() {}
+
+        void allocate(uint w, uint h);
+
+        void setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y);
+        void setColors(const Vector3 colors[16], const float weights[16]);
+        void setColors(const Vector4 colors[16], const float weights[16]);
+
+        void setAlphaWeights();
+        void setUniformWeights();
+
+        void createMinimalSet(bool ignoreTransparent);
+        void wrapIndices();
+
+        void swizzle(uint x, uint y, uint z, uint w); // 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+
+        bool isSingleColor(bool ignoreAlpha) const;
+        bool hasAlpha() const;
+
+        // These methods require indices to be set:
+        Vector4 color(uint x, uint y) const { nvDebugCheck(x < w && y < h); return colors[indices[y * 4 + x]]; }
+        Vector4 & color(uint x, uint y) { nvDebugCheck(x < w && y < h); return colors[indices[y * 4 + x]]; }
+
+        Vector4 color(uint i) const { nvDebugCheck(i < indexCount); return colors[indices[i]]; }
+        Vector4 & color(uint i) { nvDebugCheck(i < indexCount); return colors[indices[i]]; }
+
+        float weight(uint i) const { nvDebugCheck(i < indexCount); return weights[indices[i]]; }
+
+        bool isValidIndex(uint i) const { return i < indexCount && indices[i] >= 0; }
+
+        uint colorCount;
+        uint indexCount;    // Fixed to 16
+        uint w, h;          // Fixed to 4x4
+
+        // Allocate color set dynamically and add support for sets larger than 4x4.
+        Vector4 colors[16];
+        float weights[16];  // @@ Add mask to indicate what color components are weighted?
+        int indices[16];
+    };
+    */
+
+
+    /// Uncompressed 4x4 alpha block.
+    struct NVIMAGE_CLASS AlphaBlock4x4
+    {
+        void init(uint8 value);
+        void init(const ColorBlock & src, uint channel);
+        //void init(const ColorSet & src, uint channel);
+
+        //void initMaxRGB(const ColorSet & src, float threshold);
+        //void initWeights(const ColorSet & src);
+
+        uint8 alpha[4*4];
+        float weights[16];
+    };
+
+
+    struct FloatAlphaBlock4x4
+    {
+        float alphas[4 * 4];
+        float weights[4 * 4];
+    };
+
+    struct FloatColorBlock4x4
+    {
+        Vector4 colors[4 * 4];
+        float weights[4 * 4];
+    };
+
+
 
-	/// Uncompressed 4x4 color block.
-	struct ColorBlock
-	{
-		ColorBlock();
-		ColorBlock(const uint * linearImage);
-		ColorBlock(const ColorBlock & block);
-		ColorBlock(const Image * img, uint x, uint y);
-		
-		void init(const Image * img, uint x, uint y);
-		
-		void swizzleDXT5n();
-		void splatX();
-		void splatY();
-		
-		bool isSingleColor() const;
-		uint countUniqueColors() const;
-		Color32 averageColor() const;
-		bool hasAlpha() const;
-		
-		void diameterRange(Color32 * start, Color32 * end) const;
-		void luminanceRange(Color32 * start, Color32 * end) const;
-		void boundsRange(Color32 * start, Color32 * end) const;
-		void boundsRangeAlpha(Color32 * start, Color32 * end) const;
-		
-		void sortColorsByAbsoluteValue();
-		
-		void computeRange(const Vector3 & axis, Color32 * start, Color32 * end) const;
-		void sortColors(const Vector3 & axis);
-		
-		float volume() const;
-		
-		// Accessors
-		const Color32 * colors() const;
-
-		Color32 color(uint i) const;
-		Color32 & color(uint i);
-		
-		Color32 color(uint x, uint y) const;
-		Color32 & color(uint x, uint y);
-		
-	private:
-		
-		Color32 m_color[4*4];
-		
-	};
-	
-
-	/// Get pointer to block colors.
-	inline const Color32 * ColorBlock::colors() const
-	{
-		return m_color;
-	}
-	
-	/// Get block color.
-	inline Color32 ColorBlock::color(uint i) const
-	{
-		nvDebugCheck(i < 16);
-		return m_color[i];
-	}
-	
-	/// Get block color.
-	inline Color32 & ColorBlock::color(uint i)
-	{
-		nvDebugCheck(i < 16);
-		return m_color[i];
-	}
-	
-	/// Get block color.
-	inline Color32 ColorBlock::color(uint x, uint y) const
-	{
-		nvDebugCheck(x < 4 && y < 4);
-		return m_color[y * 4 + x];
-	}
-	
-	/// Get block color.
-	inline Color32 & ColorBlock::color(uint x, uint y)
-	{
-		nvDebugCheck(x < 4 && y < 4);
-		return m_color[y * 4 + x];
-	}
-	
 } // nv namespace
 
 #endif // NV_IMAGE_COLORBLOCK_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorBlock.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorBlock.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorBlock.cpp
@@ -1,25 +1,33 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
-#include <nvmath/Box.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/Image.h>
+#include "ColorBlock.h"
+#include "Image.h"
+#include "FloatImage.h"
+
+#include "nvmath/Box.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
+
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memcpy
 
 using namespace nv;
 
 namespace {
-	
-	// Get approximate luminance.
-	inline static uint colorLuminance(Color32 c)
-	{
-		return c.r + c.g + c.b;
-	}
-	
-	// Get the euclidean distance between the given colors.
-	inline static uint colorDistance(Color32 c0, Color32 c1)
-	{
-		return (c0.r - c1.r) * (c0.r - c1.r) + (c0.g - c1.g) * (c0.g - c1.g) + (c0.b - c1.b) * (c0.b - c1.b);
-	}
-	
+
+    // Get approximate luminance.
+    inline static uint colorLuminance(Color32 c)
+    {
+        return c.r + c.g + c.b;
+    }
+
+    // Get the euclidean distance between the given colors.
+    inline static uint colorDistance(Color32 c0, Color32 c1)
+    {
+        return (c0.r - c1.r) * (c0.r - c1.r) + (c0.g - c1.g) * (c0.g - c1.g) + (c0.b - c1.b) * (c0.b - c1.b);
+    }
+
 } // namespace`
 
 
@@ -31,374 +39,701 @@
 /// Init the color block from an array of colors.
 ColorBlock::ColorBlock(const uint * linearImage)
 {
-	for(uint i = 0; i < 16; i++) {
-		color(i) = Color32(linearImage[i]);
-	}
+    for(uint i = 0; i < 16; i++) {
+        color(i) = Color32(linearImage[i]);
+    }
 }
 
 /// Init the color block with the contents of the given block.
 ColorBlock::ColorBlock(const ColorBlock & block)
 {
-	for(uint i = 0; i < 16; i++) {
-		color(i) = block.color(i);
-	}
+    for(uint i = 0; i < 16; i++) {
+        color(i) = block.color(i);
+    }
 }
 
 
 /// Initialize this color block.
 ColorBlock::ColorBlock(const Image * img, uint x, uint y)
 {
-	init(img, x, y);
+    init(img, x, y);
 }
 
 void ColorBlock::init(const Image * img, uint x, uint y)
 {
-	nvDebugCheck(img != NULL);
-	
-	const uint bw = min(img->width() - x, 4U);
-	const uint bh = min(img->height() - y, 4U);
+    init(img->width(), img->height(), (const uint *)img->pixels(), x, y);
+}
 
-	nvDebugCheck(bw != 0);
-	nvDebugCheck(bh != 0);
+void ColorBlock::init(uint w, uint h, const uint * data, uint x, uint y)
+{
+    nvDebugCheck(data != NULL);
 
-	static int remainder[] = {
-		0, 0, 0, 0,
-		0, 1, 0, 1,
-		0, 1, 2, 0,
-		0, 1, 2, 3,
-	};
+    const uint bw = min(w - x, 4U);
+    const uint bh = min(h - y, 4U);
+    nvDebugCheck(bw != 0 && bh != 0);
 
-	// Blocks that are smaller than 4x4 are handled by repeating the pixels.
-	// @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
+    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
+    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
+    // @@ Ideally we should zero the weights of the pixels out of range.
 
-	for(uint i = 0; i < 4; i++) {
-		//const int by = i % bh;
-		const int by = remainder[(bh - 1) * 4 + i];
-		for(uint e = 0; e < 4; e++) {
-			//const int bx = e % bw;
-			const int bx = remainder[(bw - 1) * 4 + e];
-			color(e, i) = img->pixel(x + bx, y + by);
-		}
-	}
-}
+    for (uint i = 0; i < 4; i++)
+    {
+        const int by = i % bh;
 
+        for (uint e = 0; e < 4; e++)
+        {
+            const int bx = e % bw;
+            const uint idx = (y + by) * w + x + bx;
 
-void ColorBlock::swizzleDXT5n()
+            color(e, i).u = data[idx];
+        }
+    }
+}
+
+void ColorBlock::init(uint w, uint h, const float * data, uint x, uint y)
 {
-	for(int i = 0; i < 16; i++)
-	{
-		Color32 c = m_color[i];
-		m_color[i] = Color32(0xFF, c.g, 0, c.r);
-	}
+    nvDebugCheck(data != NULL);
+
+    const uint bw = min(w - x, 4U);
+    const uint bh = min(h - y, 4U);
+    nvDebugCheck(bw != 0 && bh != 0);
+
+    // Blocks that are smaller than 4x4 are handled by repeating the pixels.
+    // @@ Thats only correct when block size is 1, 2 or 4, but not with 3. :(
+    // @@ Ideally we should zero the weights of the pixels out of range.
+
+    uint srcPlane = w * h;
+
+    for (uint i = 0; i < 4; i++)
+    {
+        const uint by = i % bh;
+
+        for (uint e = 0; e < 4; e++)
+        {
+            const uint bx = e % bw;
+            const uint idx = ((y + by) * w + x + bx);
+
+            Color32 & c = color(e, i);
+            c.r = uint8(255 * clamp(data[idx + 0 * srcPlane], 0.0f, 1.0f)); // @@ Is this the right way to quantize floats to bytes?
+            c.g = uint8(255 * clamp(data[idx + 1 * srcPlane], 0.0f, 1.0f));
+            c.b = uint8(255 * clamp(data[idx + 2 * srcPlane], 0.0f, 1.0f));
+            c.a = uint8(255 * clamp(data[idx + 3 * srcPlane], 0.0f, 1.0f));
+        }
+    }
 }
 
-void ColorBlock::splatX()
+static inline uint8 component(Color32 c, uint i)
 {
-	for(int i = 0; i < 16; i++)
-	{
-		uint8 x = m_color[i].r;
-		m_color[i] = Color32(x, x, x, x);
-	}
+    if (i == 0) return c.r;
+    if (i == 1) return c.g;
+    if (i == 2) return c.b;
+    if (i == 3) return c.a;
+    if (i == 4) return 0xFF;
+    return 0;
 }
 
-void ColorBlock::splatY()
+void ColorBlock::swizzle(uint x, uint y, uint z, uint w)
 {
-	for(int i = 0; i < 16; i++)
-	{
-		uint8 y = m_color[i].g;
-		m_color[i] = Color32(y, y, y, y);
-	}
+    for (int i = 0; i < 16; i++)
+    {
+        Color32 c = m_color[i];
+        m_color[i].r = component(c, x);
+        m_color[i].g = component(c, y);
+        m_color[i].b = component(c, z);
+        m_color[i].a = component(c, w);
+    }
 }
 
+
 /// Returns true if the block has a single color.
-bool ColorBlock::isSingleColor() const
+bool ColorBlock::isSingleColor(Color32 mask/*= Color32(0xFF, 0xFF, 0xFF, 0x00)*/) const
 {
-	Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
-	uint u = m_color[0].u & mask.u;
-        
-	for (int i = 1; i < 16; i++)
-	{
-		if (u != (m_color[i].u & mask.u))
-		{
-			return false;
-		}
-	}
-        
-	return true;
+    uint u = m_color[0].u & mask.u;
+
+    for (int i = 1; i < 16; i++)
+    {
+        if (u != (m_color[i].u & mask.u))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/*
+/// Returns true if the block has a single color, ignoring transparent pixels.
+bool ColorBlock::isSingleColorNoAlpha() const
+{
+    Color32 c;
+    int i;
+    for(i = 0; i < 16; i++)
+    {
+        if (m_color[i].a != 0) c = m_color[i];
+    }
+
+    Color32 mask(0xFF, 0xFF, 0xFF, 0x00);
+    uint u = c.u & mask.u;
+
+    for(; i < 16; i++)
+    {
+        if (u != (m_color[i].u & mask.u))
+        {
+            return false;
+        }
+    }
+
+    return true;
 }
+*/
 
 /// Count number of unique colors in this color block.
-uint ColorBlock::countUniqueColors() const
+/*uint ColorBlock::countUniqueColors() const
 {
-	uint count = 0;
+    uint count = 0;
 
-	// @@ This does not have to be o(n^2)
-	for(int i = 0; i < 16; i++)
-	{
-		bool unique = true;
-		for(int j = 0; j < i; j++) {
-			if( m_color[i] != m_color[j] ) {
-				unique = false;
-			}
-		}
-		
-		if( unique ) {
-			count++;
-		}
-	}
-	
-	return count;
-}
+    // @@ This does not have to be o(n^2)
+    for(int i = 0; i < 16; i++)
+    {
+        bool unique = true;
+        for(int j = 0; j < i; j++) {
+            if( m_color[i] != m_color[j] ) {
+                unique = false;
+            }
+        }
+
+        if( unique ) {
+            count++;
+        }
+    }
 
-/// Get average color of the block.
+    return count;
+}*/
+
+/*/// Get average color of the block.
 Color32 ColorBlock::averageColor() const
 {
-	uint r, g, b, a;
-	r = g = b = a = 0;
+    uint r, g, b, a;
+    r = g = b = a = 0;
 
-	for(uint i = 0; i < 16; i++) {
-		r += m_color[i].r;
-		g += m_color[i].g;
-		b += m_color[i].b;
-		a += m_color[i].a;
-	}
-	
-	return Color32(uint8(r / 16), uint8(g / 16), uint8(b / 16), uint8(a / 16));
-}
+    for(uint i = 0; i < 16; i++) {
+        r += m_color[i].r;
+        g += m_color[i].g;
+        b += m_color[i].b;
+        a += m_color[i].a;
+    }
+
+    return Color32(uint8(r / 16), uint8(g / 16), uint8(b / 16), uint8(a / 16));
+}*/
 
 /// Return true if the block is not fully opaque.
 bool ColorBlock::hasAlpha() const
 {
-	for (uint i = 0; i < 16; i++)
-	{
-		if (m_color[i].a != 255) return true;
-	}
-	return false;
+    for (uint i = 0; i < 16; i++)
+    {
+        if (m_color[i].a != 255) return true;
+    }
+    return false;
 }
 
+#if 0
 
 /// Get diameter color range.
 void ColorBlock::diameterRange(Color32 * start, Color32 * end) const
 {
-	nvDebugCheck(start != NULL);
-	nvDebugCheck(end != NULL);
-	
-	Color32 c0, c1;
-	uint best_dist = 0;
-	
-	for(int i = 0; i < 16; i++) {
-		for (int j = i+1; j < 16; j++) {
-			uint dist = colorDistance(m_color[i], m_color[j]);
-			if( dist > best_dist ) {
-				best_dist = dist;
-				c0 = m_color[i];
-				c1 = m_color[j];
-			}
-		}
-	}
-	
-	*start = c0;
-	*end = c1;
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    Color32 c0, c1;
+    uint best_dist = 0;
+
+    for(int i = 0; i < 16; i++) {
+        for (int j = i+1; j < 16; j++) {
+            uint dist = colorDistance(m_color[i], m_color[j]);
+            if( dist > best_dist ) {
+                best_dist = dist;
+                c0 = m_color[i];
+                c1 = m_color[j];
+            }
+        }
+    }
+
+    *start = c0;
+    *end = c1;
 }
 
 /// Get luminance color range.
 void ColorBlock::luminanceRange(Color32 * start, Color32 * end) const
 {
-	nvDebugCheck(start != NULL);
-	nvDebugCheck(end != NULL);
-	
-	Color32 minColor, maxColor;
-	uint minLuminance, maxLuminance;
-	
-	maxLuminance = minLuminance = colorLuminance(m_color[0]);
-	
-	for(uint i = 1; i < 16; i++)
-	{
-		uint luminance = colorLuminance(m_color[i]);
-		
-		if (luminance > maxLuminance) {
-			maxLuminance = luminance;
-			maxColor = m_color[i];
-		}
-		else if (luminance < minLuminance) {
-			minLuminance = luminance;
-			minColor = m_color[i];
-		}
-	}
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    Color32 minColor, maxColor;
+    uint minLuminance, maxLuminance;
+
+    maxLuminance = minLuminance = colorLuminance(m_color[0]);
 
-	*start = minColor;
-	*end = maxColor;
+    for(uint i = 1; i < 16; i++)
+    {
+        uint luminance = colorLuminance(m_color[i]);
+
+        if (luminance > maxLuminance) {
+            maxLuminance = luminance;
+            maxColor = m_color[i];
+        }
+        else if (luminance < minLuminance) {
+            minLuminance = luminance;
+            minColor = m_color[i];
+        }
+    }
+
+    *start = minColor;
+    *end = maxColor;
 }
 
 /// Get color range based on the bounding box. 
 void ColorBlock::boundsRange(Color32 * start, Color32 * end) const
 {
-	nvDebugCheck(start != NULL);
-	nvDebugCheck(end != NULL);
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
 
-	Color32 minColor(255, 255, 255);
-	Color32 maxColor(0, 0, 0);
+    Color32 minColor(255, 255, 255);
+    Color32 maxColor(0, 0, 0);
 
-	for(uint i = 0; i < 16; i++)
-	{
-		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
-		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
-		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
-		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
-		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
-		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
-	}
-
-	// Offset range by 1/16 of the extents
-	Color32 inset;
-	inset.r = (maxColor.r - minColor.r) >> 4;
-	inset.g = (maxColor.g - minColor.g) >> 4;
-	inset.b = (maxColor.b - minColor.b) >> 4;
-
-	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
-	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
-	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
-
-	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
-	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
-	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+    for(uint i = 0; i < 16; i++)
+    {
+        if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+        if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+        if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+        if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+        if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+        if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+    }
+
+    // Offset range by 1/16 of the extents
+    Color32 inset;
+    inset.r = (maxColor.r - minColor.r) >> 4;
+    inset.g = (maxColor.g - minColor.g) >> 4;
+    inset.b = (maxColor.b - minColor.b) >> 4;
+
+    minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+    minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+    minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+
+    maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+    maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+    maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
 
-	*start = minColor;
-	*end = maxColor;
+    *start = minColor;
+    *end = maxColor;
 }
 
 /// Get color range based on the bounding box. 
 void ColorBlock::boundsRangeAlpha(Color32 * start, Color32 * end) const
 {
-	nvDebugCheck(start != NULL);
-	nvDebugCheck(end != NULL);
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
 
-	Color32 minColor(255, 255, 255, 255);
-	Color32 maxColor(0, 0, 0, 0);
+    Color32 minColor(255, 255, 255, 255);
+    Color32 maxColor(0, 0, 0, 0);
 
-	for(uint i = 0; i < 16; i++)
-	{
-		if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
-		if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
-		if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
-		if (m_color[i].a < minColor.a) { minColor.a = m_color[i].a; }
-		if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
-		if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
-		if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
-		if (m_color[i].a > maxColor.a) { maxColor.a = m_color[i].a; }
-	}
-
-	// Offset range by 1/16 of the extents
-	Color32 inset;
-	inset.r = (maxColor.r - minColor.r) >> 4;
-	inset.g = (maxColor.g - minColor.g) >> 4;
-	inset.b = (maxColor.b - minColor.b) >> 4;
-	inset.a = (maxColor.a - minColor.a) >> 4;
-
-	minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
-	minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
-	minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
-	minColor.a = (minColor.a + inset.a <= 255) ? minColor.a + inset.a : 255;
-
-	maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
-	maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
-	maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
-	maxColor.a = (maxColor.a >= inset.a) ? maxColor.a - inset.a : 0;
-	
-	*start = minColor;
-	*end = maxColor;
-}
+    for(uint i = 0; i < 16; i++)
+    {
+        if (m_color[i].r < minColor.r) { minColor.r = m_color[i].r; }
+        if (m_color[i].g < minColor.g) { minColor.g = m_color[i].g; }
+        if (m_color[i].b < minColor.b) { minColor.b = m_color[i].b; }
+        if (m_color[i].a < minColor.a) { minColor.a = m_color[i].a; }
+        if (m_color[i].r > maxColor.r) { maxColor.r = m_color[i].r; }
+        if (m_color[i].g > maxColor.g) { maxColor.g = m_color[i].g; }
+        if (m_color[i].b > maxColor.b) { maxColor.b = m_color[i].b; }
+        if (m_color[i].a > maxColor.a) { maxColor.a = m_color[i].a; }
+    }
+
+    // Offset range by 1/16 of the extents
+    Color32 inset;
+    inset.r = (maxColor.r - minColor.r) >> 4;
+    inset.g = (maxColor.g - minColor.g) >> 4;
+    inset.b = (maxColor.b - minColor.b) >> 4;
+    inset.a = (maxColor.a - minColor.a) >> 4;
+
+    minColor.r = (minColor.r + inset.r <= 255) ? minColor.r + inset.r : 255;
+    minColor.g = (minColor.g + inset.g <= 255) ? minColor.g + inset.g : 255;
+    minColor.b = (minColor.b + inset.b <= 255) ? minColor.b + inset.b : 255;
+    minColor.a = (minColor.a + inset.a <= 255) ? minColor.a + inset.a : 255;
+
+    maxColor.r = (maxColor.r >= inset.r) ? maxColor.r - inset.r : 0;
+    maxColor.g = (maxColor.g >= inset.g) ? maxColor.g - inset.g : 0;
+    maxColor.b = (maxColor.b >= inset.b) ? maxColor.b - inset.b : 0;
+    maxColor.a = (maxColor.a >= inset.a) ? maxColor.a - inset.a : 0;
 
+    *start = minColor;
+    *end = maxColor;
+}
+#endif
 
-/// Sort colors by abosolute value in their 16 bit representation.
+/*/// Sort colors by abosolute value in their 16 bit representation.
 void ColorBlock::sortColorsByAbsoluteValue()
 {
-	// Dummy selection sort.
-	for( uint a = 0; a < 16; a++ ) {
-		uint max = a;
-		Color16 cmax(m_color[a]);
-		
-		for( uint b = a+1; b < 16; b++ ) {
-			Color16 cb(m_color[b]);
-			
-			if( cb.u > cmax.u ) {
-				max = b;
-				cmax = cb;
-			}
-		}
-		swap( m_color[a], m_color[max] );
-	}
-}
+    // Dummy selection sort.
+    for( uint a = 0; a < 16; a++ ) {
+        uint max = a;
+        Color16 cmax(m_color[a]);
+
+        for( uint b = a+1; b < 16; b++ ) {
+            Color16 cb(m_color[b]);
+
+            if( cb.u > cmax.u ) {
+                max = b;
+                cmax = cb;
+            }
+        }
+        swap( m_color[a], m_color[max] );
+    }
+}*/
 
 
-/// Find extreme colors in the given axis.
+/*/// Find extreme colors in the given axis.
 void ColorBlock::computeRange(Vector3::Arg axis, Color32 * start, Color32 * end) const
 {
-	nvDebugCheck(start != NULL);
-	nvDebugCheck(end != NULL);
-	
-	int mini, maxi;
-	mini = maxi = 0;
-	
-	float min, max;	
-	min = max = dot(Vector3(m_color[0].r, m_color[0].g, m_color[0].b), axis);
-
-	for(uint i = 1; i < 16; i++)
-	{
-		const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
-		
-		float val = dot(vec, axis);
-		if( val < min ) {
-			mini = i;
-			min = val;
-		}
-		else if( val > max ) {
-			maxi = i;
-			max = val;
-		}
-	}
-	
-	*start = m_color[mini];
-	*end = m_color[maxi];
-}
+    nvDebugCheck(start != NULL);
+    nvDebugCheck(end != NULL);
+
+    int mini, maxi;
+    mini = maxi = 0;
+
+    float min, max;	
+    min = max = dot(Vector3(m_color[0].r, m_color[0].g, m_color[0].b), axis);
 
+    for(uint i = 1; i < 16; i++)
+    {
+        const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+
+        float val = dot(vec, axis);
+        if( val < min ) {
+            mini = i;
+            min = val;
+        }
+        else if( val > max ) {
+            maxi = i;
+            max = val;
+        }
+    }
+
+    *start = m_color[mini];
+    *end = m_color[maxi];
+}*/
 
-/// Sort colors in the given axis.
+
+/*/// Sort colors in the given axis.
 void ColorBlock::sortColors(const Vector3 & axis)
 {
-	float luma_array[16];
-	
-	for(uint i = 0; i < 16; i++) {
-		const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
-		luma_array[i] = dot(vec, axis);
-	}
-	
-	// Dummy selection sort.
-	for( uint a = 0; a < 16; a++ ) {
-		uint min = a;
-		for( uint b = a+1; b < 16; b++ ) {
-			if( luma_array[b] < luma_array[min] ) {
-				min = b;
-			}
-		}
-		swap( luma_array[a], luma_array[min] );
-		swap( m_color[a], m_color[min] );
-	}
-}
+    float luma_array[16];
 
+    for(uint i = 0; i < 16; i++) {
+        const Vector3 vec(m_color[i].r, m_color[i].g, m_color[i].b);
+        luma_array[i] = dot(vec, axis);
+    }
+
+    // Dummy selection sort.
+    for( uint a = 0; a < 16; a++ ) {
+        uint min = a;
+        for( uint b = a+1; b < 16; b++ ) {
+            if( luma_array[b] < luma_array[min] ) {
+                min = b;
+            }
+        }
+        swap( luma_array[a], luma_array[min] );
+        swap( m_color[a], m_color[min] );
+    }
+}*/
 
-/// Get the volume of the color block.
+
+/*/// Get the volume of the color block.
 float ColorBlock::volume() const
 {
-	Box bounds;
-	bounds.clearBounds();
-	
-	for(int i = 0; i < 16; i++) {
-		const Vector3 point(m_color[i].r, m_color[i].g, m_color[i].b);
-		bounds.addPointToBounds(point);
-	}
-	
-	return bounds.volume();
+    Box bounds;
+    bounds.clearBounds();
+
+    for(int i = 0; i < 16; i++) {
+        const Vector3 point(m_color[i].r, m_color[i].g, m_color[i].b);
+        bounds.addPointToBounds(point);
+    }
+
+    return bounds.volume();
+}*/
+
+#if 0
+void ColorSet::allocate(uint w, uint h)
+{
+    nvDebugCheck(w <= 4 && h <= 4);
+
+    this->colorCount = w * h;
+    this->indexCount = 16;
+    this->w = 4;
+    this->h = 4;
+
+    //colors = new Vector4[colorCount];
+    //weights = new float[colorCount];
+    //indices = new int[indexCount];
+}
+
+// Allocate 4x4 block and fill with 
+void ColorSet::setColors(const float * data, uint img_w, uint img_h, uint img_x, uint img_y)
+{
+    nvDebugCheck(img_x < img_w && img_y < img_h);
+
+    const uint block_w = min(4U, img_w - img_x);
+    const uint block_h = min(4U, img_h - img_y);
+    nvDebugCheck(block_w != 0 && block_h != 0);
+
+    allocate(block_w, block_h);
+
+    const float * r = data + img_w * img_h * 0;
+    const float * g = data + img_w * img_h * 1;
+    const float * b = data + img_w * img_h * 2;
+    const float * a = data + img_w * img_h * 3;
+
+    // Set colors.
+    for (uint y = 0, i = 0; y < block_h; y++)
+    {
+        for (uint x = 0; x < block_w; x++, i++)
+        {
+            uint idx = x + img_x + (y + img_y) * img_w;
+            colors[i].x = r[idx];
+            colors[i].y = g[idx];
+            colors[i].z = b[idx];
+            colors[i].w = a[idx];
+        }
+    }
+
+    // Set default indices.
+    for (uint y = 0, i = 0; y < 4; y++)
+    {
+        for (uint x = 0; x < 4; x++)
+        {
+            if (x < block_w && y < block_h) {
+                indices[y*4+x] = i++;
+            }
+            else {
+                indices[y*4+x] = -1;
+            }
+        }
+    }
 }
 
+void ColorSet::setColors(const Vector3 colors[16], const float weights[16])
+{
+
+}
+
+void ColorSet::setColors(const Vector4 colors[16], const float weights[16])
+{
+
+}
+
+
+
+void ColorSet::setAlphaWeights()
+{
+    for (uint i = 0; i < colorCount; i++)
+    {
+        //weights[i] = max(colors[i].w, 0.001f); // Avoid division by zero.
+        weights[i] = max(colors[i].w, 0.0f);
+    }
+}
+
+void ColorSet::setUniformWeights()
+{
+    for (uint i = 0; i < colorCount; i++)
+    {
+        weights[i] = 1.0f;
+    }
+}
+
+
+// @@ Handle complex blocks (not 4x4).
+void ColorSet::createMinimalSet(bool ignoreTransparent)
+{
+    nvDebugCheck(indexCount == 16);
+    nvDebugCheck(colorCount <= 16);
+
+    Vector4 C[16];
+    float W[16];
+    memcpy(C, colors, sizeof(Vector4)*colorCount);
+    memcpy(W, weights, sizeof(float)*colorCount);
+
+    uint n = 0;
+    for (uint i = 0; i < indexCount; i++)
+    {
+        if (indices[i] < 0) {
+            continue;
+        }
+
+        Vector4 ci = C[indices[i]];
+        float wi = W[indices[i]];
+
+        if (ignoreTransparent && wi == 0) {
+            indices[i] = -1;
+            continue;
+        }
+
+        // Find matching color.
+        uint j;
+        for (j = 0; j < n; j++) {
+            bool colorMatch = equal(colors[j].x, ci.x) && equal(colors[j].y, ci.y) && equal(colors[j].z, ci.z);
+            //bool alphaMatch = equal(colors[j].w, ci.w);
+
+            if (colorMatch) {
+                weights[j] += wi;
+                indices[i] = j;
+                break;
+            }
+        }
+
+        // No match found. Add new color.
+        if (j == n) {
+            colors[n] = ci;
+            weights[n] = wi;
+            indices[i] = n;
+            n++;
+        }
+    }
+    //nvDebugCheck(n != 0); // Fully transparent blocks are OK.
+
+    for (uint i = n; i < colorCount; i++) {
+        colors[i] = Vector4(0);
+        weights[i] = 0;
+    }
+
+    colorCount = n;
+
+    // Avoid empty blocks.
+    if (colorCount == 0) {
+        colorCount = 1;
+        indices[0] = 0;
+        //colors[0] = Vector4(0);
+        weights[0] = 1;
+    }
+}
+
+
+// Fill blocks that are smaller than (4,4) by wrapping indices.
+void ColorSet::wrapIndices()
+{
+    for (uint y = h; y < 4; y++)
+    {
+        uint base = (y % h) * w;
+        for (uint x = w; x < 4; x++)
+        {
+            indices[y*4+3] = indices[base + (x % w)];
+        }
+    }
+}
+
+bool ColorSet::isSingleColor(bool ignoreAlpha) const
+{
+    Vector4 v = colors[0];
+    if (ignoreAlpha) v.w = 1.0f;
+
+    for (uint i = 1; i < colorCount; i++)
+    {
+        Vector4 c = colors[i];
+        if (ignoreAlpha) c.w = 1.0f;
+
+        if (v != c) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
+// 0=r, 1=g, 2=b, 3=a, 4=0xFF, 5=0
+static inline float component(Vector4::Arg c, uint i)
+{
+    if (i == 0) return c.x;
+    if (i == 1) return c.y;
+    if (i == 2) return c.z;
+    if (i == 3) return c.w;
+    if (i == 4) return 0xFF;
+    return 0;
+}
+
+void ColorSet::swizzle(uint x, uint y, uint z, uint w)
+{
+    for (uint i = 0; i < colorCount; i++)
+    {
+        Vector4 c = colors[i];
+        colors[i].x = component(c, x);
+        colors[i].y = component(c, y);
+        colors[i].z = component(c, z);
+        colors[i].w = component(c, w);
+    }
+}
+
+bool ColorSet::hasAlpha() const
+{
+    for (uint i = 0; i < colorCount; i++)
+    {
+        if (colors[i].w != 0.0f) return true;
+    }
+    return false;
+}
+#endif // 0
+
+
+void AlphaBlock4x4::init(uint8 a)
+{
+    for (int i = 0; i < 16; i++) {
+        alpha[i] = a;
+        weights[i] = 1.0f;
+    }
+}
+
+void AlphaBlock4x4::init(const ColorBlock & src, uint channel)
+{
+    nvCheck(channel >= 0 && channel < 4);
+
+    // Colors are in BGRA format.
+    if (channel == 0) channel = 2;
+    else if (channel == 2) channel = 0;
+
+    for (int i = 0; i < 16; i++) {
+        alpha[i] = src.color(i).component[channel];
+        weights[i] = 1.0f;
+    }
+}
+
+
+
+
+/*void AlphaBlock4x4::init(const ColorSet & src, uint channel)
+{
+    nvCheck(channel >= 0 && channel < 4);
+
+    for (int i = 0; i < 16; i++) {
+        float f = src.color(i).component[channel];
+        alpha[i] = unitFloatToFixed8(f);
+        weights[i] = 1.0f;
+    }
+}
+
+void AlphaBlock4x4::initMaxRGB(const ColorSet & src, float threshold)
+{
+    for (int i = 0; i < 16; i++) {
+        float x = src.color(i).x;
+        float y = src.color(i).y;
+        float z = src.color(i).z;
+        alpha[i] = unitFloatToFixed8(max(max(x, y), max(z, threshold)));
+        weights[i] = 1.0f;
+    }
+}*/
+
+/*void AlphaBlock4x4::initWeights(const ColorSet & src)
+{
+    for (int i = 0; i < 16; i++) {
+        weights[i] = src.weight(i);
+    }
+}*/
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorSpace.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorSpace.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorSpace.h
@@ -0,0 +1,22 @@
+// This code is in the public domain -- jim@tilander.org
+
+#pragma once
+#ifndef NV_IMAGE_COLORSPACE_H
+#define NV_IMAGE_COLORSPACE_H
+
+namespace nv 
+{
+	class Image;
+	
+	// Defines simple mappings between different color spaces and encodes them in the 
+	// input image.
+	namespace ColorSpace
+	{
+		void RGBtoYCoCg_R(Image* img);
+		void YCoCg_RtoRGB(Image* img);
+	}
+}
+
+
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorSpace.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorSpace.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ColorSpace.cpp
@@ -0,0 +1,69 @@
+// This code is in the public domain -- jim@tilander.org
+
+#include "ColorSpace.h"
+
+#include "nvimage/Image.h"
+#include "nvmath/Color.h"
+
+
+namespace nv
+{
+	void ColorSpace::RGBtoYCoCg_R(Image* img)
+	{
+		const uint w = img->width();
+		const uint h = img->height();
+		
+		for( uint y=0; y < h; y++ )
+		{
+			for( uint x=0; x < w; x++ )
+			{
+				Color32 pixel = img->pixel(x, y);
+				
+				const int r = pixel.r;
+				const int g = pixel.g;
+				const int b = pixel.b;
+				
+				const int Co = r - b;
+				const int t  = b + Co/2;
+				const int Cg = g - t;
+				const int Y  = t + Cg/2;
+				
+				// Just saturate the chroma here (we loose out of one bit in each channel)
+				// this just means that we won't have as high dynamic range. Perhaps a better option
+				// is to loose the least significant bit instead?
+				pixel.r = clamp(Co + 128, 0, 255);
+				pixel.g = clamp(Cg + 128, 0, 255);
+				pixel.b = 0;
+				pixel.a = Y;
+			}
+		}
+	}
+	
+	void ColorSpace::YCoCg_RtoRGB(Image* img)
+	{
+		const uint w = img->width();
+		const uint h = img->height();
+		
+		for( uint y=0; y < h; y++ )
+		{
+			for( uint x=0; x < w; x++ )
+			{
+				Color32 pixel = img->pixel(x, y);
+				
+				const int Co = (int)pixel.r - 128;
+				const int Cg = (int)pixel.g - 128;
+				const int Y  =      pixel.a;
+				
+				const int t = Y - Cg/2;
+				const int g = Cg + t;
+				const int b = t - Co/2;
+				const int r = b + Co;
+				
+				pixel.r = r;
+				pixel.g = g;
+				pixel.b = b;
+				pixel.a = 1;
+			}
+		}
+	}
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ConeMap.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ConeMap.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ConeMap.h
@@ -1,39 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_IMAGE_CONEMAP_H
-#define NV_IMAGE_CONEMAP_H
-
-#include <nvmath/Vector.h>
-#include <nvimage/nvimage.h>
-
-namespace nv
-{
-	class Image;
-	class FloatImage;
-
-	FloatImage * createConeMap(const Image * img, Vector4::Arg heightWeights);
-
-} // nv namespace
-
-#endif // NV_IMAGE_CONEMAP_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ConeMap.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ConeMap.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ConeMap.cpp
@@ -1,122 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Ptr.h>
-
-#include <nvmath/Color.h>
-
-#include <nvimage/NormalMap.h>
-#include <nvimage/Filter.h>
-#include <nvimage/FloatImage.h>
-#include <nvimage/Image.h>
-
-using namespace nv;
-
-
-static float processPixel(const FloatImage * img, uint x, uint y)
-{
-	nvDebugCheck(img != NULL);
-	
-	const uint w = img->width();
-	const uint h = img->height();
-	
-	float d = img->pixel(x, y, 0);
-	
-	float fx0 = (float) x / w;
-	float fy0 = (float) y / h;
-
-	float best_ratio = INF;
-	uint best_x = w;
-	uint best_y = h;
-	
-	for (uint yy = 0; yy < h; yy++)
-	{
-		for (uint xx = 0; xx < w; xx++)
-		{
-			float ch = d - img->pixel(xx, yy, 0);
-			
-			if (ch > 0)
-			{
-				float dx = float(xx - x);
-				float dy = float(yy - y);
-				
-				float ratio = (dx * dx + dy * dy) / ch;
-				
-				if (ratio < best_ratio)
-				{
-					best_x = xx;
-					best_y = yy;
-				}
-			}
-		}
-	}
-
-	if (best_x != w)
-	{
-		nvDebugCheck(best_y !=h);
-		
-		float dx = float(best_x - x) / w;
-		float dy = float(best_y - y) / h;
-		
-		float cw = sqrtf(dx*dx + dy*dy);
-		float ch = d - img->pixel(xx, yy, 0);
-		
-		return min(1, sqrtf(cw / ch));
-	}
-	
-	return 1;
-}
-
-
-// Create cone map using the given kernels.
-FloatImage * createConeMap(const Image * img, Vector4::Arg heightWeights)
-{
-	nvCheck(img != NULL);
-	
-	const uint w = img->width();
-	const uint h = img->height();
-	
-	AutoPtr<FloatImage> fimage(new FloatImage());
-	//fimage->allocate(2, w, h);
-	fimage->allocate(4, w, h);
-	
-	// Compute height and store in red channel:
-	float * heightChannel = fimage->channel(0);
-	for(uint i = 0; i < w*h; i++)
-	{
-		Vector4 color = toVector4(img->pixel(i));
-		heightChannel[i] = dot(color, heightWeights);
-	}
-	
-	// Compute cones:
-	for(uint y = 0; y < h; y++)
-	{
-		for(uint x = 0; x < w; x++)
-		{
-			processPixel(fimage, x, y);
-		}
-	}
-	
-	return fimage.release();
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/DirectDrawSurface.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/DirectDrawSurface.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/DirectDrawSurface.h
@@ -21,134 +21,410 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
+#pragma once
 #ifndef NV_IMAGE_DIRECTDRAWSURFACE_H
 #define NV_IMAGE_DIRECTDRAWSURFACE_H
 
-#include <nvimage/nvimage.h>
+#include "nvimage.h"
+
+#if !defined(MAKEFOURCC)
+#define MAKEFOURCC(ch0, ch1, ch2, ch3) \
+    (uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
+    (uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
+#endif
 
 namespace nv
 {
-	class Image;
-	class Stream;
-	struct ColorBlock;
-
-	struct NVIMAGE_CLASS DDSPixelFormat
-	{
-		uint size;
-		uint flags;
-		uint fourcc;
-		uint bitcount;
-		uint rmask;
-		uint gmask;
-		uint bmask;
-		uint amask;
-	};
-
-	struct NVIMAGE_CLASS DDSCaps
-	{
-		uint caps1;
-		uint caps2;
-		uint caps3;
-		uint caps4;
-	};
-
-	/// DDS file header for DX10.
-	struct NVIMAGE_CLASS DDSHeader10
-	{
-	    uint dxgiFormat;
-	    uint resourceDimension;
-	    uint miscFlag;
-	    uint arraySize;
-	    uint reserved;
-	};
-
-	/// DDS file header.
-	struct NVIMAGE_CLASS DDSHeader
-	{
-		uint fourcc;
-		uint size;
-		uint flags;
-		uint height;
-		uint width;
-		uint pitch;
-		uint depth;
-		uint mipmapcount;
-		uint reserved[11];
-		DDSPixelFormat pf;
-		DDSCaps caps;
-		uint notused;
-		DDSHeader10 header10;
-		
-		
-		// Helper methods.
-		DDSHeader();
-		
-		void setWidth(uint w);
-		void setHeight(uint h);
-		void setDepth(uint d);
-		void setMipmapCount(uint count);
-		void setTexture2D();
-		void setTexture3D();
-		void setTextureCube();
-		void setLinearSize(uint size);
-		void setPitch(uint pitch);
-		void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
-		void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
-		void setDX10Format(uint format);
-		void setNormalFlag(bool b);
-		
-		void swapBytes();
-		
-		bool hasDX10Header() const;
-	};
-
-	NVIMAGE_API Stream & operator<< (Stream & s, DDSHeader & header);
-
-
-	/// DirectDraw Surface. (DDS)
-	class NVIMAGE_CLASS DirectDrawSurface
-	{
-	public:
-		DirectDrawSurface(const char * file);
-		~DirectDrawSurface();
-		
-		bool isValid() const;
-		bool isSupported() const;
-		
-		uint mipmapCount() const;
-		uint width() const;
-		uint height() const;
-		uint depth() const;
-		bool isTexture1D() const;
-		bool isTexture2D() const;
-		bool isTexture3D() const;
-		bool isTextureCube() const;
-
-		void setNormalFlag(bool b);
-		
-		void mipmap(Image * img, uint f, uint m);
-		//	void mipmap(FloatImage * img, uint f, uint m);
-		
-		void printInfo() const;
-
-	private:
-		
-		uint blockSize() const;
-		uint faceSize() const;
-		uint mipmapSize(uint m) const;
-		
-		uint offset(uint f, uint m);
-		
-		void readLinearImage(Image * img);
-		void readBlockImage(Image * img);
-		void readBlock(ColorBlock * rgba);
-		
-		
-	private:
-		Stream * const stream;
-		DDSHeader header;
-		DDSHeader10 header10;
-	};
+    class Image;
+    class Stream;
+    struct ColorBlock;
+
+    enum DDPF
+    {
+        DDPF_ALPHAPIXELS = 0x00000001U,
+        DDPF_ALPHA = 0x00000002U,
+        DDPF_FOURCC = 0x00000004U,
+        DDPF_RGB = 0x00000040U,
+        DDPF_PALETTEINDEXED1 = 0x00000800U,
+        DDPF_PALETTEINDEXED2 = 0x00001000U,
+        DDPF_PALETTEINDEXED4 = 0x00000008U,
+        DDPF_PALETTEINDEXED8 = 0x00000020U,
+        DDPF_LUMINANCE = 0x00020000U,
+        DDPF_ALPHAPREMULT = 0x00008000U,
+
+        // Custom NVTT flags.
+        DDPF_NORMAL = 0x80000000U,
+        DDPF_SRGB = 0x40000000U,
+    };
+
+
+    enum D3DFORMAT
+    {
+        // 32 bit RGB formats.
+        D3DFMT_R8G8B8 = 20,
+        D3DFMT_A8R8G8B8 = 21,
+        D3DFMT_X8R8G8B8 = 22,
+        D3DFMT_R5G6B5 = 23,
+        D3DFMT_X1R5G5B5 = 24,
+        D3DFMT_A1R5G5B5 = 25,
+        D3DFMT_A4R4G4B4 = 26,
+        D3DFMT_R3G3B2 = 27,
+        D3DFMT_A8 = 28,
+        D3DFMT_A8R3G3B2 = 29,
+        D3DFMT_X4R4G4B4 = 30,
+        D3DFMT_A2B10G10R10 = 31,
+        D3DFMT_A8B8G8R8 = 32,
+        D3DFMT_X8B8G8R8 = 33,
+        D3DFMT_G16R16 = 34,
+        D3DFMT_A2R10G10B10 = 35,
+
+        D3DFMT_A16B16G16R16 = 36,
+
+        // Palette formats.
+        D3DFMT_A8P8 = 40,
+        D3DFMT_P8 = 41,
+
+        // Luminance formats.
+        D3DFMT_L8 = 50,
+        D3DFMT_A8L8 = 51,
+        D3DFMT_A4L4 = 52,
+        D3DFMT_L16 = 81,
+
+        // Floating point formats
+        D3DFMT_R16F = 111,
+        D3DFMT_G16R16F = 112,
+        D3DFMT_A16B16G16R16F = 113,
+        D3DFMT_R32F = 114,
+        D3DFMT_G32R32F = 115,
+        D3DFMT_A32B32G32R32F = 116,
+    };
+
+    enum FOURCC
+    {
+        FOURCC_NVTT = MAKEFOURCC('N', 'V', 'T', 'T'),
+        FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' '),
+        FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1'),
+        FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2'),
+        FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3'),
+        FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4'),
+        FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5'),
+        FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B'),
+        FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1'),
+        FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2'),
+        FOURCC_A2XY = MAKEFOURCC('A', '2', 'X', 'Y'),
+        FOURCC_DX10 = MAKEFOURCC('D', 'X', '1', '0'),
+        FOURCC_UVER = MAKEFOURCC('U', 'V', 'E', 'R'),
+    };
+
+
+    // D3D1x resource dimensions.
+    enum DDS_DIMENSION // D3D10_RESOURCE_DIMENSION
+    {
+        DDS_DIMENSION_UNKNOWN = 0,
+        DDS_DIMENSION_BUFFER = 1,
+        DDS_DIMENSION_TEXTURE1D = 2,
+        DDS_DIMENSION_TEXTURE2D = 3,
+        DDS_DIMENSION_TEXTURE3D = 4,
+    };
+
+    enum DDS_MISC_FLAG
+    {
+        DDS_MISC_TEXTURECUBE = 0x4,
+    };
+
+    // DXGI formats.
+    enum DXGI_FORMAT
+    {
+        DXGI_FORMAT_UNKNOWN = 0,
+
+        DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
+        DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
+        DXGI_FORMAT_R32G32B32A32_UINT = 3,
+        DXGI_FORMAT_R32G32B32A32_SINT = 4,
+
+        DXGI_FORMAT_R32G32B32_TYPELESS = 5,
+        DXGI_FORMAT_R32G32B32_FLOAT = 6,
+        DXGI_FORMAT_R32G32B32_UINT = 7,
+        DXGI_FORMAT_R32G32B32_SINT = 8,
+
+        DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
+        DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
+        DXGI_FORMAT_R16G16B16A16_UNORM = 11,
+        DXGI_FORMAT_R16G16B16A16_UINT = 12,
+        DXGI_FORMAT_R16G16B16A16_SNORM = 13,
+        DXGI_FORMAT_R16G16B16A16_SINT = 14,
+
+        DXGI_FORMAT_R32G32_TYPELESS = 15,
+        DXGI_FORMAT_R32G32_FLOAT = 16,
+        DXGI_FORMAT_R32G32_UINT = 17,
+        DXGI_FORMAT_R32G32_SINT = 18,
+
+        DXGI_FORMAT_R32G8X24_TYPELESS = 19,
+        DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
+        DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
+        DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
+
+        DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
+        DXGI_FORMAT_R10G10B10A2_UNORM = 24,
+        DXGI_FORMAT_R10G10B10A2_UINT = 25,
+
+        DXGI_FORMAT_R11G11B10_FLOAT = 26,
+
+        DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
+        DXGI_FORMAT_R8G8B8A8_UNORM = 28,
+        DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
+        DXGI_FORMAT_R8G8B8A8_UINT = 30,
+        DXGI_FORMAT_R8G8B8A8_SNORM = 31,
+        DXGI_FORMAT_R8G8B8A8_SINT = 32,
+
+        DXGI_FORMAT_R16G16_TYPELESS = 33,
+        DXGI_FORMAT_R16G16_FLOAT = 34,
+        DXGI_FORMAT_R16G16_UNORM = 35,
+        DXGI_FORMAT_R16G16_UINT = 36,
+        DXGI_FORMAT_R16G16_SNORM = 37,
+        DXGI_FORMAT_R16G16_SINT = 38,
+
+        DXGI_FORMAT_R32_TYPELESS = 39,
+        DXGI_FORMAT_D32_FLOAT = 40,
+        DXGI_FORMAT_R32_FLOAT = 41,
+        DXGI_FORMAT_R32_UINT = 42,
+        DXGI_FORMAT_R32_SINT = 43,
+
+        DXGI_FORMAT_R24G8_TYPELESS = 44,
+        DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
+        DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
+        DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
+
+        DXGI_FORMAT_R8G8_TYPELESS = 48,
+        DXGI_FORMAT_R8G8_UNORM = 49,
+        DXGI_FORMAT_R8G8_UINT = 50,
+        DXGI_FORMAT_R8G8_SNORM = 51,
+        DXGI_FORMAT_R8G8_SINT = 52,
+
+        DXGI_FORMAT_R16_TYPELESS = 53,
+        DXGI_FORMAT_R16_FLOAT = 54,
+        DXGI_FORMAT_D16_UNORM = 55,
+        DXGI_FORMAT_R16_UNORM = 56,
+        DXGI_FORMAT_R16_UINT = 57,
+        DXGI_FORMAT_R16_SNORM = 58,
+        DXGI_FORMAT_R16_SINT = 59,
+
+        DXGI_FORMAT_R8_TYPELESS = 60,
+        DXGI_FORMAT_R8_UNORM = 61,
+        DXGI_FORMAT_R8_UINT = 62,
+        DXGI_FORMAT_R8_SNORM = 63,
+        DXGI_FORMAT_R8_SINT = 64,
+        DXGI_FORMAT_A8_UNORM = 65,
+
+        DXGI_FORMAT_R1_UNORM = 66,
+
+        DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
+
+        DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
+        DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
+
+        DXGI_FORMAT_BC1_TYPELESS = 70,
+        DXGI_FORMAT_BC1_UNORM = 71,
+        DXGI_FORMAT_BC1_UNORM_SRGB = 72,
+
+        DXGI_FORMAT_BC2_TYPELESS = 73,
+        DXGI_FORMAT_BC2_UNORM = 74,
+        DXGI_FORMAT_BC2_UNORM_SRGB = 75,
+
+        DXGI_FORMAT_BC3_TYPELESS = 76,
+        DXGI_FORMAT_BC3_UNORM = 77,
+        DXGI_FORMAT_BC3_UNORM_SRGB = 78,
+
+        DXGI_FORMAT_BC4_TYPELESS = 79,
+        DXGI_FORMAT_BC4_UNORM = 80,
+        DXGI_FORMAT_BC4_SNORM = 81,
+
+        DXGI_FORMAT_BC5_TYPELESS = 82,
+        DXGI_FORMAT_BC5_UNORM = 83,
+        DXGI_FORMAT_BC5_SNORM = 84,
+
+        DXGI_FORMAT_B5G6R5_UNORM = 85,
+        DXGI_FORMAT_B5G5R5A1_UNORM = 86,
+        DXGI_FORMAT_B8G8R8A8_UNORM = 87,
+        DXGI_FORMAT_B8G8R8X8_UNORM = 88,
+
+        DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89,
+        DXGI_FORMAT_B8G8R8A8_TYPELESS = 90,
+        DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91,
+        DXGI_FORMAT_B8G8R8X8_TYPELESS = 92,
+        DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93,
+
+        DXGI_FORMAT_BC6H_TYPELESS = 94,
+        DXGI_FORMAT_BC6H_UF16 = 95,
+        DXGI_FORMAT_BC6H_SF16 = 96,
+
+        DXGI_FORMAT_BC7_TYPELESS = 97,
+        DXGI_FORMAT_BC7_UNORM = 98,
+        DXGI_FORMAT_BC7_UNORM_SRGB = 99,
+    };
+
+    NVIMAGE_API extern uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+
+    NVIMAGE_API extern uint findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+
+    struct RGBAPixelFormat
+    {
+        uint bitcount;
+        uint rmask;
+        uint gmask;
+        uint bmask;
+        uint amask;
+    };
+
+    extern const RGBAPixelFormat *findDXGIPixelFormat(uint dxgiFormat);
+
+    struct NVIMAGE_CLASS DDSPixelFormat
+    {
+        uint size;
+        uint flags;
+        uint fourcc;
+        uint bitcount;
+        uint rmask;
+        uint gmask;
+        uint bmask;
+        uint amask;
+    };
+
+    struct NVIMAGE_CLASS DDSCaps
+    {
+        uint caps1;
+        uint caps2;
+        uint caps3;
+        uint caps4;
+    };
+
+    /// DDS file header for DX10.
+    struct NVIMAGE_CLASS DDSHeader10
+    {
+        uint dxgiFormat;
+        uint resourceDimension;
+        uint miscFlag;
+        uint arraySize;
+        uint reserved;
+    };
+
+    /// DDS file header.
+    struct NVIMAGE_CLASS DDSHeader
+    {
+        uint fourcc;
+        uint size;
+        uint flags;
+        uint height;
+        uint width;
+        uint pitch;
+        uint depth;
+        uint mipmapcount;
+        uint reserved[11];
+        DDSPixelFormat pf;
+        DDSCaps caps;
+        uint notused;
+        DDSHeader10 header10;
+
+
+        // Helper methods.
+        DDSHeader();
+
+        void setWidth(uint w);
+        void setHeight(uint h);
+        void setDepth(uint d);
+        void setMipmapCount(uint count);
+        void setTexture2D();
+        void setTexture3D();
+        void setTextureCube();
+        void setTextureArray(int imageCount);
+        void setLinearSize(uint size);
+        void setPitch(uint pitch);
+        void setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
+        void setFormatCode(uint code);
+        void setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3);
+        void setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+        void setDX10Format(uint format);
+        void setNormalFlag(bool b);
+        void setSrgbFlag(bool b);
+        void setHasAlphaFlag(bool b);
+        void setUserVersion(int version);
+
+        void swapBytes();
+
+        bool hasDX10Header() const;
+        uint signature() const;
+        uint toolVersion() const;
+        uint userVersion() const;
+        bool isNormalMap() const;
+        bool isSrgb() const;
+        bool hasAlpha() const;
+        uint d3d9Format() const;
+        uint pixelSize() const; // In bits!
+        uint blockSize() const; // In bytes!
+        bool isBlockFormat() const;
+    };
+
+    NVIMAGE_API Stream & operator<< (Stream & s, DDSHeader & header);
+
+
+    /// DirectDraw Surface. (DDS)
+    class NVIMAGE_CLASS DirectDrawSurface
+    {
+    public:
+        DirectDrawSurface();
+        DirectDrawSurface(const char * file);
+        DirectDrawSurface(Stream * stream);
+        ~DirectDrawSurface();
+
+        bool load(const char * filename);
+        bool load(Stream * stream);
+
+        bool isValid() const;
+        bool isSupported() const;
+
+        bool hasAlpha() const;
+
+        uint mipmapCount() const;
+        uint width() const;
+        uint height() const;
+        uint depth() const;
+        uint arraySize() const;
+        bool isTexture1D() const;
+        bool isTexture2D() const;
+        bool isTexture3D() const;
+        bool isTextureCube() const;
+        bool isTextureArray() const;
+
+        void setNormalFlag(bool b);
+        void setHasAlphaFlag(bool b);
+        void setUserVersion(int version);
+
+        void mipmap(Image * img, uint f, uint m);
+
+        uint surfaceWidth(uint mipmap) const;
+        uint surfaceHeight(uint mipmap) const;
+        uint surfaceDepth(uint mipmap) const;
+        uint surfaceSize(uint mipmap) const;
+        bool readSurface(uint face, uint mipmap, void * data, uint size);
+
+        void printInfo() const;
+
+        // Only initialized after loading.
+        DDSHeader header;
+
+    private:
+
+        uint faceSize() const;
+        uint offset(uint face, uint mipmap);
+
+        void readLinearImage(Image * img, uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+        void readBlockImage(Image * img);
+        void readBlock(ColorBlock * rgba);
+
+
+    private:
+        Stream * stream;
+    };
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/DirectDrawSurface.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/DirectDrawSurface.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/DirectDrawSurface.cpp
@@ -21,1301 +21,1670 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#include <nvcore/Debug.h>
-#include <nvcore/Containers.h> // max
-#include <nvcore/StdStream.h>
-
-#include <nvimage/DirectDrawSurface.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/Image.h>
-#include <nvimage/BlockDXT.h>
-#include <nvimage/PixelFormat.h>
+#include "DirectDrawSurface.h"
+#include "ColorBlock.h"
+#include "Image.h"
+#include "BlockDXT.h"
+#include "PixelFormat.h"
+
+#include "nvcore/Debug.h"
+#include "nvcore/Utils.h" // max
+#include "nvcore/StdStream.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
 
 #include <string.h> // memset
 
 
 using namespace nv;
 
-#if !defined(MAKEFOURCC)
-#	define MAKEFOURCC(ch0, ch1, ch2, ch3) \
-		(uint(uint8(ch0)) | (uint(uint8(ch1)) << 8) | \
-		(uint(uint8(ch2)) << 16) | (uint(uint8(ch3)) << 24 ))
-#endif
-
 namespace
 {
-	static const uint FOURCC_DDS = MAKEFOURCC('D', 'D', 'S', ' ');
-	static const uint FOURCC_DXT1 = MAKEFOURCC('D', 'X', 'T', '1');
-	static const uint FOURCC_DXT2 = MAKEFOURCC('D', 'X', 'T', '2');
-	static const uint FOURCC_DXT3 = MAKEFOURCC('D', 'X', 'T', '3');
-	static const uint FOURCC_DXT4 = MAKEFOURCC('D', 'X', 'T', '4');
-	static const uint FOURCC_DXT5 = MAKEFOURCC('D', 'X', 'T', '5');
-	static const uint FOURCC_RXGB = MAKEFOURCC('R', 'X', 'G', 'B');
-	static const uint FOURCC_ATI1 = MAKEFOURCC('A', 'T', 'I', '1');
-	static const uint FOURCC_ATI2 = MAKEFOURCC('A', 'T', 'I', '2');
-
-	static const uint FOURCC_A2XY = MAKEFOURCC('A', '2', 'X', 'Y');
-	
-	static const uint FOURCC_DX10 = MAKEFOURCC('D', 'X', '1', '0');
-
-	// 32 bit RGB formats.
-	static const uint D3DFMT_R8G8B8 = 20;
-	static const uint D3DFMT_A8R8G8B8 = 21;
-	static const uint D3DFMT_X8R8G8B8 = 22;
-	static const uint D3DFMT_R5G6B5 = 23;
-	static const uint D3DFMT_X1R5G5B5 = 24;
-	static const uint D3DFMT_A1R5G5B5 = 25;
-	static const uint D3DFMT_A4R4G4B4 = 26;
-	static const uint D3DFMT_R3G3B2 = 27;
-	static const uint D3DFMT_A8 = 28;
-	static const uint D3DFMT_A8R3G3B2 = 29;
-	static const uint D3DFMT_X4R4G4B4 = 30;
-	static const uint D3DFMT_A2B10G10R10 = 31;
-	static const uint D3DFMT_A8B8G8R8 = 32;
-	static const uint D3DFMT_X8B8G8R8 = 33;
-	static const uint D3DFMT_G16R16 = 34;
-	static const uint D3DFMT_A2R10G10B10 = 35;
-
-	static const uint D3DFMT_A16B16G16R16 = 36;
-
-	// Palette formats.
-	static const uint D3DFMT_A8P8 = 40;
-	static const uint D3DFMT_P8 = 41;
-
-	// Luminance formats.
-	static const uint D3DFMT_L8 = 50;
-	static const uint D3DFMT_A8L8 = 51;
-	static const uint D3DFMT_A4L4 = 52;
-	static const uint D3DFMT_L16 = 81;
-
-	// Floating point formats
-	static const uint D3DFMT_R16F = 111;
-	static const uint D3DFMT_G16R16F = 112;
-	static const uint D3DFMT_A16B16G16R16F = 113;
-	static const uint D3DFMT_R32F = 114;
-	static const uint D3DFMT_G32R32F = 115;
-	static const uint D3DFMT_A32B32G32R32F = 116;
-
-	static const uint DDSD_CAPS = 0x00000001U;
-	static const uint DDSD_PIXELFORMAT = 0x00001000U;
-	static const uint DDSD_WIDTH = 0x00000004U;
-	static const uint DDSD_HEIGHT = 0x00000002U;
-	static const uint DDSD_PITCH = 0x00000008U;
-	static const uint DDSD_MIPMAPCOUNT = 0x00020000U;
-	static const uint DDSD_LINEARSIZE = 0x00080000U;
-	static const uint DDSD_DEPTH = 0x00800000U;
-
-	static const uint DDSCAPS_COMPLEX = 0x00000008U;
-	static const uint DDSCAPS_TEXTURE = 0x00001000U;
-	static const uint DDSCAPS_MIPMAP = 0x00400000U;
-	static const uint DDSCAPS2_VOLUME = 0x00200000U;
-	static const uint DDSCAPS2_CUBEMAP = 0x00000200U;
-
-	static const uint DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400U;
-	static const uint DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800U;
-	static const uint DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000U;
-	static const uint DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000U;
-	static const uint DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000U;
-	static const uint DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000U;
-	static const uint DDSCAPS2_CUBEMAP_ALL_FACES = 0x0000FC00U;
-
-	static const uint DDPF_ALPHAPIXELS = 0x00000001U;
-	static const uint DDPF_ALPHA = 0x00000002U;
-	static const uint DDPF_FOURCC = 0x00000004U;
-	static const uint DDPF_RGB = 0x00000040U;
-	static const uint DDPF_PALETTEINDEXED1 = 0x00000800U;
-	static const uint DDPF_PALETTEINDEXED2 = 0x00001000U;
-	static const uint DDPF_PALETTEINDEXED4 = 0x00000008U;
-	static const uint DDPF_PALETTEINDEXED8 = 0x00000020U;
-	static const uint DDPF_LUMINANCE = 0x00020000U;
-	static const uint DDPF_ALPHAPREMULT = 0x00008000U;
-	static const uint DDPF_NORMAL = 0x80000000U;	// @@ Custom nv flag.
-
-	// DX10 formats.
-	enum DXGI_FORMAT
-	{
-		DXGI_FORMAT_UNKNOWN = 0,
-		
-		DXGI_FORMAT_R32G32B32A32_TYPELESS = 1,
-		DXGI_FORMAT_R32G32B32A32_FLOAT = 2,
-		DXGI_FORMAT_R32G32B32A32_UINT = 3,
-		DXGI_FORMAT_R32G32B32A32_SINT = 4,
-		
-		DXGI_FORMAT_R32G32B32_TYPELESS = 5,
-		DXGI_FORMAT_R32G32B32_FLOAT = 6,
-		DXGI_FORMAT_R32G32B32_UINT = 7,
-		DXGI_FORMAT_R32G32B32_SINT = 8,
-		
-		DXGI_FORMAT_R16G16B16A16_TYPELESS = 9,
-		DXGI_FORMAT_R16G16B16A16_FLOAT = 10,
-		DXGI_FORMAT_R16G16B16A16_UNORM = 11,
-		DXGI_FORMAT_R16G16B16A16_UINT = 12,
-		DXGI_FORMAT_R16G16B16A16_SNORM = 13,
-		DXGI_FORMAT_R16G16B16A16_SINT = 14,
-		
-		DXGI_FORMAT_R32G32_TYPELESS = 15,
-		DXGI_FORMAT_R32G32_FLOAT = 16,
-		DXGI_FORMAT_R32G32_UINT = 17,
-		DXGI_FORMAT_R32G32_SINT = 18,
-		
-		DXGI_FORMAT_R32G8X24_TYPELESS = 19,
-		DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20,
-		DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21,
-		DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22,
-		
-		DXGI_FORMAT_R10G10B10A2_TYPELESS = 23,
-		DXGI_FORMAT_R10G10B10A2_UNORM = 24,
-		DXGI_FORMAT_R10G10B10A2_UINT = 25,
-		
-		DXGI_FORMAT_R11G11B10_FLOAT = 26,
-		
-		DXGI_FORMAT_R8G8B8A8_TYPELESS = 27,
-		DXGI_FORMAT_R8G8B8A8_UNORM = 28,
-		DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29,
-		DXGI_FORMAT_R8G8B8A8_UINT = 30,
-		DXGI_FORMAT_R8G8B8A8_SNORM = 31,
-		DXGI_FORMAT_R8G8B8A8_SINT = 32,
-		
-		DXGI_FORMAT_R16G16_TYPELESS = 33,
-		DXGI_FORMAT_R16G16_FLOAT = 34,
-		DXGI_FORMAT_R16G16_UNORM = 35,
-		DXGI_FORMAT_R16G16_UINT = 36,
-		DXGI_FORMAT_R16G16_SNORM = 37,
-		DXGI_FORMAT_R16G16_SINT = 38,
-		
-		DXGI_FORMAT_R32_TYPELESS = 39,
-		DXGI_FORMAT_D32_FLOAT = 40,
-		DXGI_FORMAT_R32_FLOAT = 41,
-		DXGI_FORMAT_R32_UINT = 42,
-		DXGI_FORMAT_R32_SINT = 43,
-		
-		DXGI_FORMAT_R24G8_TYPELESS = 44,
-		DXGI_FORMAT_D24_UNORM_S8_UINT = 45,
-		DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46,
-		DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47,
-		
-		DXGI_FORMAT_R8G8_TYPELESS = 48,
-		DXGI_FORMAT_R8G8_UNORM = 49,
-		DXGI_FORMAT_R8G8_UINT = 50,
-		DXGI_FORMAT_R8G8_SNORM = 51,
-		DXGI_FORMAT_R8G8_SINT = 52,
-		
-		DXGI_FORMAT_R16_TYPELESS = 53,
-		DXGI_FORMAT_R16_FLOAT = 54,
-		DXGI_FORMAT_D16_UNORM = 55,
-		DXGI_FORMAT_R16_UNORM = 56,
-		DXGI_FORMAT_R16_UINT = 57,
-		DXGI_FORMAT_R16_SNORM = 58,
-		DXGI_FORMAT_R16_SINT = 59,
-		
-		DXGI_FORMAT_R8_TYPELESS = 60,
-		DXGI_FORMAT_R8_UNORM = 61,
-		DXGI_FORMAT_R8_UINT = 62,
-		DXGI_FORMAT_R8_SNORM = 63,
-		DXGI_FORMAT_R8_SINT = 64,
-		DXGI_FORMAT_A8_UNORM = 65,
-		
-		DXGI_FORMAT_R1_UNORM = 66,
-		
-		DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67,
-		
-		DXGI_FORMAT_R8G8_B8G8_UNORM = 68,
-		DXGI_FORMAT_G8R8_G8B8_UNORM = 69,
-		
-		DXGI_FORMAT_BC1_TYPELESS = 70,
-		DXGI_FORMAT_BC1_UNORM = 71,
-		DXGI_FORMAT_BC1_UNORM_SRGB = 72,
-		
-		DXGI_FORMAT_BC2_TYPELESS = 73,
-		DXGI_FORMAT_BC2_UNORM = 74,
-		DXGI_FORMAT_BC2_UNORM_SRGB = 75,
-		
-		DXGI_FORMAT_BC3_TYPELESS = 76,
-		DXGI_FORMAT_BC3_UNORM = 77,
-		DXGI_FORMAT_BC3_UNORM_SRGB = 78,
-		
-		DXGI_FORMAT_BC4_TYPELESS = 79,
-		DXGI_FORMAT_BC4_UNORM = 80,
-		DXGI_FORMAT_BC4_SNORM = 81,
-		
-		DXGI_FORMAT_BC5_TYPELESS = 82,
-		DXGI_FORMAT_BC5_UNORM = 83,
-		DXGI_FORMAT_BC5_SNORM = 84,
-		
-		DXGI_FORMAT_B5G6R5_UNORM = 85,
-		DXGI_FORMAT_B5G5R5A1_UNORM = 86,
-		DXGI_FORMAT_B8G8R8A8_UNORM = 87,
-		DXGI_FORMAT_B8G8R8X8_UNORM = 88,
-	};
-
-	enum D3D10_RESOURCE_DIMENSION
-	{
-		D3D10_RESOURCE_DIMENSION_UNKNOWN = 0,
-		D3D10_RESOURCE_DIMENSION_BUFFER = 1,
-		D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2,
-		D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3,
-		D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4,
-	};
+
+    static const uint DDSD_CAPS = 0x00000001U;
+    static const uint DDSD_PIXELFORMAT = 0x00001000U;
+    static const uint DDSD_WIDTH = 0x00000004U;
+    static const uint DDSD_HEIGHT = 0x00000002U;
+    static const uint DDSD_PITCH = 0x00000008U;
+    static const uint DDSD_MIPMAPCOUNT = 0x00020000U;
+    static const uint DDSD_LINEARSIZE = 0x00080000U;
+    static const uint DDSD_DEPTH = 0x00800000U;
+
+    static const uint DDSCAPS_COMPLEX = 0x00000008U;
+    static const uint DDSCAPS_TEXTURE = 0x00001000U;
+    static const uint DDSCAPS_MIPMAP = 0x00400000U;
+    static const uint DDSCAPS2_VOLUME = 0x00200000U;
+    static const uint DDSCAPS2_CUBEMAP = 0x00000200U;
+
+    static const uint DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400U;
+    static const uint DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800U;
+    static const uint DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000U;
+    static const uint DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000U;
+    static const uint DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000U;
+    static const uint DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000U;
+    static const uint DDSCAPS2_CUBEMAP_ALL_FACES = 0x0000FC00U;
 
 
-	const char * getDxgiFormatString(DXGI_FORMAT dxgiFormat)
-	{
+    const char * getDxgiFormatString(DXGI_FORMAT dxgiFormat)
+    {
 #define CASE(format) case DXGI_FORMAT_##format: return #format
-		switch(dxgiFormat)
-		{
-			CASE(UNKNOWN);
-			
-			CASE(R32G32B32A32_TYPELESS);
-			CASE(R32G32B32A32_FLOAT);
-			CASE(R32G32B32A32_UINT);
-			CASE(R32G32B32A32_SINT);
-			
-			CASE(R32G32B32_TYPELESS);
-			CASE(R32G32B32_FLOAT);
-			CASE(R32G32B32_UINT);
-			CASE(R32G32B32_SINT);
-			
-			CASE(R16G16B16A16_TYPELESS);
-			CASE(R16G16B16A16_FLOAT);
-			CASE(R16G16B16A16_UNORM);
-			CASE(R16G16B16A16_UINT);
-			CASE(R16G16B16A16_SNORM);
-			CASE(R16G16B16A16_SINT);
-			
-			CASE(R32G32_TYPELESS);
-			CASE(R32G32_FLOAT);
-			CASE(R32G32_UINT);
-			CASE(R32G32_SINT);
-			
-			CASE(R32G8X24_TYPELESS);
-			CASE(D32_FLOAT_S8X24_UINT);
-			CASE(R32_FLOAT_X8X24_TYPELESS);
-			CASE(X32_TYPELESS_G8X24_UINT);
-			
-			CASE(R10G10B10A2_TYPELESS);
-			CASE(R10G10B10A2_UNORM);
-			CASE(R10G10B10A2_UINT);
-			
-			CASE(R11G11B10_FLOAT);
-			
-			CASE(R8G8B8A8_TYPELESS);
-			CASE(R8G8B8A8_UNORM);
-			CASE(R8G8B8A8_UNORM_SRGB);
-			CASE(R8G8B8A8_UINT);
-			CASE(R8G8B8A8_SNORM);
-			CASE(R8G8B8A8_SINT);
-			
-			CASE(R16G16_TYPELESS);
-			CASE(R16G16_FLOAT);
-			CASE(R16G16_UNORM);
-			CASE(R16G16_UINT);
-			CASE(R16G16_SNORM);
-			CASE(R16G16_SINT);
-			
-			CASE(R32_TYPELESS);
-			CASE(D32_FLOAT);
-			CASE(R32_FLOAT);
-			CASE(R32_UINT);
-			CASE(R32_SINT);
-			
-			CASE(R24G8_TYPELESS);
-			CASE(D24_UNORM_S8_UINT);
-			CASE(R24_UNORM_X8_TYPELESS);
-			CASE(X24_TYPELESS_G8_UINT);
-			
-			CASE(R8G8_TYPELESS);
-			CASE(R8G8_UNORM);
-			CASE(R8G8_UINT);
-			CASE(R8G8_SNORM);
-			CASE(R8G8_SINT);
-			
-			CASE(R16_TYPELESS);
-			CASE(R16_FLOAT);
-			CASE(D16_UNORM);
-			CASE(R16_UNORM);
-			CASE(R16_UINT);
-			CASE(R16_SNORM);
-			CASE(R16_SINT);
-			
-			CASE(R8_TYPELESS);
-			CASE(R8_UNORM);
-			CASE(R8_UINT);
-			CASE(R8_SNORM);
-			CASE(R8_SINT);
-			CASE(A8_UNORM);
-
-			CASE(R1_UNORM);
-		
-			CASE(R9G9B9E5_SHAREDEXP);
-			
-			CASE(R8G8_B8G8_UNORM);
-			CASE(G8R8_G8B8_UNORM);
-
-			CASE(BC1_TYPELESS);
-			CASE(BC1_UNORM);
-			CASE(BC1_UNORM_SRGB);
-		
-			CASE(BC2_TYPELESS);
-			CASE(BC2_UNORM);
-			CASE(BC2_UNORM_SRGB);
-		
-			CASE(BC3_TYPELESS);
-			CASE(BC3_UNORM);
-			CASE(BC3_UNORM_SRGB);
-		
-			CASE(BC4_TYPELESS);
-			CASE(BC4_UNORM);
-			CASE(BC4_SNORM);
-		
-			CASE(BC5_TYPELESS);
-			CASE(BC5_UNORM);
-			CASE(BC5_SNORM);
-
-			CASE(B5G6R5_UNORM);
-			CASE(B5G5R5A1_UNORM);
-			CASE(B8G8R8A8_UNORM);
-			CASE(B8G8R8X8_UNORM);
-
-			default: 
-				return "UNKNOWN";
-		}
+        switch(dxgiFormat)
+        {
+            CASE(UNKNOWN);
+
+            CASE(R32G32B32A32_TYPELESS);
+            CASE(R32G32B32A32_FLOAT);
+            CASE(R32G32B32A32_UINT);
+            CASE(R32G32B32A32_SINT);
+
+            CASE(R32G32B32_TYPELESS);
+            CASE(R32G32B32_FLOAT);
+            CASE(R32G32B32_UINT);
+            CASE(R32G32B32_SINT);
+
+            CASE(R16G16B16A16_TYPELESS);
+            CASE(R16G16B16A16_FLOAT);
+            CASE(R16G16B16A16_UNORM);
+            CASE(R16G16B16A16_UINT);
+            CASE(R16G16B16A16_SNORM);
+            CASE(R16G16B16A16_SINT);
+
+            CASE(R32G32_TYPELESS);
+            CASE(R32G32_FLOAT);
+            CASE(R32G32_UINT);
+            CASE(R32G32_SINT);
+
+            CASE(R32G8X24_TYPELESS);
+            CASE(D32_FLOAT_S8X24_UINT);
+            CASE(R32_FLOAT_X8X24_TYPELESS);
+            CASE(X32_TYPELESS_G8X24_UINT);
+
+            CASE(R10G10B10A2_TYPELESS);
+            CASE(R10G10B10A2_UNORM);
+            CASE(R10G10B10A2_UINT);
+
+            CASE(R11G11B10_FLOAT);
+
+            CASE(R8G8B8A8_TYPELESS);
+            CASE(R8G8B8A8_UNORM);
+            CASE(R8G8B8A8_UNORM_SRGB);
+            CASE(R8G8B8A8_UINT);
+            CASE(R8G8B8A8_SNORM);
+            CASE(R8G8B8A8_SINT);
+
+            CASE(R16G16_TYPELESS);
+            CASE(R16G16_FLOAT);
+            CASE(R16G16_UNORM);
+            CASE(R16G16_UINT);
+            CASE(R16G16_SNORM);
+            CASE(R16G16_SINT);
+
+            CASE(R32_TYPELESS);
+            CASE(D32_FLOAT);
+            CASE(R32_FLOAT);
+            CASE(R32_UINT);
+            CASE(R32_SINT);
+
+            CASE(R24G8_TYPELESS);
+            CASE(D24_UNORM_S8_UINT);
+            CASE(R24_UNORM_X8_TYPELESS);
+            CASE(X24_TYPELESS_G8_UINT);
+
+            CASE(R8G8_TYPELESS);
+            CASE(R8G8_UNORM);
+            CASE(R8G8_UINT);
+            CASE(R8G8_SNORM);
+            CASE(R8G8_SINT);
+
+            CASE(R16_TYPELESS);
+            CASE(R16_FLOAT);
+            CASE(D16_UNORM);
+            CASE(R16_UNORM);
+            CASE(R16_UINT);
+            CASE(R16_SNORM);
+            CASE(R16_SINT);
+
+            CASE(R8_TYPELESS);
+            CASE(R8_UNORM);
+            CASE(R8_UINT);
+            CASE(R8_SNORM);
+            CASE(R8_SINT);
+            CASE(A8_UNORM);
+
+            CASE(R1_UNORM);
+
+            CASE(R9G9B9E5_SHAREDEXP);
+
+            CASE(R8G8_B8G8_UNORM);
+            CASE(G8R8_G8B8_UNORM);
+
+            CASE(BC1_TYPELESS);
+            CASE(BC1_UNORM);
+            CASE(BC1_UNORM_SRGB);
+
+            CASE(BC2_TYPELESS);
+            CASE(BC2_UNORM);
+            CASE(BC2_UNORM_SRGB);
+
+            CASE(BC3_TYPELESS);
+            CASE(BC3_UNORM);
+            CASE(BC3_UNORM_SRGB);
+
+            CASE(BC4_TYPELESS);
+            CASE(BC4_UNORM);
+            CASE(BC4_SNORM);
+
+            CASE(BC5_TYPELESS);
+            CASE(BC5_UNORM);
+            CASE(BC5_SNORM);
+
+            CASE(B5G6R5_UNORM);
+            CASE(B5G5R5A1_UNORM);
+            CASE(B8G8R8A8_UNORM);
+            CASE(B8G8R8X8_UNORM);
+
+        default: 
+            return "UNKNOWN";
+        }
 #undef CASE
-	}
-	
-	const char * getD3d10ResourceDimensionString(D3D10_RESOURCE_DIMENSION resourceDimension)
-	{
-		switch(resourceDimension)
-		{
-			default:
-			case D3D10_RESOURCE_DIMENSION_UNKNOWN: return "UNKNOWN";
-			case D3D10_RESOURCE_DIMENSION_BUFFER: return "BUFFER";
-			case D3D10_RESOURCE_DIMENSION_TEXTURE1D: return "TEXTURE1D";
-			case D3D10_RESOURCE_DIMENSION_TEXTURE2D: return "TEXTURE2D";
-			case D3D10_RESOURCE_DIMENSION_TEXTURE3D: return "TEXTURE3D";
-		}
-	}
+    }
+
+    const char * getD3d10ResourceDimensionString(DDS_DIMENSION resourceDimension)
+    {
+        switch(resourceDimension)
+        {
+            default:
+            case DDS_DIMENSION_UNKNOWN: return "UNKNOWN";
+            case DDS_DIMENSION_BUFFER: return "BUFFER";
+            case DDS_DIMENSION_TEXTURE1D: return "TEXTURE1D";
+            case DDS_DIMENSION_TEXTURE2D: return "TEXTURE2D";
+            case DDS_DIMENSION_TEXTURE3D: return "TEXTURE3D";
+        }
+    }
+
+    static uint pixelSize(D3DFORMAT format) {
+        if (format == D3DFMT_R16F) return 8*2;
+        if (format == D3DFMT_G16R16F) return 8*4;
+        if (format == D3DFMT_A16B16G16R16F) return 8*8;
+        if (format == D3DFMT_R32F) return 8*4;
+        if (format == D3DFMT_G32R32F) return 8*8;
+        if (format == D3DFMT_A32B32G32R32F) return 8*16;
+
+        if (format == D3DFMT_R8G8B8) return 8*3;
+        if (format == D3DFMT_A8R8G8B8) return 8*4;
+        if (format == D3DFMT_X8R8G8B8) return 8*4;
+        if (format == D3DFMT_R5G6B5) return 8*2;
+        if (format == D3DFMT_X1R5G5B5) return 8*2;
+        if (format == D3DFMT_A1R5G5B5) return 8*2;
+        if (format == D3DFMT_A4R4G4B4) return 8*2;
+        if (format == D3DFMT_R3G3B2) return 8*1;
+        if (format == D3DFMT_A8) return 8*1;
+        if (format == D3DFMT_A8R3G3B2) return 8*2;
+        if (format == D3DFMT_X4R4G4B4) return 8*2;
+        if (format == D3DFMT_A2B10G10R10) return 8*4;
+        if (format == D3DFMT_A8B8G8R8) return 8*4;
+        if (format == D3DFMT_X8B8G8R8) return 8*4;
+        if (format == D3DFMT_G16R16) return 8*4;
+        if (format == D3DFMT_A2R10G10B10) return 8*4;
+        if (format == D3DFMT_A2B10G10R10) return 8*4;
+
+        if (format == D3DFMT_L8) return 8*1;
+        if (format == D3DFMT_L16) return 8*2;
+
+        return 0;
+    }
+
+    static uint pixelSize(DXGI_FORMAT format) {
+        switch(format) {
+            case DXGI_FORMAT_R32G32B32A32_TYPELESS:
+            case DXGI_FORMAT_R32G32B32A32_FLOAT:
+            case DXGI_FORMAT_R32G32B32A32_UINT:
+            case DXGI_FORMAT_R32G32B32A32_SINT:
+                return 8*16;
+
+            case DXGI_FORMAT_R32G32B32_TYPELESS:
+            case DXGI_FORMAT_R32G32B32_FLOAT:
+            case DXGI_FORMAT_R32G32B32_UINT:
+            case DXGI_FORMAT_R32G32B32_SINT:
+                return 8*12;
+
+            case DXGI_FORMAT_R16G16B16A16_TYPELESS:
+            case DXGI_FORMAT_R16G16B16A16_FLOAT:
+            case DXGI_FORMAT_R16G16B16A16_UNORM:
+            case DXGI_FORMAT_R16G16B16A16_UINT:
+            case DXGI_FORMAT_R16G16B16A16_SNORM:
+            case DXGI_FORMAT_R16G16B16A16_SINT:
+            
+            case DXGI_FORMAT_R32G32_TYPELESS:
+            case DXGI_FORMAT_R32G32_FLOAT:
+            case DXGI_FORMAT_R32G32_UINT:
+            case DXGI_FORMAT_R32G32_SINT:
+
+            case DXGI_FORMAT_R32G8X24_TYPELESS:
+            case DXGI_FORMAT_D32_FLOAT_S8X24_UINT:
+            case DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS:
+            case DXGI_FORMAT_X32_TYPELESS_G8X24_UINT:
+                return 8*8;
+
+            case DXGI_FORMAT_R10G10B10A2_TYPELESS:
+            case DXGI_FORMAT_R10G10B10A2_UNORM:
+            case DXGI_FORMAT_R10G10B10A2_UINT:
+
+            case DXGI_FORMAT_R11G11B10_FLOAT:
+
+            case DXGI_FORMAT_R8G8B8A8_TYPELESS:
+            case DXGI_FORMAT_R8G8B8A8_UNORM:
+            case DXGI_FORMAT_R8G8B8A8_UNORM_SRGB:
+            case DXGI_FORMAT_R8G8B8A8_UINT:
+            case DXGI_FORMAT_R8G8B8A8_SNORM:
+            case DXGI_FORMAT_R8G8B8A8_SINT:
+
+            case DXGI_FORMAT_R16G16_TYPELESS:
+            case DXGI_FORMAT_R16G16_FLOAT:
+            case DXGI_FORMAT_R16G16_UNORM:
+            case DXGI_FORMAT_R16G16_UINT:
+            case DXGI_FORMAT_R16G16_SNORM:
+            case DXGI_FORMAT_R16G16_SINT:
+
+            case DXGI_FORMAT_R32_TYPELESS:
+            case DXGI_FORMAT_D32_FLOAT:
+            case DXGI_FORMAT_R32_FLOAT:
+            case DXGI_FORMAT_R32_UINT:
+            case DXGI_FORMAT_R32_SINT:
+
+            case DXGI_FORMAT_R24G8_TYPELESS:
+            case DXGI_FORMAT_D24_UNORM_S8_UINT:
+            case DXGI_FORMAT_R24_UNORM_X8_TYPELESS:
+            case DXGI_FORMAT_X24_TYPELESS_G8_UINT:
+                return 8*4;
+
+            case DXGI_FORMAT_R8G8_TYPELESS:
+            case DXGI_FORMAT_R8G8_UNORM:
+            case DXGI_FORMAT_R8G8_UINT:
+            case DXGI_FORMAT_R8G8_SNORM:
+            case DXGI_FORMAT_R8G8_SINT:
+
+            case DXGI_FORMAT_R16_TYPELESS:
+            case DXGI_FORMAT_R16_FLOAT:
+            case DXGI_FORMAT_D16_UNORM:
+            case DXGI_FORMAT_R16_UNORM:
+            case DXGI_FORMAT_R16_UINT:
+            case DXGI_FORMAT_R16_SNORM:
+            case DXGI_FORMAT_R16_SINT:
+                return 8*2;
+
+            case DXGI_FORMAT_R8_TYPELESS:
+            case DXGI_FORMAT_R8_UNORM:
+            case DXGI_FORMAT_R8_UINT:
+            case DXGI_FORMAT_R8_SNORM:
+            case DXGI_FORMAT_R8_SINT:
+            case DXGI_FORMAT_A8_UNORM:
+                return 8*1;
+
+            case DXGI_FORMAT_R1_UNORM:
+                return 1;
+
+            case DXGI_FORMAT_R9G9B9E5_SHAREDEXP:
+                return 8*4;
+
+            case DXGI_FORMAT_R8G8_B8G8_UNORM:
+            case DXGI_FORMAT_G8R8_G8B8_UNORM:
+                return 8*4;
+
+            case DXGI_FORMAT_B5G6R5_UNORM:
+            case DXGI_FORMAT_B5G5R5A1_UNORM:
+                return 8*2;
+            
+            case DXGI_FORMAT_B8G8R8A8_UNORM:
+            case DXGI_FORMAT_B8G8R8X8_UNORM:
+                return 8*4;
+
+            case DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM:
+            case DXGI_FORMAT_B8G8R8A8_TYPELESS:
+            case DXGI_FORMAT_B8G8R8A8_UNORM_SRGB:
+            case DXGI_FORMAT_B8G8R8X8_TYPELESS:
+            case DXGI_FORMAT_B8G8R8X8_UNORM_SRGB:
+                return 8*4;
+                
+            default:
+                return 0;
+        }
+        nvUnreachable();
+    }
 
 } // namespace
 
 namespace nv
 {
-	static Stream & operator<< (Stream & s, DDSPixelFormat & pf)
-	{
-		nvStaticCheck(sizeof(DDSPixelFormat) == 32);
-		s << pf.size;
-		s << pf.flags;
-		s << pf.fourcc;
-		s << pf.bitcount;
-		s << pf.rmask;
-		s << pf.gmask;
-		s << pf.bmask;
-		s << pf.amask;
-		return s;
-	}
-
-	static Stream & operator<< (Stream & s, DDSCaps & caps)
-	{
-		nvStaticCheck(sizeof(DDSCaps) == 16);
-		s << caps.caps1;
-		s << caps.caps2;
-		s << caps.caps3;
-		s << caps.caps4;
-		return s;
-	}
-
-	static Stream & operator<< (Stream & s, DDSHeader10 & header)
-	{
-		nvStaticCheck(sizeof(DDSHeader10) == 20);
-		s << header.dxgiFormat;
-		s << header.resourceDimension;
-		s << header.miscFlag;
-		s << header.arraySize;
-		s << header.reserved;
-		return s;
-	}
-
-	Stream & operator<< (Stream & s, DDSHeader & header)
-	{
-		nvStaticCheck(sizeof(DDSHeader) == 148);
-		s << header.fourcc;
-		s << header.size;
-		s << header.flags;
-		s << header.height;
-		s << header.width;
-		s << header.pitch;
-		s << header.depth;
-		s << header.mipmapcount;
-		s.serialize(header.reserved, 11 * sizeof(uint));
-		s << header.pf;
-		s << header.caps;
-		s << header.notused;
-		
-		if (header.hasDX10Header())
-		{
-			s << header.header10;
-		}
-		
-		return s;
-	}
+    static Stream & operator<< (Stream & s, DDSPixelFormat & pf)
+    {
+        nvStaticCheck(sizeof(DDSPixelFormat) == 32);
+        s << pf.size;
+        s << pf.flags;
+        s << pf.fourcc;
+        s << pf.bitcount;
+        s.serialize(&pf.rmask, sizeof(pf.rmask));
+        s.serialize(&pf.gmask, sizeof(pf.gmask));
+        s.serialize(&pf.bmask, sizeof(pf.bmask));
+        s.serialize(&pf.amask, sizeof(pf.amask));
+        // s << pf.rmask;
+        // s << pf.gmask;
+        // s << pf.bmask;
+        // s << pf.amask;
+        return s;
+    }
+
+    static Stream & operator<< (Stream & s, DDSCaps & caps)
+    {
+        nvStaticCheck(sizeof(DDSCaps) == 16);
+        s << caps.caps1;
+        s << caps.caps2;
+        s << caps.caps3;
+        s << caps.caps4;
+        return s;
+    }
+
+    static Stream & operator<< (Stream & s, DDSHeader10 & header)
+    {
+        nvStaticCheck(sizeof(DDSHeader10) == 20);
+        s << header.dxgiFormat;
+        s << header.resourceDimension;
+        s << header.miscFlag;
+        s << header.arraySize;
+        s << header.reserved;
+        return s;
+    }
+
+    Stream & operator<< (Stream & s, DDSHeader & header)
+    {
+        nvStaticCheck(sizeof(DDSHeader) == 148);
+        s << header.fourcc;
+        s << header.size;
+        s << header.flags;
+        s << header.height;
+        s << header.width;
+        s << header.pitch;
+        s << header.depth;
+        s << header.mipmapcount;
+        for (int i = 0; i < 11; i++) {
+            s << header.reserved[i];
+        }
+        s << header.pf;
+        s << header.caps;
+        s << header.notused;
+
+        if (header.hasDX10Header())
+        {
+            s << header.header10;
+        }
+
+        return s;
+    }
 
 } // nv namespace
 
-/* Not used!
 namespace
 {
-	struct FormatDescriptor
-	{
-		uint format;
-		uint bitcount;
-		uint rmask;
-		uint gmask;
-		uint bmask;
-		uint amask;
-	};
-
-	static const FormatDescriptor s_d3dFormats[] =
-	{
-		{ D3DFMT_R8G8B8,		24, 0xFF0000,   0xFF00,	    0xFF,       0 },
-		{ D3DFMT_A8R8G8B8,		32, 0xFF0000,   0xFF00,     0xFF,       0xFF000000 },  // DXGI_FORMAT_B8G8R8A8_UNORM
-		{ D3DFMT_X8R8G8B8,		32, 0xFF0000,   0xFF00,     0xFF,       0 },           // DXGI_FORMAT_B8G8R8X8_UNORM
-		{ D3DFMT_R5G6B5,		16,	0xF800,     0x7E0,      0x1F,       0 },           // DXGI_FORMAT_B5G6R5_UNORM
-		{ D3DFMT_X1R5G5B5,		16, 0x7C00,     0x3E0,      0x1F,       0 },
-		{ D3DFMT_A1R5G5B5,		16, 0x7C00,     0x3E0,      0x1F,       0x8000 },      // DXGI_FORMAT_B5G5R5A1_UNORM
-		{ D3DFMT_A4R4G4B4,		16, 0xF00,      0xF0,       0xF,        0xF000 },
-		{ D3DFMT_R3G3B2,		8,  0xE0,       0x1C,       0x3,	    0 },
-		{ D3DFMT_A8,			8,  0,          0,          0,		    8 },           // DXGI_FORMAT_A8_UNORM
-		{ D3DFMT_A8R3G3B2,		16, 0xE0,       0x1C,       0x3,        0xFF00 },
-		{ D3DFMT_X4R4G4B4,		16, 0xF00,      0xF0,       0xF,        0 },
-		{ D3DFMT_A2B10G10R10,	32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 },  // DXGI_FORMAT_R10G10B10A2
-		{ D3DFMT_A8B8G8R8,		32, 0xFF,       0xFF00,     0xFF0000,   0xFF000000 },  // DXGI_FORMAT_R8G8B8A8_UNORM
-		{ D3DFMT_X8B8G8R8,		32, 0xFF,       0xFF00,     0xFF0000,   0 },
-		{ D3DFMT_G16R16,		32, 0xFFFF,     0xFFFF0000, 0,          0 },           // DXGI_FORMAT_R16G16_UNORM
-		{ D3DFMT_A2R10G10B10,	32, 0x3FF00000, 0xFFC00,    0x3FF,      0xC0000000 },
-
-		{ D3DFMT_L8,			8,  8,          0,          0,          0 },           // DXGI_FORMAT_R8_UNORM 
-		{ D3DFMT_L16,			16, 16,         0,          0,          0 },           // DXGI_FORMAT_R16_UNORM
-	};
-
-	static const uint s_d3dFormatCount = sizeof(s_d3dFormats) / sizeof(s_d3dFormats[0]);
-	
-	static uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
-	{
-		for (int i = 0; i < s_d3dFormatCount; i++)
-		{
-			if (s_d3dFormats[i].bitcount == bitcount &&
-				s_d3dFormats[i].rmask == rmask &&
-				s_d3dFormats[i].gmask == gmask &&
-				s_d3dFormats[i].bmask == bmask &&
-				s_d3dFormats[i].amask == amask)
-			{
-				return s_d3dFormats[i].format;
-			}
-		}
+    struct FormatDescriptor
+    {
+        uint d3d9Format;
+        uint dxgiFormat;
+        RGBAPixelFormat pixelFormat;
+    };
+
+    static const FormatDescriptor s_formats[] =
+    {
+        { D3DFMT_R8G8B8,         DXGI_FORMAT_UNKNOWN,           { 24, 0xFF0000,   0xFF00,     0xFF,       0 } },
+        { D3DFMT_A8R8G8B8,       DXGI_FORMAT_B8G8R8A8_UNORM,    { 32, 0xFF0000,   0xFF00,     0xFF,       0xFF000000 } },
+        { D3DFMT_X8R8G8B8,       DXGI_FORMAT_B8G8R8X8_UNORM,    { 32, 0xFF0000,   0xFF00,     0xFF,       0 } },
+        { D3DFMT_R5G6B5,         DXGI_FORMAT_B5G6R5_UNORM,      { 16, 0xF800,     0x7E0,      0x1F,       0 } },
+        { D3DFMT_X1R5G5B5,       DXGI_FORMAT_UNKNOWN,           { 16, 0x7C00,     0x3E0,      0x1F,       0 } },
+        { D3DFMT_A1R5G5B5,       DXGI_FORMAT_B5G5R5A1_UNORM,    { 16, 0x7C00,     0x3E0,      0x1F,       0x8000 } },
+        { D3DFMT_A4R4G4B4,       DXGI_FORMAT_UNKNOWN,           { 16, 0xF00,      0xF0,       0xF,        0xF000 } },
+        { D3DFMT_R3G3B2,         DXGI_FORMAT_UNKNOWN,           { 8,  0xE0,       0x1C,       0x3,        0 } },
+        { D3DFMT_A8,             DXGI_FORMAT_A8_UNORM,          { 8,  0,          0,          0,          8 } },
+        { D3DFMT_A8R3G3B2,       DXGI_FORMAT_UNKNOWN,           { 16, 0xE0,       0x1C,       0x3,        0xFF00 } },
+        { D3DFMT_X4R4G4B4,       DXGI_FORMAT_UNKNOWN,           { 16, 0xF00,      0xF0,       0xF,        0 } },
+        { D3DFMT_A2B10G10R10,    DXGI_FORMAT_R10G10B10A2_UNORM, { 32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 } },
+        { D3DFMT_A8B8G8R8,       DXGI_FORMAT_R8G8B8A8_UNORM,    { 32, 0xFF,       0xFF00,     0xFF0000,   0xFF000000 } },
+        { D3DFMT_X8B8G8R8,       DXGI_FORMAT_UNKNOWN,           { 32, 0xFF,       0xFF00,     0xFF0000,   0 } },
+        { D3DFMT_G16R16,         DXGI_FORMAT_R16G16_UNORM,      { 32, 0xFFFF,     0xFFFF0000, 0,          0 } },
+        { D3DFMT_A2R10G10B10,    DXGI_FORMAT_UNKNOWN,           { 32, 0x3FF00000, 0xFFC00,    0x3FF,      0xC0000000 } },
+        { D3DFMT_A2B10G10R10,    DXGI_FORMAT_UNKNOWN,           { 32, 0x3FF,      0xFFC00,    0x3FF00000, 0xC0000000 } },
+
+        { D3DFMT_L8,             DXGI_FORMAT_R8_UNORM ,         { 8,  0xFF,       0,          0,          0 } },
+        { D3DFMT_L16,            DXGI_FORMAT_R16_UNORM,         { 16, 0xFFFF,     0,          0,          0 } },
+        { D3DFMT_A8L8,           DXGI_FORMAT_R8G8_UNORM,        { 16, 0xFF,       0,          0,     0xFF00 } },
+    };
 
-		return 0;
-	}
+    static const uint s_formatCount = NV_ARRAY_SIZE(s_formats);
 
-} // nv namespace
-*/
+} // namespace
+
+NVIMAGE_API uint nv::findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+    for (int i = 0; i < s_formatCount; i++)
+    {
+        if (s_formats[i].pixelFormat.bitcount == bitcount &&
+            s_formats[i].pixelFormat.rmask == rmask &&
+            s_formats[i].pixelFormat.gmask == gmask &&
+            s_formats[i].pixelFormat.bmask == bmask &&
+            s_formats[i].pixelFormat.amask == amask)
+        {
+            return s_formats[i].d3d9Format;
+        }
+    }
+
+    return 0;
+}
+
+NVIMAGE_API uint nv::findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+    for (int i = 0; i < s_formatCount; i++)
+    {
+        if (s_formats[i].pixelFormat.bitcount == bitcount &&
+            s_formats[i].pixelFormat.rmask == rmask &&
+            s_formats[i].pixelFormat.gmask == gmask &&
+            s_formats[i].pixelFormat.bmask == bmask &&
+            s_formats[i].pixelFormat.amask == amask)
+        {
+            return s_formats[i].dxgiFormat;
+        }
+    }
+
+    return DXGI_FORMAT_UNKNOWN;
+}
+
+const RGBAPixelFormat *nv::findDXGIPixelFormat(uint dxgiFormat)
+{
+    for (int i = 0; i < s_formatCount; i++)
+    {
+        if (s_formats[i].dxgiFormat == dxgiFormat) {
+            return &s_formats[i].pixelFormat;
+        }
+    }
+
+    return NULL;
+}
 
 DDSHeader::DDSHeader()
 {
-	this->fourcc = FOURCC_DDS;
-	this->size = 124;
-	this->flags  = (DDSD_CAPS|DDSD_PIXELFORMAT);
-	this->height = 0;
-	this->width = 0;
-	this->pitch = 0;
-	this->depth = 0;
-	this->mipmapcount = 0;
-	memset(this->reserved, 0, sizeof(this->reserved));
-
-	// Store version information on the reserved header attributes.
-	this->reserved[9] = MAKEFOURCC('N', 'V', 'T', 'T');
-	this->reserved[10] = (2 << 16) | (0 << 8) | (8);	// major.minor.revision
-
-	this->pf.size = 32;
-	this->pf.flags = 0;
-	this->pf.fourcc = 0;
-	this->pf.bitcount = 0;
-	this->pf.rmask = 0;
-	this->pf.gmask = 0;
-	this->pf.bmask = 0;
-	this->pf.amask = 0;
-	this->caps.caps1 = DDSCAPS_TEXTURE;
-	this->caps.caps2 = 0;
-	this->caps.caps3 = 0;
-	this->caps.caps4 = 0;
-	this->notused = 0;
-	
-	this->header10.dxgiFormat = DXGI_FORMAT_UNKNOWN;
-	this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_UNKNOWN;
-	this->header10.miscFlag = 0;
-	this->header10.arraySize = 0;
-	this->header10.reserved = 0;
+    this->fourcc = FOURCC_DDS;
+    this->size = 124;
+    this->flags  = (DDSD_CAPS|DDSD_PIXELFORMAT);
+    this->height = 0;
+    this->width = 0;
+    this->pitch = 0;
+    this->depth = 0;
+    this->mipmapcount = 0;
+    memset(this->reserved, 0, sizeof(this->reserved));
+
+    // Store version information on the reserved header attributes.
+    this->reserved[9] = FOURCC_NVTT;
+    this->reserved[10] = (2 << 16) | (1 << 8) | (0); // major.minor.revision
+
+    this->pf.size = 32;
+    this->pf.flags = 0;
+    this->pf.fourcc = 0;
+    this->pf.bitcount = 0;
+    this->pf.rmask = 0;
+    this->pf.gmask = 0;
+    this->pf.bmask = 0;
+    this->pf.amask = 0;
+    this->caps.caps1 = DDSCAPS_TEXTURE;
+    this->caps.caps2 = 0;
+    this->caps.caps3 = 0;
+    this->caps.caps4 = 0;
+    this->notused = 0;
+
+    this->header10.dxgiFormat = DXGI_FORMAT_UNKNOWN;
+    this->header10.resourceDimension = DDS_DIMENSION_UNKNOWN;
+    this->header10.miscFlag = 0;
+    this->header10.arraySize = 0;
+    this->header10.reserved = 0;
 }
 
 void DDSHeader::setWidth(uint w)
 {
-	this->flags |= DDSD_WIDTH;
-	this->width = w;
+    this->flags |= DDSD_WIDTH;
+    this->width = w;
 }
 
 void DDSHeader::setHeight(uint h)
 {
-	this->flags |= DDSD_HEIGHT;
-	this->height = h;
+    this->flags |= DDSD_HEIGHT;
+    this->height = h;
 }
 
 void DDSHeader::setDepth(uint d)
 {
-	this->flags |= DDSD_DEPTH;
-	this->height = d;
+    this->flags |= DDSD_DEPTH;
+    this->depth = d;
 }
 
 void DDSHeader::setMipmapCount(uint count)
 {
-	if (count == 0 || count == 1)
-	{
-		this->flags &= ~DDSD_MIPMAPCOUNT;
-		this->mipmapcount = 0;
-		
-		if (this->caps.caps2 == 0) {
-			this->caps.caps1 = DDSCAPS_TEXTURE;
-		}
-		else {
-			this->caps.caps1 = DDSCAPS_TEXTURE | DDSCAPS_COMPLEX;
-		}
-	}
-	else
-	{
-		this->flags |= DDSD_MIPMAPCOUNT;
-		this->mipmapcount = count;
+    if (count == 0 || count == 1)
+    {
+        this->flags &= ~DDSD_MIPMAPCOUNT;
+        this->mipmapcount = 1;
+
+        if (this->caps.caps2 == 0) {
+            this->caps.caps1 = DDSCAPS_TEXTURE;
+        }
+        else {
+            this->caps.caps1 = DDSCAPS_TEXTURE | DDSCAPS_COMPLEX;
+        }
+    }
+    else
+    {
+        this->flags |= DDSD_MIPMAPCOUNT;
+        this->mipmapcount = count;
 
-		this->caps.caps1 |= DDSCAPS_COMPLEX | DDSCAPS_MIPMAP;
-	}
+        this->caps.caps1 |= DDSCAPS_COMPLEX | DDSCAPS_MIPMAP;
+    }
 }
 
 void DDSHeader::setTexture2D()
 {
-	this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
+    this->header10.resourceDimension = DDS_DIMENSION_TEXTURE2D;
+    this->header10.miscFlag = 0;
+    this->header10.arraySize = 1;
 }
 
 void DDSHeader::setTexture3D()
 {
-	this->caps.caps2 = DDSCAPS2_VOLUME;
-	
-	this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE3D;
+    this->caps.caps2 = DDSCAPS2_VOLUME;
+
+    this->header10.resourceDimension = DDS_DIMENSION_TEXTURE3D;
+    this->header10.miscFlag = 0;
+    this->header10.arraySize = 1;
 }
 
 void DDSHeader::setTextureCube()
 {
-	this->caps.caps1 |= DDSCAPS_COMPLEX;
-	this->caps.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALL_FACES;
-	
-	this->header10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D;
-	this->header10.arraySize = 6;
+    this->caps.caps1 |= DDSCAPS_COMPLEX;
+    this->caps.caps2 = DDSCAPS2_CUBEMAP | DDSCAPS2_CUBEMAP_ALL_FACES;
+
+    this->header10.resourceDimension = DDS_DIMENSION_TEXTURE2D;
+    this->header10.miscFlag = DDS_MISC_TEXTURECUBE;
+    this->header10.arraySize = 1;
+}
+
+void DDSHeader::setTextureArray(int imageCount)
+{
+    this->header10.resourceDimension = DDS_DIMENSION_TEXTURE2D;
+    this->header10.arraySize = imageCount;
 }
 
 void DDSHeader::setLinearSize(uint size)
 {
-	this->flags &= ~DDSD_PITCH;
-	this->flags |= DDSD_LINEARSIZE;
-	this->pitch = size;
+    this->flags &= ~DDSD_PITCH;
+    this->flags |= DDSD_LINEARSIZE;
+    this->pitch = size;
 }
 
 void DDSHeader::setPitch(uint pitch)
 {
-	this->flags &= ~DDSD_LINEARSIZE;
-	this->flags |= DDSD_PITCH;
-	this->pitch = pitch;
+    this->flags &= ~DDSD_LINEARSIZE;
+    this->flags |= DDSD_PITCH;
+    this->pitch = pitch;
 }
 
 void DDSHeader::setFourCC(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
 {
-	// set fourcc pixel format.
-	this->pf.flags = DDPF_FOURCC;
-	this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
-	
-	if (this->pf.fourcc == FOURCC_ATI2)
-	{
-		this->pf.bitcount = FOURCC_A2XY;
-	}
-	else
-	{
-		this->pf.bitcount = 0;
-	}
-	
-	this->pf.rmask = 0;
-	this->pf.gmask = 0;
-	this->pf.bmask = 0;
-	this->pf.amask = 0;
+    // set fourcc pixel format.
+    this->pf.flags = DDPF_FOURCC;
+    this->pf.fourcc = MAKEFOURCC(c0, c1, c2, c3);
+
+    this->pf.bitcount = 0;
+    this->pf.rmask = 0;
+    this->pf.gmask = 0;
+    this->pf.bmask = 0;
+    this->pf.amask = 0;
+}
+
+void DDSHeader::setFormatCode(uint32 code)
+{
+    // set fourcc pixel format.
+    this->pf.flags = DDPF_FOURCC;
+    this->pf.fourcc = code;
+
+    this->pf.bitcount = 0;
+    this->pf.rmask = 0;
+    this->pf.gmask = 0;
+    this->pf.bmask = 0;
+    this->pf.amask = 0;
+}
+
+void DDSHeader::setSwizzleCode(uint8 c0, uint8 c1, uint8 c2, uint8 c3)
+{
+    this->pf.bitcount = MAKEFOURCC(c0, c1, c2, c3);
 }
 
+
 void DDSHeader::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
 {
-	// Make sure the masks are correct.
-	nvCheck((rmask & gmask) == 0);
-	nvCheck((rmask & bmask) == 0);
-	nvCheck((rmask & amask) == 0);
-	nvCheck((gmask & bmask) == 0);
-	nvCheck((gmask & amask) == 0);
-	nvCheck((bmask & amask) == 0);
-
-	this->pf.flags = DDPF_RGB;
-
-	if (amask != 0) {
-		this->pf.flags |= DDPF_ALPHAPIXELS;
-	}
-
-	if (bitcount == 0)
-	{
-		// Compute bit count from the masks.
-		uint total = rmask | gmask | bmask | amask;
-		while(total != 0) {
-			bitcount++;
-			total >>= 1;
-		}
-	}
-
-	nvCheck(bitcount > 0 && bitcount <= 32);
-
-	// Align to 8.
-	if (bitcount <= 8) bitcount = 8;
-	else if (bitcount <= 16) bitcount = 16;
-	else if (bitcount <= 24) bitcount = 24;
-	else bitcount = 32;
-
-	this->pf.fourcc = 0; //findD3D9Format(bitcount, rmask, gmask, bmask, amask);
-	this->pf.bitcount = bitcount;
-	this->pf.rmask = rmask;
-	this->pf.gmask = gmask;
-	this->pf.bmask = bmask;
-	this->pf.amask = amask;
+    // Make sure the masks are correct.
+    nvCheck((rmask & gmask) == 0);
+    nvCheck((rmask & bmask) == 0);
+    nvCheck((rmask & amask) == 0);
+    nvCheck((gmask & bmask) == 0);
+    nvCheck((gmask & amask) == 0);
+    nvCheck((bmask & amask) == 0);
+
+    if (rmask != 0 || gmask != 0 || bmask != 0)
+    {
+        if (gmask == 0 && bmask == 0)
+        {
+            this->pf.flags = DDPF_LUMINANCE;
+        }
+        else
+        {
+            this->pf.flags = DDPF_RGB;
+        }
+
+        if (amask != 0) {
+            this->pf.flags |= DDPF_ALPHAPIXELS;
+        }
+    }
+    else if (amask != 0)
+    {
+        this->pf.flags |= DDPF_ALPHA;
+    }
+
+    if (bitcount == 0)
+    {
+        // Compute bit count from the masks.
+        uint total = rmask | gmask | bmask | amask;
+        while(total != 0) {
+            bitcount++;
+            total >>= 1;
+        }
+    }
+
+    // D3DX functions do not like this:
+    this->pf.fourcc = 0; //findD3D9Format(bitcount, rmask, gmask, bmask, amask);
+    /*if (this->pf.fourcc) {
+        this->pf.flags |= DDPF_FOURCC;
+    }*/
+
+    nvCheck(bitcount > 0 && bitcount <= 32);
+    this->pf.bitcount = bitcount;
+    this->pf.rmask = rmask;
+    this->pf.gmask = gmask;
+    this->pf.bmask = bmask;
+    this->pf.amask = amask;
 }
 
 void DDSHeader::setDX10Format(uint format)
 {
-	//this->pf.flags = 0;
-	this->pf.fourcc = FOURCC_DX10;
-	this->header10.dxgiFormat = format;
+    this->pf.flags = DDPF_FOURCC;
+    this->pf.fourcc = FOURCC_DX10;
+    this->header10.dxgiFormat = format;
 }
 
 void DDSHeader::setNormalFlag(bool b)
 {
-	if (b) this->pf.flags |= DDPF_NORMAL;
-	else this->pf.flags &= ~DDPF_NORMAL;
+    if (b) this->pf.flags |= DDPF_NORMAL;
+    else this->pf.flags &= ~DDPF_NORMAL;
+}
+
+void DDSHeader::setSrgbFlag(bool b)
+{
+    if (b) this->pf.flags |= DDPF_SRGB;
+    else this->pf.flags &= ~DDPF_SRGB;
+}
+
+void DDSHeader::setHasAlphaFlag(bool b)
+{
+    if (b) this->pf.flags |= DDPF_ALPHAPIXELS;
+    else this->pf.flags &= ~DDPF_ALPHAPIXELS;
+}
+
+void DDSHeader::setUserVersion(int version)
+{
+    this->reserved[7] = FOURCC_UVER;
+    this->reserved[8] = version;
 }
 
 void DDSHeader::swapBytes()
 {
-	this->fourcc = POSH_LittleU32(this->fourcc);
-	this->size = POSH_LittleU32(this->size);
-	this->flags = POSH_LittleU32(this->flags);
-	this->height = POSH_LittleU32(this->height);
-	this->width = POSH_LittleU32(this->width);
-	this->pitch = POSH_LittleU32(this->pitch);
-	this->depth = POSH_LittleU32(this->depth);
-	this->mipmapcount = POSH_LittleU32(this->mipmapcount);
-	
-	for(int i = 0; i < 11; i++) {
-		this->reserved[i] = POSH_LittleU32(this->reserved[i]);
-	}
-
-	this->pf.size = POSH_LittleU32(this->pf.size);
-	this->pf.flags = POSH_LittleU32(this->pf.flags);
-	this->pf.fourcc = POSH_LittleU32(this->pf.fourcc);
-	this->pf.bitcount = POSH_LittleU32(this->pf.bitcount);
-	this->pf.rmask = POSH_LittleU32(this->pf.rmask);
-	this->pf.gmask = POSH_LittleU32(this->pf.gmask);
-	this->pf.bmask = POSH_LittleU32(this->pf.bmask);
-	this->pf.amask = POSH_LittleU32(this->pf.amask);
-	this->caps.caps1 = POSH_LittleU32(this->caps.caps1);
-	this->caps.caps2 = POSH_LittleU32(this->caps.caps2);
-	this->caps.caps3 = POSH_LittleU32(this->caps.caps3);
-	this->caps.caps4 = POSH_LittleU32(this->caps.caps4);
-	this->notused = POSH_LittleU32(this->notused);
-
-	this->header10.dxgiFormat = POSH_LittleU32(this->header10.dxgiFormat);
-	this->header10.resourceDimension = POSH_LittleU32(this->header10.resourceDimension);
-	this->header10.miscFlag = POSH_LittleU32(this->header10.miscFlag);
-	this->header10.arraySize = POSH_LittleU32(this->header10.arraySize);
-	this->header10.reserved = POSH_LittleU32(this->header10.reserved);
+    this->fourcc = POSH_LittleU32(this->fourcc);
+    this->size = POSH_LittleU32(this->size);
+    this->flags = POSH_LittleU32(this->flags);
+    this->height = POSH_LittleU32(this->height);
+    this->width = POSH_LittleU32(this->width);
+    this->pitch = POSH_LittleU32(this->pitch);
+    this->depth = POSH_LittleU32(this->depth);
+    this->mipmapcount = POSH_LittleU32(this->mipmapcount);
+
+    for(int i = 0; i < 11; i++) {
+        this->reserved[i] = POSH_LittleU32(this->reserved[i]);
+    }
+
+    this->pf.size = POSH_LittleU32(this->pf.size);
+    this->pf.flags = POSH_LittleU32(this->pf.flags);
+    this->pf.fourcc = POSH_LittleU32(this->pf.fourcc);
+    this->pf.bitcount = POSH_LittleU32(this->pf.bitcount);
+    this->pf.rmask = POSH_LittleU32(this->pf.rmask);
+    this->pf.gmask = POSH_LittleU32(this->pf.gmask);
+    this->pf.bmask = POSH_LittleU32(this->pf.bmask);
+    this->pf.amask = POSH_LittleU32(this->pf.amask);
+    this->caps.caps1 = POSH_LittleU32(this->caps.caps1);
+    this->caps.caps2 = POSH_LittleU32(this->caps.caps2);
+    this->caps.caps3 = POSH_LittleU32(this->caps.caps3);
+    this->caps.caps4 = POSH_LittleU32(this->caps.caps4);
+    this->notused = POSH_LittleU32(this->notused);
+
+    this->header10.dxgiFormat = POSH_LittleU32(this->header10.dxgiFormat);
+    this->header10.resourceDimension = POSH_LittleU32(this->header10.resourceDimension);
+    this->header10.miscFlag = POSH_LittleU32(this->header10.miscFlag);
+    this->header10.arraySize = POSH_LittleU32(this->header10.arraySize);
+    this->header10.reserved = POSH_LittleU32(this->header10.reserved);
 }
 
 bool DDSHeader::hasDX10Header() const
 {
-	return this->pf.fourcc == FOURCC_DX10;  // @@ This is according to AMD
-	//return this->pf.flags == 0;             // @@ This is according to MS
+    //if (pf.flags & DDPF_FOURCC) {
+        return this->pf.fourcc == FOURCC_DX10;
+    //}
+    //return false;
 }
 
+uint DDSHeader::signature() const
+{
+    return this->reserved[9];
+}
 
+uint DDSHeader::toolVersion() const
+{
+    return this->reserved[10];
+}
 
-DirectDrawSurface::DirectDrawSurface(const char * name) : stream(new StdInputStream(name))
+uint DDSHeader::userVersion() const
 {
-	if (!stream->isError())
-	{
-		(*stream) << header;
-	}
+    if (this->reserved[7] == FOURCC_UVER) {
+        return this->reserved[8];
+    }
+    return 0;
+}
+
+bool DDSHeader::isNormalMap() const
+{
+    return (pf.flags & DDPF_NORMAL) != 0;
+}
+
+bool DDSHeader::isSrgb() const
+{
+    return (pf.flags & DDPF_SRGB) != 0;
+}
+
+bool DDSHeader::hasAlpha() const
+{
+    return (pf.flags & DDPF_ALPHAPIXELS) != 0;
+}
+
+uint DDSHeader::d3d9Format() const
+{
+    if (pf.flags & DDPF_FOURCC) {
+        return pf.fourcc;
+    }
+    else {
+        return findD3D9Format(pf.bitcount, pf.rmask, pf.gmask, pf.bmask, pf.amask);
+    }
+}
+
+uint DDSHeader::pixelSize() const
+{
+    if (hasDX10Header()) {
+        return ::pixelSize((DXGI_FORMAT)header10.dxgiFormat);
+    }
+    else {
+        if (pf.flags & DDPF_FOURCC) {
+            return ::pixelSize((D3DFORMAT)pf.fourcc);
+        }
+        else {
+            nvDebugCheck((pf.flags & DDPF_RGB) || (pf.flags & DDPF_LUMINANCE));
+            return pf.bitcount;
+        }
+    }
+}
+
+uint DDSHeader::blockSize() const
+{
+    switch(pf.fourcc) 
+    {
+    case FOURCC_DXT1:
+    case FOURCC_ATI1:
+        return 8;
+    case FOURCC_DXT2:
+    case FOURCC_DXT3:
+    case FOURCC_DXT4:
+    case FOURCC_DXT5:
+    case FOURCC_RXGB:
+    case FOURCC_ATI2:
+        return 16;
+    case FOURCC_DX10:
+        switch(header10.dxgiFormat)
+        {
+        case DXGI_FORMAT_BC1_TYPELESS:
+        case DXGI_FORMAT_BC1_UNORM:
+        case DXGI_FORMAT_BC1_UNORM_SRGB:
+        case DXGI_FORMAT_BC4_TYPELESS:
+        case DXGI_FORMAT_BC4_UNORM:
+        case DXGI_FORMAT_BC4_SNORM:
+            return 8;
+        case DXGI_FORMAT_BC2_TYPELESS:
+        case DXGI_FORMAT_BC2_UNORM:
+        case DXGI_FORMAT_BC2_UNORM_SRGB:
+        case DXGI_FORMAT_BC3_TYPELESS:
+        case DXGI_FORMAT_BC3_UNORM:
+        case DXGI_FORMAT_BC3_UNORM_SRGB:
+        case DXGI_FORMAT_BC5_TYPELESS:
+        case DXGI_FORMAT_BC5_UNORM:
+        case DXGI_FORMAT_BC5_SNORM:
+        case DXGI_FORMAT_BC6H_TYPELESS:
+        case DXGI_FORMAT_BC6H_SF16:
+        case DXGI_FORMAT_BC6H_UF16:
+        case DXGI_FORMAT_BC7_TYPELESS:
+        case DXGI_FORMAT_BC7_UNORM:
+        case DXGI_FORMAT_BC7_UNORM_SRGB:
+            return 16;
+        };
+    };
+
+    // Not a block image.
+    return 0;
+}
+
+bool DDSHeader::isBlockFormat() const
+{
+    return blockSize() != 0;
+}
+
+
+
+
+
+DirectDrawSurface::DirectDrawSurface() : stream(NULL)
+{
+}
+
+DirectDrawSurface::DirectDrawSurface(const char * name) : stream(NULL)
+{
+    load(name);
+}
+
+DirectDrawSurface::DirectDrawSurface(Stream * s) : stream(NULL)
+{
+    load(s);
 }
 
 DirectDrawSurface::~DirectDrawSurface()
 {
-	delete stream;
+    delete stream;
+}
+
+bool DirectDrawSurface::load(const char * filename)
+{
+    return load(new StdInputStream(filename));
+}
+
+bool DirectDrawSurface::load(Stream * stream)
+{
+    delete this->stream;
+    this->stream = stream;
+
+    if (!stream->isError())
+    {
+        (*stream) << header;
+        return true;
+    }
+
+    return false;
 }
 
 bool DirectDrawSurface::isValid() const
 {
-	if (stream->isError())
-	{
-		return false;
-	}
-	
-	if (header.fourcc != FOURCC_DDS || header.size != 124)
-	{
-		return false;
-	}
-	
-	const uint required = (DDSD_WIDTH|DDSD_HEIGHT/*|DDSD_CAPS|DDSD_PIXELFORMAT*/);
-	if( (header.flags & required) != required ) {
-		return false;
-	}
-	
-	if (header.pf.size != 32) {
-		return false;
-	}
-	
-	if( !(header.caps.caps1 & DDSCAPS_TEXTURE) ) {
-		return false;
-	}
-	
-	return true;
+    if (stream == NULL || stream->isError())
+    {
+        return false;
+    }
+
+    if (header.fourcc != FOURCC_DDS || header.size != 124)
+    {
+        return false;
+    }
+
+    const uint required = (DDSD_WIDTH|DDSD_HEIGHT/*|DDSD_CAPS|DDSD_PIXELFORMAT*/);
+    if( (header.flags & required) != required ) {
+        return false;
+    }
+
+    if (header.pf.size != 32) {
+        return false;
+    }
+
+    if( !(header.caps.caps1 & DDSCAPS_TEXTURE) ) {
+        return false;
+    }
+
+    return true;
 }
 
 bool DirectDrawSurface::isSupported() const
 {
-	nvDebugCheck(isValid());
-	
-	if (header.hasDX10Header())
-	{
-	}
-	else
-	{
-		if (header.pf.flags & DDPF_FOURCC)
-		{
-			if (header.pf.fourcc != FOURCC_DXT1 &&
-				header.pf.fourcc != FOURCC_DXT2 &&
-				header.pf.fourcc != FOURCC_DXT3 &&
-				header.pf.fourcc != FOURCC_DXT4 &&
-				header.pf.fourcc != FOURCC_DXT5 &&
-				header.pf.fourcc != FOURCC_RXGB &&
-				header.pf.fourcc != FOURCC_ATI1 &&
-				header.pf.fourcc != FOURCC_ATI2)
-			{
-				// Unknown fourcc code.
-				return false;
-			}
-		}
-		else if (header.pf.flags & DDPF_RGB)
-		{
-			// All RGB formats are supported now.
-		}
-		else
-		{
-			return false;
-		}
-		
-		if (isTextureCube() && (header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) != DDSCAPS2_CUBEMAP_ALL_FACES)
-		{
-			// Cubemaps must contain all faces.
-			return false;
-		}
-		
-		if (isTexture3D())
-		{
-			// @@ 3D textures not supported yet.
-			return false;
-		}
-	}
-	
-	return true;
-}
+    nvDebugCheck(isValid());
 
+    if (header.hasDX10Header())
+    {
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16 ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC7_UNORM)
+        {
+            return true;
+        }
+        else {
+            return findDXGIPixelFormat(header.header10.dxgiFormat) != NULL;
+        }
+    }
+    else
+    {
+        if (header.pf.flags & DDPF_FOURCC)
+        {
+            if (header.pf.fourcc != FOURCC_DXT1 &&
+                header.pf.fourcc != FOURCC_DXT2 &&
+                header.pf.fourcc != FOURCC_DXT3 &&
+                header.pf.fourcc != FOURCC_DXT4 &&
+                header.pf.fourcc != FOURCC_DXT5 &&
+                header.pf.fourcc != FOURCC_RXGB &&
+                header.pf.fourcc != FOURCC_ATI1 &&
+                header.pf.fourcc != FOURCC_ATI2)
+            {
+                // Unknown fourcc code.
+                return false;
+            }
+        }
+        else if ((header.pf.flags & DDPF_RGB) || (header.pf.flags & DDPF_LUMINANCE))
+        {
+            // All RGB and luminance formats are supported now.
+        }
+        else
+        {
+            return false;
+        }
+
+        if (isTextureCube()) {
+            if (header.width != header.height) return false;
+
+            if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) != DDSCAPS2_CUBEMAP_ALL_FACES)
+            {
+                // Cubemaps must contain all faces.
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+bool DirectDrawSurface::hasAlpha() const
+{
+    if (header.hasDX10Header())
+    {
+#pragma NV_MESSAGE("TODO: Update hasAlpha to handle all DX10 formats.")
+        return 
+            header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM ||
+            header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM;
+    }
+    else
+    {
+        if (header.pf.flags & DDPF_RGB) 
+        {
+            return header.pf.amask != 0;
+        }
+        else if (header.pf.flags & DDPF_FOURCC)
+        {
+            if (header.pf.fourcc == FOURCC_RXGB ||
+                header.pf.fourcc == FOURCC_ATI1 ||
+                header.pf.fourcc == FOURCC_ATI2 ||
+                header.pf.flags & DDPF_NORMAL)
+            {
+                return false;
+            }
+            else
+            {
+                // @@ Here we could check the ALPHA_PIXELS flag, but nobody sets it. (except us?)
+                return true;
+            }
+        }
+
+        return false;
+    }
+}
 
 uint DirectDrawSurface::mipmapCount() const
 {
-	nvDebugCheck(isValid());
-	if (header.flags & DDSD_MIPMAPCOUNT) return header.mipmapcount;
-	else return 1;
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_MIPMAPCOUNT) return header.mipmapcount;
+    else return 1;
 }
 
 
 uint DirectDrawSurface::width() const
 {
-	nvDebugCheck(isValid());
-	if (header.flags & DDSD_WIDTH) return header.width;
-	else return 1;
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_WIDTH) return header.width;
+    else return 1;
 }
 
 uint DirectDrawSurface::height() const
 {
-	nvDebugCheck(isValid());
-	if (header.flags & DDSD_HEIGHT) return header.height;
-	else return 1;
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_HEIGHT) return header.height;
+    else return 1;
 }
 
 uint DirectDrawSurface::depth() const
 {
-	nvDebugCheck(isValid());
-	if (header.flags & DDSD_DEPTH) return header.depth;
-	else return 1;
+    nvDebugCheck(isValid());
+    if (header.flags & DDSD_DEPTH) return header.depth;
+    else return 1;
+}
+
+uint DirectDrawSurface::arraySize() const
+{
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header()) return header.header10.arraySize;
+    else return 1;
 }
 
 bool DirectDrawSurface::isTexture1D() const
 {
-	nvDebugCheck(isValid());
-	if (header.hasDX10Header())
-	{
-		return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE1D;
-	}
-	return false;
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header())
+    {
+        return header.header10.resourceDimension == DDS_DIMENSION_TEXTURE1D;
+    }
+    return false;
 }
 
 bool DirectDrawSurface::isTexture2D() const
 {
-	nvDebugCheck(isValid());
-	if (header.hasDX10Header())
-	{
-		return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE2D;
-	}
-	else
-	{
-		return !isTexture3D() && !isTextureCube();
-	}
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header())
+    {
+        return header.header10.resourceDimension == DDS_DIMENSION_TEXTURE2D && header.header10.arraySize == 1;
+    }
+    else
+    {
+        return !isTexture3D() && !isTextureCube();
+    }
 }
 
 bool DirectDrawSurface::isTexture3D() const
 {
-	nvDebugCheck(isValid());
-	if (header.hasDX10Header())
-	{
-		return header.header10.resourceDimension == D3D10_RESOURCE_DIMENSION_TEXTURE3D;
-	}
-	else
-	{
-		return (header.caps.caps2 & DDSCAPS2_VOLUME) != 0;
-	}
+    nvDebugCheck(isValid());
+    if (header.hasDX10Header())
+    {
+        return header.header10.resourceDimension == DDS_DIMENSION_TEXTURE3D;
+    }
+    else
+    {
+        return (header.caps.caps2 & DDSCAPS2_VOLUME) != 0;
+    }
 }
 
 bool DirectDrawSurface::isTextureCube() const
 {
-	nvDebugCheck(isValid());
-	return (header.caps.caps2 & DDSCAPS2_CUBEMAP) != 0;
+    nvDebugCheck(isValid());
+    return (header.caps.caps2 & DDSCAPS2_CUBEMAP) != 0;
+}
+
+bool DirectDrawSurface::isTextureArray() const
+{
+    nvDebugCheck(isValid());
+    return header.hasDX10Header() && header.header10.arraySize > 1;
 }
 
 void DirectDrawSurface::setNormalFlag(bool b)
 {
-	nvDebugCheck(isValid());
-	header.setNormalFlag(b);
+    nvDebugCheck(isValid());
+    header.setNormalFlag(b);
+}
+
+void DirectDrawSurface::setHasAlphaFlag(bool b)
+{
+    nvDebugCheck(isValid());
+    header.setHasAlphaFlag(b);
+}
+
+void DirectDrawSurface::setUserVersion(int version)
+{
+    nvDebugCheck(isValid());
+    header.setUserVersion(version);
 }
 
 void DirectDrawSurface::mipmap(Image * img, uint face, uint mipmap)
 {
-	nvDebugCheck(isValid());
-	
-	stream->seek(offset(face, mipmap));
-	
-	uint w = width();
-	uint h = height();
-	
-	// Compute width and height.
-	for (uint m = 0; m < mipmap; m++)
-	{
-		w = max(1U, w / 2);
-		h = max(1U, h / 2);
-	}
-	
-	img->allocate(w, h);
-	
-	if (header.pf.flags & DDPF_RGB) 
-	{
-		readLinearImage(img);
-	}
-	else if (header.pf.flags & DDPF_FOURCC)
-	{
-		readBlockImage(img);
-	}
-}
-
-void DirectDrawSurface::readLinearImage(Image * img)
-{
-	nvDebugCheck(stream != NULL);
-	nvDebugCheck(img != NULL);
-	
-	const uint w = img->width();
-	const uint h = img->height();
-
-	uint rshift, rsize;
-	PixelFormat::maskShiftAndSize(header.pf.rmask, &rshift, &rsize);
-	
-	uint gshift, gsize;
-	PixelFormat::maskShiftAndSize(header.pf.gmask, &gshift, &gsize);
-	
-	uint bshift, bsize;
-	PixelFormat::maskShiftAndSize(header.pf.bmask, &bshift, &bsize);
-	
-	uint ashift, asize;
-	PixelFormat::maskShiftAndSize(header.pf.amask, &ashift, &asize);
-
-	uint byteCount = (header.pf.bitcount + 7) / 8;
-
-	// set image format: RGB or ARGB
-	// alpha channel exists if and only if the alpha mask is non-zero
-	if (header.pf.amask == 0)
- 	{
-		img->setFormat(Image::Format_RGB);
-	}
-	else
-	{
-		img->setFormat(Image::Format_ARGB);
-	}
-
-	// Read linear RGB images.
-	for (uint y = 0; y < h; y++)
-	{
-		for (uint x = 0; x < w; x++)
-		{
-			uint c = 0;
-			stream->serialize(&c, byteCount);
-
-			Color32 pixel(0, 0, 0, 0xFF);
-			pixel.r = PixelFormat::convert((c & header.pf.rmask) >> rshift, rsize, 8);
-			pixel.g = PixelFormat::convert((c & header.pf.gmask) >> gshift, gsize, 8);
-			pixel.b = PixelFormat::convert((c & header.pf.bmask) >> bshift, bsize, 8);
-			pixel.a = PixelFormat::convert((c & header.pf.amask) >> ashift, asize, 8);
-
-			img->pixel(x, y) = pixel;
-		}
-	}
+    nvDebugCheck(isValid());
+
+    stream->seek(offset(face, mipmap));
+
+    uint w = width();
+    uint h = height();
+	uint d = depth();
+
+    // Compute width and height.
+    for (uint m = 0; m < mipmap; m++)
+    {
+        w = max(1U, w / 2);
+        h = max(1U, h / 2);
+		d = max(1U, d / 2);
+    }
+
+    img->allocate(w, h, d);
+
+    if (hasAlpha())
+    {
+        img->setFormat(Image::Format_ARGB);
+    }
+    else
+    {
+        img->setFormat(Image::Format_RGB);
+    }
+
+    if (header.hasDX10Header())
+    {
+        if (const RGBAPixelFormat *format = findDXGIPixelFormat(header.header10.dxgiFormat)) {
+            readLinearImage(img, format->bitcount, format->rmask, format->gmask, format->bmask, format->amask);
+        }
+        else {
+            readBlockImage(img);
+        }
+    }
+    else
+    {
+        if (header.pf.flags & DDPF_RGB) 
+        {
+            readLinearImage(img, header.pf.bitcount, header.pf.rmask, header.pf.gmask, header.pf.bmask, header.pf.amask);
+        }
+        else if (header.pf.flags & DDPF_FOURCC)
+        {
+            readBlockImage(img);
+        }
+    }
+}
+
+/*void * DirectDrawSurface::readData(uint * sizePtr)
+{
+    uint header_size = 128; // sizeof(DDSHeader);
+
+    if (header.hasDX10Header())
+    {
+        header_size += 20; // sizeof(DDSHeader10);
+    }
+
+    stream->seek(header_size);
+
+    int size = stream->size() - header_size;
+    *sizePtr = size;
+
+    void * data = new unsigned char [size];
+    
+    size = stream->serialize(data, size);
+    nvDebugCheck(size == *sizePtr);
+
+    return data;
+}*/
+
+/*uint DirectDrawSurface::surfaceSize(uint mipmap) const
+{
+    uint w = header.width();
+    uint h = header.height();
+    uint d = header.depth();
+    for (int m = 0; m < mipmap; m++) {
+        w = (w + 1) / 2;
+        h = (h + 1) / 2;
+        d = (d + 1) / 2;
+    }
+    
+    bool isBlockFormat;
+    uint blockOrPixelSize;
+
+    if (header.hasDX10Header()) {
+        blockOrPixelSize = blockSize(header10.dxgiFormat);
+        isBlockFormat = (blockOrPixelSize != 0);
+        if (isBlockFormat) {
+            blockOrPixelSize = pixelSize(header10.dxgiFormat);
+        }
+    }
+    else {
+        header.pf.flags 
+    }
+
+    if (isBlockFormat) {
+        w = (w + 3) / 4;
+        h = (h + 3) / 4;
+        d = (d + 3) / 4; // @@ Is it necessary to align the depths?
+    }
+
+    uint blockOrPixelCount = w * h * d;
+
+    return blockCount = blockOrPixelSize;
+}*/
+
+bool DirectDrawSurface::readSurface(uint face, uint mipmap, void * data, uint size)
+{
+    if (size != surfaceSize(mipmap)) return false;
+
+    stream->seek(offset(face, mipmap));
+    if (stream->isError()) return false;
+
+    return stream->serialize(data, size) == size;
+}
+
+
+void DirectDrawSurface::readLinearImage(Image * img, uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+    nvDebugCheck(stream != NULL);
+    nvDebugCheck(img != NULL);
+
+    const uint w = img->width();
+    const uint h = img->height();
+    const uint d = img->depth();
+
+    uint rshift, rsize;
+    PixelFormat::maskShiftAndSize(rmask, &rshift, &rsize);
+
+    uint gshift, gsize;
+    PixelFormat::maskShiftAndSize(gmask, &gshift, &gsize);
+
+    uint bshift, bsize;
+    PixelFormat::maskShiftAndSize(bmask, &bshift, &bsize);
+
+    uint ashift, asize;
+    PixelFormat::maskShiftAndSize(amask, &ashift, &asize);
+
+    uint byteCount = (bitcount + 7) / 8;
+
+#pragma NV_MESSAGE("TODO: Support floating point linear images and other FOURCC codes.")
+
+    // Read linear RGB images.
+    for (uint z = 0; z < d; z++)
+    {
+        for (uint y = 0; y < h; y++)
+        {
+            for (uint x = 0; x < w; x++)
+            {
+                uint c = 0;
+                stream->serialize(&c, byteCount);
+
+                Color32 pixel(0, 0, 0, 0xFF);
+                pixel.r = PixelFormat::convert((c & rmask) >> rshift, rsize, 8);
+                pixel.g = PixelFormat::convert((c & gmask) >> gshift, gsize, 8);
+                pixel.b = PixelFormat::convert((c & bmask) >> bshift, bsize, 8);
+                pixel.a = PixelFormat::convert((c & amask) >> ashift, asize, 8);
+
+                img->pixel(x, y, z) = pixel;
+            }
+        }
+    }
 }
 
 void DirectDrawSurface::readBlockImage(Image * img)
 {
-	nvDebugCheck(stream != NULL);
-	nvDebugCheck(img != NULL);
+    nvDebugCheck(stream != NULL);
+    nvDebugCheck(img != NULL);
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    const uint bw = (w + 3) / 4;
+    const uint bh = (h + 3) / 4;
 
-	// set image format: RGB or ARGB
-	if (header.pf.fourcc == FOURCC_RXGB ||
-		header.pf.fourcc == FOURCC_ATI1 ||
-		header.pf.fourcc == FOURCC_ATI2 ||
-		header.pf.flags & DDPF_NORMAL)
-	{
-		img->setFormat(Image::Format_RGB);
-	}
-	else
-	{
-		img->setFormat(Image::Format_ARGB);
-	}
-
-	const uint w = img->width();
-	const uint h = img->height();
-	
-	const uint bw = (w + 3) / 4;
-	const uint bh = (h + 3) / 4;
-	
-	for (uint by = 0; by < bh; by++)
-	{
-		for (uint bx = 0; bx < bw; bx++)
-		{
-			ColorBlock block;
-			
-			// Read color block.
-			readBlock(&block);
-			
-			// Write color block.
-			for (uint y = 0; y < min(4U, h-4*by); y++)
-			{
-				for (uint x = 0; x < min(4U, w-4*bx); x++)
-				{
-					img->pixel(4*bx+x, 4*by+y) = block.color(x, y);
-				}
-			}
-		}
-	}
+    for (uint by = 0; by < bh; by++)
+    {
+        for (uint bx = 0; bx < bw; bx++)
+        {
+            ColorBlock block;
+
+            // Read color block.
+            readBlock(&block);
+
+            // Write color block.
+            for (uint y = 0; y < min(4U, h-4*by); y++)
+            {
+                for (uint x = 0; x < min(4U, w-4*bx); x++)
+                {
+                    img->pixel(4*bx+x, 4*by+y) = block.color(x, y);
+                }
+            }
+        }
+    }
 }
 
 static Color32 buildNormal(uint8 x, uint8 y)
 {
-	float nx = 2 * (x / 255.0f) - 1;
-	float ny = 2 * (y / 255.0f) - 1;
-	float nz = 0.0f;
-	if (1 - nx*nx - ny*ny > 0) nz = sqrtf(1 - nx*nx - ny*ny);
-	uint8 z = clamp(int(255.0f * (nz + 1) / 2.0f), 0, 255);
-	
-	return Color32(x, y, z);
+    float nx = 2 * (x / 255.0f) - 1;
+    float ny = 2 * (y / 255.0f) - 1;
+    float nz = 0.0f;
+    if (1 - nx*nx - ny*ny > 0) nz = sqrtf(1 - nx*nx - ny*ny);
+    uint8 z = clamp(int(255.0f * (nz + 1) / 2.0f), 0, 255);
+
+    return Color32(x, y, z);
 }
 
 
 void DirectDrawSurface::readBlock(ColorBlock * rgba)
 {
-	nvDebugCheck(stream != NULL);
-	nvDebugCheck(rgba != NULL);
-	
-	if (header.pf.fourcc == FOURCC_DXT1)
-	{
-		BlockDXT1 block;
-		*stream << block;
-		block.decodeBlock(rgba);
-	}
-	else if (header.pf.fourcc == FOURCC_DXT2 ||
-	    header.pf.fourcc == FOURCC_DXT3)
-	{
-		BlockDXT3 block;
-		*stream << block;
-		block.decodeBlock(rgba);
-	}
-	else if (header.pf.fourcc == FOURCC_DXT4 ||
-	    header.pf.fourcc == FOURCC_DXT5 ||
-	    header.pf.fourcc == FOURCC_RXGB)
-	{
-		BlockDXT5 block;
-		*stream << block;
-		block.decodeBlock(rgba);
-		
-		if (header.pf.fourcc == FOURCC_RXGB)
-		{
-			// Swap R & A.
-			for (int i = 0; i < 16; i++)
-			{
-				Color32 & c = rgba->color(i);
-				uint tmp = c.r;
-				c.r = c.a;
-				c.a = tmp;
-			}
-		}
-	}
-	else if (header.pf.fourcc == FOURCC_ATI1)
-	{
-		BlockATI1 block;
-		*stream << block;
-		block.decodeBlock(rgba);
-	}
-	else if (header.pf.fourcc == FOURCC_ATI2)
-	{
-		BlockATI2 block;
-		*stream << block;
-		block.decodeBlock(rgba);
-	}
-	
-	// If normal flag set, convert to normal.
-	if (header.pf.flags & DDPF_NORMAL)
-	{
-		if (header.pf.fourcc == FOURCC_ATI2)
-		{
-			for (int i = 0; i < 16; i++)
-			{
-				Color32 & c = rgba->color(i);
-				c = buildNormal(c.r, c.g);
-			}
-		}
-		else if (header.pf.fourcc == FOURCC_DXT5)
-		{
-			for (int i = 0; i < 16; i++)
-			{
-				Color32 & c = rgba->color(i);
-				c = buildNormal(c.a, c.g);
-			}
-		}
-	}
-}
-
-
-uint DirectDrawSurface::blockSize() const
-{
-	switch(header.pf.fourcc)
-	{
-		case FOURCC_DXT1:
-		case FOURCC_ATI1:
-			return 8;
-		case FOURCC_DXT2:
-		case FOURCC_DXT3:
-		case FOURCC_DXT4:
-		case FOURCC_DXT5:
-		case FOURCC_RXGB:
-		case FOURCC_ATI2:
-			return 16;
-	};
-
-	// Not a block image.
-	return 0;
-}
+    nvDebugCheck(stream != NULL);
+    nvDebugCheck(rgba != NULL);
 
-uint DirectDrawSurface::mipmapSize(uint mipmap) const
-{
-	uint w = width();
-	uint h = height();
-	uint d = depth();
-	
-	for (uint m = 0; m < mipmap; m++)
-	{
-		w = max(1U, w / 2);
-		h = max(1U, h / 2);
-		d = max(1U, d / 2);
-	}
+    uint fourcc = header.pf.fourcc;
 
-	if (header.pf.flags & DDPF_FOURCC)
-	{
-		// @@ How are 3D textures aligned?
-		w = (w + 3) / 4;
-		h = (h + 3) / 4;
-		return blockSize() * w * h;
-	}
-	else
-	{
-		nvDebugCheck(header.pf.flags & DDPF_RGB);
-		
-		// Align pixels to bytes.
-		uint byteCount = (header.pf.bitcount + 7) / 8;
-		
-		// Align pitch to 4 bytes.
-		uint pitch = 4 * ((w * byteCount + 3) / 4);
-		
-		return pitch * h * d;
-	}
+    // Map DX10 block formats to fourcc codes.
+    if (header.hasDX10Header())
+    {
+        if (header.header10.dxgiFormat == DXGI_FORMAT_BC1_UNORM) fourcc = FOURCC_DXT1;
+        else if (header.header10.dxgiFormat == DXGI_FORMAT_BC2_UNORM) fourcc = FOURCC_DXT3;
+        else if (header.header10.dxgiFormat == DXGI_FORMAT_BC3_UNORM) fourcc = FOURCC_DXT5;
+        else if (header.header10.dxgiFormat == DXGI_FORMAT_BC4_UNORM) fourcc = FOURCC_ATI1;
+        else if (header.header10.dxgiFormat == DXGI_FORMAT_BC5_UNORM) fourcc = FOURCC_ATI2;
+    }
+
+    if (fourcc == FOURCC_DXT1)
+    {
+        BlockDXT1 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (fourcc == FOURCC_DXT2 || fourcc == FOURCC_DXT3)
+    {
+        BlockDXT3 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (fourcc == FOURCC_DXT4 || fourcc == FOURCC_DXT5 || fourcc == FOURCC_RXGB)
+    {
+        BlockDXT5 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+
+        if (fourcc == FOURCC_RXGB)
+        {
+            // Swap R & A.
+            for (int i = 0; i < 16; i++)
+            {
+                Color32 & c = rgba->color(i);
+                uint tmp = c.r;
+                c.r = c.a;
+                c.a = tmp;
+            }
+        }
+    }
+    else if (fourcc == FOURCC_ATI1)
+    {
+        BlockATI1 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (fourcc == FOURCC_ATI2)
+    {
+        BlockATI2 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else if (header.hasDX10Header() && header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16)
+    {
+        BlockBC6 block;
+        *stream << block;
+        Vector3 colors[16];
+        block.decodeBlock(colors);
+
+        // Clamp to [0, 1] and round to 8-bit
+        for (int y = 0; y < 4; ++y)
+        {
+            for (int x = 0; x < 4; ++x)
+            {
+                Vector3 px = colors[y*4 + x];
+                rgba->color(x, y).setRGBA(
+                                    ftoi_round(clamp(px.x, 0.0f, 1.0f) * 255.0f),
+                                    ftoi_round(clamp(px.y, 0.0f, 1.0f) * 255.0f),
+                                    ftoi_round(clamp(px.z, 0.0f, 1.0f) * 255.0f),
+                                    0xFF);
+            }
+        }
+    }
+    else if (header.hasDX10Header() && header.header10.dxgiFormat == DXGI_FORMAT_BC7_UNORM)
+    {
+        BlockBC7 block;
+        *stream << block;
+        block.decodeBlock(rgba);
+    }
+    else
+    {
+        nvDebugCheck(false);
+    }
+
+    // If normal flag set, convert to normal.
+    if (header.pf.flags & DDPF_NORMAL)
+    {
+        if (fourcc == FOURCC_ATI2)
+        {
+            for (int i = 0; i < 16; i++)
+            {
+                Color32 & c = rgba->color(i);
+                c = buildNormal(c.r, c.g);
+            }
+        }
+        else if (fourcc == FOURCC_DXT5)
+        {
+            for (int i = 0; i < 16; i++)
+            {
+                Color32 & c = rgba->color(i);
+                c = buildNormal(c.a, c.g);
+            }
+        }
+    }
+}
+
+
+static uint mipmapExtent(uint mipmap, uint x)
+{
+    for (uint m = 0; m < mipmap; m++) {
+        x = max(1U, x / 2);
+    }
+    return x;
+}
+
+uint DirectDrawSurface::surfaceWidth(uint mipmap) const
+{
+    return mipmapExtent(mipmap, width());
+}
+
+uint DirectDrawSurface::surfaceHeight(uint mipmap) const
+{
+    return mipmapExtent(mipmap, height());
+}
+
+uint DirectDrawSurface::surfaceDepth(uint mipmap) const
+{
+    return mipmapExtent(mipmap, depth());
+}
+
+uint DirectDrawSurface::surfaceSize(uint mipmap) const
+{
+    uint w = surfaceWidth(mipmap);
+    uint h = surfaceHeight(mipmap);
+    uint d = surfaceDepth(mipmap);
+
+    uint blockSize = header.blockSize();
+
+    if (blockSize == 0) {
+        uint bitCount = header.pixelSize();
+        uint pitch = computeBytePitch(w, bitCount, 1); // Asuming 1 byte alignment, which is the same D3DX expects.
+        return pitch * h * d;
+    }
+    else {
+        w = (w + 3) / 4;
+        h = (h + 3) / 4;
+        d = d; // @@ How are 3D textures aligned?
+        return blockSize * w * h * d;
+    }
 }
 
 uint DirectDrawSurface::faceSize() const
 {
-	const uint count = mipmapCount();
-	uint size = 0;
-	
-	for (uint m = 0; m < count; m++)
-	{
-		size += mipmapSize(m);
-	}
-	
-	return size;
+    const uint count = mipmapCount();
+    uint size = 0;
+
+    for (uint m = 0; m < count; m++)
+    {
+        size += surfaceSize(m);
+    }
+
+    return size;
 }
 
 uint DirectDrawSurface::offset(const uint face, const uint mipmap)
 {
-	uint size = 128; // sizeof(DDSHeader);
-	
-	if (header.hasDX10Header())
-	{
-		size += 20; // sizeof(DDSHeader10);
-	}
-
-	if (face != 0)
-	{
-		size += face * faceSize();
-	}
-	
-	for (uint m = 0; m < mipmap; m++)
-	{
-		size += mipmapSize(m);
-	}
-	
-	return size;
+    uint size = 128; // sizeof(DDSHeader);
+
+    if (header.hasDX10Header())
+    {
+        size += 20; // sizeof(DDSHeader10);
+    }
+
+    if (face != 0)
+    {
+        size += face * faceSize();
+    }
+
+    for (uint m = 0; m < mipmap; m++)
+    {
+        size += surfaceSize(m);
+    }
+
+    return size;
 }
 
 
 void DirectDrawSurface::printInfo() const
 {
-	printf("Flags: 0x%.8X\n", header.flags);
-	if (header.flags & DDSD_CAPS) printf("\tDDSD_CAPS\n");
-	if (header.flags & DDSD_PIXELFORMAT) printf("\tDDSD_PIXELFORMAT\n");
-	if (header.flags & DDSD_WIDTH) printf("\tDDSD_WIDTH\n");
-	if (header.flags & DDSD_HEIGHT) printf("\tDDSD_HEIGHT\n");
-	if (header.flags & DDSD_DEPTH) printf("\tDDSD_DEPTH\n");
-	if (header.flags & DDSD_PITCH) printf("\tDDSD_PITCH\n");
-	if (header.flags & DDSD_LINEARSIZE) printf("\tDDSD_LINEARSIZE\n");
-	if (header.flags & DDSD_MIPMAPCOUNT) printf("\tDDSD_MIPMAPCOUNT\n");
-
-	printf("Height: %d\n", header.height);
-	printf("Width: %d\n", header.width);
-	printf("Depth: %d\n", header.depth);
-	if (header.flags & DDSD_PITCH) printf("Pitch: %d\n", header.pitch);
-	else if (header.flags & DDSD_LINEARSIZE) printf("Linear size: %d\n", header.pitch);
-	printf("Mipmap count: %d\n", header.mipmapcount);
-	
-	printf("Pixel Format:\n");
-	printf("\tFlags: 0x%.8X\n", header.pf.flags);
-	if (header.pf.flags & DDPF_RGB) printf("\t\tDDPF_RGB\n");
-	if (header.pf.flags & DDPF_FOURCC) printf("\t\tDDPF_FOURCC\n");
-	if (header.pf.flags & DDPF_ALPHAPIXELS) printf("\t\tDDPF_ALPHAPIXELS\n");
-	if (header.pf.flags & DDPF_ALPHA) printf("\t\tDDPF_ALPHA\n");
-	if (header.pf.flags & DDPF_PALETTEINDEXED1) printf("\t\tDDPF_PALETTEINDEXED1\n");
-	if (header.pf.flags & DDPF_PALETTEINDEXED2) printf("\t\tDDPF_PALETTEINDEXED2\n");
-	if (header.pf.flags & DDPF_PALETTEINDEXED4) printf("\t\tDDPF_PALETTEINDEXED4\n");
-	if (header.pf.flags & DDPF_PALETTEINDEXED8) printf("\t\tDDPF_PALETTEINDEXED8\n");
-	if (header.pf.flags & DDPF_ALPHAPREMULT) printf("\t\tDDPF_ALPHAPREMULT\n");
-	if (header.pf.flags & DDPF_NORMAL) printf("\t\tDDPF_NORMAL\n");
-	
-	printf("\tFourCC: '%c%c%c%c'\n",
-		((header.pf.fourcc >> 0) & 0xFF),
-		((header.pf.fourcc >> 8) & 0xFF),
-		((header.pf.fourcc >> 16) & 0xFF),
-		((header.pf.fourcc >> 24) & 0xFF));
-	if ((header.pf.fourcc & DDPF_FOURCC) && (header.pf.bitcount != 0))
-	{
-		printf("\tSwizzle: '%c%c%c%c'\n", 
-			(header.pf.bitcount >> 0) & 0xFF,
-			(header.pf.bitcount >> 8) & 0xFF,
-			(header.pf.bitcount >> 16) & 0xFF,
-			(header.pf.bitcount >> 24) & 0xFF);
-	}
-	else
-	{
-		printf("\tBit count: %d\n", header.pf.bitcount);
-	}
-	printf("\tRed mask: 0x%.8X\n", header.pf.rmask);
-	printf("\tGreen mask: 0x%.8X\n", header.pf.gmask);
-	printf("\tBlue mask: 0x%.8X\n", header.pf.bmask);
-	printf("\tAlpha mask: 0x%.8X\n", header.pf.amask);
-
-	printf("Caps:\n");
-	printf("\tCaps 1: 0x%.8X\n", header.caps.caps1);
-	if (header.caps.caps1 & DDSCAPS_COMPLEX) printf("\t\tDDSCAPS_COMPLEX\n");
-	if (header.caps.caps1 & DDSCAPS_TEXTURE) printf("\t\tDDSCAPS_TEXTURE\n");
-	if (header.caps.caps1 & DDSCAPS_MIPMAP) printf("\t\tDDSCAPS_MIPMAP\n");
-
-	printf("\tCaps 2: 0x%.8X\n", header.caps.caps2);
-	if (header.caps.caps2 & DDSCAPS2_VOLUME) printf("\t\tDDSCAPS2_VOLUME\n");
-	else if (header.caps.caps2 & DDSCAPS2_CUBEMAP)
-	{
-		printf("\t\tDDSCAPS2_CUBEMAP\n");
-		if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) == DDSCAPS2_CUBEMAP_ALL_FACES) printf("\t\tDDSCAPS2_CUBEMAP_ALL_FACES\n");
-		else {
-			if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEX) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEX\n");
-			if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEX) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEX\n");
-			if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEY) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEY\n");
-			if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEY) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEY\n");
-			if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEZ\n");
-			if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEZ\n");
-		}
-	}
-
-	printf("\tCaps 3: 0x%.8X\n", header.caps.caps3);
-	printf("\tCaps 4: 0x%.8X\n", header.caps.caps4);
-
-	if (header.hasDX10Header())
-	{
-		printf("DX10 Header:\n");
-		printf("\tDXGI Format: %u (%s)\n", header.header10.dxgiFormat, getDxgiFormatString((DXGI_FORMAT)header.header10.dxgiFormat));
-		printf("\tResource dimension: %u (%s)\n", header.header10.resourceDimension, getD3d10ResourceDimensionString((D3D10_RESOURCE_DIMENSION)header.header10.resourceDimension));
-		printf("\tMisc flag: %u\n", header.header10.miscFlag);
-		printf("\tArray size: %u\n", header.header10.arraySize);
-	}
-	
-	if (header.reserved[9] == MAKEFOURCC('N', 'V', 'T', 'T'))
-	{
-		int major = (header.reserved[10] >> 16) & 0xFF;
-		int minor = (header.reserved[10] >> 8) & 0xFF;
-		int revision= header.reserved[10] & 0xFF;
-		
-		printf("Version:\n");
-		printf("\tNVIDIA Texture Tools %d.%d.%d\n", major, minor, revision);
-	}
+    printf("Flags: 0x%.8X\n", header.flags);
+    if (header.flags & DDSD_CAPS) printf("\tDDSD_CAPS\n");
+    if (header.flags & DDSD_PIXELFORMAT) printf("\tDDSD_PIXELFORMAT\n");
+    if (header.flags & DDSD_WIDTH) printf("\tDDSD_WIDTH\n");
+    if (header.flags & DDSD_HEIGHT) printf("\tDDSD_HEIGHT\n");
+    if (header.flags & DDSD_DEPTH) printf("\tDDSD_DEPTH\n");
+    if (header.flags & DDSD_PITCH) printf("\tDDSD_PITCH\n");
+    if (header.flags & DDSD_LINEARSIZE) printf("\tDDSD_LINEARSIZE\n");
+    if (header.flags & DDSD_MIPMAPCOUNT) printf("\tDDSD_MIPMAPCOUNT\n");
+
+    printf("Height: %d\n", header.height);
+    printf("Width: %d\n", header.width);
+    printf("Depth: %d\n", header.depth);
+    if (header.flags & DDSD_PITCH) printf("Pitch: %d\n", header.pitch);
+    else if (header.flags & DDSD_LINEARSIZE) printf("Linear size: %d\n", header.pitch);
+    printf("Mipmap count: %d\n", header.mipmapcount);
+
+    printf("Pixel Format:\n");
+    printf("\tFlags: 0x%.8X\n", header.pf.flags);
+    if (header.pf.flags & DDPF_RGB) printf("\t\tDDPF_RGB\n");
+    if (header.pf.flags & DDPF_LUMINANCE) printf("\t\tDDPF_LUMINANCE\n");
+    if (header.pf.flags & DDPF_FOURCC) printf("\t\tDDPF_FOURCC\n");
+    if (header.pf.flags & DDPF_ALPHAPIXELS) printf("\t\tDDPF_ALPHAPIXELS\n");
+    if (header.pf.flags & DDPF_ALPHA) printf("\t\tDDPF_ALPHA\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED1) printf("\t\tDDPF_PALETTEINDEXED1\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED2) printf("\t\tDDPF_PALETTEINDEXED2\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED4) printf("\t\tDDPF_PALETTEINDEXED4\n");
+    if (header.pf.flags & DDPF_PALETTEINDEXED8) printf("\t\tDDPF_PALETTEINDEXED8\n");
+    if (header.pf.flags & DDPF_ALPHAPREMULT) printf("\t\tDDPF_ALPHAPREMULT\n");
+    if (header.pf.flags & DDPF_NORMAL) printf("\t\tDDPF_NORMAL\n");
+
+    if (header.pf.fourcc != 0) { 
+        // Display fourcc code even when DDPF_FOURCC flag not set.
+        printf("\tFourCC: '%c%c%c%c' (0x%.8X)\n",
+            ((header.pf.fourcc >> 0) & 0xFF),
+            ((header.pf.fourcc >> 8) & 0xFF),
+            ((header.pf.fourcc >> 16) & 0xFF),
+            ((header.pf.fourcc >> 24) & 0xFF), 
+            header.pf.fourcc);
+    }
+
+    if ((header.pf.flags & DDPF_FOURCC) && (header.pf.bitcount != 0))
+    {
+        printf("\tSwizzle: '%c%c%c%c' (0x%.8X)\n", 
+            (header.pf.bitcount >> 0) & 0xFF,
+            (header.pf.bitcount >> 8) & 0xFF,
+            (header.pf.bitcount >> 16) & 0xFF,
+            (header.pf.bitcount >> 24) & 0xFF,
+            header.pf.bitcount);
+    }
+    else
+    {
+        printf("\tBit count: %d\n", header.pf.bitcount);
+    }
+
+    printf("\tRed mask:   0x%.8X\n", header.pf.rmask);
+    printf("\tGreen mask: 0x%.8X\n", header.pf.gmask);
+    printf("\tBlue mask:  0x%.8X\n", header.pf.bmask);
+    printf("\tAlpha mask: 0x%.8X\n", header.pf.amask);
+
+    printf("Caps:\n");
+    printf("\tCaps 1: 0x%.8X\n", header.caps.caps1);
+    if (header.caps.caps1 & DDSCAPS_COMPLEX) printf("\t\tDDSCAPS_COMPLEX\n");
+    if (header.caps.caps1 & DDSCAPS_TEXTURE) printf("\t\tDDSCAPS_TEXTURE\n");
+    if (header.caps.caps1 & DDSCAPS_MIPMAP) printf("\t\tDDSCAPS_MIPMAP\n");
+
+    printf("\tCaps 2: 0x%.8X\n", header.caps.caps2);
+    if (header.caps.caps2 & DDSCAPS2_VOLUME) printf("\t\tDDSCAPS2_VOLUME\n");
+    else if (header.caps.caps2 & DDSCAPS2_CUBEMAP)
+    {
+        printf("\t\tDDSCAPS2_CUBEMAP\n");
+        if ((header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) == DDSCAPS2_CUBEMAP_ALL_FACES) printf("\t\tDDSCAPS2_CUBEMAP_ALL_FACES\n");
+        else {
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEX) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEX\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEX) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEX\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEY) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEY\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEY) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEY\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_POSITIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_POSITIVEZ\n");
+            if (header.caps.caps2 & DDSCAPS2_CUBEMAP_NEGATIVEZ) printf("\t\tDDSCAPS2_CUBEMAP_NEGATIVEZ\n");
+        }
+    }
+
+    printf("\tCaps 3: 0x%.8X\n", header.caps.caps3);
+    printf("\tCaps 4: 0x%.8X\n", header.caps.caps4);
+
+    if (header.hasDX10Header())
+    {
+        printf("DX10 Header:\n");
+        printf("\tDXGI Format: %u (%s)\n", header.header10.dxgiFormat, getDxgiFormatString((DXGI_FORMAT)header.header10.dxgiFormat));
+        printf("\tResource dimension: %u (%s)\n", header.header10.resourceDimension, getD3d10ResourceDimensionString((DDS_DIMENSION)header.header10.resourceDimension));
+        printf("\tMisc flag: %u\n", header.header10.miscFlag);
+        printf("\tArray size: %u\n", header.header10.arraySize);
+    }
+
+    if (header.reserved[9] == FOURCC_NVTT)
+    {
+        int major = (header.reserved[10] >> 16) & 0xFF;
+        int minor = (header.reserved[10] >> 8) & 0xFF;
+        int revision= header.reserved[10] & 0xFF;
+
+        printf("Version:\n");
+        printf("\tNVIDIA Texture Tools %d.%d.%d\n", major, minor, revision);
+    }
+
+    if (header.reserved[7] == FOURCC_UVER)
+    {
+        printf("User Version: %d\n", header.reserved[8]);
+    }
 }
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ErrorMetric.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ErrorMetric.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ErrorMetric.h
@@ -0,0 +1,22 @@
+
+#include "nvimage.h"
+
+
+namespace nv
+{
+    class FloatImage;
+
+    NVIMAGE_API float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    NVIMAGE_API float rmsAlphaError(const FloatImage * ref, const FloatImage * img);
+
+    NVIMAGE_API float cieLabError(const FloatImage * ref, const FloatImage * img);
+    float cieLab94Error(const FloatImage * ref, const FloatImage * img);
+    float spatialCieLabError(const FloatImage * ref, const FloatImage * img);
+
+    float averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    float averageAlphaError(const FloatImage * ref, const FloatImage * img);
+
+    float averageAngularError(const FloatImage * img0, const FloatImage * img1);
+    NVIMAGE_API float rmsAngularError(const FloatImage * img0, const FloatImage * img1);
+
+} // nv namespace
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ErrorMetric.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ErrorMetric.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ErrorMetric.cpp
@@ -0,0 +1,460 @@
+
+#include "ErrorMetric.h"
+#include "FloatImage.h"
+#include "Filter.h"
+
+#include "nvmath/Matrix.h"
+#include "nvmath/Vector.inl"
+
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+
+float nv::rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
+{
+    if (!sameLayout(img, ref)) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4);
+    nvDebugCheck(ref->componentCount() == 4);
+
+    double mse = 0;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float r0 = ref->pixel(i + count * 0);
+        float g0 = ref->pixel(i + count * 1);
+        float b0 = ref->pixel(i + count * 2);
+        float a0 = ref->pixel(i + count * 3);
+        float r1 = img->pixel(i + count * 0);
+        float g1 = img->pixel(i + count * 1);
+        float b1 = img->pixel(i + count * 2);
+        //float a1 = img->pixel(i + count * 3);
+
+        float r = r0 - r1;
+        float g = g0 - g1;
+        float b = b0 - b1;
+
+        float a = 1;
+        if (alphaWeight) a = a0 * a0; // @@ a0*a1 or a0*a0 ?
+
+        mse += (r * r) * a;
+        mse += (g * g) * a;
+        mse += (b * b) * a;
+    }
+
+    return float(sqrt(mse / count));
+}
+
+float nv::rmsAlphaError(const FloatImage * ref, const FloatImage * img)
+{
+    if (!sameLayout(img, ref)) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4 && ref->componentCount() == 4);
+
+    double mse = 0;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float a0 = img->pixel(i + count * 3);
+        float a1 = ref->pixel(i + count * 3);
+
+        float a = a0 - a1;
+
+        mse += a * a;
+    }
+
+    return float(sqrt(mse / count));
+}
+
+
+float nv::averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight)
+{
+    if (!sameLayout(img, ref)) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4);
+    nvDebugCheck(ref->componentCount() == 4);
+
+    double mae = 0;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float r0 = img->pixel(i + count * 0);
+        float g0 = img->pixel(i + count * 1);
+        float b0 = img->pixel(i + count * 2);
+        //float a0 = img->pixel(i + count * 3);
+        float r1 = ref->pixel(i + count * 0);
+        float g1 = ref->pixel(i + count * 1);
+        float b1 = ref->pixel(i + count * 2);
+        float a1 = ref->pixel(i + count * 3);
+
+        float r = fabs(r0 - r1);
+        float g = fabs(g0 - g1);
+        float b = fabs(b0 - b1);
+
+        float a = 1;
+        if (alphaWeight) a = a1;
+
+        mae += r * a;
+        mae += g * a;
+        mae += b * a;
+    }
+
+    return float(mae / count);
+}
+
+float nv::averageAlphaError(const FloatImage * ref, const FloatImage * img)
+{
+    if (img == NULL || ref == NULL || img->width() != ref->width() || img->height() != ref->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img->componentCount() == 4 && ref->componentCount() == 4);
+
+    double mae = 0;
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++)
+    {
+        float a0 = img->pixel(i + count * 3);
+        float a1 = ref->pixel(i + count * 3);
+
+        float a = a0 - a1;
+
+        mae += fabs(a);
+    }
+
+    return float(mae / count);
+}
+
+
+// Color space conversions based on:
+// http://www.brucelindbloom.com/
+
+// Assumes input is in *linear* sRGB color space.
+static Vector3 rgbToXyz(Vector3::Arg c)
+{
+    Vector3 xyz;
+    xyz.x = 0.412453f * c.x + 0.357580f * c.y + 0.180423f * c.z;
+    xyz.y = 0.212671f * c.x + 0.715160f * c.y + 0.072169f * c.z;
+    xyz.z = 0.019334f * c.x + 0.119193f * c.y + 0.950227f * c.z;
+    return xyz;
+}
+
+static Vector3 xyzToRgb(Vector3::Arg c)
+{
+    Vector3 rgb;
+    rgb.x =  3.2404542f * c.x - 1.5371385f * c.y - 0.4985314f * c.z;
+    rgb.y = -0.9692660f * c.x + 1.8760108f * c.y + 0.0415560f * c.z;
+    rgb.z =  0.0556434f * c.x - 0.2040259f * c.y + 1.0572252f * c.z;
+    return rgb;
+}
+
+static float toLinear(float f)
+{
+    return powf(f, 2.2f);
+}
+
+static float toGamma(float f)
+{
+    // @@ Use sRGB space?
+    return powf(f, 1.0f/2.2f);
+}
+
+static Vector3 toLinear(Vector3::Arg c)
+{
+    return Vector3(toLinear(c.x), toLinear(c.y), toLinear(c.z));
+}
+
+static Vector3 toGamma(Vector3::Arg c)
+{
+    return Vector3(toGamma(c.x), toGamma(c.y), toGamma(c.z));
+}
+
+static float f(float t)
+{
+    const float epsilon = powf(6.0f/29.0f, 3);
+
+    if (t > epsilon) {
+        return powf(t, 1.0f/3.0f);
+    }
+    else {
+        return 1.0f/3.0f * powf(29.0f/6.0f, 2) * t + 4.0f / 29.0f;
+    }
+}
+
+static float finv(float t)
+{
+    if (t > 6.0f / 29.0f) {
+        return 3.0f * powf(6.0f / 29.0f, 2) * (t - 4.0f / 29.0f);
+    }
+    else {
+        return powf(t, 3.0f);
+    }
+}
+
+static Vector3 xyzToCieLab(Vector3::Arg c)
+{
+    // Normalized white point.
+    const float Xn = 0.950456f;
+    const float Yn = 1.0f;
+    const float Zn = 1.088754f;
+
+    float Xr = c.x / Xn;
+    float Yr = c.y / Yn;
+    float Zr = c.z / Zn;
+
+    float fx = f(Xr);
+    float fy = f(Yr);
+    float fz = f(Zr);
+
+    float L = 116 * fx - 16;
+    float a = 500 * (fx - fy);
+    float b = 200 * (fy - fz);
+
+    return Vector3(L, a, b);
+}
+
+static Vector3 rgbToCieLab(Vector3::Arg c)
+{
+    return xyzToCieLab(rgbToXyz(toLinear(c)));
+}
+
+// h is hue-angle in radians
+static Vector3 cieLabToLCh(Vector3::Arg c)
+{
+    return Vector3(c.x, sqrtf(c.y*c.y + c.z*c.z), atan2f(c.y, c.z));
+}
+
+static void rgbToCieLab(const FloatImage * rgbImage, FloatImage * LabImage)
+{
+    nvDebugCheck(rgbImage != NULL && LabImage != NULL);
+    nvDebugCheck(rgbImage->width() == LabImage->width() && rgbImage->height() == LabImage->height());
+    nvDebugCheck(rgbImage->componentCount() >= 3 && LabImage->componentCount() >= 3);
+
+    const uint w = rgbImage->width();
+    const uint h = LabImage->height();
+
+    const float * R = rgbImage->channel(0);
+    const float * G = rgbImage->channel(1);
+    const float * B = rgbImage->channel(2);
+
+    float * L = LabImage->channel(0);
+    float * a = LabImage->channel(1);
+    float * b = LabImage->channel(2);
+
+    const uint count = w*h;
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 Lab = rgbToCieLab(Vector3(R[i], G[i], B[i]));
+        L[i] = Lab.x;
+        a[i] = Lab.y;
+        b[i] = Lab.z;
+    }
+}
+
+
+// Assumes input images are in linear sRGB space.
+float nv::cieLabError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (!sameLayout(img0, img1)) return FLT_MAX;
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    const float * r0 = img0->channel(0);
+    const float * g0 = img0->channel(1);
+    const float * b0 = img0->channel(2);
+
+    const float * r1 = img1->channel(0);
+    const float * g1 = img1->channel(1);
+    const float * b1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = img0->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 lab0 = rgbToCieLab(Vector3(r0[i], g0[i], b0[i]));
+        Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
+
+        // @@ Measure Delta E.
+        Vector3 delta = lab0 - lab1;
+        
+        error += length(delta);
+    }
+
+    return float(error / count);
+}
+
+// Assumes input images are in linear sRGB space.
+float nv::cieLab94Error(const FloatImage * img0, const FloatImage * img1)
+{
+    if (!sameLayout(img0, img1)) return FLT_MAX;
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    const float kL = 1;
+    const float kC = 1;
+    const float kH = 1;
+    const float k1 = 0.045f;
+    const float k2 = 0.015f;
+
+    const float sL = 1;
+
+    const float * r0 = img0->channel(0);
+    const float * g0 = img0->channel(1);
+    const float * b0 = img0->channel(2);
+
+    const float * r1 = img1->channel(0);
+    const float * g1 = img1->channel(1);
+    const float * b1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = img0->pixelCount();
+    for (uint i = 0; i < count; ++i)
+    {
+        Vector3 lab0 = rgbToCieLab(Vector3(r0[i], g0[i], b0[i]));
+        Vector3 lch0 = cieLabToLCh(lab0);
+        Vector3 lab1 = rgbToCieLab(Vector3(r1[i], g1[i], b1[i]));
+        Vector3 lch1 = cieLabToLCh(lab1);
+
+        const float sC = 1 + k1*lch0.x;
+        const float sH = 1 + k2*lch0.x;
+
+        // @@ Measure Delta E using the 1994 definition
+        Vector3 labDelta = lab0 - lab1;
+        Vector3 lchDelta = lch0 - lch1;
+
+        double deltaLsq = powf(lchDelta.x / (kL*sL), 2);
+        double deltaCsq = powf(lchDelta.y / (kC*sC), 2);
+
+        // avoid possible sqrt of negative value by computing (deltaH/(kH*sH))^2
+        double deltaHsq = powf(labDelta.y, 2) + powf(labDelta.z, 2) - powf(lchDelta.y, 2);
+        deltaHsq /= powf(kH*sH, 2);
+
+        error += sqrt(deltaLsq + deltaCsq + deltaHsq);
+    }
+
+    return float(error / count);
+}
+
+float nv::spatialCieLabError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    uint w = img0->width();
+    uint h = img0->height();
+    uint d = img0->depth();
+
+    FloatImage lab0, lab1; // Original images in CIE-Lab space.
+    lab0.allocate(3, w, h, d);
+    lab1.allocate(3, w, h, d);
+
+    // Convert input images to CIE-Lab.
+    rgbToCieLab(img0, &lab0);
+    rgbToCieLab(img1, &lab1);
+
+    // @@ Convolve each channel by the corresponding filter.
+    /*
+    GaussianFilter LFilter(5);
+    GaussianFilter aFilter(5);
+    GaussianFilter bFilter(5);
+
+    lab0.convolve(0, LFilter);
+    lab0.convolve(1, aFilter);
+    lab0.convolve(2, bFilter);
+
+    lab1.convolve(0, LFilter);
+    lab1.convolve(1, aFilter);
+    lab1.convolve(2, bFilter);
+    */
+    // @@ Measure Delta E between lab0 and lab1.
+
+    return 0.0f;
+}
+
+
+// Assumes input images are normal maps.
+float nv::averageAngularError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    uint w = img0->width();
+    uint h = img0->height();
+
+    const float * x0 = img0->channel(0);
+    const float * y0 = img0->channel(1);
+    const float * z0 = img0->channel(2);
+
+    const float * x1 = img1->channel(0);
+    const float * y1 = img1->channel(1);
+    const float * z1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = w*h;
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 n0 = Vector3(x0[i], y0[i], z0[i]);
+        Vector3 n1 = Vector3(x1[i], y1[i], z1[i]);
+
+        n0 = 2.0f * n0 - Vector3(1);
+        n1 = 2.0f * n1 - Vector3(1);
+
+        n0 = normalizeSafe(n0, Vector3(0), 0.0f);
+        n1 = normalizeSafe(n1, Vector3(0), 0.0f);
+
+        error += acos(clamp(dot(n0, n1), -1.0f, 1.0f));
+    }
+
+    return float(error / count);
+}
+
+float nv::rmsAngularError(const FloatImage * img0, const FloatImage * img1)
+{
+    if (img0 == NULL || img1 == NULL || img0->width() != img1->width() || img0->height() != img1->height()) {
+        return FLT_MAX;
+    }
+    nvDebugCheck(img0->componentCount() == 4 && img1->componentCount() == 4);
+
+    uint w = img0->width();
+    uint h = img0->height();
+
+    const float * x0 = img0->channel(0);
+    const float * y0 = img0->channel(1);
+    const float * z0 = img0->channel(2);
+
+    const float * x1 = img1->channel(0);
+    const float * y1 = img1->channel(1);
+    const float * z1 = img1->channel(2);
+
+    double error = 0.0f;
+
+    const uint count = w*h;
+    for (uint i = 0; i < count; i++)
+    {
+        Vector3 n0 = Vector3(x0[i], y0[i], z0[i]);
+        Vector3 n1 = Vector3(x1[i], y1[i], z1[i]);
+
+        n0 = 2.0f * n0 - Vector3(1);
+        n1 = 2.0f * n1 - Vector3(1);
+
+        n0 = normalizeSafe(n0, Vector3(0), 0.0f);
+        n1 = normalizeSafe(n1, Vector3(0), 0.0f);
+
+        float angle = acosf(clamp(dot(n0, n1), -1.0f, 1.0f));
+        error += angle * angle;
+    }
+
+    return float(sqrt(error / count));
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/Filter.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/Filter.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/Filter.h
@@ -1,218 +1,233 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_FILTER_H
 #define NV_IMAGE_FILTER_H
 
-#include <nvimage/nvimage.h>
-#include <nvcore/Debug.h>
+#include "nvimage.h"
+#include "nvcore/Debug.h"
 
 namespace nv
 {
-	class Vector4;
+    class Vector4;
 
-	/// Base filter class.
-	class NVIMAGE_CLASS Filter
-	{
-	public:
-		Filter(float width);
-		virtual ~Filter();
-
-		float width() const { return m_width; }
-		float sampleDelta(float x, float scale) const;
-		float sampleBox(float x, float scale, int samples) const;
-		float sampleTriangle(float x, float scale, int samples) const;
-
-		virtual float evaluate(float x) const = 0;
-
-	protected:
-		const float m_width;
-	};
-
-	// Box filter.
-	class NVIMAGE_CLASS BoxFilter : public Filter
-	{
-	public:
-		BoxFilter();
-		BoxFilter(float width);
-		virtual float evaluate(float x) const;
-	};
-
-	// Triangle (bilinear/tent) filter.
-	class NVIMAGE_CLASS TriangleFilter : public Filter
-	{
-	public:
-		TriangleFilter();
-		TriangleFilter(float width);
-		virtual float evaluate(float x) const;
-	};
-
-	// Quadratic (bell) filter.
-	class NVIMAGE_CLASS QuadraticFilter : public Filter
-	{
-	public:
-		QuadraticFilter();
-		virtual float evaluate(float x) const;
-	};
-
-	// Cubic filter from Thatcher Ulrich.
-	class NVIMAGE_CLASS CubicFilter : public Filter
-	{
-	public:
-		CubicFilter();
-		virtual float evaluate(float x) const;
-	};
-
-	// Cubic b-spline filter from Paul Heckbert.
-	class NVIMAGE_CLASS BSplineFilter : public Filter
-	{
-	public:
-		BSplineFilter();
-		virtual float evaluate(float x) const;
-	};
-
-	/// Mitchell & Netravali's two-param cubic
-	/// @see "Reconstruction Filters in Computer Graphics", SIGGRAPH 88
-	class NVIMAGE_CLASS MitchellFilter : public Filter
-	{
-	public:
-		MitchellFilter();
-		virtual float evaluate(float x) const;
-
-		void setParameters(float b, float c);
-
-	private:
-		float p0, p2, p3;
-		float q0, q1, q2, q3;
-	};
-
-	// Lanczos3 filter.
-	class NVIMAGE_CLASS LanczosFilter : public Filter
-	{
-	public:
-		LanczosFilter();
-		virtual float evaluate(float x) const;
-	};
-
-	// Sinc filter.
-	class NVIMAGE_CLASS SincFilter : public Filter
-	{
-	public:
-		SincFilter(float w);
-		virtual float evaluate(float x) const;
-	};
-
-	// Kaiser filter.
-	class NVIMAGE_CLASS KaiserFilter : public Filter
-	{
-	public:
-		KaiserFilter(float w);
-		virtual float evaluate(float x) const;
-	
-		void setParameters(float a, float stretch);
-
-	private:
-		float alpha;
-		float stretch;
-	};
-
-
-
-	/// A 1D kernel. Used to precompute filter weights.
-	class NVIMAGE_CLASS Kernel1
-	{
-		NV_FORBID_COPY(Kernel1);
-	public:
-		Kernel1(const Filter & f, int iscale, int samples = 32);
-		~Kernel1();
-		
-		float valueAt(uint x) const {
-			nvDebugCheck(x < (uint)m_windowSize);
-			return m_data[x];
-		}
-		
-		int windowSize() const {
-			return m_windowSize;
-		}
-		
-		float width() const {
-			return m_width;
-		}
-		
-		void debugPrint();
-		
-	private:
-		int m_windowSize;
-		float m_width;
-		float * m_data;
-	};
-
-
-	/// A 2D kernel.
-	class NVIMAGE_CLASS Kernel2 
-	{
-	public:
-		Kernel2(uint width);
-		Kernel2(const Kernel2 & k);
-		~Kernel2();
-		
-		void normalize();
-		void transpose();
-		
-		float valueAt(uint x, uint y) const {
-			return m_data[y * m_windowSize + x];
-		}
-		
-		uint windowSize() const {
-			return m_windowSize;
-		}
-		
-		void initLaplacian();
-		void initEdgeDetection();
-		void initSobel();
-		void initPrewitt();
-		
-		void initBlendedSobel(const Vector4 & scale);
-		
-	private:
-		const uint m_windowSize;
-		float * m_data;
-	};
-
-
-	/// A 1D polyphase kernel
-	class NVIMAGE_CLASS PolyphaseKernel
-	{
-		NV_FORBID_COPY(PolyphaseKernel);
-	public:
-		PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples = 32);
-		~PolyphaseKernel();
-		
-		int windowSize() const {
-			return m_windowSize;
-		}
-		
-		uint length() const {
-			return m_length;
-		}
-
-		float width() const {
-			return m_width;
-		}
-
-		float valueAt(uint column, uint x) const {
-			nvDebugCheck(column < m_length);
-			nvDebugCheck(x < (uint)m_windowSize);
-			return m_data[column * m_windowSize + x];
-		}
-
-		void debugPrint() const;
-		
-	private:
-		int m_windowSize;
-		uint m_length;
-		float m_width;
-		float * m_data;
-	};
+    /// Base filter class.
+    class NVIMAGE_CLASS Filter
+    {
+    public:
+        Filter(float width);
+        virtual ~Filter();
+
+        float width() const { return m_width; }
+        float sampleDelta(float x, float scale) const;
+        float sampleBox(float x, float scale, int samples) const;
+        float sampleTriangle(float x, float scale, int samples) const;
+
+        virtual float evaluate(float x) const = 0;
+
+    protected:
+        const float m_width;
+    };
+
+    // Box filter.
+    class NVIMAGE_CLASS BoxFilter : public Filter
+    {
+    public:
+        BoxFilter();
+        BoxFilter(float width);
+        virtual float evaluate(float x) const;
+    };
+
+    // Triangle (bilinear/tent) filter.
+    class NVIMAGE_CLASS TriangleFilter : public Filter
+    {
+    public:
+        TriangleFilter();
+        TriangleFilter(float width);
+        virtual float evaluate(float x) const;
+    };
+
+    // Quadratic (bell) filter.
+    class NVIMAGE_CLASS QuadraticFilter : public Filter
+    {
+    public:
+        QuadraticFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    // Cubic filter from Thatcher Ulrich.
+    class NVIMAGE_CLASS CubicFilter : public Filter
+    {
+    public:
+        CubicFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    // Cubic b-spline filter from Paul Heckbert.
+    class NVIMAGE_CLASS BSplineFilter : public Filter
+    {
+    public:
+        BSplineFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    /// Mitchell & Netravali's two-param cubic
+    /// @see "Reconstruction Filters in Computer Graphics", SIGGRAPH 88
+    class NVIMAGE_CLASS MitchellFilter : public Filter
+    {
+    public:
+        MitchellFilter();
+        virtual float evaluate(float x) const;
+
+        void setParameters(float b, float c);
+
+    private:
+        float p0, p2, p3;
+        float q0, q1, q2, q3;
+    };
+
+    // Lanczos3 filter.
+    class NVIMAGE_CLASS LanczosFilter : public Filter
+    {
+    public:
+        LanczosFilter();
+        virtual float evaluate(float x) const;
+    };
+
+    // Sinc filter.
+    class NVIMAGE_CLASS SincFilter : public Filter
+    {
+    public:
+        SincFilter(float w);
+        virtual float evaluate(float x) const;
+    };
+
+    // Kaiser filter.
+    class NVIMAGE_CLASS KaiserFilter : public Filter
+    {
+    public:
+        KaiserFilter(float w);
+        virtual float evaluate(float x) const;
+
+        void setParameters(float a, float stretch);
+
+    private:
+        float alpha;
+        float stretch;
+    };
+
+    // Gaussian filter.
+    class GaussianFilter : public Filter
+    {
+    public:
+        GaussianFilter(float w);
+        virtual float evaluate(float x) const;
+
+        void setParameters(float variance);
+
+    private:
+        float variance;
+    };
+
+
+
+    /// A 1D kernel. Used to precompute filter weights.
+    class NVIMAGE_CLASS Kernel1
+    {
+        NV_FORBID_COPY(Kernel1);
+    public:
+        Kernel1(const Filter & f, int iscale, int samples = 32);
+        ~Kernel1();
+
+        float valueAt(uint x) const {
+            nvDebugCheck(x < (uint)m_windowSize);
+            return m_data[x];
+        }
+
+        int windowSize() const {
+            return m_windowSize;
+        }
+
+        float width() const {
+            return m_width;
+        }
+
+        void debugPrint();
+
+    private:
+        int m_windowSize;
+        float m_width;
+        float * m_data;
+    };
+
+
+    /// A 2D kernel.
+    class NVIMAGE_CLASS Kernel2 
+    {
+    public:
+        Kernel2(uint width);
+        Kernel2(uint width, const float * data);
+        Kernel2(const Kernel2 & k);
+        ~Kernel2();
+
+        void normalize();
+        void transpose();
+
+        float valueAt(uint x, uint y) const {
+            return m_data[y * m_windowSize + x];
+        }
+
+        uint windowSize() const {
+            return m_windowSize;
+        }
+
+        void initLaplacian();
+        void initEdgeDetection();
+        void initSobel();
+        void initPrewitt();
+
+        void initBlendedSobel(const Vector4 & scale);
+
+    private:
+        const uint m_windowSize;
+        float * m_data;
+    };
+
+
+    /// A 1D polyphase kernel
+    class NVIMAGE_CLASS PolyphaseKernel
+    {
+        NV_FORBID_COPY(PolyphaseKernel);
+    public:
+        PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples = 32);
+        ~PolyphaseKernel();
+
+        int windowSize() const {
+            return m_windowSize;
+        }
+
+        uint length() const {
+            return m_length;
+        }
+
+        float width() const {
+            return m_width;
+        }
+
+        float valueAt(uint column, uint x) const {
+            nvDebugCheck(column < m_length);
+            nvDebugCheck(x < (uint)m_windowSize);
+            return m_data[column * m_windowSize + x];
+        }
+
+        void debugPrint() const;
+
+    private:
+        int m_windowSize;
+        uint m_length;
+        float m_width;
+        float * m_data;
+    };
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/Filter.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/Filter.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/Filter.cpp
@@ -35,62 +35,64 @@
 
 #include "Filter.h"
 
-#include <nvmath/Vector.h>	// Vector4
-#include <nvcore/Containers.h>	// swap
+#include "nvmath/Vector.h" // Vector4
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memset
 
 using namespace nv;
 
 namespace
 {
-	// Sinc function.
-	inline static float sincf(const float x)
-	{
-		if (fabs(x) < NV_EPSILON) {
-			//return 1.0;
-			return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
-		}
-		else {
-			return sin(x) / x;
-		}
-	}
-
-	// Bessel function of the first kind from Jon Blow's article.
-	// http://mathworld.wolfram.com/BesselFunctionoftheFirstKind.html
-	// http://en.wikipedia.org/wiki/Bessel_function
-	inline static float bessel0(float x)
-	{
-		const float EPSILON_RATIO = 1e-6f;
-		float xh, sum, pow, ds;
-		int k;
-
-		xh = 0.5f * x;
-		sum = 1.0f;
-		pow = 1.0f;
-		k = 0;
-		ds = 1.0;
-		while (ds > sum * EPSILON_RATIO) {
-			++k;
-			pow = pow * (xh / k);
-			ds = pow * pow;
-			sum = sum + ds;
-		}
-
-		return sum;
-	}
-
-	/*// Alternative bessel function from Paul Heckbert.
-	static float _bessel0(float x)
-	{
-		const float EPSILON_RATIO = 1E-6;
-		float sum = 1.0f;
-		float y = x * x / 4.0f;
-		float t = y;
-		for(int i = 2; t > EPSILON_RATIO; i++) {
-			sum += t;
-			t *= y / float(i * i);
-		}
-		return sum;
-	}*/
+    // Sinc function.
+    inline static float sincf(const float x)
+    {
+        if (fabs(x) < NV_EPSILON) {
+            //return 1.0;
+            return 1.0f + x*x*(-1.0f/6.0f + x*x*1.0f/120.0f);
+        }
+        else {
+            return sin(x) / x;
+        }
+    }
+
+    // Bessel function of the first kind from Jon Blow's article.
+    // http://mathworld.wolfram.com/BesselFunctionoftheFirstKind.html
+    // http://en.wikipedia.org/wiki/Bessel_function
+    inline static float bessel0(float x)
+    {
+        const float EPSILON_RATIO = 1e-6f;
+        float xh, sum, pow, ds;
+        int k;
+
+        xh = 0.5f * x;
+        sum = 1.0f;
+        pow = 1.0f;
+        k = 0;
+        ds = 1.0;
+        while (ds > sum * EPSILON_RATIO) {
+            ++k;
+            pow = pow * (xh / k);
+            ds = pow * pow;
+            sum = sum + ds;
+        }
+
+        return sum;
+    }
+
+    /*// Alternative bessel function from Paul Heckbert.
+    static float _bessel0(float x)
+    {
+        const float EPSILON_RATIO = 1E-6;
+        float sum = 1.0f;
+        float y = x * x / 4.0f;
+        float t = y;
+        for(int i = 2; t > EPSILON_RATIO; i++) {
+            sum += t;
+            t *= y / float(i * i);
+        }
+        return sum;
+    }*/
 
 } // namespace
 
@@ -105,42 +107,45 @@
 
 float Filter::sampleDelta(float x, float scale) const
 {
-	return evaluate((x + 0.5f)* scale);
+    return evaluate((x + 0.5f)* scale);
 }
 
 float Filter::sampleBox(float x, float scale, int samples) const
 {
-	float sum = 0;
-	float isamples = 1.0f / float(samples);
+    double sum = 0;
+    float isamples = 1.0f / float(samples);
+
+    for(int s = 0; s < samples; s++)
+    {
+        float p = (x + (float(s) + 0.5f) * isamples) * scale;
+        float value = evaluate(p);
+
+        //printf("%f: %.8f (%X)\n", p, value, *(uint32 *)&value);
 
-	for(int s = 0; s < samples; s++)
-	{
-		float p = (x + (float(s) + 0.5f) * isamples) * scale;
-		float value = evaluate(p);
-		sum += value;
-	}
-	
-	return sum * isamples;
+        sum += value;
+    }
+
+    return float(sum * isamples);
 }
 
 float Filter::sampleTriangle(float x, float scale, int samples) const
 {
-	float sum = 0;
-	float isamples = 1.0f / float(samples);
+    double sum = 0;
+    float isamples = 1.0f / float(samples);
+
+    for(int s = 0; s < samples; s++)
+    {
+        float offset = (2 * float(s) + 1.0f) * isamples;		
+        float p = (x + offset - 0.5f) * scale;
+        float value = evaluate(p);
+
+        float weight = offset;
+        if (weight > 1.0f) weight = 2.0f - weight;
 
-	for(int s = 0; s < samples; s++)
-	{
-		float offset = (2 * float(s) + 1.0f) * isamples;		
-		float p = (x + offset - 0.5f) * scale;
-		float value = evaluate(p);
-		
-		float weight = offset;
-		if (weight > 1.0f) weight = 2.0f - weight;
-		
-		sum += value * weight;
-	}
-	
-	return 2 * sum * isamples;
+        sum += value * weight;
+    }
+
+    return float(2 * sum * isamples);
 }
 
 
@@ -152,8 +157,8 @@
 
 float BoxFilter::evaluate(float x) const
 {
-	if (fabs(x) <= m_width) return 1.0f;
-	else return 0.0f;
+    if (fabs(x) <= m_width) return 1.0f;
+    else return 0.0f;
 }
 
 
@@ -162,7 +167,7 @@
 
 float TriangleFilter::evaluate(float x) const
 {
-	x = fabs(x);
+    x = fabs(x);
     if( x < m_width ) return m_width - x;
     return 0.0f;
 }
@@ -172,11 +177,11 @@
 
 float QuadraticFilter::evaluate(float x) const
 {
-	x = fabs(x);
+    x = fabs(x);
     if( x < 0.5f ) return 0.75f - x * x;
     if( x < 1.5f ) { 
-    	float t = x - 1.5f;
-    	return 0.5f * t * t;
+        float t = x - 1.5f;
+        return 0.5f * t * t;
     }
     return 0.0f;
 }
@@ -186,10 +191,10 @@
 
 float CubicFilter::evaluate(float x) const
 {
-	// f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1
-	x = fabs(x);
-	if( x < 1.0f ) return((2.0f * x - 3.0f) * x * x + 1.0f);
-	return 0.0f;
+    // f(t) = 2|t|^3 - 3|t|^2 + 1, -1 <= t <= 1
+    x = fabs(x);
+    if( x < 1.0f ) return((2.0f * x - 3.0f) * x * x + 1.0f);
+    return 0.0f;
 }
 
 
@@ -197,11 +202,11 @@
 
 float BSplineFilter::evaluate(float x) const
 {
-	x = fabs(x);
+    x = fabs(x);
     if( x < 1.0f ) return (4.0f + x * x * (-6.0f + x * 3.0f)) / 6.0f;
     if( x < 2.0f ) { 
-    	float t = 2.0f - x;
-    	return t * t * t / 6.0f;
+        float t = 2.0f - x;
+        return t * t * t / 6.0f;
     }
     return 0.0f;
 }
@@ -211,21 +216,21 @@
 
 float MitchellFilter::evaluate(float x) const
 {
-	x = fabs(x);
-	if( x < 1.0f ) return p0 + x * x * (p2 + x * p3);
-	if( x < 2.0f ) return q0 + x * (q1 + x * (q2 + x * q3));
-	return 0.0f;
+    x = fabs(x);
+    if( x < 1.0f ) return p0 + x * x * (p2 + x * p3);
+    if( x < 2.0f ) return q0 + x * (q1 + x * (q2 + x * q3));
+    return 0.0f;
 }
 
 void MitchellFilter::setParameters(float b, float c)
 {
-	p0 = (6.0f -  2.0f * b) / 6.0f;
-	p2 = (-18.0f + 12.0f * b + 6.0f * c) / 6.0f;
-	p3 = (12.0f - 9.0f * b - 6.0f * c) / 6.0f;
-	q0 = (8.0f * b + 24.0f * c) / 6.0f;
-	q1 = (-12.0f * b - 48.0f * c) / 6.0f;
-	q2 = (6.0f * b + 30.0f * c) / 6.0f;
-	q3 = (-b - 6.0f * c) / 6.0f;
+    p0 = (6.0f -  2.0f * b) / 6.0f;
+    p2 = (-18.0f + 12.0f * b + 6.0f * c) / 6.0f;
+    p3 = (12.0f - 9.0f * b - 6.0f * c) / 6.0f;
+    q0 = (8.0f * b + 24.0f * c) / 6.0f;
+    q1 = (-12.0f * b - 48.0f * c) / 6.0f;
+    q2 = (6.0f * b + 30.0f * c) / 6.0f;
+    q3 = (-b - 6.0f * c) / 6.0f;
 }
 
 
@@ -233,9 +238,9 @@
 
 float LanczosFilter::evaluate(float x) const
 {
-	x = fabs(x);
-	if( x < 3.0f ) return sincf(PI * x) * sincf(PI * x / 3.0f);
-	return 0.0f;
+    x = fabs(x);
+    if( x < 3.0f ) return sincf(PI * x) * sincf(PI * x / 3.0f);
+    return 0.0f;
 }
 
 
@@ -243,172 +248,187 @@
 
 float SincFilter::evaluate(float x) const
 {
-	return sincf(PI * x);
+    return sincf(PI * x);
 }
 
 
-KaiserFilter::KaiserFilter(float w) : Filter(w) { setParameters(4.0f, 1.0f); }
+KaiserFilter::KaiserFilter(float w) : Filter(w) { setParameters(/*alpha=*/4.0f, /*stretch=*/1.0f); }
 
 float KaiserFilter::evaluate(float x) const
 {
-	const float sinc_value = sincf(PI * x * stretch);
-	const float t = x / m_width;
-	if ((1 - t * t) >= 0) return sinc_value * bessel0(alpha * sqrtf(1 - t * t)) / bessel0(alpha);
-	else return 0;
+    const float sinc_value = sincf(PI * x * stretch);
+    const float t = x / m_width;
+    if ((1 - t * t) >= 0) return sinc_value * bessel0(alpha * sqrtf(1 - t * t)) / bessel0(alpha);
+    else return 0;
 }
 
 void KaiserFilter::setParameters(float alpha, float stretch)
 {
-	this->alpha = alpha;
-	this->stretch = stretch;
+    this->alpha = alpha;
+    this->stretch = stretch;
+}
+
+GaussianFilter::GaussianFilter(float w) : Filter(w) { setParameters(1); }
+
+float GaussianFilter::evaluate(float x) const
+{
+    // variance = sigma^2
+    return (1.0f / sqrtf(2 * PI * variance)) * expf(-x*x / (2 * variance));
+}
+
+void GaussianFilter::setParameters(float variance)
+{
+    this->variance = variance;
 }
 
 
 
-/// Ctor.
 Kernel1::Kernel1(const Filter & f, int iscale, int samples/*= 32*/)
 {
-	nvDebugCheck(iscale > 1);
-	nvDebugCheck(samples > 0);
-	
-	const float scale = 1.0f / iscale;
-	
-	m_width = f.width() * iscale;
-	m_windowSize = (int)ceilf(2 * m_width);
-	m_data = new float[m_windowSize];
-	
-	const float offset = float(m_windowSize) / 2;
-	
-	float total = 0.0f;
-	for (int i = 0; i < m_windowSize; i++)
-	{
-		const float sample = f.sampleBox(i - offset, scale, samples);
-		m_data[i] = sample;
-		total += sample;
-	}
-	
-	const float inv = 1.0f / total;
-	for (int i = 0; i < m_windowSize; i++)
-	{
-		m_data[i] *= inv;
-	}
+    nvDebugCheck(iscale > 1);
+    nvDebugCheck(samples > 0);
+
+    const float scale = 1.0f / iscale;
+
+    m_width = f.width() * iscale;
+    m_windowSize = (int)ceilf(2 * m_width);
+    m_data = new float[m_windowSize];
+
+    const float offset = float(m_windowSize) / 2;
+
+    float total = 0.0f;
+    for (int i = 0; i < m_windowSize; i++)
+    {
+        const float sample = f.sampleBox(i - offset, scale, samples);
+        m_data[i] = sample;
+        total += sample;
+    }
+
+    const float inv = 1.0f / total;
+    for (int i = 0; i < m_windowSize; i++)
+    {
+        m_data[i] *= inv;
+    }
 }
 
-/// Dtor.
 Kernel1::~Kernel1()
 {
-	delete m_data;
+    delete [] m_data;
 }
 
-/// Print the kernel for debugging purposes.
+// Print the kernel for debugging purposes.
 void Kernel1::debugPrint()
 {
-	for (int i = 0; i < m_windowSize; i++) {
-		nvDebug("%d: %f\n", i, m_data[i]);
-	}
+    for (int i = 0; i < m_windowSize; i++) {
+        nvDebug("%d: %f\n", i, m_data[i]);
+    }
 }
 
 
 
-/// Ctor.
 Kernel2::Kernel2(uint ws) : m_windowSize(ws)
 {
-	m_data = new float[m_windowSize * m_windowSize];
+    m_data = new float[m_windowSize * m_windowSize];
+}
+
+Kernel2::Kernel2(uint ws, const float * data) : m_windowSize(ws)
+{
+    m_data = new float[m_windowSize * m_windowSize];
+
+    memcpy(m_data, data, sizeof(float) * m_windowSize * m_windowSize);
 }
 
-/// Copy ctor.
 Kernel2::Kernel2(const Kernel2 & k) : m_windowSize(k.m_windowSize)
 {
-	m_data = new float[m_windowSize * m_windowSize];
-	for (uint i = 0; i < m_windowSize * m_windowSize; i++) {
-		m_data[i] = k.m_data[i];
-	}
+    m_data = new float[m_windowSize * m_windowSize];
+    for (uint i = 0; i < m_windowSize * m_windowSize; i++) {
+        m_data[i] = k.m_data[i];
+    }
 }
 
 
-/// Dtor.
 Kernel2::~Kernel2()
 {
-	delete m_data;
+    delete [] m_data;
 }
 
-/// Normalize the filter.
+// Normalize the filter.
 void Kernel2::normalize()
 {
-	float total = 0.0f;
-	for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
-		total += fabs(m_data[i]);
-	}
-	
-	float inv = 1.0f / total;
-	for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
-		m_data[i] *= inv;
-	}
+    float total = 0.0f;
+    for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
+        total += fabs(m_data[i]);
+    }
+
+    float inv = 1.0f / total;
+    for(uint i = 0; i < m_windowSize*m_windowSize; i++) {
+        m_data[i] *= inv;
+    }
 }
 
-/// Transpose the kernel.
+// Transpose the kernel.
 void Kernel2::transpose()
 {
-	for(uint i = 0; i < m_windowSize; i++) {
-		for(uint j = i+1; j < m_windowSize; j++) {
-			swap(m_data[i*m_windowSize + j], m_data[j*m_windowSize + i]);
-		}
-	}
+    for(uint i = 0; i < m_windowSize; i++) {
+        for(uint j = i+1; j < m_windowSize; j++) {
+            swap(m_data[i*m_windowSize + j], m_data[j*m_windowSize + i]);
+        }
+    }
 }
 
-/// Init laplacian filter, usually used for sharpening.
+// Init laplacian filter, usually used for sharpening.
 void Kernel2::initLaplacian()
 {
-	nvDebugCheck(m_windowSize == 3);
-//	m_data[0] = -1; m_data[1] = -1; m_data[2] = -1;
-//	m_data[3] = -1; m_data[4] = +8; m_data[5] = -1;
-//	m_data[6] = -1; m_data[7] = -1; m_data[8] = -1;	
-	
-	m_data[0] = +0; m_data[1] = -1; m_data[2] = +0;
-	m_data[3] = -1; m_data[4] = +4; m_data[5] = -1;
-	m_data[6] = +0; m_data[7] = -1; m_data[8] = +0;	
-	
-//	m_data[0] = +1; m_data[1] = -2; m_data[2] = +1;
-//	m_data[3] = -2; m_data[4] = +4; m_data[5] = -2;
-//	m_data[6] = +1; m_data[7] = -2; m_data[8] = +1;	
+    nvDebugCheck(m_windowSize == 3);
+    //	m_data[0] = -1; m_data[1] = -1; m_data[2] = -1;
+    //	m_data[3] = -1; m_data[4] = +8; m_data[5] = -1;
+    //	m_data[6] = -1; m_data[7] = -1; m_data[8] = -1;	
+
+    m_data[0] = +0; m_data[1] = -1; m_data[2] = +0;
+    m_data[3] = -1; m_data[4] = +4; m_data[5] = -1;
+    m_data[6] = +0; m_data[7] = -1; m_data[8] = +0;	
+
+    //	m_data[0] = +1; m_data[1] = -2; m_data[2] = +1;
+    //	m_data[3] = -2; m_data[4] = +4; m_data[5] = -2;
+    //	m_data[6] = +1; m_data[7] = -2; m_data[8] = +1;	
 }
 
 
-/// Init simple edge detection filter.
+// Init simple edge detection filter.
 void Kernel2::initEdgeDetection()
 {
-	nvCheck(m_windowSize == 3);
-	m_data[0] = 0; m_data[1] = 0; m_data[2] = 0;
-	m_data[3] =-1; m_data[4] = 0; m_data[5] = 1;
-	m_data[6] = 0; m_data[7] = 0; m_data[8] = 0;
+    nvCheck(m_windowSize == 3);
+    m_data[0] = 0; m_data[1] = 0; m_data[2] = 0;
+    m_data[3] =-1; m_data[4] = 0; m_data[5] = 1;
+    m_data[6] = 0; m_data[7] = 0; m_data[8] = 0;
 }
 
-/// Init sobel filter.
+// Init sobel filter.
 void Kernel2::initSobel()
 {
-	if (m_windowSize == 3)
-	{
-		m_data[0] = -1; m_data[1] = 0; m_data[2] = 1;
-		m_data[3] = -2; m_data[4] = 0; m_data[5] = 2;
-		m_data[6] = -1; m_data[7] = 0; m_data[8] = 1;
-	}
-	else if (m_windowSize == 5)
-	{
-		float elements[] = {
+    if (m_windowSize == 3)
+    {
+        m_data[0] = -1; m_data[1] = 0; m_data[2] = 1;
+        m_data[3] = -2; m_data[4] = 0; m_data[5] = 2;
+        m_data[6] = -1; m_data[7] = 0; m_data[8] = 1;
+    }
+    else if (m_windowSize == 5)
+    {
+        float elements[] = {
             -1, -2, 0, 2, 1,
             -2, -3, 0, 3, 2,
             -3, -4, 0, 4, 3,
             -2, -3, 0, 3, 2,
             -1, -2, 0, 2, 1
-		};
+        };
 
-		for (int i = 0; i < 5*5; i++) {
-			m_data[i] = elements[i];
-		}
-	}
-	else if (m_windowSize == 7)
-	{
-		float elements[] = {
+        for (int i = 0; i < 5*5; i++) {
+            m_data[i] = elements[i];
+        }
+    }
+    else if (m_windowSize == 7)
+    {
+        float elements[] = {
             -1, -2, -3, 0, 3, 2, 1,
             -2, -3, -4, 0, 4, 3, 2,
             -3, -4, -5, 0, 5, 4, 3,
@@ -416,15 +436,15 @@
             -3, -4, -5, 0, 5, 4, 3,
             -2, -3, -4, 0, 4, 3, 2,
             -1, -2, -3, 0, 3, 2, 1
-		};
+        };
 
-		for (int i = 0; i < 7*7; i++) {
-			m_data[i] = elements[i];
-		}
-	}
-	else if (m_windowSize == 9)
-	{
-		float elements[] = {
+        for (int i = 0; i < 7*7; i++) {
+            m_data[i] = elements[i];
+        }
+    }
+    else if (m_windowSize == 9)
+    {
+        float elements[] = {
             -1, -2, -3, -4, 0, 4, 3, 2, 1,
             -2, -3, -4, -5, 0, 5, 4, 3, 2,
             -3, -4, -5, -6, 0, 6, 5, 4, 3,
@@ -434,47 +454,47 @@
             -3, -4, -5, -6, 0, 6, 5, 4, 3,
             -2, -3, -4, -5, 0, 5, 4, 3, 2,
             -1, -2, -3, -4, 0, 4, 3, 2, 1
-		};
-		
-		for (int i = 0; i < 9*9; i++) {
-			m_data[i] = elements[i];
-		}
-	}
+        };
+
+        for (int i = 0; i < 9*9; i++) {
+            m_data[i] = elements[i];
+        }
+    }
 }
 
-/// Init prewitt filter.
+// Init prewitt filter.
 void Kernel2::initPrewitt()
 {
-	if (m_windowSize == 3)
-	{
-		m_data[0] = -1; m_data[1] = 0; m_data[2] = -1;
-		m_data[3] = -1; m_data[4] = 0; m_data[5] = -1;
-		m_data[6] = -1; m_data[7] = 0; m_data[8] = -1;
-	}
-	else if (m_windowSize == 5)
-	{
-		// @@ Is this correct?
-		float elements[] = {
+    if (m_windowSize == 3)
+    {
+        m_data[0] = -1; m_data[1] = 0; m_data[2] = -1;
+        m_data[3] = -1; m_data[4] = 0; m_data[5] = -1;
+        m_data[6] = -1; m_data[7] = 0; m_data[8] = -1;
+    }
+    else if (m_windowSize == 5)
+    {
+        // @@ Is this correct?
+        float elements[] = {
             -2, -1, 0, 1, 2,
             -2, -1, 0, 1, 2,
             -2, -1, 0, 1, 2,
             -2, -1, 0, 1, 2,
             -2, -1, 0, 1, 2
-		};
+        };
 
-		for (int i = 0; i < 5*5; i++) {
-			m_data[i] = elements[i];
-		}
-	}
+        for (int i = 0; i < 5*5; i++) {
+            m_data[i] = elements[i];
+        }
+    }
 }
 
-/// Init blended sobel filter.
+// Init blended sobel filter.
 void Kernel2::initBlendedSobel(const Vector4 & scale)
 {
-	nvCheck(m_windowSize == 9);
+    nvCheck(m_windowSize == 9);
 
-	{
-		const float elements[] = {
+    {
+        const float elements[] = {
             -1, -2, -3, -4, 0, 4, 3, 2, 1,
             -2, -3, -4, -5, 0, 5, 4, 3, 2,
             -3, -4, -5, -6, 0, 6, 5, 4, 3,
@@ -484,14 +504,14 @@
             -3, -4, -5, -6, 0, 6, 5, 4, 3,
             -2, -3, -4, -5, 0, 5, 4, 3, 2,
             -1, -2, -3, -4, 0, 4, 3, 2, 1
-		};
-		
-		for (int i = 0; i < 9*9; i++) {
-			m_data[i] = elements[i] * scale.w();
-		}
-	}
-	{
-		const float elements[] = {
+        };
+
+        for (int i = 0; i < 9*9; i++) {
+            m_data[i] = elements[i] * scale.w;
+        }
+    }
+    {
+        const float elements[] = {
             -1, -2, -3, 0, 3, 2, 1,
             -2, -3, -4, 0, 4, 3, 2,
             -3, -4, -5, 0, 5, 4, 3,
@@ -499,107 +519,109 @@
             -3, -4, -5, 0, 5, 4, 3,
             -2, -3, -4, 0, 4, 3, 2,
             -1, -2, -3, 0, 3, 2, 1,
-		};
+        };
 
-		for (int i = 0; i < 7; i++) {
-			for (int e = 0; e < 7; e++) {
-				m_data[(i + 1) * 9 + e + 1] += elements[i * 7 + e] * scale.z();
-			}
-		}
-	}
-	{
-		const float elements[] = {
+        for (int i = 0; i < 7; i++) {
+            for (int e = 0; e < 7; e++) {
+                m_data[(i + 1) * 9 + e + 1] += elements[i * 7 + e] * scale.z;
+            }
+        }
+    }
+    {
+        const float elements[] = {
             -1, -2, 0, 2, 1,
             -2, -3, 0, 3, 2,
             -3, -4, 0, 4, 3,
             -2, -3, 0, 3, 2,
             -1, -2, 0, 2, 1
-		};
+        };
 
-		for (int i = 0; i < 5; i++) {
-			for (int e = 0; e < 5; e++) {
-				m_data[(i + 2) * 9 + e + 2] += elements[i * 5 + e] * scale.y();
-			}
-		}
-	}
-	{
-		const float elements[] = {
+        for (int i = 0; i < 5; i++) {
+            for (int e = 0; e < 5; e++) {
+                m_data[(i + 2) * 9 + e + 2] += elements[i * 5 + e] * scale.y;
+            }
+        }
+    }
+    {
+        const float elements[] = {
             -1, 0, 1,
             -2, 0, 2,
             -1, 0, 1,
-		};
+        };
 
-		for (int i = 0; i < 3; i++) {
-			for (int e = 0; e < 3; e++) {
-				m_data[(i + 3) * 9 + e + 3] += elements[i * 3 + e] * scale.x();
-			}
-		}
-	}
+        for (int i = 0; i < 3; i++) {
+            for (int e = 0; e < 3; e++) {
+                m_data[(i + 3) * 9 + e + 3] += elements[i * 3 + e] * scale.x;
+            }
+        }
+    }
 }
 
 
 PolyphaseKernel::PolyphaseKernel(const Filter & f, uint srcLength, uint dstLength, int samples/*= 32*/)
 {
-	nvDebugCheck(samples > 0);
+    nvDebugCheck(samples > 0);
 
-	float scale = float(dstLength) / float(srcLength);
-	const float iscale = 1.0f / scale;
+    float scale = float(dstLength) / float(srcLength);
+    const float iscale = 1.0f / scale;
 
-	if (scale > 1) {
-		// Upsampling.
-		samples = 1;
-		scale = 1;
-	}
-
-	m_length = dstLength;
-	m_width = f.width() * iscale;
-	m_windowSize = (int)ceilf(m_width * 2) + 1;
-
-	m_data = new float[m_windowSize * m_length];
-	memset(m_data, 0, sizeof(float) * m_windowSize * m_length);
-
-	for (uint i = 0; i < m_length; i++)
-	{
-		const float center = (0.5f + i) * iscale;
-		
-		const int left = (int)floorf(center - m_width);
-		const int right = (int)ceilf(center + m_width);
-		nvDebugCheck(right - left <= m_windowSize);
-		
-		float total = 0.0f;
-		for (int j = 0; j < m_windowSize; j++)
-		{
-			const float sample = f.sampleBox(left + j - center, scale, samples);
-			
-			m_data[i * m_windowSize + j] = sample;
-			total += sample;
-		}
-		
-		// normalize weights.
-		for (int j = 0; j < m_windowSize; j++)
-		{
-			m_data[i * m_windowSize + j] /= total;
-		}
-	}
+    if (scale > 1) {
+        // Upsampling.
+        samples = 1;
+        scale = 1;
+    }
+
+    m_length = dstLength;
+    m_width = f.width() * iscale;
+    m_windowSize = (int)ceilf(m_width * 2) + 1;
+
+    m_data = new float[m_windowSize * m_length];
+    memset(m_data, 0, sizeof(float) * m_windowSize * m_length);
+
+    for (uint i = 0; i < m_length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - m_width);
+        const int right = (int)ceilf(center + m_width);
+        nvDebugCheck(right - left <= m_windowSize);
+
+        float total = 0.0f;
+        for (int j = 0; j < m_windowSize; j++)
+        {
+            const float sample = f.sampleBox(left + j - center, scale, samples);
+
+            //printf("%f %X\n", sample, *(uint32 *)&sample);
+
+            m_data[i * m_windowSize + j] = sample;
+            total += sample;
+        }
+
+        // normalize weights.
+        for (int j = 0; j < m_windowSize; j++)
+        {
+            m_data[i * m_windowSize + j] /= total;
+        }
+    }
 }
 
 PolyphaseKernel::~PolyphaseKernel()
 {
-	delete [] m_data;
+    delete [] m_data;
 }
 
 
-/// Print the kernel for debugging purposes.
+// Print the kernel for debugging purposes.
 void PolyphaseKernel::debugPrint() const
 {
-	for (uint i = 0; i < m_length; i++)
-	{
-		nvDebug("%d: ", i);
-		for (int j = 0; j < m_windowSize; j++)
-		{
-			nvDebug(" %6.4f", m_data[i * m_windowSize + j]);
-		}
-		nvDebug("\n");
-	}
+    for (uint i = 0; i < m_length; i++)
+    {
+        nvDebug("%d: ", i);
+        for (int j = 0; j < m_windowSize; j++)
+        {
+            nvDebug(" %6.4f", m_data[i * m_windowSize + j]);
+        }
+        nvDebug("\n");
+    }
 }
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/FloatImage.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/FloatImage.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/FloatImage.h
@@ -3,265 +3,392 @@
 #ifndef NV_IMAGE_FLOATIMAGE_H
 #define NV_IMAGE_FLOATIMAGE_H
 
-#include <nvimage/nvimage.h>
+#include "nvimage.h"
 
-#include <nvmath/Vector.h>
+#include "nvmath/nvmath.h" // lerp
 
-#include <nvcore/Debug.h>
-#include <nvcore/Containers.h> // clamp
+#include "nvcore/Debug.h"
+#include "nvcore/Utils.h" // clamp
 
 #include <stdlib.h> // abs
 
 
 namespace nv
 {
-class Vector4;
-class Matrix;
-class Image;
-class Filter;
-class Kernel1;
-class Kernel2;
-class PolyphaseKernel;
+    class Vector4;
+    class Matrix;
+    class Image;
+    class Filter;
+    class Kernel1;
+    class Kernel2;
+    class PolyphaseKernel;
+
+    /// Multicomponent floating point image class.
+    class FloatImage
+    {
+    public:
+
+        enum WrapMode {
+            WrapMode_Clamp,
+            WrapMode_Repeat,
+            WrapMode_Mirror
+        };
+
+        NVIMAGE_API FloatImage();
+        NVIMAGE_API FloatImage(const Image * img);
+        NVIMAGE_API virtual ~FloatImage();
+
+        /** @name Conversion. */
+        //@{
+        NVIMAGE_API void initFrom(const Image * img);
+        NVIMAGE_API Image * createImage(uint base_component = 0, uint num = 4) const;
+        NVIMAGE_API Image * createImageGammaCorrect(float gamma = 2.2f) const;
+        //@}
+
+        /** @name Allocation. */
+        //@{
+        NVIMAGE_API void allocate(uint c, uint w, uint h, uint d = 1);
+        NVIMAGE_API void free(); // Does not clear members.
+        NVIMAGE_API void resizeChannelCount(uint c);
+        //@}
+
+        /** @name Manipulation. */
+        //@{
+        NVIMAGE_API void clear(float f = 0.0f);
+        NVIMAGE_API void clear(uint component, float f = 0.0f);
+        NVIMAGE_API void copyChannel(uint src, uint dst);
+
+        NVIMAGE_API void normalize(uint base_component);
+
+        NVIMAGE_API void packNormals(uint base_component);
+        NVIMAGE_API void expandNormals(uint base_component);
+        NVIMAGE_API void scaleBias(uint base_component, uint num, float scale, float add);
+
+        NVIMAGE_API void clamp(uint base_component, uint num, float low, float high);
+
+        NVIMAGE_API void toLinear(uint base_component, uint num, float gamma = 2.2f);
+        NVIMAGE_API void toGamma(uint base_component, uint num, float gamma = 2.2f);
+        NVIMAGE_API void exponentiate(uint base_component, uint num, float power);
+
+        NVIMAGE_API void transform(uint base_component, const Matrix & m, const Vector4 & offset);
+        NVIMAGE_API void swizzle(uint base_component, uint r, uint g, uint b, uint a);
+
+        NVIMAGE_API FloatImage * fastDownSample() const;
+        NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm) const;
+        NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm, uint alpha) const;
+        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm) const;
+        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm) const;
+        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const;
+        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm, uint alpha) const;
+
+        NVIMAGE_API void convolve(const Kernel2 & k, uint c, WrapMode wm);
+
+        //NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, WrapMode wm) const;
+        //NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, uint w, uint h, WrapMode wm) const;
+        //@}
+
+        NVIMAGE_API float applyKernelXY(const Kernel2 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        NVIMAGE_API float applyKernelX(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        NVIMAGE_API float applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        NVIMAGE_API float applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        NVIMAGE_API void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * output) const;
+        NVIMAGE_API void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * output) const;
+
+
+        NVIMAGE_API void flipX();
+        NVIMAGE_API void flipY();
+        NVIMAGE_API void flipZ();
+
+        NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale = 1.0f) const;
+        NVIMAGE_API void scaleAlphaToCoverage(float coverage, float alphaRef, int alphaChannel);
+
+
+        uint width() const { return m_width; }
+        uint height() const { return m_height; }
+        uint depth() const { return m_depth; }
+        uint componentCount() const { return m_componentCount; }
+        uint floatCount() const { return m_floatCount; }
+        uint pixelCount() const { return m_pixelCount; }
+
+
+        /** @name Pixel access. */
+        //@{
+        const float * channel(uint c) const;
+        float * channel(uint c);
+
+        const float * plane(uint c, uint z) const;
+        float * plane(uint c, uint z);
+
+        const float * scanline(uint c, uint y, uint z) const;
+        float * scanline(uint c, uint y, uint z);
+
+        //float pixel(uint c, uint x, uint y) const;
+        //float & pixel(uint c, uint x, uint y);
+
+        float pixel(uint c, uint x, uint y, uint z) const;
+        float & pixel(uint c, uint x, uint y, uint z);
+
+        float pixel(uint c, uint idx) const;
+        float & pixel(uint c, uint idx);
+
+        float pixel(uint idx) const;
+        float & pixel(uint idx);
+
+        float sampleNearest(uint c, float x, float y, WrapMode wm) const;
+        float sampleLinear(uint c, float x, float y, WrapMode wm) const;
+
+        float sampleNearest(uint c, float x, float y, float z, WrapMode wm) const;
+        float sampleLinear(uint c, float x, float y, float z, WrapMode wm) const;
+
+        float sampleNearestClamp(uint c, float x, float y) const;
+        float sampleNearestRepeat(uint c, float x, float y) const;
+        float sampleNearestMirror(uint c, float x, float y) const;
+
+        float sampleNearestClamp(uint c, float x, float y, float z) const;
+        float sampleNearestRepeat(uint c, float x, float y, float z) const;
+        float sampleNearestMirror(uint c, float x, float y, float z) const;
+
+        NVIMAGE_API float sampleLinearClamp(uint c, float x, float y) const;
+        float sampleLinearRepeat(uint c, float x, float y) const;
+        float sampleLinearMirror(uint c, float x, float y) const;
+
+        float sampleLinearClamp(uint c, float x, float y, float z) const;
+        float sampleLinearRepeat(uint c, float x, float y, float z) const;
+        float sampleLinearMirror(uint c, float x, float y, float z) const;
+        //@}
+
+
+        NVIMAGE_API FloatImage* clone() const;
+
+    public:
+
+        uint index(uint x, uint y, uint z) const;
+        uint indexClamp(int x, int y, int z) const;
+        uint indexRepeat(int x, int y, int z) const;
+        uint indexMirror(int x, int y, int z) const;
+        uint index(int x, int y, int z, WrapMode wm) const;
+
+        float bilerp(uint c, int ix0, int iy0, int ix1, int iy1, float fx, float fy) const;
+        float trilerp(uint c, int ix0, int iy0, int iz0, int ix1, int iy1, int iz1, float fx, float fy, float fz) const;
+
+    public:
+
+        uint16 m_componentCount;
+        uint16 m_width;
+        uint16 m_height;
+        uint16 m_depth;
+        uint32 m_pixelCount;
+        uint32 m_floatCount;
+        float * m_mem;
+
+    };
+
+
+    /// Get const channel pointer.
+    inline const float * FloatImage::channel(uint c) const
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(c < m_componentCount);
+        return m_mem + c * m_pixelCount;
+    }
+
+    /// Get channel pointer.
+    inline float * FloatImage::channel(uint c) {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(c < m_componentCount);
+        return m_mem + c * m_pixelCount;
+    }
+
+    inline const float * FloatImage::plane(uint c, uint z) const {
+        nvDebugCheck(z < m_depth);
+        return channel(c) + z * m_width * m_height;        
+    }
+
+    inline float * FloatImage::plane(uint c, uint z) {
+        nvDebugCheck(z < m_depth);
+        return channel(c) + z * m_width * m_height;        
+    }
+
+    /// Get const scanline pointer.
+    inline const float * FloatImage::scanline(uint c, uint y, uint z) const
+    {
+        nvDebugCheck(y < m_height);
+        return plane(c, z) + y * m_width;
+    }
+
+    /// Get scanline pointer.
+    inline float * FloatImage::scanline(uint c, uint y, uint z)
+    {
+        nvDebugCheck(y < m_height);
+        return plane(c, z) + y * m_width;
+    }
+
+    /// Get pixel component.
+    inline float FloatImage::pixel(uint c, uint x, uint y, uint z) const
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(c < m_componentCount);
+        nvDebugCheck(x < m_width);
+        nvDebugCheck(y < m_height);
+        nvDebugCheck(z < m_depth);
+        return m_mem[c * m_pixelCount + index(x, y, z)];
+    }
+
+    /// Get pixel component.
+    inline float & FloatImage::pixel(uint c, uint x, uint y, uint z)
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(c < m_componentCount);
+        nvDebugCheck(x < m_width);
+        nvDebugCheck(y < m_height);
+        nvDebugCheck(z < m_depth);
+        return m_mem[c * m_pixelCount + index(x, y, z)];
+    }
+
+    /// Get pixel component.
+    inline float FloatImage::pixel(uint c, uint idx) const
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(c < m_componentCount);
+        nvDebugCheck(idx < m_pixelCount);
+        return m_mem[c * m_pixelCount + idx];
+    }
+
+    /// Get pixel component.
+    inline float & FloatImage::pixel(uint c, uint idx)
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(c < m_componentCount);
+        nvDebugCheck(idx < m_pixelCount);
+        return m_mem[c * m_pixelCount + idx];
+    }
+
+    /// Get pixel component.
+    inline float FloatImage::pixel(uint idx) const
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(idx < m_floatCount);
+        return m_mem[idx];
+    }
+
+    /// Get pixel component.
+    inline float & FloatImage::pixel(uint idx)
+    {
+        nvDebugCheck(m_mem != NULL);
+        nvDebugCheck(idx < m_floatCount);
+        return m_mem[idx];
+    }
+
+    inline uint FloatImage::index(uint x, uint y, uint z) const
+    {
+        nvDebugCheck(x < m_width);
+        nvDebugCheck(y < m_height);
+        nvDebugCheck(z < m_depth);
+        uint idx = (z * m_height + y) * m_width + x;
+        nvDebugCheck(idx < m_pixelCount);
+        return idx;
+    }
+
+
+    inline int wrapClamp(int x, int w)
+    {
+        return nv::clamp(x, 0, w - 1);
+    }
+    inline int wrapRepeat(int x, int w)
+    {
+        if (x >= 0) return x % w;
+        else return (x + 1) % w + w - 1;
+    }
+    inline int wrapMirror(int x, int w)
+    {
+        if (w == 1) x = 0;
+
+        x = abs(x);
+        while (x >= w) {
+            x = abs(w + w - x - 2);
+        }
+
+        return x;
+    }
+
+
+
+    inline uint FloatImage::indexClamp(int x, int y, int z) const
+    {
+        x = wrapClamp(x, m_width);
+        y = wrapClamp(y, m_height);
+        z = wrapClamp(z, m_depth);
+        return index(x, y, z);
+    }
+
+
+    inline uint FloatImage::indexRepeat(int x, int y, int z) const
+    {
+        x = wrapRepeat(x, m_width);
+        y = wrapRepeat(y, m_height);
+        z = wrapRepeat(z, m_depth);
+        return index(x, y, z);
+   }
+
+    inline uint FloatImage::indexMirror(int x, int y, int z) const
+    {
+        x = wrapMirror(x, m_width);
+        y = wrapMirror(y, m_height);
+        z = wrapMirror(z, m_depth);
+        return index(x, y, z);
+    }
+
+    inline uint FloatImage::index(int x, int y, int z, WrapMode wm) const
+    {
+        if (wm == WrapMode_Clamp) return indexClamp(x, y, z);
+        if (wm == WrapMode_Repeat) return indexRepeat(x, y, z);
+        /*if (wm == WrapMode_Mirror)*/ return indexMirror(x, y, z);
+    }
+
+    inline float FloatImage::bilerp(uint c, int ix0, int iy0, int ix1, int iy1, float fx, float fy) const {
+        int iz = 0;
+        float f1 = pixel(c, ix0, iy0, iz);
+        float f2 = pixel(c, ix1, iy0, iz);
+        float f3 = pixel(c, ix0, iy1, iz);
+        float f4 = pixel(c, ix1, iy1, iz);
+
+        float i1 = lerp(f1, f2, fx);
+        float i2 = lerp(f3, f4, fx);
+
+        return lerp(i1, i2, fy);
+    }
+
+    inline float FloatImage::trilerp(uint c, int ix0, int iy0, int iz0, int ix1, int iy1, int iz1, float fx, float fy, float fz) const {
+        float f000 = pixel(c, ix0, iy0, iz0);
+        float f100 = pixel(c, ix1, iy0, iz0);
+        float f010 = pixel(c, ix0, iy1, iz0);
+        float f110 = pixel(c, ix1, iy1, iz0);
+        float f001 = pixel(c, ix0, iy0, iz1);
+        float f101 = pixel(c, ix1, iy0, iz1);
+        float f011 = pixel(c, ix0, iy1, iz1);
+        float f111 = pixel(c, ix1, iy1, iz1);
+
+        float i1 = lerp(f000, f001, fz);
+        float i2 = lerp(f010, f011, fz);
+        float j1 = lerp(f100, f101, fz);
+        float j2 = lerp(f110, f111, fz);
+
+        float w1 = lerp(i1, i2, fy);
+        float w2 = lerp(j1, j2, fy);
+
+        return lerp(w1, w2, fx);
+    }
+
+    // Does not compare channel count.
+    inline bool sameLayout(const FloatImage * img0, const FloatImage * img1) {
+        if (img0 == NULL || img1 == NULL) return false;
+        return img0->width() == img1->width() && img0->height() == img1->height() && img0->depth() == img1->depth();
+    }
 
-/// Multicomponent floating point image class.
-class FloatImage
-{
-public:
-
-	enum WrapMode {
-		WrapMode_Clamp,
-		WrapMode_Repeat,
-		WrapMode_Mirror
-	};
-	
-	NVIMAGE_API FloatImage();
-	NVIMAGE_API FloatImage(const Image * img);
-	NVIMAGE_API virtual ~FloatImage();
-
-	/** @name Conversion. */
-	//@{
-	NVIMAGE_API void initFrom(const Image * img);
-	NVIMAGE_API Image * createImage(uint base_component = 0, uint num = 4) const;
-	NVIMAGE_API Image * createImageGammaCorrect(float gamma = 2.2f) const;
-	//@}
-
-	/** @name Allocation. */
-	//@{
-	NVIMAGE_API void allocate(uint c, uint w, uint h);
-	NVIMAGE_API void free(); // Does not clear members.
-	//@}
-
-	/** @name Manipulation. */
-	//@{
-	NVIMAGE_API void clear(float f=0.0f);
-
-	NVIMAGE_API void normalize(uint base_component);
-	
-	NVIMAGE_API void packNormals(uint base_component);
-	NVIMAGE_API void expandNormals(uint base_component);
-	NVIMAGE_API void scaleBias(uint base_component, uint num, float scale, float add);
-	
-	//NVIMAGE_API void clamp(uint base_component, uint num);
-	NVIMAGE_API void clamp(float low, float high);
-	
-	NVIMAGE_API void toLinear(uint base_component, uint num, float gamma = 2.2f);
-	NVIMAGE_API void toGamma(uint base_component, uint num, float gamma = 2.2f);
-	NVIMAGE_API void exponentiate(uint base_component, uint num, float power);
-	
-
-	NVIMAGE_API FloatImage * fastDownSample() const;
-	NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm) const;
-	NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm, uint alpha) const;
-	NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm) const;
-
-	NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const;
-	//@}
-
-	NVIMAGE_API float applyKernel(const Kernel2 * k, int x, int y, uint c, WrapMode wm) const;
-	NVIMAGE_API float applyKernelVertical(const Kernel1 * k, int x, int y, uint c, WrapMode wm) const;
-	NVIMAGE_API float applyKernelHorizontal(const Kernel1 * k, int x, int y, uint c, WrapMode wm) const;
-	NVIMAGE_API void applyKernelVertical(const PolyphaseKernel & k, int x, uint c, WrapMode wm, float * output) const;
-	NVIMAGE_API void applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c, WrapMode wm, float * output) const;
-	NVIMAGE_API void applyKernelVertical(const PolyphaseKernel & k, int x, uint c, uint a, WrapMode wm, float * output) const;
-	NVIMAGE_API void applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c, uint a, WrapMode wm, float * output) const;
-	
-	
-	uint width() const { return m_width; }
-	uint height() const { return m_height; }
-	uint componentNum() const { return m_componentNum; }
-	uint count() const { return m_count; }
-
-
-	/** @name Pixel access. */
-	//@{
-	const float * channel(uint c) const;
-	float * channel(uint c);
-	
-	const float * scanline(uint y, uint c) const;
-	float * scanline(uint y, uint c);
-	
-	void setPixel(float f, uint x, uint y, uint c);
-	void addPixel(float f, uint x, uint y, uint c);
-	float pixel(uint x, uint y, uint c) const;
-	
-	void setPixel(float f, uint idx);
-	float pixel(uint idx) const;
-	
-	float sampleNearest(float x, float y, int c, WrapMode wm) const;
-	float sampleLinear(float x, float y, int c, WrapMode wm) const;
-	
-	float sampleNearestClamp(float x, float y, int c) const;
-	float sampleNearestRepeat(float x, float y, int c) const;
-	float sampleNearestMirror(float x, float y, int c) const;
-	
-	float sampleLinearClamp(float x, float y, int c) const;
-	float sampleLinearRepeat(float x, float y, int c) const;
-	float sampleLinearMirror(float x, float y, int c) const;
-	//@}
-	
-	
-	FloatImage* clone() const;
-	
-public:
-	
-	uint index(uint x, uint y) const;
-	uint indexClamp(int x, int y) const;
-	uint indexRepeat(int x, int y) const;
-	uint indexMirror(int x, int y) const;
-	uint index(int x, int y, WrapMode wm) const;
-
-public:
-
-	uint16 m_width;			///< Width of the texture.
-	uint16 m_height;		///< Height of the texture.
-	uint32 m_componentNum;	///< Number of components.
-	uint32 m_count;			///< Image pixel count.
-	float * m_mem;
-
-};
-
-
-/// Get const channel pointer.
-inline const float * FloatImage::channel(uint c) const
-{
-	nvDebugCheck(m_mem != NULL);
-	nvDebugCheck(c < m_componentNum);
-	return m_mem + c * m_width * m_height;
-}
-
-/// Get channel pointer.
-inline float * FloatImage::channel(uint c) {
-	nvDebugCheck(m_mem != NULL);
-	nvDebugCheck(c < m_componentNum);
-	return m_mem + c * m_width * m_height;
-}
-
-/// Get const scanline pointer.
-inline const float * FloatImage::scanline(uint y, uint c) const
-{
-	nvDebugCheck(y < m_height);
-	return channel(c) + y * m_width;
-}
-
-/// Get scanline pointer.
-inline float * FloatImage::scanline(uint y, uint c)
-{
-	nvDebugCheck(y < m_height);
-	return channel(c) + y * m_width;
-}
-
-/// Set pixel component.
-inline void FloatImage::setPixel(float f, uint x, uint y, uint c)
-{
-	nvDebugCheck(m_mem != NULL);
-	nvDebugCheck(x < m_width);
-	nvDebugCheck(y < m_height);
-	nvDebugCheck(c < m_componentNum);
-	m_mem[(c * m_height + y) * m_width + x] = f;
-}
-
-/// Add to pixel component.
-inline void FloatImage::addPixel(float f, uint x, uint y, uint c)
-{
-	nvDebugCheck(m_mem != NULL);
-	nvDebugCheck(x < m_width);
-	nvDebugCheck(y < m_height);
-	nvDebugCheck(c < m_componentNum);
-	m_mem[(c * m_height + y) * m_width + x] += f;
-}
-
-/// Get pixel component.
-inline float FloatImage::pixel(uint x, uint y, uint c) const
-{
-	nvDebugCheck(m_mem != NULL);
-	nvDebugCheck(x < m_width);
-	nvDebugCheck(y < m_height);
-	nvDebugCheck(c < m_componentNum);
-	return m_mem[(c * m_height + y) * m_width + x];
-}
-
-/// Set pixel component.
-inline void FloatImage::setPixel(float f, uint idx)
-{
-	nvDebugCheck(idx < m_count);
-	m_mem[idx] = f;
-}
-
-/// Get pixel component.
-inline float FloatImage::pixel(uint idx) const
-{
-	nvDebugCheck(idx < m_count);
-	return m_mem[idx];
-}
-
-inline uint FloatImage::index(uint x, uint y) const
-{
-	nvDebugCheck(x < m_width);
-	nvDebugCheck(y < m_height);
-	return y * m_width + x;
-}
-
-inline uint FloatImage::indexClamp(int x, int y) const
-{
-	return nv::clamp(y, int(0), int(m_height-1)) * m_width + nv::clamp(x, int(0), int(m_width-1));
-}
-
-inline int repeat_remainder(int a, int b)
-{
-   if (a >= 0) return a % b;
-   else return (a + 1) % b + b - 1;
-}
-
-inline uint FloatImage::indexRepeat(int x, int y) const
-{
-	return repeat_remainder(y, m_height) * m_width + repeat_remainder(x, m_width);
-}
-
-inline uint FloatImage::indexMirror(int x, int y) const
-{
-	if (m_width == 1) x = 0;
-
-	x = abs(x);
-	while (x >= m_width) {
-		x = abs(m_width + m_width - x - 2);
-	}
-
-	if (m_height == 1) y = 0;
-
-	y = abs(y);
-	while (y >= m_height) {
-		y = abs(m_height + m_height - y - 2);
-	}
-
-	return index(x, y);
-}
-
-inline uint FloatImage::index(int x, int y, WrapMode wm) const
-{
-	if (wm == WrapMode_Clamp) return indexClamp(x, y);
-	if (wm == WrapMode_Repeat) return indexRepeat(x, y);
-	/*if (wm == WrapMode_Mirror)*/ return indexMirror(x, y);
-}
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/FloatImage.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/FloatImage.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/FloatImage.cpp
@@ -4,355 +4,530 @@
 #include "Filter.h"
 #include "Image.h"
 
-#include <nvmath/Color.h>
-#include <nvmath/Matrix.h>
-
-#include <nvcore/Containers.h>
-#include <nvcore/Ptr.h>
+#include "nvmath/Color.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/ftoi.h"
+#include "nvmath/Gamma.h"
+
+#include "nvcore/Utils.h" // max
+#include "nvcore/Ptr.h"
+#include "nvcore/Memory.h"
+#include "nvcore/Array.inl"
 
 #include <math.h>
+#include <string.h> // memset, memcpy
 
 
 using namespace nv;
 
-namespace 
-{
-	static int iround(float f)
-	{
-		return int(f);
-	}
-
-	static int ifloor(float f)
-	{
-		return int(floor(f));
-	}
-
-	static float frac(float f)
-	{
-		return f - floor(f);
-	}
-
-	static int mirror(int x, int w)
-	{
-		x = abs(x);
-		while (x >= w) {
-			x = 2 * w - x - 2;
-		}
-		return x;
-	}
-}
-
 
 /// Ctor.
-FloatImage::FloatImage() : m_width(0), m_height(0), 
-	m_componentNum(0), m_count(0), m_mem(NULL)
+FloatImage::FloatImage() : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
+  m_pixelCount(0), m_floatCount(0), m_mem(NULL)
 {
 }
 
 /// Ctor. Init from image.
-FloatImage::FloatImage(const Image * img) : m_width(0), m_height(0), 
-	m_componentNum(0), m_count(0), m_mem(NULL)
+FloatImage::FloatImage(const Image * img) : m_componentCount(0), m_width(0), m_height(0), m_depth(0),
+    m_pixelCount(0), m_floatCount(0), m_mem(NULL)
 {
-	initFrom(img);
+    initFrom(img);
 }
 
 /// Dtor.
 FloatImage::~FloatImage()
 {
-	free();
+    free();
 }
 
 
 /// Init the floating point image from a regular image.
 void FloatImage::initFrom(const Image * img)
 {
-	nvCheck(img != NULL);
-	
-	allocate(4, img->width(), img->height());
-	
-	float * red_channel = channel(0);
-	float * green_channel = channel(1);
-	float * blue_channel = channel(2);
-	float * alpha_channel = channel(3);
-	
-	const uint count = m_width * m_height;
-	for(uint i = 0; i < count; i++) {
-		Color32 pixel = img->pixel(i);
-		red_channel[i] = float(pixel.r) / 255.0f;
-		green_channel[i] = float(pixel.g) / 255.0f;
-		blue_channel[i] = float(pixel.b) / 255.0f;
-		alpha_channel[i] = float(pixel.a) / 255.0f;
-	}
+    nvCheck(img != NULL);
+
+    allocate(4, img->width(), img->height(), img->depth());
+
+    float * red_channel = channel(0);
+    float * green_channel = channel(1);
+    float * blue_channel = channel(2);
+    float * alpha_channel = channel(3);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+        Color32 pixel = img->pixel(i);
+        red_channel[i] = float(pixel.r) / 255.0f;
+        green_channel[i] = float(pixel.g) / 255.0f;
+        blue_channel[i] = float(pixel.b) / 255.0f;
+        alpha_channel[i] = float(pixel.a) / 255.0f;
+    }
 }
 
 /// Convert the floating point image to a regular image.
-Image * FloatImage::createImage(uint base_component/*= 0*/, uint num/*= 4*/) const
+Image * FloatImage::createImage(uint baseComponent/*= 0*/, uint num/*= 4*/) const
 {
-	nvCheck(num <= 4);
-	nvCheck(base_component + num <= m_componentNum);
-	
-	AutoPtr<Image> img(new Image());
-	img->allocate(m_width, m_height);
-	
-	const uint size = m_width * m_height;
-	for(uint i = 0; i < size; i++) {
-		
-		uint c;
-		uint8 rgba[4]= {0, 0, 0, 0xff};
-
-		for(c = 0; c < num; c++) {
-			float f = m_mem[size * (base_component + c) + i];
-			rgba[c] = nv::clamp(int(255.0f * f), 0, 255);
-		}
-
-		img->pixel(i) = Color32(rgba[0], rgba[1], rgba[2], rgba[3]);
-	}
-	
-	return img.release();
+    nvCheck(num <= 4);
+    nvCheck(baseComponent + num <= m_componentCount);
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(m_width, m_height, m_depth);
+
+    for (uint i = 0; i < m_pixelCount; i++) {
+
+        uint c;
+        uint8 rgba[4]= {0, 0, 0, 0xff};
+
+        for (c = 0; c < num; c++) {
+            float f = pixel(baseComponent + c, i);
+            rgba[c] = nv::clamp(int(255.0f * f), 0, 255);
+        }
+
+        img->pixel(i) = Color32(rgba[0], rgba[1], rgba[2], rgba[3]);
+    }
+
+    return img.release();
 }
 
 
 /// Convert the floating point image to a regular image. Correct gamma of rgb, but not alpha.
 Image * FloatImage::createImageGammaCorrect(float gamma/*= 2.2f*/) const
 {
-	nvCheck(m_componentNum == 4);
-	
-	AutoPtr<Image> img(new Image());
-	img->allocate(m_width, m_height);
-	
-	const float * rChannel = this->channel(0);
-	const float * gChannel = this->channel(1);
-	const float * bChannel = this->channel(2);
-	const float * aChannel = this->channel(3);
-
-	const uint size = m_width * m_height;
-	for(uint i = 0; i < size; i++)
-	{
-		const uint8 r = nv::clamp(int(255.0f * pow(rChannel[i], 1.0f/gamma)), 0, 255);
-		const uint8 g = nv::clamp(int(255.0f * pow(gChannel[i], 1.0f/gamma)), 0, 255);
-		const uint8 b = nv::clamp(int(255.0f * pow(bChannel[i], 1.0f/gamma)), 0, 255);
-		const uint8 a = nv::clamp(int(255.0f * aChannel[i]), 0, 255);
-
-		img->pixel(i) = Color32(r, g, b, a);
-	}
-	
-	return img.release();
-}
-
-/// Allocate a 2d float image of the given format and the given extents.
-void FloatImage::allocate(uint c, uint w, uint h)
-{
-	free();
-
-	m_width = w;
-	m_height = h;
-	m_componentNum = c;
-	m_count = w * h * c;
-	m_mem = reinterpret_cast<float *>(::malloc(m_count * sizeof(float)));
+    nvCheck(m_componentCount == 4);
+
+    AutoPtr<Image> img(new Image());
+    img->allocate(m_width, m_height, m_depth);
+
+    const float * rChannel = this->channel(0);
+    const float * gChannel = this->channel(1);
+    const float * bChannel = this->channel(2);
+    const float * aChannel = this->channel(3);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++)
+    {
+        const uint8 r = nv::clamp(int(255.0f * pow(rChannel[i], 1.0f/gamma)), 0, 255);
+        const uint8 g = nv::clamp(int(255.0f * pow(gChannel[i], 1.0f/gamma)), 0, 255);
+        const uint8 b = nv::clamp(int(255.0f * pow(bChannel[i], 1.0f/gamma)), 0, 255);
+        const uint8 a = nv::clamp(int(255.0f * aChannel[i]), 0, 255);
+
+        img->pixel(i) = Color32(r, g, b, a);
+    }
+
+    return img.release();
+}
+
+/// Allocate a 2D float image of the given format and the given extents.
+void FloatImage::allocate(uint c, uint w, uint h, uint d)
+{
+    if (m_componentCount != c || m_width != w || m_height != h || m_depth != d)
+    {
+        free();
+
+        m_width = w;
+        m_height = h;
+        m_depth = d;
+        m_componentCount = c;
+        m_pixelCount = w * h * d;
+        m_floatCount = m_pixelCount * c;
+        m_mem = malloc<float>(m_floatCount);
+    }
 }
 
 /// Free the image, but don't clear the members.
 void FloatImage::free()
 {
-	::free( reinterpret_cast<void *>(m_mem) );
-	m_mem = NULL;
+    ::free(m_mem);
+    m_mem = NULL;
+}
+
+void FloatImage::resizeChannelCount(uint c)
+{
+    if (m_componentCount != c) {
+        uint count = m_pixelCount * c;
+        m_mem = realloc<float>(m_mem, count);
+
+        if (c > m_componentCount) {
+            memset(m_mem + m_floatCount, 0, (count - m_floatCount) * sizeof(float));
+        }
+
+        m_componentCount = c;
+        m_floatCount = count;
+    }
 }
 
 void FloatImage::clear(float f/*=0.0f*/)
 {
-	for(uint i = 0; i < m_count; i++) {
-		m_mem[i] = f;
-	}
+    for (uint i = 0; i < m_floatCount; i++) {
+        m_mem[i] = f;
+    }
 }
 
-void FloatImage::normalize(uint base_component)
+void FloatImage::clear(uint c, float f/*= 0.0f*/)
 {
-	nvCheck(base_component + 3 <= m_componentNum);
-	
-	float * xChannel = this->channel(base_component + 0);
-	float * yChannel = this->channel(base_component + 1);
-	float * zChannel = this->channel(base_component + 2);
+    float * channel = this->channel(c);
 
-	const uint size = m_width * m_height;
-	for(uint i = 0; i < size; i++) {
-		
-		Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
-		normal = normalizeSafe(normal, Vector3(zero), 0.0f);
-		
-		xChannel[i] = normal.x();
-		yChannel[i] = normal.y();
-		zChannel[i] = normal.z();
-	}
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+        channel[i] = f;
+    }
 }
 
-void FloatImage::packNormals(uint base_component)
+void FloatImage::copyChannel(uint src, uint dst)
 {
-	scaleBias(base_component, 3, 0.5f, 1.0f);
+    nvCheck(src < m_componentCount);
+    nvCheck(dst < m_componentCount);
+
+    const float * srcChannel = this->channel(src);
+    float * dstChannel = this->channel(dst);
+
+    memcpy(dstChannel, srcChannel, sizeof(float)*m_pixelCount);
 }
 
-void FloatImage::expandNormals(uint base_component)
+void FloatImage::normalize(uint baseComponent)
 {
-	scaleBias(base_component, 3, 2, -0.5);
+    nvCheck(baseComponent + 3 <= m_componentCount);
+
+    float * xChannel = this->channel(baseComponent + 0);
+    float * yChannel = this->channel(baseComponent + 1);
+    float * zChannel = this->channel(baseComponent + 2);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+
+        Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
+        normal = normalizeSafe(normal, Vector3(0), 0.0f);
+
+        xChannel[i] = normal.x;
+        yChannel[i] = normal.y;
+        zChannel[i] = normal.z;
+    }
+}
+
+void FloatImage::packNormals(uint baseComponent)
+{
+    scaleBias(baseComponent, 3, 0.5f, 0.5f);
+}
+
+void FloatImage::expandNormals(uint baseComponent)
+{
+    scaleBias(baseComponent, 3, 2, -1.0);
 }
 
-void FloatImage::scaleBias(uint base_component, uint num, float scale, float bias)
+void FloatImage::scaleBias(uint baseComponent, uint num, float scale, float bias)
 {
-	const uint size = m_width * m_height;
-	
-	for(uint c = 0; c < num; c++) {
-		float * ptr = this->channel(base_component + c);
-		
-		for(uint i = 0; i < size; i++) {
-			ptr[i] = scale * (ptr[i] + bias);
-		}
-	}
+    const uint size = m_pixelCount;
+
+    for (uint c = 0; c < num; c++) {
+        float * ptr = this->channel(baseComponent + c);
+
+        for (uint i = 0; i < size; i++) {
+            ptr[i] = scale * ptr[i] + bias;
+        }
+    }
 }
 
 /// Clamp the elements of the image.
-void FloatImage::clamp(float low, float high)
+void FloatImage::clamp(uint baseComponent, uint num, float low, float high)
 {
-	for(uint i = 0; i < m_count; i++) {
-		m_mem[i] = nv::clamp(m_mem[i], low, high);
-	}
+    const uint size = m_pixelCount;
+
+    for (uint c = 0; c < num; c++) {
+        float * ptr = this->channel(baseComponent + c);
+
+        for (uint i = 0; i < size; i++) {
+            ptr[i] = nv::clamp(ptr[i], low, high);
+        }
+    }
 }
 
 /// From gamma to linear space.
-void FloatImage::toLinear(uint base_component, uint num, float gamma /*= 2.2f*/)
+void FloatImage::toLinear(uint baseComponent, uint num, float gamma /*= 2.2f*/)
 {
-	exponentiate(base_component, num, gamma);
+    if (gamma == 2.2f) {
+        for (uint c = 0; c < num; c++) {
+            float * ptr = this->channel(baseComponent + c);
+
+            powf_11_5(ptr, ptr, m_pixelCount);
+        }
+    } else {
+        exponentiate(baseComponent, num, gamma);
+    }
 }
 
 /// From linear to gamma space.
-void FloatImage::toGamma(uint base_component, uint num, float gamma /*= 2.2f*/)
+void FloatImage::toGamma(uint baseComponent, uint num, float gamma /*= 2.2f*/)
 {
-	exponentiate(base_component, num, 1.0f/gamma);
+    if (gamma == 2.2f) {
+        for (uint c = 0; c < num; c++) {
+            float * ptr = this->channel(baseComponent + c);
+
+            powf_5_11(ptr, ptr, m_pixelCount);
+        }
+    } else {
+        exponentiate(baseComponent, num, 1.0f/gamma);
+    }
 }
 
 /// Exponentiate the elements of the image.
-void FloatImage::exponentiate(uint base_component, uint num, float power)
+void FloatImage::exponentiate(uint baseComponent, uint num, float power)
+{
+    const uint size = m_pixelCount;
+
+    for(uint c = 0; c < num; c++) {
+        float * ptr = this->channel(baseComponent + c);
+
+        for(uint i = 0; i < size; i++) {
+            ptr[i] = powf(max(0.0f, ptr[i]), power);
+        }
+    }
+}
+
+/// Apply linear transform.
+void FloatImage::transform(uint baseComponent, const Matrix & m, Vector4::Arg offset)
+{
+    nvCheck(baseComponent + 4 <= m_componentCount);
+
+    float * r = this->channel(baseComponent + 0);
+    float * g = this->channel(baseComponent + 1);
+    float * b = this->channel(baseComponent + 2);
+    float * a = this->channel(baseComponent + 3);
+
+    const uint size = m_pixelCount;
+    for (uint i = 0; i < size; i++)
+    {
+        Vector4 color = nv::transform(m, Vector4(*r, *g, *b, *a)) + offset;
+
+        *r++ = color.x;
+        *g++ = color.y;
+        *b++ = color.z;
+        *a++ = color.w;
+    }
+}
+
+void FloatImage::swizzle(uint baseComponent, uint r, uint g, uint b, uint a)
+{
+    nvCheck(baseComponent + 4 <= m_componentCount);
+    nvCheck(r < 7 && g < 7 && b < 7 && a < 7);
+
+    float consts[] = { 1.0f, 0.0f, -1.0f };
+    float * c[7];
+    c[0] = this->channel(baseComponent + 0);
+    c[1] = this->channel(baseComponent + 1);
+    c[2] = this->channel(baseComponent + 2);
+    c[3] = this->channel(baseComponent + 3);
+    c[4] = consts;
+    c[5] = consts + 1;
+    c[6] = consts + 2;
+
+    const uint size = m_pixelCount;
+    for (uint i = 0; i < size; i++)
+    {
+        float tmp[4] = { *c[r], *c[g], *c[b], *c[a] };
+
+        *c[0]++ = tmp[0];
+        *c[1]++ = tmp[1];
+        *c[2]++ = tmp[2];
+        *c[3]++ = tmp[3];
+    }
+}
+
+float FloatImage::sampleNearest(uint c, float x, float y, const WrapMode wm) const
 {
-	const uint size = m_width * m_height;
+    if( wm == WrapMode_Clamp ) return sampleNearestClamp(c, x, y);
+    else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(c, x, y);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(c, x, y);
+}
 
-	for(uint c = 0; c < num; c++) {
-		float * ptr = this->channel(base_component + c);
-		
-		for(uint i = 0; i < size; i++) {
-			ptr[i] = pow(ptr[i], power);
-		}
-	}
+float FloatImage::sampleLinear(uint c, float x, float y, WrapMode wm) const
+{
+    if( wm == WrapMode_Clamp ) return sampleLinearClamp(c, x, y);
+    else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(c, x, y);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(c, x, y);
 }
 
-float FloatImage::sampleNearest(const float x, const float y, const int c, const WrapMode wm) const
+float FloatImage::sampleNearest(uint c, float x, float y, float z, WrapMode wm) const
 {
-	if( wm == WrapMode_Clamp ) return sampleNearestClamp(x, y, c);
-	else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(x, y, c);
-	else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(x, y, c);
+    if( wm == WrapMode_Clamp ) return sampleNearestClamp(c, x, y, z);
+    else if( wm == WrapMode_Repeat ) return sampleNearestRepeat(c, x, y, z);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleNearestMirror(c, x, y, z);
 }
 
-float FloatImage::sampleLinear(const float x, const float y, const int c, const WrapMode wm) const
+float FloatImage::sampleLinear(uint c, float x, float y, float z, WrapMode wm) const
 {
-	if( wm == WrapMode_Clamp ) return sampleLinearClamp(x, y, c);
-	else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(x, y, c);
-	else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(x, y, c);
+    if( wm == WrapMode_Clamp ) return sampleLinearClamp(c, x, y, z);
+    else if( wm == WrapMode_Repeat ) return sampleLinearRepeat(c, x, y, z);
+    else /*if( wm == WrapMode_Mirror )*/ return sampleLinearMirror(c, x, y, z);
 }
 
-float FloatImage::sampleNearestClamp(const float x, const float y, const int c) const
+float FloatImage::sampleNearestClamp(uint c, float x, float y) const
 {
-	int ix = ::clamp(iround(x * m_width), 0, m_width-1);
-	int iy = ::clamp(iround(y * m_height), 0, m_height-1);
-	return pixel(ix, iy, c);
+    int ix = wrapClamp(iround(x * m_width), m_width);
+    int iy = wrapClamp(iround(y * m_height), m_height);
+    return pixel(c, ix, iy, 0);
 }
 
-float FloatImage::sampleNearestRepeat(const float x, const float y, const int c) const
+float FloatImage::sampleNearestRepeat(uint c, float x, float y) const
 {
-	int ix = iround(frac(x) * m_width);
-	int iy = iround(frac(y) * m_height);
-	return pixel(ix, iy, c);
+    int ix = wrapRepeat(iround(x * m_width), m_width);
+    int iy = wrapRepeat(iround(y * m_height), m_height);
+    return pixel(c, ix, iy, 0);
 }
 
-float FloatImage::sampleNearestMirror(const float x, const float y, const int c) const
+float FloatImage::sampleNearestMirror(uint c, float x, float y) const
 {
-	int ix = mirror(iround(x * m_width), m_width);
-	int iy = mirror(iround(y * m_height), m_height);
-	return pixel(ix, iy, c);
+    int ix = wrapMirror(iround(x * m_width), m_width);
+    int iy = wrapMirror(iround(y * m_height), m_height);
+    return pixel(c, ix, iy, 0);
 }
 
-float FloatImage::sampleLinearClamp(float x, float y, const int c) const
+float FloatImage::sampleNearestClamp(uint c, float x, float y, float z) const
 {
-	const int w = m_width;
-	const int h = m_height;
-	
-	x *= w;
-	y *= h;
-	
-	const float fracX = frac(x);
-	const float fracY = frac(y);
-	
-	const int ix0 = ::clamp(ifloor(x), 0, w-1);
-	const int iy0 = ::clamp(ifloor(y), 0, h-1);
-	const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
-	const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
+    int ix = wrapClamp(iround(x * m_width), m_width);
+    int iy = wrapClamp(iround(y * m_height), m_height);
+    int iz = wrapClamp(iround(z * m_depth), m_depth);
+    return pixel(c, ix, iy, iz);
+}
 
-	float f1 = pixel(ix0, iy0, c);
-	float f2 = pixel(ix1, iy0, c);
-	float f3 = pixel(ix0, iy1, c);
-	float f4 = pixel(ix1, iy1, c);
-	
-	float i1 = lerp(f1, f2, fracX);
-	float i2 = lerp(f3, f4, fracX);
+float FloatImage::sampleNearestRepeat(uint c, float x, float y, float z) const
+{
+    int ix = wrapRepeat(iround(x * m_width), m_width);
+    int iy = wrapRepeat(iround(y * m_height), m_height);
+    int iz = wrapRepeat(iround(z * m_depth), m_depth);
+    return pixel(c, ix, iy, iz);
+}
 
-	return lerp(i1, i2, fracY);
+float FloatImage::sampleNearestMirror(uint c, float x, float y, float z) const
+{
+    int ix = wrapMirror(iround(x * m_width), m_width);
+    int iy = wrapMirror(iround(y * m_height), m_height);
+    int iz = wrapMirror(iround(z * m_depth), m_depth);
+    return pixel(c, ix, iy, iz);
 }
 
-float FloatImage::sampleLinearRepeat(float x, float y, int c) const
+
+float FloatImage::sampleLinearClamp(uint c, float x, float y) const
 {
-	const int w = m_width;
-	const int h = m_height;
-	
-	const float fracX = frac(x * w);
-	const float fracY = frac(y * h);
-	
-	int ix0 = ifloor(frac(x) * w);
-	int iy0 = ifloor(frac(y) * h);
-	int ix1 = ifloor(frac(x + 1.0f/w) * w);
-	int iy1 = ifloor(frac(y + 1.0f/h) * h);
-	
-	float f1 = pixel(ix0, iy0, c);
-	float f2 = pixel(ix1, iy0, c);
-	float f3 = pixel(ix0, iy1, c);
-	float f4 = pixel(ix1, iy1, c);
-	
-	float i1 = lerp(f1, f2, fracX);
-	float i2 = lerp(f3, f4, fracX);
+    const int w = m_width;
+    const int h = m_height;
 
-	return lerp(i1, i2, fracY);
+    x *= w;
+    y *= h;
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+
+    const int ix0 = ::clamp(ifloor(x), 0, w-1);
+    const int iy0 = ::clamp(ifloor(y), 0, h-1);
+    const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
+    const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
+
+    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
 }
 
-float FloatImage::sampleLinearMirror(float x, float y, int c) const
+float FloatImage::sampleLinearRepeat(uint c, float x, float y) const
 {
-	const int w = m_width;
-	const int h = m_height;
+    const int w = m_width;
+    const int h = m_height;
+
+    const float fracX = frac(x * w);
+    const float fracY = frac(y * h);
+
+    // @@ Using floor in some places, but round in others?
+    int ix0 = ifloor(frac(x) * w);
+    int iy0 = ifloor(frac(y) * h);
+    int ix1 = ifloor(frac(x + 1.0f/w) * w);
+    int iy1 = ifloor(frac(y + 1.0f/h) * h);
+
+    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
+}
 
-	x *= w;
-	y *= h;
+float FloatImage::sampleLinearMirror(uint c, float x, float y) const
+{
+    const int w = m_width;
+    const int h = m_height;
 
-	const float fracX = frac(x);
-	const float fracY = frac(y);
+    x *= w;
+    y *= h;
 
-	int ix0 = mirror(iround(x), w);
-	int iy0 = mirror(iround(y), h);
-	int ix1 = mirror(iround(x) + 1, w);
-	int iy1 = mirror(iround(y) + 1, h);
+    const float fracX = frac(x);
+    const float fracY = frac(y);
 
-	float f1 = pixel(ix0, iy0, c);
-	float f2 = pixel(ix1, iy0, c);
-	float f3 = pixel(ix0, iy1, c);
-	float f4 = pixel(ix1, iy1, c);
-	
-	float i1 = lerp(f1, f2, fracX);
-	float i2 = lerp(f3, f4, fracX);
+    int ix0 = wrapMirror(iround(x), w);
+    int iy0 = wrapMirror(iround(y), h);
+    int ix1 = wrapMirror(iround(x) + 1, w);
+    int iy1 = wrapMirror(iround(y) + 1, h);
 
-	return lerp(i1, i2, fracY);
+    return bilerp(c, ix0, iy0, ix1, iy1, fracX, fracY);
+}
+
+float FloatImage::sampleLinearClamp(uint c, float x, float y, float z) const
+{
+    const int w = m_width;
+    const int h = m_height;
+    const int d = m_depth;
+
+    x *= w;
+    y *= h;
+    z *= d;
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+    const float fracZ = frac(z);
+
+    // @@ Using floor in some places, but round in others?
+    const int ix0 = ::clamp(ifloor(x), 0, w-1);
+    const int iy0 = ::clamp(ifloor(y), 0, h-1);
+    const int iz0 = ::clamp(ifloor(z), 0, h-1);
+    const int ix1 = ::clamp(ifloor(x)+1, 0, w-1);
+    const int iy1 = ::clamp(ifloor(y)+1, 0, h-1);
+    const int iz1 = ::clamp(ifloor(z)+1, 0, h-1);
+
+    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
+}
+
+float FloatImage::sampleLinearRepeat(uint c, float x, float y, float z) const
+{
+    const int w = m_width;
+    const int h = m_height;
+    const int d = m_depth;
+
+    const float fracX = frac(x * w);
+    const float fracY = frac(y * h);
+    const float fracZ = frac(z * d);
+
+    int ix0 = ifloor(frac(x) * w);
+    int iy0 = ifloor(frac(y) * h);
+    int iz0 = ifloor(frac(z) * d);
+    int ix1 = ifloor(frac(x + 1.0f/w) * w);
+    int iy1 = ifloor(frac(y + 1.0f/h) * h);
+    int iz1 = ifloor(frac(z + 1.0f/d) * d);
+
+    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
+}
+
+float FloatImage::sampleLinearMirror(uint c, float x, float y, float z) const
+{
+    const int w = m_width;
+    const int h = m_height;
+    const int d = m_depth;
+
+    x *= w;
+    y *= h;
+    z *= d;
+
+    int ix0 = wrapMirror(iround(x), w);
+    int iy0 = wrapMirror(iround(y), h);
+    int iz0 = wrapMirror(iround(z), d);
+    int ix1 = wrapMirror(iround(x) + 1, w);
+    int iy1 = wrapMirror(iround(y) + 1, h);
+    int iz1 = wrapMirror(iround(z) + 1, d);
+
+    const float fracX = frac(x);
+    const float fracY = frac(y);
+    const float fracZ = frac(z);
+
+    return trilerp(c, ix0, iy0, iz0, ix1, iy1, iz1, fracX, fracY, fracZ);
 }
 
 
@@ -365,545 +540,930 @@
 ///
 FloatImage * FloatImage::fastDownSample() const
 {
-	nvDebugCheck(m_width != 1 || m_height != 1);
-	
-	AutoPtr<FloatImage> dst_image( new FloatImage() );
-
-	const uint w = max(1, m_width / 2);
-	const uint h = max(1, m_height / 2);
-	dst_image->allocate(m_componentNum, w, h);
-
-	// 1D box filter.
-	if (m_width == 1 || m_height == 1)
-	{
-		const uint n = w * h;
-		
-		if ((m_width * m_height) & 1)
-		{
-			const float scale = 1.0f / (2 * n + 1);
-			
-			for(uint c = 0; c < m_componentNum; c++)
-			{
-				const float * src = this->channel(c);
-				float * dst = dst_image->channel(c);
-				
-				for(uint x = 0; x < n; x++)
-				{
-					const float w0 = float(n - x);
-					const float w1 = float(n - 0);
-					const float w2 = float(1 + x);
-					
-					*dst++ = scale * (w0 * src[0] + w1 * src[1] + w2 * src[2]);
-					src += 2;
-				}
-			}
-		}
-		else
-		{
-			for(uint c = 0; c < m_componentNum; c++)
-			{
-				const float * src = this->channel(c);
-				float * dst = dst_image->channel(c);
-				
-				for(uint x = 0; x < n; x++)
-				{
-					*dst = 0.5f * (src[0] + src[1]);
-					dst++;
-					src += 2;
-				}
-			}
-		}
-	}
-	
-	// Regular box filter.
-	else if ((m_width & 1) == 0 && (m_height & 1) == 0)
-	{
-		for(uint c = 0; c < m_componentNum; c++)
-		{
-			const float * src = this->channel(c);
-			float * dst = dst_image->channel(c);
-			
-			for(uint y = 0; y < h; y++)
-			{
-				for(uint x = 0; x < w; x++)
-				{
-					*dst = 0.25f * (src[0] + src[1] + src[m_width] + src[m_width + 1]);
-					dst++;
-					src += 2;
-				}
-				
-				src += m_width;
-			}
-		}
-	}
-	
-	// Polyphase filters.
-	else if (m_width & 1 && m_height & 1)
-	{
-		nvDebugCheck(m_width == 2 * w + 1);
-		nvDebugCheck(m_height == 2 * h + 1);
-		
-		const float scale = 1.0f / (m_width * m_height);
-		
-		for(uint c = 0; c < m_componentNum; c++)
-		{
-			const float * src = this->channel(c);
-			float * dst = dst_image->channel(c);
-			
-			for(uint y = 0; y < h; y++)
-			{
-				const float v0 = float(h - y);
-				const float v1 = float(h - 0);
-				const float v2 = float(1 + y);
-				
-				for (uint x = 0; x < w; x++)
-				{
-					const float w0 = float(w - x);
-					const float w1 = float(w - 0);
-					const float w2 = float(1 + x);
-					
-					float f = 0.0f;
-					f += v0 * (w0 * src[0 * m_width + 2 * x] + w1 * src[0 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
-					f += v1 * (w0 * src[1 * m_width + 2 * x] + w1 * src[1 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
-					f += v2 * (w0 * src[2 * m_width + 2 * x] + w1 * src[2 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
-					
-					*dst = f * scale;
-					dst++;
-				}
-				
-				src += 2 * m_width;
-			}
-		}
-	}
-	else if (m_width & 1)
-	{
-		nvDebugCheck(m_width == 2 * w + 1);
-		const float scale = 1.0f / (2 * m_width);
-		
-		for(uint c = 0; c < m_componentNum; c++)
-		{
-			const float * src = this->channel(c);
-			float * dst = dst_image->channel(c);
-			
-			for(uint y = 0; y < h; y++)
-			{
-				for (uint x = 0; x < w; x++)
-				{
-					const float w0 = float(w - x);
-					const float w1 = float(w - 0);
-					const float w2 = float(1 + x);
-					
-					float f = 0.0f;
-					f += w0 * (src[2 * x + 0] + src[m_width + 2 * x + 0]);
-					f += w1 * (src[2 * x + 1] + src[m_width + 2 * x + 1]);
-					f += w2 * (src[2 * x + 2] + src[m_width + 2 * x + 2]);
-					
-					*dst = f * scale;
-					dst++;
-				}
-				
-				src += 2 * m_width;
-			}
-		}
-	}
-	else if (m_height & 1)
-	{
-		nvDebugCheck(m_height == 2 * h + 1);
-		
-		const float scale = 1.0f / (2 * m_height);
-		
-		for(uint c = 0; c < m_componentNum; c++)
-		{
-			const float * src = this->channel(c);
-			float * dst = dst_image->channel(c);
-			
-			for(uint y = 0; y < h; y++)
-			{
-				const float v0 = float(h - y);
-				const float v1 = float(h - 0);
-				const float v2 = float(1 + y);
-				
-				for (uint x = 0; x < w; x++)
-				{
-					float f = 0.0f;
-					f += v0 * (src[0 * m_width + 2 * x] + src[0 * m_width + 2 * x + 1]);
-					f += v1 * (src[1 * m_width + 2 * x] + src[1 * m_width + 2 * x + 1]);
-					f += v2 * (src[2 * m_width + 2 * x] + src[2 * m_width + 2 * x + 1]);
-					
-					*dst = f * scale;
-					dst++;
-				}
-				
-				src += 2 * m_width;
-			}
-		}
-	}
-	
-	return dst_image.release();
+    nvDebugCheck(m_depth == 1);
+    nvDebugCheck(m_width != 1 || m_height != 1);
+
+    AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+    const uint w = max(1, m_width / 2);
+    const uint h = max(1, m_height / 2);
+    dst_image->allocate(m_componentCount, w, h);
+
+    // 1D box filter.
+    if (m_width == 1 || m_height == 1)
+    {
+        const uint n = w * h;
+
+        if ((m_width * m_height) & 1)
+        {
+            const float scale = 1.0f / (2 * n + 1);
+
+            for(uint c = 0; c < m_componentCount; c++)
+            {
+                const float * src = this->channel(c);
+                float * dst = dst_image->channel(c);
+
+                for(uint x = 0; x < n; x++)
+                {
+                    const float w0 = float(n - x);
+                    const float w1 = float(n - 0);
+                    const float w2 = float(1 + x);
+
+                    *dst++ = scale * (w0 * src[0] + w1 * src[1] + w2 * src[2]);
+                    src += 2;
+                }
+            }
+        }
+        else
+        {
+            for(uint c = 0; c < m_componentCount; c++)
+            {
+                const float * src = this->channel(c);
+                float * dst = dst_image->channel(c);
+
+                for(uint x = 0; x < n; x++)
+                {
+                    *dst = 0.5f * (src[0] + src[1]);
+                    dst++;
+                    src += 2;
+                }
+            }
+        }
+    }
+
+    // Regular box filter.
+    else if ((m_width & 1) == 0 && (m_height & 1) == 0)
+    {
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                for(uint x = 0; x < w; x++)
+                {
+                    *dst = 0.25f * (src[0] + src[1] + src[m_width] + src[m_width + 1]);
+                    dst++;
+                    src += 2;
+                }
+
+                src += m_width;
+            }
+        }
+    }
+
+    // Polyphase filters.
+    else if (m_width & 1 && m_height & 1)
+    {
+        nvDebugCheck(m_width == 2 * w + 1);
+        nvDebugCheck(m_height == 2 * h + 1);
+
+        const float scale = 1.0f / (m_width * m_height);
+
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                const float v0 = float(h - y);
+                const float v1 = float(h - 0);
+                const float v2 = float(1 + y);
+
+                for (uint x = 0; x < w; x++)
+                {
+                    const float w0 = float(w - x);
+                    const float w1 = float(w - 0);
+                    const float w2 = float(1 + x);
+
+                    float f = 0.0f;
+                    f += v0 * (w0 * src[0 * m_width + 2 * x] + w1 * src[0 * m_width + 2 * x + 1] + w2 * src[0 * m_width + 2 * x + 2]);
+                    f += v1 * (w0 * src[1 * m_width + 2 * x] + w1 * src[1 * m_width + 2 * x + 1] + w2 * src[1 * m_width + 2 * x + 2]);
+                    f += v2 * (w0 * src[2 * m_width + 2 * x] + w1 * src[2 * m_width + 2 * x + 1] + w2 * src[2 * m_width + 2 * x + 2]);
+
+                    *dst = f * scale;
+                    dst++;
+                }
+
+                src += 2 * m_width;
+            }
+        }
+    }
+    else if (m_width & 1)
+    {
+        nvDebugCheck(m_width == 2 * w + 1);
+        const float scale = 1.0f / (2 * m_width);
+
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                for (uint x = 0; x < w; x++)
+                {
+                    const float w0 = float(w - x);
+                    const float w1 = float(w - 0);
+                    const float w2 = float(1 + x);
+
+                    float f = 0.0f;
+                    f += w0 * (src[2 * x + 0] + src[m_width + 2 * x + 0]);
+                    f += w1 * (src[2 * x + 1] + src[m_width + 2 * x + 1]);
+                    f += w2 * (src[2 * x + 2] + src[m_width + 2 * x + 2]);
+
+                    *dst = f * scale;
+                    dst++;
+                }
+
+                src += 2 * m_width;
+            }
+        }
+    }
+    else if (m_height & 1)
+    {
+        nvDebugCheck(m_height == 2 * h + 1);
+
+        const float scale = 1.0f / (2 * m_height);
+
+        for(uint c = 0; c < m_componentCount; c++)
+        {
+            const float * src = this->channel(c);
+            float * dst = dst_image->channel(c);
+
+            for(uint y = 0; y < h; y++)
+            {
+                const float v0 = float(h - y);
+                const float v1 = float(h - 0);
+                const float v2 = float(1 + y);
+
+                for (uint x = 0; x < w; x++)
+                {
+                    float f = 0.0f;
+                    f += v0 * (src[0 * m_width + 2 * x] + src[0 * m_width + 2 * x + 1]);
+                    f += v1 * (src[1 * m_width + 2 * x] + src[1 * m_width + 2 * x + 1]);
+                    f += v2 * (src[2 * m_width + 2 * x] + src[2 * m_width + 2 * x + 1]);
+
+                    *dst = f * scale;
+                    dst++;
+                }
+
+                src += 2 * m_width;
+            }
+        }
+    }
+
+    return dst_image.release();
 }
 
 /// Downsample applying a 1D kernel separately in each dimension.
 FloatImage * FloatImage::downSample(const Filter & filter, WrapMode wm) const
 {
-	const uint w = max(1, m_width / 2);
-	const uint h = max(1, m_height / 2);
+    const uint w = max(1, m_width / 2);
+    const uint h = max(1, m_height / 2);
+    const uint d = max(1, m_depth / 2);
 
-	return resize(filter, w, h, wm);
+    return resize(filter, w, h, d, wm);
 }
 
 /// Downsample applying a 1D kernel separately in each dimension.
 FloatImage * FloatImage::downSample(const Filter & filter, WrapMode wm, uint alpha) const
 {
-	const uint w = max(1, m_width / 2);
-	const uint h = max(1, m_height / 2);
+    const uint w = max(1, m_width / 2);
+    const uint h = max(1, m_height / 2);
+    const uint d = max(1, m_depth / 2);
 
-	return resize(filter, w, h, wm, alpha);
+    return resize(filter, w, h, d, wm, alpha);
 }
 
 
 /// Downsample applying a 1D kernel separately in each dimension.
 FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode wm) const
 {
-	// @@ Use monophase filters when frac(m_width / w) == 0
+    // @@ Use monophase filters when frac(m_width / w) == 0
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );	
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
 
-	AutoPtr<FloatImage> tmp_image( new FloatImage() );
-	AutoPtr<FloatImage> dst_image( new FloatImage() );	
-	
-	PolyphaseKernel xkernel(filter, m_width, w, 32);
-	PolyphaseKernel ykernel(filter, m_height, h, 32);
-	
-	// @@ Select fastest filtering order:
-	//if (w * m_height <= h * m_width)
-	{
-		tmp_image->allocate(m_componentNum, w, m_height);
-		dst_image->allocate(m_componentNum, w, h);
-		
-		Array<float> tmp_column(h);
-		tmp_column.resize(h);
-		
-		for (uint c = 0; c < m_componentNum; c++)
-		{
-			float * tmp_channel = tmp_image->channel(c);
-			
-			for (uint y = 0; y < m_height; y++) {
-				this->applyKernelHorizontal(xkernel, y, c, wm, tmp_channel + y * w);
-			}
-			
-			float * dst_channel = dst_image->channel(c);
-			
-			for (uint x = 0; x < w; x++) {
-				tmp_image->applyKernelVertical(ykernel, x, c, wm, tmp_column.unsecureBuffer());
-				
-				for (uint y = 0; y < h; y++) {
-					dst_channel[y * w + x] = tmp_column[y];
-				}
-			}
-		}
-	}
-	/*else
-	{
-		tmp_image->allocate(m_componentNum, m_width, h);
-		dst_image->allocate(m_componentNum, w, h);
-		
-		Array<float> tmp_column(h);
-		tmp_column.resize(h);
-		
-		for (uint c = 0; c < m_componentNum; c++)
-		{
-			float * tmp_channel = tmp_image->channel(c);
-
-			for (uint x = 0; x < w; x++) {
-				tmp_image->applyKernelVertical(ykernel, x, c, wm, tmp_column.unsecureBuffer());
-				
-				for (uint y = 0; y < h; y++) {
-					tmp_channel[y * w + x] = tmp_column[y];
-				}
-			}
-
-			float * dst_channel = dst_image->channel(c);
-
-			for (uint y = 0; y < m_height; y++) {
-				this->applyKernelHorizontal(xkernel, y, c, wm, dst_channel + y * w);
-			}
-		}
-	}*/
-	
-	return dst_image.release();
+    // @@ Select fastest filtering order:
+    //if (w * m_height <= h * m_width)
+    {
+        tmp_image->allocate(m_componentCount, w, m_height);
+        dst_image->allocate(m_componentCount, w, h);
+
+        // @@ We could avoid this allocation, write directly to dst_plane.
+        Array<float> tmp_column(h);
+        tmp_column.resize(h);
+
+        for (uint c = 0; c < m_componentCount; c++)
+        {
+            for (uint z = 0; z < m_depth; z++)
+            {
+                float * tmp_plane = tmp_image->plane(c, z);
+
+                for (uint y = 0; y < m_height; y++) {
+                    this->applyKernelX(xkernel, y, z, c, wm, tmp_plane + y * w);
+                }
+
+                float * dst_plane = dst_image->plane(c, z);
+
+                for (uint x = 0; x < w; x++) {
+                    tmp_image->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
+
+                    // @@ We could avoid this copy, write directly to dst_plane.
+                    for (uint y = 0; y < h; y++) {
+                        dst_plane[y * w + x] = tmp_column[y];
+                    }
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+/// Downsample applying a 1D kernel separately in each dimension. (for 3d textures)
+FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm) const
+{
+    // @@ Use monophase filters when frac(m_width / w) == 0
+
+    // Use the existing 2d version if we are not resizing in the Z axis:
+    if (m_depth == d) {
+        return resize(filter, w, h, wm);
+    }
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> tmp_image2( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+    PolyphaseKernel zkernel(filter, m_depth, d, 32);
+
+    tmp_image->allocate(m_componentCount, w, m_height, m_depth);
+    tmp_image2->allocate(m_componentCount, w, m_height, d);
+    dst_image->allocate(m_componentCount, w, h, d);
+
+    Array<float> tmp_column(h);
+    tmp_column.resize(h);
+
+    for (uint c = 0; c < m_componentCount; c++)
+    {
+        float * tmp_channel = tmp_image->channel(c);
+
+        // split width in half
+        for (uint z = 0; z < m_depth; z++ ) {
+            for (uint y = 0; y < m_height; y++) {
+                this->applyKernelX(xkernel, y, z, c, wm, tmp_channel + z * m_height * w + y * w);
+            }
+        }
+
+        // split depth in half
+        float * tmp2_channel = tmp_image2->channel(c);
+        for (uint y = 0; y < m_height; y++) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image->applyKernelZ(zkernel, x, y, c, wm, tmp_column.buffer() );
+
+                for (uint z = 0; z < d; z++) {
+                    tmp2_channel[z * m_height * w + y * w + x] = tmp_column[z];
+                }
+            }
+        }
+
+        // split height in half
+        float * dst_channel = dst_image->channel(c);
+
+        for (uint z = 0; z < d; z++ ) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image2->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
+
+                for (uint y = 0; y < h; y++) {
+                    dst_channel[z * h * w + y * w + x] = tmp_column[y];
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
 }
 
+
 /// Downsample applying a 1D kernel separately in each dimension.
 FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const
 {
-	nvCheck(alpha < m_componentNum);
+    nvCheck(alpha < m_componentCount);
 
-	AutoPtr<FloatImage> tmp_image( new FloatImage() );
-	AutoPtr<FloatImage> dst_image( new FloatImage() );	
-	
-	PolyphaseKernel xkernel(filter, m_width, w, 32);
-	PolyphaseKernel ykernel(filter, m_height, h, 32);
-	
-	{
-		tmp_image->allocate(m_componentNum, w, m_height);
-		dst_image->allocate(m_componentNum, w, h);
-		
-		Array<float> tmp_column(h);
-		tmp_column.resize(h);
-		
-		for (uint c = 0; c < m_componentNum; c++)
-		{
-			float * tmp_channel = tmp_image->channel(c);
-			
-			for (uint y = 0; y < m_height; y++) {
-				this->applyKernelHorizontal(xkernel, y, c, alpha, wm, tmp_channel + y * w);
-			}
-		}
-
-		// Process all channels before applying vertical kernel to make sure alpha has been computed.
-
-		for (uint c = 0; c < m_componentNum; c++)
-		{
-			float * dst_channel = dst_image->channel(c);
-			
-			for (uint x = 0; x < w; x++) {
-				tmp_image->applyKernelVertical(ykernel, x, c, alpha, wm, tmp_column.unsecureBuffer());
-				
-				for (uint y = 0; y < h; y++) {
-					dst_channel[y * w + x] = tmp_column[y];
-				}
-			}
-		}
-	}
-	
-	return dst_image.release();
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );	
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+
+    {
+        tmp_image->allocate(m_componentCount, w, m_height);
+        dst_image->allocate(m_componentCount, w, h);
+
+        Array<float> tmp_column(h);
+        tmp_column.resize(h);
+
+        for (uint i = 0; i < m_componentCount; i++)
+        {
+            // Process alpha channel first.
+            uint c;
+            if (i == 0) c = alpha;
+            else if (i > alpha) c = i;
+            else c = i - 1;
+
+            for (uint z = 0; z < m_depth; z++)
+            {
+                float * tmp_plane = tmp_image->plane(c, z);
+
+                for (uint y = 0; y < m_height; y++) {
+                    this->applyKernelX(xkernel, y, z, c, wm, tmp_plane + y * w);
+                }
+
+                float * dst_plane = dst_image->plane(c, z);
+
+                for (uint x = 0; x < w; x++) {
+                    tmp_image->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
+
+                    // @@ Avoid this copy, write directly to dst_plane.
+                    for (uint y = 0; y < h; y++) {
+                        dst_plane[y * w + x] = tmp_column[y];
+                    }
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
 }
 
 
+/// Downsample applying a 1D kernel separately in each dimension. (for 3d textures)
+FloatImage * FloatImage::resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm, uint alpha) const
+{
+    nvCheck(alpha < m_componentCount);
+
+    // use the existing 2d version if we are a 2d image:
+    if (m_depth == d) {
+        return resize( filter, w, h, wm, alpha );
+    }
+
+    AutoPtr<FloatImage> tmp_image( new FloatImage() );
+    AutoPtr<FloatImage> tmp_image2( new FloatImage() );
+    AutoPtr<FloatImage> dst_image( new FloatImage() );
+
+    PolyphaseKernel xkernel(filter, m_width, w, 32);
+    PolyphaseKernel ykernel(filter, m_height, h, 32);
+    PolyphaseKernel zkernel(filter, m_depth, d, 32);
+
+    tmp_image->allocate(m_componentCount, w, m_height, m_depth);
+    tmp_image2->allocate(m_componentCount, w, m_height, d);
+    dst_image->allocate(m_componentCount, w, h, d);
+
+    Array<float> tmp_column(h);
+    tmp_column.resize(h);
+
+    for (uint i = 0; i < m_componentCount; i++)
+    {
+        // Process alpha channel first.
+        uint c;
+        if (i == 0) c = alpha;
+        else if (i > alpha) c = i;
+        else c = i - 1;
+
+        float * tmp_channel = tmp_image->channel(c);
+
+        for (uint z = 0; z < m_depth; z++ ) {
+            for (uint y = 0; y < m_height; y++) {
+                this->applyKernelX(xkernel, y, z, c, wm, tmp_channel + z * m_height * w + y * w);
+            }
+        }
+
+        float * tmp2_channel = tmp_image2->channel(c);
+        for (uint y = 0; y < m_height; y++) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image->applyKernelZ(zkernel, x, y, c, wm, tmp_column.buffer() );
+
+                for (uint z = 0; z < d; z++) {
+                    tmp2_channel[z * m_height * w + y * w + x] = tmp_column[z];
+                }
+            }
+        }
+
+        float * dst_channel = dst_image->channel(c);
+
+        for (uint z = 0; z < d; z++ ) {
+            for (uint x = 0; x < w; x++) {
+                tmp_image2->applyKernelY(ykernel, x, z, c, wm, tmp_column.buffer());
+
+                for (uint y = 0; y < h; y++) {
+                    dst_channel[z * h * w + y * w + x] = tmp_column[y];
+                }
+            }
+        }
+    }
+
+    return dst_image.release();
+}
+
+
+void FloatImage::convolve(const Kernel2 & k, uint c, WrapMode wm)
+{
+    AutoPtr<FloatImage> tmpImage(clone());
+
+    uint w = m_width;
+    uint h = m_height;
+    uint d = m_depth;
+
+    for (uint z = 0; z < d; z++)
+    {
+        for (uint y = 0; y < h; y++)
+        {
+            for (uint x = 0; x < w; x++)
+            {
+                pixel(c, x, y, 0) = tmpImage->applyKernelXY(&k, x, y, z, c, wm);
+            }
+        }
+    }
+}
+
 
 /// Apply 2D kernel at the given coordinates and return result.
-float FloatImage::applyKernel(const Kernel2 * k, int x, int y, uint c, WrapMode wm) const
+float FloatImage::applyKernelXY(const Kernel2 * k, int x, int y, int z, uint c, WrapMode wm) const
 {
-	nvDebugCheck(k != NULL);
-	
-	const uint kernelWindow = k->windowSize();
-	const int kernelOffset = int(kernelWindow / 2) - 1;
-	
-	const float * channel = this->channel(c);
-
-	float sum = 0.0f;
-	for (uint i = 0; i < kernelWindow; i++)
-	{
-		const int src_y = int(y + i) - kernelOffset;
-		
-		for (uint e = 0; e < kernelWindow; e++)
-		{
-			const int src_x = int(x + e) - kernelOffset;
-			
-			int idx = this->index(src_x, src_y, wm);
-			
-			sum += k->valueAt(e, i) * channel[idx];
-		}
-	}
-	
-	return sum;
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->plane(c, z);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        int src_y = int(y + i) - kernelOffset;
+
+        for (uint e = 0; e < kernelWindow; e++)
+        {
+            int src_x = int(x + e) - kernelOffset;
+
+            int idx = this->index(src_x, src_y, z, wm);
+
+            sum += k->valueAt(e, i) * channel[idx];
+        }
+    }
+
+    return sum;
 }
 
 
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+float FloatImage::applyKernelX(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
+{
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->channel(c);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        const int src_x = int(x + i) - kernelOffset;
+        const int idx = this->index(src_x, y, z, wm);
+
+        sum += k->valueAt(i) * channel[idx];
+    }
+
+    return sum;
+}
+
 /// Apply 1D vertical kernel at the given coordinates and return result.
-float FloatImage::applyKernelVertical(const Kernel1 * k, int x, int y, uint c, WrapMode wm) const
+float FloatImage::applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
 {
-	nvDebugCheck(k != NULL);
-	
-	const uint kernelWindow = k->windowSize();
-	const int kernelOffset = int(kernelWindow / 2) - 1;
-	
-	const float * channel = this->channel(c);
-
-	float sum = 0.0f;
-	for (uint i = 0; i < kernelWindow; i++)
-	{
-		const int src_y = int(y + i) - kernelOffset;
-		const int idx = this->index(x, src_y, wm);
-		
-		sum += k->valueAt(i) * channel[idx];
-	}
-	
-	return sum;
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->channel(c);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        const int src_y = int(y + i) - kernelOffset;
+        const int idx = this->index(x, src_y, z, wm);
+
+        sum += k->valueAt(i) * channel[idx];
+    }
+
+    return sum;
 }
 
-/// Apply 1D horizontal kernel at the given coordinates and return result.
-float FloatImage::applyKernelHorizontal(const Kernel1 * k, int x, int y, uint c, WrapMode wm) const
+/// Apply 1D kernel in the z direction at the given coordinates and return result.
+float FloatImage::applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const
 {
-	nvDebugCheck(k != NULL);
-	
-	const uint kernelWindow = k->windowSize();
-	const int kernelOffset = int(kernelWindow / 2) - 1;
-	
-	const float * channel = this->channel(c);
-
-	float sum = 0.0f;
-	for (uint e = 0; e < kernelWindow; e++)
-	{
-		const int src_x = int(x + e) - kernelOffset;
-		const int idx = this->index(src_x, y, wm);
-		
-		sum += k->valueAt(e) * channel[idx];
-	}
-	
-	return sum;
+    nvDebugCheck(k != NULL);
+
+    const uint kernelWindow = k->windowSize();
+    const int kernelOffset = int(kernelWindow / 2);
+
+    const float * channel = this->channel(c);
+
+    float sum = 0.0f;
+    for (uint i = 0; i < kernelWindow; i++)
+    {
+        const int src_z = int(z + i) - kernelOffset;
+        const int idx = this->index(x, y, src_z, wm);
+
+        sum += k->valueAt(i) * channel[idx];
+    }
+
+    return sum;
 }
 
 
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+void FloatImage::applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * __restrict output) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_width);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvDebugCheck(right - left <= windowSize);
+
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(left + j, y, z, wm);
+
+            sum += k.valueAt(i, j) * channel[idx];
+        }
+
+        output[i] = sum;
+    }
+}
+
 /// Apply 1D vertical kernel at the given coordinates and return result.
-void FloatImage::applyKernelVertical(const PolyphaseKernel & k, int x, uint c, WrapMode wm, float * __restrict output) const
+void FloatImage::applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * __restrict output) const
 {
-	const uint length = k.length();
-	const float scale = float(length) / float(m_height);
-	const float iscale = 1.0f / scale;
-
-	const float width = k.width();
-	const int windowSize = k.windowSize();
-
-	const float * channel = this->channel(c);
-
-	for (uint i = 0; i < length; i++)
-	{
-		const float center = (0.5f + i) * iscale;
-		
-		const int left = (int)floorf(center - width);
-		const int right = (int)ceilf(center + width);
-		nvCheck(right - left <= windowSize);
-		
-		float sum = 0;
-		for (int j = 0; j < windowSize; ++j)
-		{
-			const int idx = this->index(x, j+left, wm);
-			
-			sum += k.valueAt(i, j) * channel[idx];
-		}
-		
-		output[i] = sum;
-	}
+    const uint length = k.length();
+    const float scale = float(length) / float(m_height);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvCheck(right - left <= windowSize);
+
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, j+left, z, wm);
+
+            sum += k.valueAt(i, j) * channel[idx];
+        }
+
+        output[i] = sum;
+    }
 }
 
-/// Apply 1D horizontal kernel at the given coordinates and return result.
-void FloatImage::applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c, WrapMode wm, float * __restrict output) const
+/// Apply 1D kernel in the Z direction at the given coordinates and return result.
+void FloatImage::applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * __restrict output) const
 {
-	const uint length = k.length();
-	const float scale = float(length) / float(m_width);
-	const float iscale = 1.0f / scale;
-
-	const float width = k.width();
-	const int windowSize = k.windowSize();
-
-	const float * channel = this->channel(c);
-
-	for (uint i = 0; i < length; i++)
-	{
-		const float center = (0.5f + i) * iscale;
-		
-		const int left = (int)floorf(center - width);
-		const int right = (int)ceilf(center + width);
-		nvDebugCheck(right - left <= windowSize);
-		
-		float sum = 0;
-		for (int j = 0; j < windowSize; ++j)
-		{
-			const int idx = this->index(left + j, y, wm);
-			
-			sum += k.valueAt(i, j) * channel[idx];
-		}
-		
-		output[i] = sum;
-	}
+    const uint length = k.length();
+    const float scale = float(length) / float(m_height);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvCheck(right - left <= windowSize);
+
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, y, j+left, wm);
+
+            sum += k.valueAt(i, j) * channel[idx];
+        }
+
+        output[i] = sum;
+    }
 }
 
 
+/// Apply 1D horizontal kernel at the given coordinates and return result.
+void FloatImage::applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * __restrict output) const
+{
+    const uint length = k.length();
+    const float scale = float(length) / float(m_width);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+    const float * alpha = this->channel(a);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvDebugCheck(right - left <= windowSize);
+
+        float norm = 0.0f;
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(left + j, y, z, wm);
+
+            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
+            norm += w;
+            sum += w * channel[idx];
+        }
+
+        output[i] = sum / norm;
+    }
+}
+
 /// Apply 1D vertical kernel at the given coordinates and return result.
-void FloatImage::applyKernelVertical(const PolyphaseKernel & k, int x, uint c, uint a, WrapMode wm, float * __restrict output) const
+void FloatImage::applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * __restrict output) const
 {
-	const uint length = k.length();
-	const float scale = float(length) / float(m_height);
-	const float iscale = 1.0f / scale;
-
-	const float width = k.width();
-	const int windowSize = k.windowSize();
-
-	const float * channel = this->channel(c);
-	const float * alpha = this->channel(a);
-
-	for (uint i = 0; i < length; i++)
-	{
-		const float center = (0.5f + i) * iscale;
-		
-		const int left = (int)floorf(center - width);
-		const int right = (int)ceilf(center + width);
-		nvCheck(right - left <= windowSize);
-		
-		float norm = 0;
-		float sum = 0;
-		for (int j = 0; j < windowSize; ++j)
-		{
-			const int idx = this->index(x, j+left, wm);
-			
-			float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
-			norm += w;
-			sum += w * channel[idx];
-		}
-		
-		output[i] = sum / norm;
-	}
+    const uint length = k.length();
+    const float scale = float(length) / float(m_height);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+    const float * alpha = this->channel(a);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvCheck(right - left <= windowSize);
+
+        float norm = 0;
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, j+left, z, wm);
+
+            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
+            norm += w;
+            sum += w * channel[idx];
+        }
+
+        output[i] = sum / norm;
+    }
 }
 
 /// Apply 1D horizontal kernel at the given coordinates and return result.
-void FloatImage::applyKernelHorizontal(const PolyphaseKernel & k, int y, uint c, uint a, WrapMode wm, float * __restrict output) const
+void FloatImage::applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * __restrict output) const
 {
-	const uint length = k.length();
-	const float scale = float(length) / float(m_width);
-	const float iscale = 1.0f / scale;
-
-	const float width = k.width();
-	const int windowSize = k.windowSize();
-
-	const float * channel = this->channel(c);
-	const float * alpha = this->channel(a);
-
-	for (uint i = 0; i < length; i++)
-	{
-		const float center = (0.5f + i) * iscale;
-		
-		const int left = (int)floorf(center - width);
-		const int right = (int)ceilf(center + width);
-		nvDebugCheck(right - left <= windowSize);
-		
-		float norm = 0.0f;
-		float sum = 0;
-		for (int j = 0; j < windowSize; ++j)
-		{
-			const int idx = this->index(left + j, y, wm);
-
-			float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
-			norm += w;
-			sum += w * channel[idx];
-		}
-		
-		output[i] = sum / norm;
-	}
+    const uint length = k.length();
+    const float scale = float(length) / float(m_width);
+    const float iscale = 1.0f / scale;
+
+    const float width = k.width();
+    const int windowSize = k.windowSize();
+
+    const float * channel = this->channel(c);
+    const float * alpha = this->channel(a);
+
+    for (uint i = 0; i < length; i++)
+    {
+        const float center = (0.5f + i) * iscale;
+
+        const int left = (int)floorf(center - width);
+        const int right = (int)ceilf(center + width);
+        nvDebugCheck(right - left <= windowSize);
+
+        float norm = 0.0f;
+        float sum = 0;
+        for (int j = 0; j < windowSize; ++j)
+        {
+            const int idx = this->index(x, y, left + j, wm);
+
+            float w = k.valueAt(i, j) * (alpha[idx] + (1.0f / 256.0f));
+            norm += w;
+            sum += w * channel[idx];
+        }
+
+        output[i] = sum / norm;
+    }
+}
+
+
+void FloatImage::flipX()
+{
+    const uint w = m_width;
+    const uint h = m_height;
+    const uint d = m_depth;
+    const uint w2 = w / 2;
+
+    for (uint c = 0; c < m_componentCount; c++) {
+        for (uint z = 0; z < d; z++) {
+            for (uint y = 0; y < h; y++) {
+                float * line = scanline(c, y, z);
+                for (uint x = 0; x < w2; x++) {
+                    swap(line[x], line[w - 1 - x]);
+                }
+            }
+        }
+    }
+}
+
+void FloatImage::flipY()
+{
+    const uint w = m_width;
+    const uint h = m_height;
+    const uint d = m_depth;
+    const uint h2 = h / 2;
+
+    for (uint c = 0; c < m_componentCount; c++) {
+        for (uint z = 0; z < d; z++) {
+            for (uint y = 0; y < h2; y++) {
+                float * src = scanline(c, y, z);
+                float * dst = scanline(c, h - 1 - y, z);
+                for (uint x = 0; x < w; x++) {
+                    swap(src[x], dst[x]);
+                }
+            }
+        }
+    }
+}
+
+void FloatImage::flipZ()
+{
+    const uint w = m_width;
+    const uint h = m_height;
+    const uint d = m_depth;
+    const uint d2 = d / 2;
+
+    for (uint c = 0; c < m_componentCount; c++) {
+        for (uint z = 0; z < d2; z++) {
+            float * src = plane(c, z);
+            float * dst = plane(c, d - 1 - z);
+            for (uint i = 0; i < w*h; i++) {
+                swap(src[i], dst[i]);
+            }
+        }
+    }
+}
+
+
+
+float FloatImage::alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale/*=1*/) const
+{
+    const uint w = m_width;
+    const uint h = m_height;
+
+    float coverage = 0.0f;
+
+#if 0
+    const float * alpha = channel(alphaChannel);
+
+    const uint count = m_pixelCount;
+    for (uint i = 0; i < count; i++) {
+        if (alpha[i] > alphaRef) coverage += 1.0f; // @@ gt or lt?
+    }
+    
+    return coverage / float(w * h);
+#else
+    const uint n = 8;
+
+    // If we want subsampling:
+    for (uint y = 0; y < h-1; y++) {
+        for (uint x = 0; x < w-1; x++) {
+
+            float alpha00 = nv::saturate(pixel(alphaChannel, x+0, y+0, 0) * alphaScale);
+            float alpha10 = nv::saturate(pixel(alphaChannel, x+1, y+0, 0) * alphaScale);
+            float alpha01 = nv::saturate(pixel(alphaChannel, x+0, y+1, 0) * alphaScale);
+            float alpha11 = nv::saturate(pixel(alphaChannel, x+1, y+1, 0) * alphaScale);
+
+            for (float fy = 0.5f/n; fy < 1.0f; fy++) {
+                for (float fx = 0.5f/n; fx < 1.0f; fx++) {
+                    float alpha = alpha00 * (1 - fx) * (1 - fy) + alpha10 * fx * (1 - fy) + alpha01 * (1 - fx) * fy + alpha11 * fx * fy;
+                    if (alpha > alphaRef) coverage += 1.0f;
+                }
+            }
+        }
+    }
+
+    return coverage / float(w * h * n * n);
+#endif
+}
+
+void FloatImage::scaleAlphaToCoverage(float desiredCoverage, float alphaRef, int alphaChannel)
+{
+#if 0
+    float minAlphaRef = 0.0f;
+    float maxAlphaRef = 1.0f;
+    float midAlphaRef = 0.5f;
+
+    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
+    for (int i = 0; i < 10; i++) {
+        float currentCoverage = alphaTestCoverage(midAlphaRef, alphaChannel);
+
+        if (currentCoverage > desiredCoverage) {
+            minAlphaRef = midAlphaRef;
+        }
+        else if (currentCoverage < desiredCoverage) {
+            maxAlphaRef = midAlphaRef;
+        }
+        else {
+            break;
+        }
+
+        midAlphaRef = (minAlphaRef + maxAlphaRef) * 0.5f;
+    }
+
+    float alphaScale = alphaRef / midAlphaRef;
+
+    // Scale alpha channel.
+    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
+    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#else
+    float minAlphaScale = 0.0f;
+    float maxAlphaScale = 4.0f;
+    float alphaScale = 1.0f;
+
+    // Determine desired scale using a binary search. Hardcoded to 8 steps max.
+    for (int i = 0; i < 10; i++) {
+        float currentCoverage = alphaTestCoverage(alphaRef, alphaChannel, alphaScale);
+
+        if (currentCoverage < desiredCoverage) {
+            minAlphaScale = alphaScale;
+        }
+        else if (currentCoverage > desiredCoverage) {
+            maxAlphaScale = alphaScale;
+        }
+        else {
+            break;
+        }
+
+        alphaScale = (minAlphaScale + maxAlphaScale) * 0.5f;
+    }
+
+    // Scale alpha channel.
+    scaleBias(alphaChannel, 1, alphaScale, 0.0f);
+    clamp(alphaChannel, 1, 0.0f, 1.0f); 
+#endif
+#if _DEBUG
+    alphaTestCoverage(alphaRef, alphaChannel);
+#endif
 }
 
 FloatImage* FloatImage::clone() const
 {
-	FloatImage* copy = new FloatImage();
-	copy->m_width = m_width;
-	copy->m_height = m_height;
-	copy->m_componentNum = m_componentNum;
-	copy->m_count = m_count;
-	
-	if(m_mem)
-	{
-		copy->allocate(m_componentNum, m_width, m_height);
-		memcpy(copy->m_mem, m_mem, m_count * sizeof(float));
-	}
-		
-	return copy;
+    FloatImage* copy = new FloatImage();
+
+    copy->allocate(m_componentCount, m_width, m_height, m_depth);
+    memcpy(copy->m_mem, m_mem, m_floatCount * sizeof(float));
+
+    return copy;
 }
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/HoleFilling.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/HoleFilling.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/HoleFilling.h
@@ -1,96 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_IMAGE_HOLEFILLING_H
-#define NV_IMAGE_HOLEFILLING_H
-
-#include <nvcore/BitArray.h>
-#include <nvimage/nvimage.h>
-
-namespace nv 
-{
-	class FloatImage;
-
-	/// Bit mask.
-	class BitMap
-	{
-	public:
-		BitMap(uint w, uint h) : 
-			m_width(w), m_height(h), m_bitArray(w*h) 
-		{
-		}
-		
-		const uint width() const { return m_width; }
-		const uint height() const { return m_height; }
-		
-		bool bitAt(uint x, uint y) const
-		{
-			nvDebugCheck(x < m_width && y < m_height);
-			return m_bitArray.bitAt(y * m_width + x);
-		}
-		bool bitAt(uint idx) const
-		{
-			return m_bitArray.bitAt(idx);
-		}
-	
-		void setBitAt(uint x, uint y)
-		{
-			nvDebugCheck(x < m_width && y < m_height);
-			m_bitArray.setBitAt(y * m_width + x);
-		}
-		void setBitAt(uint idx)
-		{
-			m_bitArray.setBitAt(idx);
-		}
-	
-		void clearBitAt(uint x, uint y)
-		{
-			nvDebugCheck(x < m_width && y < m_height);
-			m_bitArray.clearBitAt(y * m_width + x);
-		}
-		void clearBitAt(uint idx)
-		{
-			m_bitArray.clearBitAt(idx);
-		}
-	
-		void clearAll()
-		{
-			m_bitArray.clearAll();
-		}
-	
-		void setAll()
-		{
-			m_bitArray.setAll();
-		}
-	
-		void toggleAll()
-		{
-			m_bitArray.toggleAll();
-		}
-		
-		friend void swap(BitMap & a, BitMap & b)
-		{
-			nvCheck(a.m_width == b.m_width);
-			nvCheck(a.m_height == b.m_height);
-			//swap(const_cast<uint &>(a.m_width), const_cast<uint &>(b.m_width));
-			//swap(const_cast<uint &>(a.m_height), const_cast<uint &>(b.m_height));
-			swap(a.m_bitArray, b.m_bitArray);
-		}
-		
-	private:
-		
-		const uint m_width;
-		const uint m_height;
-		BitArray m_bitArray;
-		
-	};
-
-	NVIMAGE_API void fillVoronoi(FloatImage * img, const BitMap * bmap);
-	NVIMAGE_API void fillBlur(FloatImage * img, const BitMap * bmap);
-	NVIMAGE_API void fillPullPush(FloatImage * img, const BitMap * bmap);
-	
-	NVIMAGE_API void fillExtrapolate(int passCount, FloatImage * img, BitMap * bmap);
-	NVIMAGE_API void fillQuadraticExtrapolate(int passCount, FloatImage * img, BitMap * bmap, int coverageIndex = -1);
-	
-} // nv namespace
-
-#endif // NV_IMAGE_HOLEFILLING_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/HoleFilling.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/HoleFilling.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/HoleFilling.cpp
@@ -1,753 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvcore/Containers.h>
-#include <nvcore/Ptr.h>
-
-#include <nvmath/nvmath.h>
-
-#include <nvimage/HoleFilling.h>
-#include <nvimage/FloatImage.h>
-
-using namespace nv;
-
-
-// This is a variation of Sapiro's inpainting method.
-void nv::fillExtrapolate(int passCount, FloatImage * img, BitMap * bmap)
-{
-	nvCheck(img != NULL);
-	nvCheck(bmap != NULL);
-
-	const int w = img->width();
-	const int h = img->height();
-	const int count = img->componentNum();
-
-	nvCheck(bmap->width() == uint(w));
-	nvCheck(bmap->height() == uint(h));
-
-	AutoPtr<BitMap> newbmap(new BitMap(w, h));
-
-	for(int p = 0; p < passCount; p++)
-	{
-		for(int c = 0; c < count; c++)
-		{
-			float * channel = img->channel(c);
-			
-			for(int y = 0; y < h; y++) {
-				for(int x = 0; x < w; x++) {
-					
-					if (bmap->bitAt(x, y)) {
-						// Not a hole.
-						newbmap->setBitAt(x, y);
-						continue;
-					}
-					
-					const bool west = bmap->bitAt(img->indexClamp(x-1, y));
-					const bool east = bmap->bitAt(img->indexClamp(x+1, y));
-					const bool north = bmap->bitAt(img->indexClamp(x, y-1));
-					const bool south = bmap->bitAt(img->indexClamp(x, y+1));
-					const bool northwest = bmap->bitAt(img->indexClamp(x-1, y-1));
-					const bool northeast = bmap->bitAt(img->indexClamp(x+1, y-1));
-					const bool southwest = bmap->bitAt(img->indexClamp(x-1, y+1));
-					const bool southeast = bmap->bitAt(img->indexClamp(x+1, y+1));
-					
-					int num = west + east + north + south + northwest + northeast + southwest + southeast;
-					
-					if (num != 0) {
-
-						float average = 0.0f;
-						if (num == 3 && west && northwest && southwest) {
-							average = channel[img->indexClamp(x-1, y)];
-						}
-						else if (num == 3 && east && northeast && southeast) {
-							average = channel[img->indexClamp(x+1, y)];
-						}
-						else if (num == 3 && north && northwest && northeast) {
-							average = channel[img->indexClamp(x, y-1)];
-						}
-						else if (num == 3 && south && southwest && southeast) {
-							average = channel[img->indexClamp(x, y+1)];
-						}
-						else {
-							float total = 0.0f;
-							if (west) { average += 1 * channel[img->indexClamp(x-1, y)]; total += 1; }
-							if (east) { average += 1 * channel[img->indexClamp(x+1, y)]; total += 1; }
-							if (north) { average += 1 * channel[img->indexClamp(x, y-1)]; total += 1; }
-							if (south) { average += 1 * channel[img->indexClamp(x, y+1)]; total += 1; }
-						
-							if (northwest) { average += channel[img->indexClamp(x-1, y-1)]; ++total; }
-							if (northeast) { average += channel[img->indexClamp(x+1, y-1)]; ++total; }
-							if (southwest) { average += channel[img->indexClamp(x-1, y+1)]; ++total; }
-							if (southeast) { average += channel[img->indexClamp(x+1, y+1)]; ++total; }
-							
-							average /= total;
-						}
-
-						channel[img->indexClamp(x, y)] = average;
-						newbmap->setBitAt(x, y);
-					}
-				}
-			}
-		}
-
-		// Update the bit mask.
-		swap(*newbmap, *bmap);
-	}
-}
-
-
-namespace {
-
-	struct Neighbor {
-		uint16 x;
-		uint16 y;
-		uint32 d;
-	};
-
-	// Compute euclidean squared distance.
-	static uint dist( uint16 ax, uint16 ay, uint16 bx, uint16 by ) {
-		int dx = bx - ax;
-		int dy = by - ay;
-		return uint(dx*dx + dy*dy);
-	}
-	
-	// Check neighbour, this is the core of the EDT algorithm.
-	static void checkNeighbour( int x, int y, Neighbor * e, const Neighbor & n ) {
-		nvDebugCheck(e != NULL);
-		
-		uint d = dist( x, y, n.x, n.y );
-		if( d < e->d ) {
-			e->x = n.x;
-			e->y = n.y;
-			e->d = d;
-		}
-	}
-
-} // namespace
-
-// Voronoi filling using EDT-4
-void nv::fillVoronoi(FloatImage * img, const BitMap * bmap)
-{
-	nvCheck(img != NULL);
-
-	const int w = img->width();
-	const int h = img->height();
-	const int count = img->componentNum();
-
-	nvCheck(bmap->width() == uint(w));
-	nvCheck(bmap->height() == uint(h));
-
-	Array<Neighbor> edm;
-	edm.resize(w * h);
-	
-	int x, y;
-	int x0, x1, y0, y1;
-
-	// Init edm.
-	for( y = 0; y < h; y++ ) {
-		for( x = 0; x < w; x++ ) {
-			if( bmap->bitAt(x, y) ) {
-				edm[y * w + x].x = x;
-				edm[y * w + x].y = y;
-				edm[y * w + x].d = 0;
-			}
-			else {
-				edm[y * w + x].x = w;
-				edm[y * w + x].y = h;
-				edm[y * w + x].d = w*w + h*h;
-			}
-		}
-	}
-	
-	// First pass.
-	for( y = 0; y < h; y++ ) {
-		for( x = 0; x < w; x++ ) {
-			x0 = clamp(x-1, 0, w-1);	// @@ Wrap?
-			x1 = clamp(x+1, 0, w-1);
-			y0 = clamp(y-1, 0, h-1);
-			
-			Neighbor & e = edm[y * w + x];
-			checkNeighbour(x, y, &e, edm[y0 * w + x0]);
-			checkNeighbour(x, y, &e, edm[y0 * w + x]);
-			checkNeighbour(x, y, &e, edm[y0 * w + x1]);
-			checkNeighbour(x, y, &e, edm[y * w + x0]);
-		}
-		
-		for( x = w-1; x >= 0; x-- ) {
-			x1 = clamp(x+1, 0, w-1);
-			
-			Neighbor & e = edm[y * w + x];
-			checkNeighbour(x, y, &e, edm[y * w + x1]);
-		}
-	}
-	
-	// Third pass.
-	for( y = h-1; y >= 0; y-- ) {
-		for( x = w-1; x >= 0; x-- ) {
-			x0 = clamp(x-1, 0, w-1);
-			x1 = clamp(x+1, 0, w-1);
-			y1 = clamp(y+1, 0, h-1);
-			
-			Neighbor & e = edm[y * w + x];
-			checkNeighbour(x, y, &e, edm[y * w + x1]);
-			checkNeighbour(x, y, &e, edm[y1 * w + x0]);
-			checkNeighbour(x, y, &e, edm[y1 * w + x]);
-			checkNeighbour(x, y, &e, edm[y1 * w + x1]);
-		}
-		
-		for( x = 0; x < w; x++ ) {
-			x0 = clamp(x-1, 0, w-1);
-			
-			Neighbor & e = edm[y * w + x];
-			checkNeighbour(x, y, &e, edm[y * w + x0]);
-		}
-	}
-	
-	// Fill empty holes.
-	for( y = 0; y < h; y++ ) {
-		for( x = 0; x < w; x++ ) {
-			const int sx = edm[y * w + x].x;
-			const int sy = edm[y * w + x].y;
-			nvDebugCheck(sx < w && sy < h);
-			
-			if( sx != x || sy != y ) {
-				for(int c = 0; c < count; c++ ) {
-					img->setPixel(img->pixel(sx, sy, c), x, y, c);
-				}
-			}
-		}
-	}
-
-}
-
-
-void nv::fillBlur(FloatImage * img, const BitMap * bmap)
-{
-	nvCheck(img != NULL);
-	
-	// @@ Apply a 3x3 kernel.
-}
-
-
-static bool downsample(const FloatImage * src, const BitMap * srcMask, const FloatImage ** _dst, const BitMap ** _dstMask)
-{
-	const uint w = src->width();
-	const uint h = src->height();
-	const uint count = src->componentNum();
-
-	// count holes in srcMask, return false if fully filled.
-	uint holes = 0;
-	for(uint y = 0; y < h; y++) {
-		for(uint x = 0; x < w; x++) {
-			holes += srcMask->bitAt(x, y) == 0;
-		}
-	}
-	if (holes == 0 || (w == 2 || h == 2)) {
-		// Stop when no holes or when the texture is very small.
-		return false;
-	}
-
-	// Apply box filter to image and mask and return true.
-	const uint nw = w / 2;
-	const uint nh = h / 2;
-
-	FloatImage * dst = new FloatImage();
-	dst->allocate(count, nw, nh);
-	BitMap * dstMask = new BitMap(nw, nh);
-
-	for(uint c = 0; c < count; c++) {
-		for(uint y = 0; y < nh; y++) {
-			for(uint x = 0; x < nw; x++) {
-
-				const uint x0 = 2 * x + 0;
-				const uint x1 = 2 * x + 1;
-				const uint y0 = 2 * y + 0;
-				const uint y1 = 2 * y + 1;
-
-				const float f0 = src->pixel(x0, y0, c);
-				const float f1 = src->pixel(x1, y0, c);
-				const float f2 = src->pixel(x0, y1, c);
-				const float f3 = src->pixel(x1, y1, c);
-
-				const bool b0 = srcMask->bitAt(x0, y0);
-				const bool b1 = srcMask->bitAt(x1, y0);
-				const bool b2 = srcMask->bitAt(x0, y1);
-				const bool b3 = srcMask->bitAt(x1, y1);
-
-				if (b0 || b1 || b2 || b3) {
-					// Set bit mask.
-					dstMask->setBitAt(x, y);
-
-					// Set pixel.
-					float value = 0.0f;
-					int total = 0;
-					if (b0) { value += f0; total++; }
-					if (b1) { value += f1; total++; }
-					if (b2) { value += f2; total++; }
-					if (b3) { value += f3; total++; }
-					dst->setPixel(value / total, x, y, c);
-				}
-			}
-		}
-	}
-
-	*_dst = dst;
-	*_dstMask = dstMask;
-
-	return true;
-}
-
-// This is the filter used in the Lumigraph paper.
-void nv::fillPullPush(FloatImage * img, const BitMap * bmap)
-{
-	nvCheck(img != NULL);
-
-	const uint count = img->componentNum();
-	const uint w = img->width();
-	const uint h = img->height();
-	const uint num = log2(max(w,h));
-
-	// Build mipmap chain.
-	Array<const FloatImage *> mipmaps(num);
-	Array<const BitMap *> mipmapMasks(num);
-
-	mipmaps.append(img);
-	mipmapMasks.append(bmap);
-
-	const FloatImage * current;
-	const BitMap * currentMask;
-
-	// Compute mipmap chain.
-	while(downsample(mipmaps.back(), mipmapMasks.back(), &current, &currentMask))
-	{
-		mipmaps.append(current);
-		mipmapMasks.append(currentMask);
-	}
-
-	// Sample mipmaps until non-hole is found.
-	for(uint y = 0; y < h; y++) {
-		for(uint x = 0; x < w; x++) {
-
-			int sx = x;
-			int sy = y;
-			//float sx = x;
-			//float sy = y;
-
-			const uint levelCount = mipmaps.count();
-			for (uint l = 0; l < levelCount; l++)
-			{
-				//const float fx = sx / mipmaps[l]->width();
-				//const float fy = sy / mipmaps[l]->height();
-
-				if (mipmapMasks[l]->bitAt(sx, sy))
-				{
-					// Sample mipmaps[l](sx, sy) and copy to img(x, y)
-					for(uint c = 0; c < count; c++) {
-						//img->setPixel(mipmaps[l]->linear_clamp(fx, fy, c), x, y, c);
-						img->setPixel(mipmaps[l]->pixel(sx, sy, c), x, y, c);
-					}
-					break;
-				}
-
-				sx /= 2;
-				sy /= 2;
-			}
-		}
-	}
-
-	// Don't delete the original image and mask.
-	mipmaps[0] = NULL;
-	mipmapMasks[0] = NULL;
-
-	// Delete the mipmaps.
-	deleteAll(mipmaps);
-	deleteAll(mipmapMasks);
-}
-
-
-
-/*
-
-This Code is from Charles Bloom:
-
-DoPixelSeamFix
-10-20-02
-
-Looks in the 5x5 local neighborhood (LocalPixels) of the desired pixel to fill.
-It tries to build a quadratic model of the neighborhood surface to use in
-extrapolating.  You need 5 pixels to establish a 2d quadratic curve.
-
-This is really just a nice generic way to extrapolate pixels.  It also happens
-to work great for seam-fixing.
-
-Note that I'm working on normals, but I treat them just as 3 scalars and normalize
-at the end.  To be more correct, I would work on the surface of a sphere, but that
-just seems like way too much work.
-
-*/
-
-struct LocalPixels
-{
-	// 5x5 neighborhood
-	// the center is at result
-	// index [y][x]
-	bool fill[5][5];
-	float data[5][5];
-	
-	mutable float result;
-	mutable float weight;
-
-	bool Quad3SubH(float * pQ, int row) const
-	{
-		const bool * pFill = fill[row];
-		const float * pDat = data[row];
-	
-		if ( pFill[1] && pFill[2] && pFill[3] )
-		{
-			// good row
-			*pQ = pDat[1] - 2.f * pDat[2] + pDat[3];
-			return true;
-		}
-		else if ( pFill[0] && pFill[1] && pFill[2] )
-		{
-			// good row
-			*pQ = pDat[0] - 2.f * pDat[1] + pDat[2];
-			return true;
-		}
-		else if ( pFill[2] && pFill[3] && pFill[4] )
-		{
-			// good row
-			*pQ = pDat[2] - 2.f * pDat[3] + pDat[4];
-			return true;
-		}
-		return false;
-	}
-
-	// improve result with a horizontal quad in row 1 and/or 
-	bool Quad3SubV(float * pQ, int col) const
-	{
-		if ( fill[1][col] && fill[2][col] && fill[3][col] )
-		{
-			// good row
-			*pQ = data[1][col] - 2.f * data[2][col] + data[3][col];
-			return true;
-		}
-		else if ( fill[0][col] && fill[1][col] && fill[2][col] )
-		{
-			// good row
-			*pQ = data[0][col] - 2.f * data[1][col] + data[2][col];
-			return true;
-		}
-		else if ( fill[2][col] && fill[3][col] && fill[4][col] )
-		{
-			// good row
-			*pQ = data[2][col] - 2.f * data[3][col] + data[4][col];
-			return true;
-		}
-		return false;
-	}
-	
-	bool Quad3H(float * pQ) const
-	{
-		if (!Quad3SubH(pQ,1))
-		{
-			return Quad3SubH(pQ,3);	
-		}
-		float q = 0.0f; // initializer not needed, just make it shut up
-		if (Quad3SubH(&q, 3))
-		{
-			// got q and pQ
-			*pQ = (*pQ+q)*0.5f;
-		}
-		return true;
-	}
-	
-	bool Quad3V(float * pQ) const
-	{
-		if (!Quad3SubV(pQ, 1))
-		{
-			return Quad3SubV(pQ, 3);	
-		}
-		float q = 0.0f; // initializer not needed, just make it shut up
-		if (Quad3SubV(&q, 3))
-		{
-			// got q and pQ
-			*pQ = (*pQ + q) * 0.5f;
-		}
-		return true;
-	}
-	// Quad returns ([0]+[2] - 2.f*[1])
-	//	a common want is [1] - ([0]+[2])*0.5f ;
-	// so use -0.5f*Quad
-
-	bool tryQuads() const
-	{
-		bool res = false;
-	
-		// look for a pair that straddles the middle:
-		if ( fill[2][1] && fill[2][3] )
-		{
-			// got horizontal straddle
-			float q;
-			if ( Quad3H(&q) )
-			{
-				result += (data[2][1] + data[2][3] - q) * 0.5f;
-				weight += 1.f;
-				res = true;
-			}
-		}
-		if ( fill[1][2] && fill[3][2] )
-		{
-			// got vertical straddle
-			float q;
-			if ( Quad3V(&q) )
-			{
-				result += (data[1][2] + data[3][2] - q) * 0.5f;
-				weight += 1.f;
-				res = true;
-			}
-		}
-	
-		// look for pairs that lead into the middle :
-		if ( fill[2][0] && fill[2][1] )
-		{
-			// got left-side pair
-			float q;
-			if ( Quad3H(&q) )
-			{
-				result += data[2][1]*2.f - data[2][0] + q;
-				weight += 1.f;
-				res = true;
-			}
-		}
-		if ( fill[2][3] && fill[2][4] )
-		{
-			// got right-side pair
-			float q;
-			if ( Quad3H(&q) )
-			{
-				result += data[2][3]*2.f - data[2][4] + q;
-				weight += 1.f;
-				res = true;
-			}
-		}
-		if ( fill[0][2] && fill[1][2] )
-		{
-			// got left-side pair
-			float q;
-			if ( Quad3V(&q) )
-			{
-				result += data[1][2]*2.f - data[0][2] + q;
-				weight += 1.f;
-				res = true;
-			}
-		}
-		if ( fill[3][2] && fill[4][2] )
-		{
-			// got right-side pair
-			float q;
-			if ( Quad3V(&q) )
-			{
-				result += data[3][2]*2.f - data[4][2] + q;
-				weight += 1.f;
-				res = true;
-			}
-		}
-		return res;
-	}
-	
-	bool tryPlanar() const
-	{
-		// four cases :
-		const int indices[] =
-		{
-			2,1, 1,2, 1,1,
-			2,1, 3,2, 3,1,
-			2,3, 1,2, 1,3,
-			2,3, 3,2, 3,3
-		};
-		bool res = false;
-		for (int i = 0; i < 4; i++)
-		{
-			const int * I = indices + i*6;
-			if (!fill[ I[0] ][ I[1] ])
-				continue;
-			if (!fill[ I[2] ][ I[3] ])
-				continue;
-			if (!fill[ I[4] ][ I[5] ])
-				continue;
-	
-			result += data[ I[0] ][ I[1] ] + data[ I[2] ][ I[3] ] - data[ I[4] ][ I[5] ];
-			weight += 1.0f;
-			res = true;
-		}
-		return res;
-	}
-	
-	bool tryTwos() const
-	{
-		bool res = false;
-	
-		if (fill[2][1] && fill[2][3])
-		{
-			result += (data[2][1] + data[2][3]) * 0.5f;
-			weight += 1.0f;
-			res = true;
-		}
-		if (fill[1][2] && fill[3][2])
-		{
-			result += (data[1][2] + data[3][2]) * 0.5f;
-			weight += 1.0f;
-			res = true;
-		}
-		
-		// four side-rotates :
-		const int indices[] =
-		{
-			2,1, 2,0,
-			2,3, 2,4,
-			1,2, 0,2,
-			3,2, 4,2,
-		};
-		for (int i = 0; i < 4; i++)
-		{
-			const int * I = indices + i*4;
-			if (!fill[ I[0] ][ I[1] ])
-				continue;
-			if (!fill[ I[2] ][ I[3] ])
-				continue;
-	
-			result += data[ I[0] ][ I[1] ]*2.0f - data[ I[2] ][ I[3] ];
-			weight += 1.0f;
-			res = true;
-		}
-	
-		return res;
-	}
-
-	bool doLocalPixelFill() const
-	{
-		result = 0.0f;
-		weight = 0.0f;
-		
-		if (tryQuads()) {
-			return true;
-		}
-		
-		if (tryPlanar()) {
-			return true;
-		}
-		
-		return tryTwos();
-	}
-
-}; // struct LocalPixels
-
-
-
-// This is a quadratic extrapolation filter from Charles Bloom (DoPixelSeamFix). Used with his permission.
-void nv::fillQuadraticExtrapolate(int passCount, FloatImage * img, BitMap * bmap, int coverageIndex /*= -1*/)
-{
-	nvCheck(passCount > 0);
-	nvCheck(img != NULL);
-	nvCheck(bmap != NULL);
-
-	const int w = img->width();
-	const int h = img->height();
-	const int count = img->componentNum();
-
-	nvCheck(bmap->width() == uint(w));
-	nvCheck(bmap->height() == uint(h));
-
-	AutoPtr<BitMap> newbmap( new BitMap(w, h) );
-
-	float * coverageChannel = NULL;
-	if (coverageIndex != -1)
-	{
-		coverageChannel = img->channel(coverageIndex);
-	}
-
-	int firstChannel = -1;
-
-	for (int p = 0; p < passCount; p++)
-	{
-		for (int c = 0; c < count; c++)
-		{
-			if (c == coverageIndex) continue;
-			if (firstChannel == -1) firstChannel = c;
-
-			float * channel = img->channel(c);
-			
-			for (int yb = 0; yb < h; yb++) {
-				for (int xb = 0; xb < w; xb++) {
-					
-					if (bmap->bitAt(xb, yb)) {
-						// Not a hole.
-						newbmap->setBitAt(xb, yb);
-						continue;
-					}
-					
-					int numFill = 0;
-					
-					LocalPixels lp;
-					for (int ny = 0; ny < 5; ny++)
-					{
-						int y = (yb + ny - 2);
-						if ( y < 0 || y >= h )
-						{
-							// out of range
-							for(int i = 0; i < 5; i++) 
-							{
-								lp.fill[ny][i] = false;
-							}
-							continue;
-						}
-
-						for (int nx = 0; nx < 5; nx++)
-						{
-							int x = (xb + nx - 2);
-							if (x < 0 || x >= w)
-							{
-								lp.fill[ny][nx] = false;
-							}
-							else
-							{
-								int idx = img->index(x, y);
-								if (!bmap->bitAt(idx))
-								{
-									lp.fill[ny][nx] = false;
-								}
-								else
-								{
-									lp.fill[ny][nx] = true;
-									lp.data[ny][nx] = channel[idx];
-									numFill++;
-								}
-							}
-						}
-					}
-
-					// need at least 3 to do anything decent
-					if (numFill < 2)
-						continue;
-
-					nvDebugCheck(lp.fill[2][2] == false);
-					
-					if (lp.doLocalPixelFill())
-					{
-						const int idx = img->index(xb, yb);
-						channel[idx] = lp.result / lp.weight;
-
-						if (c == firstChannel)
-						{
-							//coverageChannel[idx] /= lp.weight;	// @@ Not sure what this was for, coverageChannel[idx] is always zero.
-							newbmap->setBitAt(xb, yb);
-						}
-					}
-				}
-			}
-		}
-
-		// Update the bit mask.
-		swap(*newbmap, *bmap);
-	}
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/Image.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/Image.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/Image.h
@@ -1,81 +1,90 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_IMAGE_H
 #define NV_IMAGE_IMAGE_H
 
-#include <nvcore/Debug.h>
-#include <nvimage/nvimage.h>
+#include "nvimage.h"
+#include "nvcore/Debug.h"
+
+#if NV_USE_ALTIVEC
+#undef pixel
+#endif
 
 namespace nv
 {
-	class Color32;
-	
-	/// 32 bit RGBA image.
-	class NVIMAGE_CLASS Image
-	{
-	public:
-		
-		enum Format 
-		{
-			Format_RGB,
-			Format_ARGB,
-		};
-		
-		Image();
-		Image(const Image & img);
-		~Image();
-
-		const Image & operator=(const Image & img);
-
-
-		void allocate(uint w, uint h);
-		bool load(const char * name);
-		
-		void wrap(void * data, uint w, uint h);
-		void unwrap();
-		
-		uint width() const;
-		uint height() const;
-		
-		const Color32 * scanline(uint h) const;
-		Color32 * scanline(uint h);
-		
-		const Color32 * pixels() const;
-		Color32 * pixels();
-		
-		const Color32 & pixel(uint idx) const;
-		Color32 & pixel(uint idx);
-		
-		const Color32 & pixel(uint x, uint y) const;
-		Color32 & pixel(uint x, uint y);
-		
-		Format format() const;
-		void setFormat(Format f);
-		
-		void fill(Color32 c);
-
-	private:
-		void free();
-		
-	private:
-		uint m_width;
-		uint m_height;
-		Format m_format;
-		Color32 * m_data;
-	};
-
-
-	inline const Color32 & Image::pixel(uint x, uint y) const
-	{
-		nvDebugCheck(x < width() && y < height());
-		return pixel(y * width() + x);
-	}
-	
-	inline Color32 & Image::pixel(uint x, uint y)
-	{
-		nvDebugCheck(x < width() && y < height());
-		return pixel(y * width() + x);
-	}
+    class Color32;
+
+    /// 32 bit RGBA image.
+    class NVIMAGE_CLASS Image
+    {
+    public:
+
+        enum Format 
+        {
+            Format_RGB,
+            Format_ARGB,
+        };
+
+        Image();
+        Image(const Image & img);
+        ~Image();
+
+        const Image & operator=(const Image & img);
+
+
+        void allocate(uint w, uint h, uint d = 1);
+        bool load(const char * name);
+
+        void resize(uint w, uint h, uint d = 1);
+
+        void wrap(void * data, uint w, uint h, uint d = 1);
+        void unwrap();
+
+        uint width() const;
+        uint height() const;
+        uint depth() const;
+
+        const Color32 * scanline(uint h) const;
+        Color32 * scanline(uint h);
+
+        const Color32 * pixels() const;
+        Color32 * pixels();
+
+        const Color32 & pixel(uint idx) const;
+        Color32 & pixel(uint idx);
+
+        const Color32 & pixel(uint x, uint y, uint z = 0) const;
+        Color32 & pixel(uint x, uint y,  uint z = 0);
+
+        Format format() const;
+        void setFormat(Format f);
+
+        void fill(Color32 c);
+
+    private:
+        void free();
+
+    private:
+        uint m_width;
+        uint m_height;
+        uint m_depth;
+        Format m_format;
+        Color32 * m_data;
+    };
+
+
+    inline const Color32 & Image::pixel(uint x, uint y, uint z) const
+    {
+        nvDebugCheck(x < m_width && y < m_height && z < m_depth);
+        return pixel((z * m_height + y) * m_width + x);
+    }
+
+    inline Color32 & Image::pixel(uint x, uint y, uint z)
+    {
+        nvDebugCheck(x < m_width && y < m_height && z < m_depth);
+        return pixel((z * m_height + y) * m_width + x);
+    }
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/Image.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/Image.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/Image.cpp
@@ -1,12 +1,16 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
-#include <nvcore/Debug.h>
-#include <nvcore/Ptr.h>
+#include "Image.h"
+#include "ImageIO.h"
 
-#include <nvmath/Color.h>
+#include "nvmath/Color.h"
 
-#include <nvimage/Image.h>
-#include <nvimage/ImageIO.h>
+#include "nvcore/Debug.h"
+#include "nvcore/Ptr.h"
+#include "nvcore/Utils.h" // swap
+#include "nvcore/Memory.h" // realloc, free
+
+#include <string.h> // memcpy
 
 
 using namespace nv;
@@ -17,133 +21,182 @@
 
 Image::Image(const Image & img) : m_data(NULL)
 {
-	allocate(img.m_width, img.m_height);
-	m_format = img.m_format;
-	memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height);
+	allocate(img.m_width, img.m_height, img.m_depth);
+    m_format = img.m_format;
+    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
 }
 
 Image::~Image()
 {
-	free();
+    free();
 }
 
 const Image & Image::operator=(const Image & img)
 {
-	allocate(img.m_width, img.m_height);
-	m_format = img.m_format;
-	memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height);
-	return *this;
+    allocate(img.m_width, img.m_height, m_depth);
+    m_format = img.m_format;
+    memcpy(m_data, img.m_data, sizeof(Color32) * m_width * m_height * m_depth);
+    return *this;
 }
 
 
-void Image::allocate(uint w, uint h)
-{
-	m_width = w;
-	m_height = h;
-	m_data = (Color32 *)realloc(m_data, w * h * sizeof(Color32));
+void Image::allocate(uint w, uint h, uint d/*= 1*/)
+{
+    free();
+    m_width = w;
+    m_height = h;
+	m_depth = d;
+    m_data = realloc<Color32>(m_data, w * h * d);
+}
+
+void Image::resize(uint w, uint h, uint d/*= 1*/) {
+
+    Image img;
+    img.allocate(w, h, d);
+
+    Color32 background(0,0,0,0);
+
+    // Copy image.
+    uint x, y, z;
+    for(z = 0; z < min(d, m_depth); z++) {
+        for(y = 0; y < min(h, m_height); y++) {
+            for(x = 0; x < min(w, m_width); x++) {
+                img.pixel(x, y, z) = pixel(x, y, z);
+            }
+            for(; x < w; x++) {
+                img.pixel(x, y, z) = background;
+            }
+        }
+        for(; y < h; y++) {
+            for(x = 0; x < w; x++) {
+                img.pixel(x, y, z) = background;
+            }
+        }
+    }
+    for(; z < d; z++) {
+        for(y = 0; y < h; y++) {
+            for(x = 0; x < w; x++) {
+                img.pixel(x, y, z) = background;
+            }
+        }
+    }
+
+    swap(m_width, img.m_width);
+    swap(m_height, img.m_height);
+	swap(m_depth, img.m_depth);
+    swap(m_format, img.m_format);
+    swap(m_data, img.m_data);
 }
 
 bool Image::load(const char * name)
 {
-	free();
-	
-	AutoPtr<Image> img(ImageIO::load(name));
-	if (img == NULL) {
-		return false;
-	}
-	
-	swap(m_width, img->m_width);
-	swap(m_height, img->m_height);
-	swap(m_format, img->m_format);
-	swap(m_data, img->m_data);
-	
-	return true;
-}
-
-void Image::wrap(void * data, uint w, uint h)
-{
-	free();
-	m_data = (Color32 *)data;
-	m_width = w;
-	m_height = h;
+    free();
+
+    AutoPtr<Image> img(ImageIO::load(name));
+    if (img == NULL) {
+        return false;
+    }
+
+    swap(m_width, img->m_width);
+    swap(m_height, img->m_height);
+	swap(m_depth, img->m_depth);
+    swap(m_format, img->m_format);
+    swap(m_data, img->m_data);
+
+    return true;
+}
+
+void Image::wrap(void * data, uint w, uint h, uint d)
+{
+    free();
+    m_data = (Color32 *)data;
+    m_width = w;
+    m_height = h;
+	m_depth = d;
 }
 
 void Image::unwrap()
 {
-	m_data = NULL;
-	m_width = 0;
-	m_height = 0;
+    m_data = NULL;
+    m_width = 0;
+    m_height = 0;
+	m_depth = 0;
 }
 
 
 void Image::free()
 {
-	::free(m_data);
-	m_data = NULL;
+    ::free(m_data);
+    m_data = NULL;
 }
 
 
 uint Image::width() const
 {
-	return m_width;
+    return m_width;
 }
 
 uint Image::height() const
 {
-	return m_height;
+    return m_height;
+}
+
+uint Image::depth() const
+{
+	return m_depth;
 }
 
 const Color32 * Image::scanline(uint h) const
 {
-	nvDebugCheck(h < m_height);
-	return m_data + h * m_width;
+    nvDebugCheck(h < m_height);
+    return m_data + h * m_width;
 }
 
 Color32 * Image::scanline(uint h)
 {
-	nvDebugCheck(h < m_height);
-	return m_data + h * m_width;
+    nvDebugCheck(h < m_height);
+    return m_data + h * m_width;
 }
 
 const Color32 * Image::pixels() const
 {
-	return m_data;
+    return m_data;
 }
 
 Color32 * Image::pixels()
 {
-	return m_data;
+    return m_data;
 }
 
 const Color32 & Image::pixel(uint idx) const
 {
-	nvDebugCheck(idx < m_width * m_height);
-	return m_data[idx];
+    nvDebugCheck(idx < m_width * m_height * m_depth);
+    return m_data[idx];
 }
 
 Color32 & Image::pixel(uint idx)
 {
-	nvDebugCheck(idx < m_width * m_height);
-	return m_data[idx];
+    nvDebugCheck(idx < m_width * m_height * m_depth);
+    return m_data[idx];
 }
 
 
 Image::Format Image::format() const
 {
-	return m_format;
+    return m_format;
 }
 
 void Image::setFormat(Image::Format f)
 {
-	m_format = f;
+    m_format = f;
 }
 
 void Image::fill(Color32 c)
 {
-	const uint size = m_width * m_height;
-	for (uint i = 0; i < size; ++i)
-	{
-		m_data[i] = c;
-	}
+    const uint size = m_width * m_height * m_depth;
+    for (uint i = 0; i < size; ++i)
+    {
+        m_data[i] = c;
+    }
 }
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ImageIO.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ImageIO.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ImageIO.h
@@ -1,58 +1,36 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_IMAGEIO_H
 #define NV_IMAGE_IMAGEIO_H
 
-#include <nvimage/nvimage.h>
+#include "nvimage.h"
+
+#include "nvcore/StrLib.h"
+
 
 namespace nv
 {
-	class Image;
-	class FloatImage;
-	class Stream;
-
-	namespace ImageIO
-	{
-		NVIMAGE_API Image * load(const char * fileName);
-		NVIMAGE_API Image * load(const char * fileName, Stream & s);
-
-		NVIMAGE_API FloatImage * loadFloat(const char * fileName);
-		NVIMAGE_API FloatImage * loadFloat(const char * fileName, Stream & s);
-		
-		NVIMAGE_API bool save(const char * fileName, Stream & s, Image * img);
-		NVIMAGE_API bool save(const char * fileName, Image * img);
-		NVIMAGE_API bool saveFloat(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);
-
-		NVIMAGE_API Image * loadTGA(Stream & s);
-		NVIMAGE_API bool saveTGA(Stream & s, const Image * img);
-
-		NVIMAGE_API Image * loadPSD(Stream & s);
-
-#if defined(HAVE_PNG)
-		NVIMAGE_API Image * loadPNG(Stream & s);
-#endif
-
-#if defined(HAVE_JPEG)
-		NVIMAGE_API Image * loadJPG(Stream & s);
-#endif
-
-#if defined(HAVE_TIFF)
-		NVIMAGE_API FloatImage * loadFloatTIFF(const char * fileName, Stream & s);
-		
-		NVIMAGE_API bool saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);
-#endif
-
-#if defined(HAVE_OPENEXR)
-		NVIMAGE_API FloatImage * loadFloatEXR(const char * fileName, Stream & s);
-		
-		NVIMAGE_API bool saveFloatEXR(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);
-#endif
+    class Image;
+    class FloatImage;
+    class Stream;
+
+    namespace ImageIO
+    {
+        NVIMAGE_API Image * load(const char * fileName);
+        NVIMAGE_API Image * load(const char * fileName, Stream & s);
+
+        NVIMAGE_API FloatImage * loadFloat(const char * fileName);
+        NVIMAGE_API FloatImage * loadFloat(const char * fileName, Stream & s);
+
+        NVIMAGE_API bool save(const char * fileName, const Image * img, const char ** tags=NULL); // NULL terminated list.
+        NVIMAGE_API bool save(const char * fileName, Stream & s, const Image * img, const char ** tags=NULL);
+
+        NVIMAGE_API bool saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount);
+        NVIMAGE_API bool saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount);
 
-	//	NVIMAGE_API FloatImage * loadFloatPFM(const char * fileName, Stream & s);
-	//	NVIMAGE_API bool saveFloatPFM(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components);
+    } // ImageIO namespace
 
-	} // ImageIO namespace
-	
 } // nv namespace
 
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ImageIO.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ImageIO.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ImageIO.cpp
@@ -1,752 +1,905 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
-#include <nvcore/Ptr.h>
-#include <nvcore/Containers.h>
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-//#include <nvcore/Tokenizer.h>	// @@ Disable temporarily
-#include <nvcore/TextWriter.h>
-
-#include <nvmath/Color.h>
-
 #include "ImageIO.h"
 #include "Image.h"
 #include "FloatImage.h"
 #include "TgaFile.h"
 #include "PsdFile.h"
+#include "DirectDrawSurface.h"
+#include "PixelFormat.h"
+
+#include "nvmath/Color.h"
+#include "nvmath/Half.h"
+
+#include "nvcore/Ptr.h"
+#include "nvcore/Utils.h"
+#include "nvcore/Array.inl"
+#include "nvcore/StrLib.h"
+#include "nvcore/StdStream.h"
+#include "nvcore/TextWriter.h"
 
 // Extern
+#if defined(HAVE_FREEIMAGE)
+#   include <FreeImage.h>
+// If FreeImage available, do not use individual libraries, since that produces link conflicts in some platforms.
+#   undef HAVE_JPEG
+#   undef HAVE_PNG
+#   undef HAVE_TIFF
+#   undef HAVE_OPENEXR
+#endif
+
 #if defined(HAVE_JPEG)
 extern "C" {
-#	include <jpeglib.h>
+#   include <jpeglib.h>
 }
 #endif
 
 #if defined(HAVE_PNG)
-#	include <png.h>
+#   include <png.h>
 #endif
 
 #if defined(HAVE_TIFF)
-#	define _TIFF_DATA_TYPEDEFS_
-#	include <tiffio.h>
+#   define _TIFF_DATA_TYPEDEFS_
+#   include <tiffio.h>
 #endif
 
 #if defined(HAVE_OPENEXR)
-#	include <ImfIO.h>
-#	include <ImathBox.h>
-#	include <ImfChannelList.h>
-#	include <ImfInputFile.h>
-#	include <ImfOutputFile.h>
-#	include <ImfArray.h>
+#   include <ImfIO.h>
+#   include <ImathBox.h>
+#   include <ImfChannelList.h>
+#   include <ImfInputFile.h>
+#   include <ImfOutputFile.h>
+#   include <ImfArray.h>
 #endif
 
+#if defined(HAVE_STBIMAGE)
+#   define STBI_NO_STDIO
+#   include <stb_image.h>
+#endif
+
+
 using namespace nv;
 
-namespace {
 
-	// Array of image load plugins.
-//	static HashMap<String, ImageInput_Plugin> s_plugin_load_map;
 
-	// Array of image save plugins.
-//	static HashMap<String, ImageOutput_Plugin> s_plugin_save_map;
-	
-	struct Color555 {
-		uint16 b : 5;
-		uint16 g : 5;
-		uint16 r : 5;
-	};
-	
-} // namespace
+struct Color555 {
+    uint16 b : 5;
+    uint16 g : 5;
+    uint16 r : 5;
+};
+
+// Load TGA image.
+static Image * loadTGA(Stream & s)
+{
+    nvCheck(!s.isError());
+    nvCheck(s.isLoading());
+
+    TgaHeader tga;
+    s << tga;
+    s.seek(TgaHeader::Size + tga.id_length);
+
+    // Get header info.
+    bool rle = false;
+    bool pal = false;
+    bool rgb = false;
+    bool grey = false;
+
+    switch( tga.image_type ) {
+        case TGA_TYPE_RLE_INDEXED:
+            rle = true;
+            // no break is intended!
+        case TGA_TYPE_INDEXED:
+            if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
+                nvDebug( "*** loadTGA: Error, only 24bit paletted images are supported.\n" );
+                return NULL;
+            }
+            pal = true;
+            break;
 
+	case TGA_TYPE_RLE_RGB:
+	    rle = true;
+	    // no break is intended!
+	case TGA_TYPE_RGB:
+	    rgb = true;
+	    break;
+
+	case TGA_TYPE_RLE_GREY:
+	    rle = true;
+	    // no break is intended!
+	case TGA_TYPE_GREY:
+	    grey = true;
+	    break;
+
+	default:
+	    nvDebug( "*** loadTGA: Error, unsupported image type.\n" );
+	    return NULL;
+    }
 
-Image * nv::ImageIO::load(const char * fileName)
-{
-	nvDebugCheck(fileName != NULL);
+    const uint pixel_size = (tga.pixel_size/8);
+    nvDebugCheck(pixel_size <= 4);
 
-	StdInputStream stream(fileName);
-	
-	if (stream.isError()) {
-		return NULL;
-	}
-	
-	return ImageIO::load(fileName, stream);
-}
+    const uint size = tga.width * tga.height * pixel_size;
 
-Image * nv::ImageIO::load(const char * fileName, Stream & s)
-{
-	nvDebugCheck(fileName != NULL);
-	nvDebugCheck(s.isLoading());
 
-	const char * extension = Path::extension(fileName);
-	
-	if (strCaseCmp(extension, ".tga") == 0) {
-		return ImageIO::loadTGA(s);
-	}
-#if defined(HAVE_JPEG)
-	if (strCaseCmp(extension, ".jpg") == 0 || strCaseCmp(extension, ".jpeg") == 0) {
-		return loadJPG(s);
-	}
-#endif
-#if defined(HAVE_PNG)
-	if (strCaseCmp(extension, ".png") == 0) {
-		return loadPNG(s);
-	}
-#endif
-	if (strCaseCmp(extension, ".psd") == 0) {
-		return loadPSD(s);
+    // Read palette
+    uint8 palette[768];
+    if( pal ) {
+        nvDebugCheck(tga.colormap_length <= 256);
+        s.serialize(palette, 3 * tga.colormap_length);
+    }
+
+    // Decode image.
+    uint8 * mem = new uint8[size];
+    if( rle ) {
+        // Decompress image in src.
+        uint8 * dst = mem;
+        int num = size;
+
+	while (num > 0) {
+	    // Get packet header
+	    uint8 c;
+	    s << c;
+
+	    uint count = (c & 0x7f) + 1;
+	    num -= count * pixel_size;
+
+	    if (c & 0x80) {
+		// RLE pixels.
+		uint8 pixel[4];	// uint8 pixel[pixel_size];
+		s.serialize( pixel, pixel_size );
+		do {
+		    memcpy(dst, pixel, pixel_size);
+		    dst += pixel_size;
+		} while (--count);
+	    }
+	    else {
+		// Raw pixels.
+		count *= pixel_size;
+		//file->Read8(dst, count);
+		s.serialize(dst, count);
+		dst += count;
+	    }
 	}
-	// @@ use image plugins?
-	return NULL;
+    }
+    else {
+        s.serialize(mem, size);
+    }
+
+    // Allocate image.
+    AutoPtr<Image> img(new Image());
+    img->allocate(tga.width, tga.height);
+
+    int lstep;
+    Color32 * dst;
+    if( tga.flags & TGA_ORIGIN_UPPER ) {
+        lstep = tga.width;
+        dst = img->pixels();
+    }
+    else {
+        lstep = - tga.width;
+        dst = img->pixels() + (tga.height-1) * tga.width;
+    }
+
+    // Write image.
+    uint8 * src = mem;
+    if( pal ) {
+        for( int y = 0; y < tga.height; y++ ) {
+            for( int x = 0; x < tga.width; x++ ) {
+                uint8 idx = *src++;
+                dst[x].setBGRA(palette[3*idx+0], palette[3*idx+1], palette[3*idx+2], 0xFF);
+            }
+            dst += lstep;
+        }
+    }
+    else if( grey ) {
+        img->setFormat(Image::Format_ARGB);
+
+        for( int y = 0; y < tga.height; y++ ) {
+            for( int x = 0; x < tga.width; x++ ) {
+                dst[x].setBGRA(*src, *src, *src, *src);
+                src++;
+            }
+            dst += lstep;
+        }
+    }
+    else {
+
+        if( tga.pixel_size == 16 ) {
+            for( int y = 0; y < tga.height; y++ ) {
+                for( int x = 0; x < tga.width; x++ ) {
+                    Color555 c = *reinterpret_cast<Color555 *>(src);
+                    uint8 b = (c.b << 3) | (c.b >> 2);
+                    uint8 g = (c.g << 3) | (c.g >> 2);
+                    uint8 r = (c.r << 3) | (c.r >> 2);
+                    dst[x].setBGRA(b, g, r, 0xFF);
+                    src += 2;
+                }
+                dst += lstep;
+            }
+        }
+        else if( tga.pixel_size == 24 ) {
+            for( int y = 0; y < tga.height; y++ ) {
+                for( int x = 0; x < tga.width; x++ ) {
+                    dst[x].setBGRA(src[0], src[1], src[2], 0xFF);
+                    src += 3;
+                }
+                dst += lstep;
+            }
+        }
+        else if( tga.pixel_size == 32 ) {
+            img->setFormat(Image::Format_ARGB);
+
+            for( int y = 0; y < tga.height; y++ ) {
+                for( int x = 0; x < tga.width; x++ ) {
+                    dst[x].setBGRA(src[0], src[1], src[2], src[3]);
+                    src += 4;
+                }
+                dst += lstep;
+            }
+        }
+    }
+
+    // free uncompressed data.
+    delete [] mem;
+
+    return img.release();
 }
 
-bool nv::ImageIO::save(const char * fileName, Stream & s, Image * img)
+// Save TGA image.
+static bool saveTGA(Stream & s, const Image * img)
 {
-	nvDebugCheck(fileName != NULL);
-	nvDebugCheck(s.isSaving());
-	nvDebugCheck(img != NULL);
+    nvCheck(!s.isError());
+    nvCheck(img != NULL);
+    nvCheck(img->pixels() != NULL);
+
+    TgaFile tga;
+    tga.head.id_length = 0;
+    tga.head.colormap_type = 0;
+    tga.head.image_type = TGA_TYPE_RGB;
+
+    tga.head.colormap_index = 0;
+    tga.head.colormap_length = 0;
+    tga.head.colormap_size = 0;
+
+    tga.head.x_origin = 0;
+    tga.head.y_origin = 0;
+    tga.head.width = img->width();
+    tga.head.height = img->height();
+    if(img->format() == Image::Format_ARGB) {
+        tga.head.pixel_size = 32;
+        tga.head.flags = TGA_ORIGIN_UPPER | TGA_HAS_ALPHA;
+    }
+    else {
+        tga.head.pixel_size = 24;
+        tga.head.flags = TGA_ORIGIN_UPPER;
+    }
 
-	const char * extension = Path::extension(fileName);
+    // @@ Serialize directly.
+    tga.allocate();
 
-	if (strCaseCmp(extension, ".tga") == 0) {
-		return ImageIO::saveTGA(s, img);
-	}
+    const uint n = img->width() * img->height();
+    if(img->format() == Image::Format_ARGB) {
+        for(uint i = 0; i < n; i++) {
+            Color32 color = img->pixel(i);
+            tga.mem[4 * i + 0] = color.b;
+            tga.mem[4 * i + 1] = color.g;
+            tga.mem[4 * i + 2] = color.r;
+            tga.mem[4 * i + 3] = color.a;
+        }
+    }
+    else {
+        for(uint i = 0; i < n; i++) {
+            Color32 color = img->pixel(i);
+            tga.mem[3 * i + 0] = color.b;
+            tga.mem[3 * i + 1] = color.g;
+            tga.mem[3 * i + 2] = color.r;
+        }
+    }
 
-	return false;
+    s << tga;
+
+    tga.free();
+
+    return true;
 }
 
-bool nv::ImageIO::save(const char * fileName, Image * img)
+/*static Image * loadPPM(Stream & s)
 {
-	nvDebugCheck(fileName != NULL);
-	nvDebugCheck(img != NULL);
+    // @@
+    return NULL;
+}*/
 
-	StdOutputStream stream(fileName);
-	if (stream.isError())
-	{
-		return false;
-	}
+// Save PPM image.
+static bool savePPM(Stream & s, const Image * img)
+{
+    //if (img->depth() != 1) return false;
+    //if (img->format() == Image::Format_ARGB) return false;
 
-	return ImageIO::save(fileName, stream, img);
+    uint w = img->width();
+    uint h = img->height();
+
+    TextWriter writer(&s);
+    writer.format("P6\n");
+    writer.format("%d %d\n", w, h);
+    writer.writeString("255\n");
+    for (uint i = 0; i < w * h; i++) {
+        Color32 c = img->pixel(i);
+        s << (uint8_t&)c.r << (uint8_t&)c.g << (uint8_t&)c.b;
+    }
+
+    return true;
 }
 
-FloatImage * nv::ImageIO::loadFloat(const char * fileName)
+
+/*static FloatImage * loadFloatPFM(Stream & s)
 {
-	nvDebugCheck(fileName != NULL);
+    return NULL;
+}*/
 
-	StdInputStream stream(fileName);
-	
-	if (stream.isError()) {
-		return NULL;
-	}
-	
-	return loadFloat(fileName, stream);
-}
+/*static bool saveFloatPFM(Stream & s, const FloatImage * img, uint base_channel, uint channel_count)
+{
+    return false;
+}*/
 
-FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s)
+// Load PSD image.
+static Image * loadPSD(Stream & s)
 {
-	nvDebugCheck(fileName != NULL);
+    nvCheck(!s.isError());
+    nvCheck(s.isLoading());
 
-	const char * extension = Path::extension(fileName);
-	
-#if defined(HAVE_TIFF)
-	if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0) {
-		return loadFloatTIFF(fileName, s);
-	}
-#endif
-#if defined(HAVE_OPENEXR)
-	if (strCaseCmp(extension, ".exr") == 0) {
-		return loadFloatEXR(fileName, s);
-	}
-#endif
+    s.setByteOrder(Stream::BigEndian);
 
-/* // @@ Disable temporarily
-	if (strCaseCmp(extension, ".pfm") == 0) {
-		return loadFloatPFM(fileName, s);
-	}
-*/
+    PsdHeader header;
+    s << header;
 
-	return NULL;
-}
+    if (!header.isValid())
+    {
+        printf("invalid header!\n");
+        return NULL;
+    }
 
+    if (!header.isSupported())
+    {
+        printf("unsupported file!\n");
+        return NULL;
+    }
 
-bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
-{
-	const char * extension = Path::extension(fileName);
+    int tmp;
 
-#if defined(HAVE_OPENEXR)
-	if (strCaseCmp(extension, ".exr") == 0)
-	{
-		return ImageIO::saveFloatEXR(fileName, fimage, base_component, num_components);
-	}
-#endif
+    // Skip mode data.
+    s << tmp;
+    s.seek(s.tell() + tmp);
+
+    // Skip image resources.
+    s << tmp;
+    s.seek(s.tell() + tmp);
+
+    // Skip the reserved data.
+    s << tmp;
+    s.seek(s.tell() + tmp);
+
+    // Find out if the data is compressed.
+    // Known values:
+    //   0: no compression
+    //   1: RLE compressed
+    uint16 compression;
+    s << compression;
+
+    if (compression > 1) {
+        // Unknown compression type.
+        return NULL;
+    }
 
-#if defined(HAVE_TIFF)
-	if (strCaseCmp(extension, ".tif") == 0 || strCaseCmp(extension, ".tiff") == 0)
-	{
-		return ImageIO::saveFloatTIFF(fileName, fimage, base_component, num_components);
-	}
-#endif
+    uint channel_num = header.channel_count;
 
-/* // @@ Disable Temporarily
-	if (strCaseCmp(extension, ".pfm") == 0)
-	{
-//		return ImageIO::saveFloatPFM(fileName, fimage, base_component, num_components);
-	}
-*/
+    AutoPtr<Image> img(new Image());
+    img->allocate(header.width, header.height);
 
-	if (num_components == 3 || num_components == 4)
-	{
-		AutoPtr<Image> image(fimage->createImage(base_component, num_components));
-		nvCheck(image != NULL);
+    if (channel_num < 4)
+    {
+        // Clear the image.
+        img->fill(Color32(0, 0, 0, 0xFF));
+    }
+    else
+    {
+        // Enable alpha.
+        img->setFormat(Image::Format_ARGB);
 
-		if (num_components == 4)
-		{
-			image->setFormat(Image::Format_ARGB);
-		}
+        // Ignore remaining channels.
+        channel_num = 4;
+    }
 
-		return ImageIO::save(fileName, image.ptr());
-	}
 
-	return false;
-}
+    const uint pixel_count = header.height * header.width;
+
+    static const uint components[4] = {2, 1, 0, 3};
+
+    if (compression)
+    {
+        s.seek(s.tell() + header.height * header.channel_count * sizeof(uint16));
+
+        // Read RLE data.
+        for (uint channel = 0; channel < channel_num; channel++)
+        {
+            uint8 * ptr = (uint8 *)img->pixels() + components[channel];
+
+            uint count = 0;
+            while( count < pixel_count )
+            {
+                if (s.isAtEnd()) return NULL;
+
+                uint8 c;
+                s << c;
+
+                uint len = c;
+                if (len < 128)
+                {
+                    // Copy next len+1 bytes literally.
+                    len++;
+                    count += len;
+                    if (count > pixel_count) return NULL;
+
+                    while (len != 0)
+                    {
+                        s << *ptr;
+                        ptr += 4;
+                        len--;
+                    }
+                }
+                else if (len > 128)
+                {
+                    // Next -len+1 bytes in the dest are replicated from next source byte.
+                    // (Interpret len as a negative 8-bit int.)
+                    len ^= 0xFF;
+                    len += 2;
+                    count += len;
+                    if (s.isAtEnd() || count > pixel_count) return NULL;
+
+                    uint8 val;
+                    s << val;
+                    while( len != 0 ) {
+                        *ptr = val;
+                        ptr += 4;
+                        len--;
+                    }
+                }
+                else if( len == 128 ) {
+                    // No-op.
+                }
+            }
+        }
+    }
+    else
+    {
+        // We're at the raw image data. It's each channel in order (Red, Green, Blue, Alpha, ...)
+        // where each channel consists of an 8-bit value for each pixel in the image.
+
+        // Read the data by channel.
+        for (uint channel = 0; channel < channel_num; channel++)
+        {
+            uint8 * ptr = (uint8 *)img->pixels() + components[channel];
+
+            // Read the data.
+            uint count = pixel_count;
+            while (count != 0)
+            {
+                s << *ptr;
+                ptr += 4;
+                count--;
+            }
+        }
+    }
 
+    return img.release();
+}
 
-/// Load TGA image.
-Image * nv::ImageIO::loadTGA(Stream & s)
+static FloatImage * loadFloatDDS(Stream & s)
 {
-	nvCheck(!s.isError());
-	nvCheck(s.isLoading());
-	
-	TgaHeader tga;
-	s << tga;
-	s.seek(TgaHeader::Size + tga.id_length);
-
-	// Get header info.
-	bool rle = false;
-	bool pal = false;
-	bool rgb = false;
-	bool grey = false;
-
-	switch( tga.image_type ) {
-		case TGA_TYPE_RLE_INDEXED:
-			rle = true;
-			// no break is intended!
-		case TGA_TYPE_INDEXED:
-			if( tga.colormap_type!=1 || tga.colormap_size!=24 || tga.colormap_length>256 ) {
-				nvDebug( "*** ImageIO::loadTGA: Error, only 24bit paletted images are supported.\n" );
-				return NULL;
-			}
-			pal = true;
-			break;
-
-		case TGA_TYPE_RLE_RGB:
-			rle = true;
-			// no break is intended!
-		case TGA_TYPE_RGB:
-			rgb = true;
-			break;
-
-		case TGA_TYPE_RLE_GREY:
-			rle = true;
-			// no break is intended!
-		case TGA_TYPE_GREY:
-			grey = true;
-			break;
-
-		default:
-			nvDebug( "*** ImageIO::loadTGA: Error, unsupported image type.\n" );
-			return NULL;
-	}
-
-	const uint pixel_size = (tga.pixel_size/8);
-	nvDebugCheck(pixel_size <= 4);
-	
-	const uint size = tga.width * tga.height * pixel_size;
-
-	
-	// Read palette
-	uint8 palette[768];
-	if( pal ) {
-		nvDebugCheck(tga.colormap_length < 256);
-		s.serialize(palette, 3 * tga.colormap_length);
-	}
+    nvCheck(s.isLoading());
+    nvCheck(!s.isError());
 
-	// Decode image.
-	uint8 * mem = new uint8[size];
-	if( rle ) {
-		// Decompress image in src.
-		uint8 * dst = mem;
-		int num = size;
-
-		while (num > 0) {
-			// Get packet header
-			uint8 c; 
-			s << c;
-
-			uint count = (c & 0x7f) + 1;
-			num -= count * pixel_size;
-
-			if (c & 0x80) {
-				// RLE pixels.
-				uint8 pixel[4];	// uint8 pixel[pixel_size];
-				s.serialize( pixel, pixel_size );
-				do {
-					memcpy(dst, pixel, pixel_size);
-					dst += pixel_size;
-				} while (--count);
-			}
-			else {
-				// Raw pixels.
-				count *= pixel_size;
-				//file->Read8(dst, count);
-				s.serialize(dst, count);
-				dst += count;
-			}
-		}
-	}
-	else {
-		s.serialize(mem, size);
-	}
+    DDSHeader header;
+    s << header;
 
-	// Allocate image.
-	AutoPtr<Image> img(new Image());
-	img->allocate(tga.width, tga.height);
-
-	int lstep;
-	Color32 * dst;
-	if( tga.flags & TGA_ORIGIN_UPPER ) {
-		lstep = tga.width;
-		dst = img->pixels();
-	}
-	else {
-		lstep = - tga.width;
-		dst = img->pixels() + (tga.height-1) * tga.width;
-	}
+    // @@ We only support a few formats for now.
 
-	// Write image.
-	uint8 * src = mem;
-	if( pal ) {
-		for( int y = 0; y < tga.height; y++ ) {
-			for( int x = 0; x < tga.width; x++ ) {
-				uint8 idx = *src++;
-				dst[x].setBGRA(palette[3*idx+0], palette[3*idx+1], palette[3*idx+2], 0xFF);
-			}
-			dst += lstep;
-		}
-	}
-	else if( grey ) {
-		img->setFormat(Image::Format_ARGB);
-		
-		for( int y = 0; y < tga.height; y++ ) {
-			for( int x = 0; x < tga.width; x++ ) {
-				dst[x].setBGRA(*src, *src, *src, *src);
-				src++;
-			}
-			dst += lstep;
-		}
-	}
-	else {
-		
-		if( tga.pixel_size == 16 ) {
-			for( int y = 0; y < tga.height; y++ ) {
-				for( int x = 0; x < tga.width; x++ ) {
-					Color555 c = *reinterpret_cast<Color555 *>(src);
-					uint8 b = (c.b << 3) | (c.b >> 2);					
-					uint8 g = (c.g << 3) | (c.g >> 2);
-					uint8 r = (c.r << 3) | (c.r >> 2);
-					dst[x].setBGRA(b, g, r, 0xFF);
-					src += 2;
-				}
-				dst += lstep;
-			}
-		}
-		else if( tga.pixel_size == 24 ) {
-			for( int y = 0; y < tga.height; y++ ) {
-				for( int x = 0; x < tga.width; x++ ) {
-					dst[x].setBGRA(src[0], src[1], src[2], 0xFF);
-					src += 3;
-				}
-				dst += lstep;
-			}
-		}
-		else if( tga.pixel_size == 32 ) {
-			img->setFormat(Image::Format_ARGB);
-			
-			for( int y = 0; y < tga.height; y++ ) {
-				for( int x = 0; x < tga.width; x++ ) {
-					dst[x].setBGRA(src[0], src[1], src[2], src[3]);
-					src += 4;
-				}
-				dst += lstep;
-			}
-		}
-	}
+    if (header.pf.fourcc == D3DFMT_A16B16G16R16F) {
+        const int size = header.width * header.height;
+        uint16 * const data = new uint16[size * 4];
+
+        //s.serialize(data, size * 4 * sizeof(uint16));
+        for (int i = 0; i < 4* size; i++) {
+            s << data[i];
+        }
+
+        FloatImage * img = new FloatImage;
+        img->allocate(4, header.width, header.height);
+
+        uint32 * r = (uint32 *)img->channel(0);
+        uint32 * g = (uint32 *)img->channel(1);
+        uint32 * b = (uint32 *)img->channel(2);
+        uint32 * a = (uint32 *)img->channel(3);
+
+        uint16 * ptr = data;
+        for (int i = 0; i < size; i++) {
+            *r++ = half_to_float( *ptr++ );
+            *g++ = half_to_float( *ptr++ );
+            *b++ = half_to_float( *ptr++ );
+            *a++ = half_to_float( *ptr++ );
+        }
+
+        delete [] data;
+
+        return img;
+    }
+    else if (header.pf.fourcc == D3DFMT_R32F) {
+        const int size = header.width * header.height;
+        float * const data = new float[size];
+
+        for (int i = 0; i < size; i++) {
+            s << data[i];
+        }
+
+        FloatImage * img = new FloatImage;
+        img->allocate(4, header.width, header.height);
+
+        float * r = img->channel(0);
+
+        float * ptr = data;
+        for (int i = 0; i < size; i++) {
+            *r++ = *ptr++;
+        }
+
+        delete [] data;
+
+        img->clear(1, 0.0f);
+        img->clear(2, 0.0f);
+        img->clear(3, 1.0f);
+
+        return img;
+    }
+    else if (header.pf.fourcc == D3DFMT_L16 || (header.pf.bitcount == 16 && header.pf.rmask == 0xFFFF && header.pf.gmask == 0 && header.pf.bmask == 0 && header.pf.amask == 0))
+    {
+        const int size = header.width * header.height;
+        uint16 * const data = new uint16[size];
+
+        for (int i = 0; i < size; i++) {
+            s << data[i];
+        }
+
+        FloatImage * img = new FloatImage;
+        img->allocate(4, header.width, header.height);
+
+        float * r = img->channel(0);
+
+        uint16 * ptr = data;
+        for (int i = 0; i < size; i++) {
+            *r++ = float(*ptr++) / 65535.0f;
+        }
+
+        delete [] data;
+
+        img->clear(1, 0.0f);
+        img->clear(2, 0.0f);
+        img->clear(3, 1.0f);
+
+        return img;
+    }
+    else if (header.pf.fourcc == D3DFMT_L8 || (header.pf.bitcount == 8 && header.pf.rmask == 0xFF && header.pf.gmask == 0 && header.pf.bmask == 0 && header.pf.amask == 0))
+    {
+        const int size = header.width * header.height;
+        uint8 * const data = new uint8[size];
 
-	// free uncompressed data.
-	delete [] mem;
+        s.serialize(data, size);
 
-	return img.release();
+        FloatImage * img = new FloatImage;
+        img->allocate(4, header.width, header.height);
+
+        float * r = img->channel(0);
+
+        uint8 * ptr = data;
+        for (int i = 0; i < size; i++) {
+            *r++ = float(*ptr++) / 255.0f;
+        }
+
+        delete [] data;
+
+        img->clear(1, 0.0f);
+        img->clear(2, 0.0f);
+        img->clear(3, 1.0f);
+
+        return img;
+    }
+    return NULL;
 }
 
-/// Save TGA image.
-bool nv::ImageIO::saveTGA(Stream & s, const Image * img)
+static bool saveFloatDDS(Stream & s, const FloatImage * img, uint base_component, uint num_components)
 {
-	nvCheck(!s.isError());
-	nvCheck(img != NULL);
-	nvCheck(img->pixels() != NULL);
-	
-	TgaFile tga;
-	tga.head.id_length = 0;
-	tga.head.colormap_type = 0;
-	tga.head.image_type = TGA_TYPE_RGB;
-
-	tga.head.colormap_index = 0;
-	tga.head.colormap_length = 0;
-	tga.head.colormap_size = 0;
-
-	tga.head.x_origin = 0;
-	tga.head.y_origin = 0;
-	tga.head.width = img->width();
-	tga.head.height = img->height();
-	if(img->format() == Image::Format_ARGB) {
-		tga.head.pixel_size = 32;
-		tga.head.flags = TGA_ORIGIN_UPPER | TGA_HAS_ALPHA;
-	}
-	else {
-		tga.head.pixel_size = 24;
-		tga.head.flags = TGA_ORIGIN_UPPER;
-	}
+    nvCheck(s.isSaving());
+    nvCheck(!s.isError());
 
-	// @@ Serialize directly.
-	tga.allocate();
+    if (num_components != 4) return false;
 
-	const uint n = img->width() * img->height();
-	if(img->format() == Image::Format_ARGB) {
-		for(uint i = 0; i < n; i++) {
-			Color32 color = img->pixel(i);
-			tga.mem[4 * i + 0] = color.b;
-			tga.mem[4 * i + 1] = color.g;
-			tga.mem[4 * i + 2] = color.r;
-			tga.mem[4 * i + 3] = color.a;
-		}
-	}
-	else {
-		for(uint i = 0; i < n; i++) {
-			Color32 color = img->pixel(i);
-			tga.mem[3 * i + 0] = color.b;
-			tga.mem[3 * i + 1] = color.g;
-			tga.mem[3 * i + 2] = color.r;
-		}
-	}
+    static const uint D3DFMT_A16B16G16R16F = 113;
 
-	s << tga;
-	
-	tga.free();
-	
-	return true;
-}
-
-/// Load PSD image.
-Image * nv::ImageIO::loadPSD(Stream & s)
-{
-	nvCheck(!s.isError());
-	nvCheck(s.isLoading());
-	
-	s.setByteOrder(Stream::BigEndian);
-	
-	PsdHeader header;
-	s << header;
-	
-	if (!header.isValid())
-	{
-		printf("invalid header!\n");
-		return NULL;
-	}
-	
-	if (!header.isSupported())
-	{
-		printf("unsupported file!\n");
-		return NULL;
-	}
-	
-	int tmp;
-	
-	// Skip mode data.
-	s << tmp;
-	s.seek(s.tell() + tmp);
-
-	// Skip image resources.
-	s << tmp;
-	s.seek(s.tell() + tmp);
-	
-	// Skip the reserved data.
-	s << tmp;
-	s.seek(s.tell() + tmp);
-	
-	// Find out if the data is compressed.
-	// Known values:
-	//   0: no compression
-	//   1: RLE compressed
-	uint16 compression;
-	s << compression;
-	
-	if (compression > 1) {
-		// Unknown compression type.
-		return NULL;
-	}
-	
-	uint channel_num = header.channel_count;
-	
-	AutoPtr<Image> img(new Image());
-	img->allocate(header.width, header.height);
-	
-	if (channel_num < 4)
-	{
-		// Clear the image.
-		img->fill(Color32(0, 0, 0, 0xFF));
-	}
-	else
-	{
-		// Enable alpha.
-		img->setFormat(Image::Format_ARGB);
-		
-		// Ignore remaining channels.
-		channel_num = 4;
-	}
-	
-	
-	const uint pixel_count = header.height * header.width;
-	
-	static const uint components[4] = {2, 1, 0, 3};
-	
-	if (compression)
-	{
-		s.seek(s.tell() + header.height * header.channel_count * sizeof(uint16));
-		
-		// Read RLE data.						
-		for (uint channel = 0; channel < channel_num; channel++)
-		{
-			uint8 * ptr = (uint8 *)img->pixels() + components[channel];
-			
-			uint count = 0;
-			while( count < pixel_count )
-			{
-				if (s.isAtEnd()) return NULL;
-				
-				uint8 c;
-				s << c;
-				
-				uint len = c;
-				if (len < 128)
-				{
-					// Copy next len+1 bytes literally.
-					len++;
-					count += len;
-					if (count > pixel_count) return NULL;
-	
-					while (len != 0)
-					{
-						s << *ptr;
-						ptr += 4;
-						len--;
-					}
-				} 
-				else if (len > 128)
-				{
-					// Next -len+1 bytes in the dest are replicated from next source byte.
-					// (Interpret len as a negative 8-bit int.)
-					len ^= 0xFF;
-					len += 2;
-					count += len;
-					if (s.isAtEnd() || count > pixel_count) return NULL;
-					
-					uint8 val;
-					s << val;
-					while( len != 0 ) {
-						*ptr = val;
-						ptr += 4;
-						len--;
-					}
-				}
-				else if( len == 128 ) {
-					// No-op.
-				}
-			}
-		}
-	}
-	else
-	{
-		// We're at the raw image data. It's each channel in order (Red, Green, Blue, Alpha, ...)
-		// where each channel consists of an 8-bit value for each pixel in the image.
-		
-		// Read the data by channel.
-		for (uint channel = 0; channel < channel_num; channel++)
-		{
-			uint8 * ptr = (uint8 *)img->pixels() + components[channel];
-			
-			// Read the data.
-			uint count = pixel_count;
-			while (count != 0)
-			{
-				s << *ptr;
-				ptr += 4;
-				count--;
-			}
-		}
-	}
+    DDSHeader header;
+    header.setTexture2D();
+    header.setWidth(img->width());
+    header.setHeight(img->height());
+    header.setFormatCode(D3DFMT_A16B16G16R16F);
+    // ...
+
+    s << header;
 
-	return img.release();
+    uint32 * r = (uint32 *)img->channel(base_component + 0);
+    uint32 * g = (uint32 *)img->channel(base_component + 1);
+    uint32 * b = (uint32 *)img->channel(base_component + 2);
+    uint32 * a = (uint32 *)img->channel(base_component + 3);
+
+    const uint size = img->width() * img->height();
+    for (uint i = 0; i < size; i++) {
+        uint16 R = half_from_float( *r++ );
+        uint16 G = half_from_float( *g++ );
+        uint16 B = half_from_float( *b++ );
+        uint16 A = half_from_float( *a++ );
+
+        s.serialize(&R, sizeof(uint16));
+        s.serialize(&G, sizeof(uint16));
+        s.serialize(&B, sizeof(uint16));
+        s.serialize(&A, sizeof(uint16));
+    }
+
+    return true;
 }
 
+
 #if defined(HAVE_PNG)
 
 static void user_read_data(png_structp png_ptr, png_bytep data, png_size_t length)
 {
-	nvDebugCheck(png_ptr != NULL);
-	
-	Stream * s = (Stream *)png_get_io_ptr(png_ptr);
-	s->serialize(data, (int)length);
-	
-	if (s->isError()) {
-		png_error(png_ptr, "Read Error");
-	}
+    nvDebugCheck(png_ptr != NULL);
+
+    Stream * s = (Stream *)png_get_io_ptr(png_ptr);
+    s->serialize(data, (int)length);
+
+    if (s->isError()) {
+        png_error(png_ptr, "Read Error");
+    }
 }
 
 
-Image * nv::ImageIO::loadPNG(Stream & s)
+static Image * loadPNG(Stream & s)
 {
-	nvCheck(!s.isError());
-	
-	// Set up a read buffer and check the library version
-	png_structp png_ptr;
-	png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
-	if (png_ptr == NULL) {
-	//	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
-		return NULL;
-	}
+    nvCheck(!s.isError());
 
-	// Allocate/initialize a memory block for the image information
-	png_infop info_ptr = png_create_info_struct(png_ptr);
-	if (info_ptr == NULL) {
-		png_destroy_read_struct(&png_ptr, NULL, NULL);
-	//	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
-		return NULL;
-	}
+    // Set up a read buffer and check the library version
+    png_structp png_ptr;
+    png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+    if (png_ptr == NULL) {
+        //	nvDebug( "*** LoadPNG: Error allocating read buffer in file '%s'.\n", name );
+        return NULL;
+    }
 
-	// Set up the error handling
-	if (setjmp(png_jmpbuf(png_ptr))) {
-		png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-	//	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
-		return NULL;
-	}
+    // Allocate/initialize a memory block for the image information
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (info_ptr == NULL) {
+        png_destroy_read_struct(&png_ptr, NULL, NULL);
+        //	nvDebug( "*** LoadPNG: Error allocating image information for '%s'.\n", name );
+        return NULL;
+    }
 
-	// Set up the I/O functions.
-	png_set_read_fn(png_ptr, (void*)&s, user_read_data);
+    // Set up the error handling
+    if (setjmp(png_jmpbuf(png_ptr))) {
+        png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+        //	nvDebug( "*** LoadPNG: Error reading png file '%s'.\n", name );
+        return NULL;
+    }
 
+    // Set up the I/O functions.
+    png_set_read_fn(png_ptr, (void*)&s, user_read_data);
 
-	// Retrieve the image header information
-	png_uint_32 width, height;
-	int bit_depth, color_type, interlace_type;
-	png_read_info(png_ptr, info_ptr);
-	png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
 
+    // Retrieve the image header information
+    png_uint_32 width, height;
+    int bit_depth, color_type, interlace_type;
+    png_read_info(png_ptr, info_ptr);
+    png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
 
-	if (color_type == PNG_COLOR_TYPE_PALETTE && bit_depth <= 8) {
-		// Convert indexed images to RGB.
-		png_set_expand(png_ptr);
-	}
-	else if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
-		// Convert grayscale to RGB.
-		png_set_expand(png_ptr);
-	}
-	else if (png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
-		// Expand images with transparency to full alpha channels
-		// so the data will be available as RGBA quartets.
-		png_set_expand(png_ptr);
-	}
-	else if (bit_depth < 8) {
-		// If we have < 8 scale it up to 8.
-		//png_set_expand(png_ptr);
-		png_set_packing(png_ptr);
-	}
 
-	// Reduce bit depth.
-	if (bit_depth == 16) {
-		png_set_strip_16(png_ptr);
-	}
+    if (color_type == PNG_COLOR_TYPE_PALETTE && bit_depth <= 8) {
+        // Convert indexed images to RGB.
+        png_set_expand(png_ptr);
+    }
+    else if (color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8) {
+        // Convert grayscale to RGB.
+        png_set_expand(png_ptr);
+    }
+    else if (png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
+        // Expand images with transparency to full alpha channels
+        // so the data will be available as RGBA quartets.
+        png_set_expand(png_ptr);
+    }
+    else if (bit_depth < 8) {
+        // If we have < 8 scale it up to 8.
+        //png_set_expand(png_ptr);
+        png_set_packing(png_ptr);
+    }
 
-	// Represent gray as RGB
-	if (color_type == PNG_COLOR_TYPE_GRAY || color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
-		png_set_gray_to_rgb(png_ptr);
-	}
+    // Reduce bit depth.
+    if (bit_depth == 16) {
+        png_set_strip_16(png_ptr);
+    }
 
-	// Convert to RGBA filling alpha with 0xFF.
-	if (!(color_type & PNG_COLOR_MASK_ALPHA)) {
-		png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
-	}
+    // Represent gray as RGB
+    if (color_type == PNG_COLOR_TYPE_GRAY || color_type == PNG_COLOR_TYPE_GRAY_ALPHA) {
+        png_set_gray_to_rgb(png_ptr);
+    }
 
-	// @todo Choose gamma according to the platform?
-	double screen_gamma = 2.2;
-	int intent;
-	if (png_get_sRGB(png_ptr, info_ptr, &intent)) {
-		png_set_gamma(png_ptr, screen_gamma, 0.45455);
-	}
-	else {
-		double image_gamma;
-		if (png_get_gAMA(png_ptr, info_ptr, &image_gamma)) {
-			png_set_gamma(png_ptr, screen_gamma, image_gamma);
-		}
-		else {
-			png_set_gamma(png_ptr, screen_gamma, 0.45455);
-		}
-	}
+    // Convert to RGBA filling alpha with 0xFF.
+    if (!(color_type & PNG_COLOR_MASK_ALPHA)) {
+        png_set_filler(png_ptr, 0xFF, PNG_FILLER_AFTER);
+    }
 
-	// Perform the selected transforms.
-	png_read_update_info(png_ptr, info_ptr);
+    // @todo Choose gamma according to the platform?
+    double screen_gamma = 2.2;
+    int intent;
+    if (png_get_sRGB(png_ptr, info_ptr, &intent)) {
+        png_set_gamma(png_ptr, screen_gamma, 0.45455);
+    }
+    else {
+        double image_gamma;
+        if (png_get_gAMA(png_ptr, info_ptr, &image_gamma)) {
+            png_set_gamma(png_ptr, screen_gamma, image_gamma);
+        }
+        else {
+            png_set_gamma(png_ptr, screen_gamma, 0.45455);
+        }
+    }
 
-	png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
+    // Perform the selected transforms.
+    png_read_update_info(png_ptr, info_ptr);
 
-	AutoPtr<Image> img(new Image());
-	img->allocate(width, height);
+    png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type, &interlace_type, NULL, NULL);
 
-	// Set internal format flags.
-	if(color_type & PNG_COLOR_MASK_COLOR) {
-		//img->flags |= PI_IF_HAS_COLOR;
-	}
-	if(color_type & PNG_COLOR_MASK_ALPHA) {
-		//img->flags |= PI_IF_HAS_ALPHA;
-		img->setFormat(Image::Format_ARGB);
-	}
+    AutoPtr<Image> img(new Image());
+    img->allocate(width, height);
 
-	// Read the image
-	uint8 * pixels = (uint8 *)img->pixels();
-	png_bytep * row_data = new png_bytep[sizeof(png_byte) * height];
-	for (uint i = 0; i < height; i++) {
-		row_data[i] = &(pixels[width * 4 * i]);
-	}
+    // Set internal format flags.
+    if(color_type & PNG_COLOR_MASK_COLOR) {
+        //img->flags |= PI_IF_HAS_COLOR;
+    }
+    if(color_type & PNG_COLOR_MASK_ALPHA) {
+        //img->flags |= PI_IF_HAS_ALPHA;
+        img->setFormat(Image::Format_ARGB);
+    }
 
-	png_read_image(png_ptr, row_data);
-	delete [] row_data;
+    // Read the image
+    uint8 * pixels = (uint8 *)img->pixels();
+    png_bytep * row_data = new png_bytep[sizeof(png_byte) * height];
+    for (uint i = 0; i < height; i++) {
+        row_data[i] = &(pixels[width * 4 * i]);
+    }
 
-	// Finish things up
-	png_read_end(png_ptr, info_ptr);
-	png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
-
-	// RGBA to BGRA.
-	uint num = width * height;
-	for(uint i = 0; i < num; i++)
-	{
-		Color32 c = img->pixel(i);
-		img->pixel(i) = Color32(c.b, c.g, c.r, c.a);
-	}
-	
-	// Compute alpha channel if needed.
-	/*if( img->flags & PI_IU_BUMPMAP || img->flags & PI_IU_ALPHAMAP ) {
-		if( img->flags & PI_IF_HAS_COLOR && !(img->flags & PI_IF_HAS_ALPHA)) {
-			img->ComputeAlphaFromColor();
-		}
-	}*/
+    png_read_image(png_ptr, row_data);
+    delete [] row_data;
+
+    // Finish things up
+    png_read_end(png_ptr, info_ptr);
+    png_destroy_read_struct(&png_ptr, &info_ptr, NULL);
+
+    // RGBA to BGRA.
+    uint num = width * height;
+    for(uint i = 0; i < num; i++)
+    {
+        Color32 c = img->pixel(i);
+        img->pixel(i) = Color32(c.b, c.g, c.r, c.a);
+    }
+
+    // Compute alpha channel if needed.
+    /*if( img->flags & PI_IU_BUMPMAP || img->flags & PI_IU_ALPHAMAP ) {
+        if( img->flags & PI_IF_HAS_COLOR && !(img->flags & PI_IF_HAS_ALPHA)) {
+            img->ComputeAlphaFromColor();
+        }
+    }*/
 
-	return img.release();
+    return img.release();
+}
+
+static void user_write_data(png_structp png_ptr, png_bytep data, png_size_t length)
+{
+    nvDebugCheck(png_ptr != NULL);
+
+    Stream * s = (Stream *)png_get_io_ptr(png_ptr);
+    s->serialize(data, (int)length);
+
+    if (s->isError()) {
+        png_error(png_ptr, "Write Error");
+    }
+}
+
+static void user_write_flush(png_structp png_ptr) { }
+
+static bool savePNG(Stream & s, const Image * img, const char ** tags/*=NULL*/)
+{
+    nvCheck(!s.isError());
+    nvCheck(img != NULL);
+    nvCheck(img->pixels() != NULL);
+
+    // Set up a write buffer and check the library version
+    png_structp png_ptr;
+    png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
+    if (png_ptr == NULL) {
+        return false;
+    }
+
+    // Allocate/initialize a memory block for the image information
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (info_ptr == NULL) {
+        png_destroy_write_struct(&png_ptr, NULL);
+        return false;
+    }
+
+    // Set up the error handling
+    if (setjmp(png_jmpbuf(png_ptr))) {
+        png_destroy_write_struct(&png_ptr, &info_ptr);
+        return false;
+    }
+
+    // Set up the I/O functions.
+    png_set_write_fn(png_ptr, (void*)&s, user_write_data, user_write_flush);
+
+    // Set image header information
+    int color_type = PNG_COLOR_TYPE_RGBA;
+    switch(img->format())
+    {
+        case Image::Format_RGB:		color_type = PNG_COLOR_TYPE_RGB; break;
+        case Image::Format_ARGB:	color_type = PNG_COLOR_TYPE_RGBA; break;
+    }
+    png_set_IHDR(png_ptr, info_ptr, img->width(), img->height(),
+        8, color_type, PNG_INTERLACE_NONE,
+        PNG_COMPRESSION_TYPE_DEFAULT,
+        PNG_FILTER_TYPE_DEFAULT);
+
+    // Set image data
+    png_bytep * row_data = new png_bytep[sizeof(png_byte) * img->height()];
+    for (uint i = 0; i < img->height(); i++) {
+        row_data[i] = (png_byte*)img->scanline (i);
+        if (img->format() == Image::Format_RGB) row_data[i]--; // This is a bit of a hack, libpng expects images in ARGB format not BGRA, it supports BGR swapping, but not alpha swapping.
+    }
+    png_set_rows(png_ptr, info_ptr, row_data);
+
+    png_text * text = NULL;
+    if (tags != NULL)
+    {
+        int count = 0;
+        while(tags[2 * count] != NULL) count++;
+
+        text = new png_text[count];
+        memset(text, 0, count * sizeof(png_text);
+
+        for (int i = 0; i < count; i++) {
+            text[i].compression = PNG_TEXT_COMPRESSION_NONE;
+            text[i].key = tags[2 * i + 0];
+            text[i].text = tags[2 * i + 1];
+        }
+
+        png_set_text(png_ptr, info_ptr, text, count);
+    }
+
+    png_write_png(png_ptr, info_ptr,
+        // component order is BGR(A)
+        PNG_TRANSFORM_BGR |
+        // Strip alpha byte for RGB images
+        (img->format() == Image::Format_RGB ? PNG_TRANSFORM_STRIP_FILLER : 0)
+        , NULL);
+
+    // Finish things up
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+
+    delete [] row_data;
+    delete [] text;
+
+    return true;
 }
 
 #endif // defined(HAVE_PNG)
@@ -756,106 +909,106 @@
 static void init_source (j_decompress_ptr /*cinfo*/){
 }
 
-static boolean fill_input_buffer (j_decompress_ptr cinfo){
-	struct jpeg_source_mgr * src = cinfo->src;
-	static JOCTET FakeEOI[] = { 0xFF, JPEG_EOI };
-
-	// Generate warning
-	nvDebug("jpeglib: Premature end of file\n");
-
-	// Insert a fake EOI marker
-	src->next_input_byte = FakeEOI;
-	src->bytes_in_buffer = 2;
+static boolean fill_input_buffer (j_decompress_ptr cinfo) {
+    struct jpeg_source_mgr * src = cinfo->src;
+    static JOCTET FakeEOI[] = { 0xFF, JPEG_EOI };
+
+    // Generate warning
+    nvDebug("jpeglib: Premature end of file\n");
+
+    // Insert a fake EOI marker
+    src->next_input_byte = FakeEOI;
+    src->bytes_in_buffer = 2;
 
-	return TRUE;
+    return TRUE;
 }
 
 static void skip_input_data (j_decompress_ptr cinfo, long num_bytes) {
-	struct jpeg_source_mgr * src = cinfo->src;
+    struct jpeg_source_mgr * src = cinfo->src;
 
-	if(num_bytes >= (long)src->bytes_in_buffer) {
-		fill_input_buffer(cinfo);
-		return;
-	}
+    if(num_bytes >= (long)src->bytes_in_buffer) {
+        fill_input_buffer(cinfo);
+        return;
+    }
 
-	src->bytes_in_buffer -= num_bytes;
-	src->next_input_byte += num_bytes;
+    src->bytes_in_buffer -= num_bytes;
+    src->next_input_byte += num_bytes;
 }
 
 static void term_source (j_decompress_ptr /*cinfo*/){
-	// no work necessary here
+    // no work necessary here
 }
 
 
-Image * nv::ImageIO::loadJPG(Stream & s)
+static Image * loadJPG(Stream & s)
 {
-	nvCheck(!s.isError());
-	
-	// Read the entire file.
-	Array<uint8> byte_array;
-	byte_array.resize(s.size());
-	s.serialize(byte_array.unsecureBuffer(), s.size());
-	
-	jpeg_decompress_struct cinfo;
-	jpeg_error_mgr jerr;
-
-	cinfo.err = jpeg_std_error(&jerr);
-	jpeg_create_decompress(&cinfo);
-
-	cinfo.src = (struct jpeg_source_mgr *) (*cinfo.mem->alloc_small)
-			((j_common_ptr) &cinfo, JPOOL_PERMANENT, sizeof(struct jpeg_source_mgr));
-	cinfo.src->init_source = init_source;
-	cinfo.src->fill_input_buffer = fill_input_buffer;
-	cinfo.src->skip_input_data = skip_input_data;
-	cinfo.src->resync_to_restart = jpeg_resync_to_restart;	// use default method
-	cinfo.src->term_source = term_source;
-	cinfo.src->bytes_in_buffer = byte_array.size();
-	cinfo.src->next_input_byte = byte_array.buffer();
-
-	jpeg_read_header(&cinfo, TRUE);
-	jpeg_start_decompress(&cinfo);
-
-	/*
-	cinfo.do_fancy_upsampling = FALSE;	// fast decompression
-	cinfo.dct_method = JDCT_FLOAT;			// Choose floating point DCT method.
-	*/
-
-	uint8 * tmp_buffer = new uint8 [cinfo.output_width * cinfo.output_height * cinfo.num_components];
-	uint8 * scanline = tmp_buffer;
-
-	while( cinfo.output_scanline < cinfo.output_height ){
-		int num_scanlines = jpeg_read_scanlines (&cinfo, &scanline, 1);
-		scanline += num_scanlines * cinfo.output_width * cinfo.num_components;
-	}
+    nvCheck(!s.isError());
 
-	jpeg_finish_decompress(&cinfo);
+    // Read the entire file.
+    Array<uint8> byte_array;
+    byte_array.resize(s.size());
+    s.serialize(byte_array.buffer(), s.size());
 
-	AutoPtr<Image> img(new Image());
-	img->allocate(cinfo.output_width, cinfo.output_height);
+    jpeg_decompress_struct cinfo;
+    jpeg_error_mgr jerr;
 
-	Color32 * dst = img->pixels();
-	const int size = img->height() * img->width();
-	const uint8 * src = tmp_buffer;
-
-	if( cinfo.num_components == 3 ) {
-		img->setFormat(Image::Format_RGB);
-		for( int i = 0; i < size; i++ ) {
-			*dst++ = Color32(src[0], src[1], src[2]);
-			src += 3;
-		}
-	}
-	else {
-		img->setFormat(Image::Format_ARGB);
-		for( int i = 0; i < size; i++ ) {
-			*dst++ = Color32(*src, *src, *src, *src);
-			src++;
-		}
-	}
+    cinfo.err = jpeg_std_error(&jerr);
+    jpeg_create_decompress(&cinfo);
+
+    cinfo.src = (struct jpeg_source_mgr *) (*cinfo.mem->alloc_small)
+                ((j_common_ptr) &cinfo, JPOOL_PERMANENT, sizeof(struct jpeg_source_mgr));
+    cinfo.src->init_source = init_source;
+    cinfo.src->fill_input_buffer = fill_input_buffer;
+    cinfo.src->skip_input_data = skip_input_data;
+    cinfo.src->resync_to_restart = jpeg_resync_to_restart;	// use default method
+    cinfo.src->term_source = term_source;
+    cinfo.src->bytes_in_buffer = byte_array.size();
+    cinfo.src->next_input_byte = byte_array.buffer();
+
+    jpeg_read_header(&cinfo, TRUE);
+    jpeg_start_decompress(&cinfo);
+
+    /*
+    cinfo.do_fancy_upsampling = FALSE;	// fast decompression
+    cinfo.dct_method = JDCT_FLOAT;			// Choose floating point DCT method.
+    */
+
+    uint8 * tmp_buffer = new uint8 [cinfo.output_width * cinfo.output_height * cinfo.num_components];
+    uint8 * scanline = tmp_buffer;
+
+    while( cinfo.output_scanline < cinfo.output_height ){
+        int num_scanlines = jpeg_read_scanlines (&cinfo, &scanline, 1);
+        scanline += num_scanlines * cinfo.output_width * cinfo.num_components;
+    }
 
-	delete [] tmp_buffer;
-	jpeg_destroy_decompress (&cinfo);
+    jpeg_finish_decompress(&cinfo);
 
-	return img.release();
+    AutoPtr<Image> img(new Image());
+    img->allocate(cinfo.output_width, cinfo.output_height);
+
+    Color32 * dst = img->pixels();
+    const int size = img->height() * img->width();
+    const uint8 * src = tmp_buffer;
+
+    if( cinfo.num_components == 3 ) {
+        img->setFormat(Image::Format_RGB);
+        for( int i = 0; i < size; i++ ) {
+            *dst++ = Color32(src[0], src[1], src[2]);
+            src += 3;
+        }
+    }
+    else {
+        img->setFormat(Image::Format_ARGB);
+        for( int i = 0; i < size; i++ ) {
+            *dst++ = Color32(*src, *src, *src, *src);
+            src++;
+        }
+    }
+
+    delete [] tmp_buffer;
+    jpeg_destroy_decompress (&cinfo);
+
+    return img.release();
 }
 
 #endif // defined(HAVE_JPEG)
@@ -865,645 +1018,1005 @@
 /*
 static tsize_t tiffReadWriteProc(thandle_t h, tdata_t ptr, tsize_t size)
 {
-	Stream * s = (Stream *)h;
-	nvDebugCheck(s != NULL);
+    Stream * s = (Stream *)h;
+    nvDebugCheck(s != NULL);
 
-	s->serialize(ptr, size);
+    s->serialize(ptr, size);
 
-	return size;
+    return size;
 }
 
 static toff_t tiffSeekProc(thandle_t h, toff_t offset, int whence)
 {
-	Stream * s = (Stream *)h;
-	nvDebugCheck(s != NULL);
-	
-	if (!s->isSeekable())
-	{
-		return (toff_t)-1;
-	}
+    Stream * s = (Stream *)h;
+    nvDebugCheck(s != NULL);
 
-	if (whence == SEEK_SET)
-	{
-		s->seek(offset);
-	}
-	else if (whence == SEEK_CUR)
-	{
-		s->seek(s->tell() + offset);
-	}
-	else if (whence == SEEK_END)
-	{
-		s->seek(s->size() + offset);
-	}
+    if (!s->isSeekable())
+    {
+        return (toff_t)-1;
+    }
+
+    if (whence == SEEK_SET)
+    {
+        s->seek(offset);
+    }
+    else if (whence == SEEK_CUR)
+    {
+        s->seek(s->tell() + offset);
+    }
+    else if (whence == SEEK_END)
+    {
+        s->seek(s->size() + offset);
+    }
 
-	return s->tell();
+    return s->tell();
 }
 
 static int tiffCloseProc(thandle_t)
 {
-	return 0;
+    return 0;
 }
 
 static toff_t tiffSizeProc(thandle_t h)
 {
-	Stream * s = (Stream *)h;
-	nvDebugCheck(s != NULL);
-	return s->size();
+    Stream * s = (Stream *)h;
+    nvDebugCheck(s != NULL);
+    return s->size();
 }
 
 static int tiffMapFileProc(thandle_t, tdata_t*, toff_t*)
 {
-	// @@ TODO, Implement these functions.
-	return -1;
+    // @@ TODO, Implement these functions.
+    return -1;
 }
 
 static void tiffUnmapFileProc(thandle_t, tdata_t, toff_t)
 {
-	// @@ TODO, Implement these functions.
+    // @@ TODO, Implement these functions.
 }
 */
 
-FloatImage * nv::ImageIO::loadFloatTIFF(const char * fileName, Stream & s)
+static FloatImage * loadFloatTIFF(const char * fileName, Stream & s)
 {
-	nvCheck(!s.isError());
-	
-	TIFF * tif = TIFFOpen(fileName, "r");
-	//TIFF * tif = TIFFClientOpen(fileName, "r", &s, tiffReadWriteProc, tiffReadWriteProc, tiffSeekProc, tiffCloseProc, tiffSizeProc, tiffMapFileProc, tiffUnmapFileProc);
-	
-	if (!tif)
-	{
-		nvDebug("Can't open '%s' for reading\n", fileName);
-		return NULL;
-	}
-	
-	::uint16 spp, bpp, format;
-	::uint32 width, height;
-	TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height);
-	TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width);
-	TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
-	TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
-	TIFFGetField(tif, TIFFTAG_SAMPLEFORMAT, &format);
-	
-	if (bpp != 8 && bpp != 16 && bpp != 32) {
-		nvDebug("Can't load '%s', only 1 sample per pixel supported\n", fileName);
-		TIFFClose(tif);
-		return NULL;
-	}
-	
-	AutoPtr<FloatImage> fimage(new FloatImage());
-	fimage->allocate(spp, width, height);
-	
-	int linesize = TIFFScanlineSize(tif);
-	tdata_t buf = (::uint8 *)::malloc(linesize);
-	
-	for (uint y = 0; y < height; y++) 
-	{
-		TIFFReadScanline(tif, buf, y, 0);
-
-		for (uint c=0; c<spp; c++ ) 
-		{
-			float * dst = fimage->scanline(y, c);
+    nvCheck(!s.isError());
 
-			for(uint x = 0; x < width; x++) 
-			{
-				if (bpp == 8)
-				{
-					dst[x] = float(((::uint8 *)buf)[x*spp+c]) / float(0xFF);
-				}
-				else if (bpp == 16)
-				{
-					dst[x] = float(((::uint16 *)buf)[x*spp+c]) / float(0xFFFF);
-				}
-				else if (bpp == 32)
-				{
-					if (format==SAMPLEFORMAT_IEEEFP)
-					{
-						dst[x] = float(((float *)buf)[x*spp+c]);
-					}
-					else
-					{
-						dst[x] = float(((::uint32 *)buf)[x*spp+c] >> 8) / float(0xFFFFFF);
-					}
+    TIFF * tif = TIFFOpen(fileName, "r");
+    //TIFF * tif = TIFFClientOpen(fileName, "r", &s, tiffReadWriteProc, tiffReadWriteProc, tiffSeekProc, tiffCloseProc, tiffSizeProc, tiffMapFileProc, tiffUnmapFileProc);
 
-				}
+    if (!tif)
+    {
+        nvDebug("Can't open '%s' for reading\n", fileName);
+        return NULL;
+    }
 
-			}
-		}
-	}
+    ::uint16 spp, bpp, format;
+    ::uint32 width, height;
+    TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &height);
+    TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &width);
+    TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
+    TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &spp);
+    TIFFGetField(tif, TIFFTAG_SAMPLEFORMAT, &format);
+
+    if (bpp != 8 && bpp != 16 && bpp != 32) {
+        nvDebug("Can't load '%s', only 1 sample per pixel supported\n", fileName);
+        TIFFClose(tif);
+        return NULL;
+    }
 
-	::free(buf);
-	
-	TIFFClose(tif);
-	
-	return fimage.release();
-}
-
-bool nv::ImageIO::saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
-{
-	nvCheck(fileName != NULL);
-	nvCheck(fimage != NULL);
-	nvCheck(base_component + num_components <= fimage->componentNum());
-	
-	const int iW = fimage->width();
-	const int iH = fimage->height();
-	const int iC = num_components;
+    AutoPtr<FloatImage> fimage(new FloatImage());
+    fimage->allocate(spp, width, height);
 
-	TIFF * image = TIFFOpen(fileName, "w");
+    int linesize = TIFFScanlineSize(tif);
+    tdata_t buf = malloc<uint8>(linesize);
 
-	// Open the TIFF file
-	if (image == NULL)
-	{
-		nvDebug("Could not open '%s' for writing\n", fileName);
-		return false;
-	}
+    for (uint y = 0; y < height; y++)
+    {
+        TIFFReadScanline(tif, buf, y, 0);
 
-	TIFFSetField(image, TIFFTAG_IMAGEWIDTH,  iW);
-	TIFFSetField(image, TIFFTAG_IMAGELENGTH, iH);
-	TIFFSetField(image, TIFFTAG_SAMPLESPERPIXEL, iC);
-	TIFFSetField(image, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_IEEEFP);
-	TIFFSetField(image, TIFFTAG_BITSPERSAMPLE, 32);
-	
-	uint32 rowsperstrip = TIFFDefaultStripSize(image, (uint32)-1); 
-
-	TIFFSetField(image, TIFFTAG_ROWSPERSTRIP, rowsperstrip);
-	TIFFSetField(image, TIFFTAG_COMPRESSION, COMPRESSION_PACKBITS);
-	if (num_components == 3)
+	for (uint c=0; c<spp; c++ )
 	{
-		// Set this so that it can be visualized with pfstools.
-		TIFFSetField(image, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);
-	}
-	TIFFSetField(image, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
-	TIFFSetField(image, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
+	    float * dst = fimage->scanline(y, c);
 
-	float * scanline = new float[iW * iC];
-	for (int y = 0; y < iH; y++)
-	{
-		for (int c = 0; c < iC; c++) 
+	    for(uint x = 0; x < width; x++)
+	    {
+		if (bpp == 8)
+		{
+			dst[x] = float(((::uint8 *)buf)[x*spp+c]) / float(0xFF);
+		}
+		else if (bpp == 16)
 		{
-			const float * src = fimage->scanline(y, base_component + c);
-			for (int x = 0; x < iW; x++) scanline[x * iC + c] = src[x];
+			dst[x] = float(((::uint16 *)buf)[x*spp+c]) / float(0xFFFF);
 		}
-		if (TIFFWriteScanline(image, scanline, y, 0)==-1)
+		else if (bpp == 32)
 		{
-			nvDebug("Error writing scanline %d\n", y);
-			return false;
+		    if (format==SAMPLEFORMAT_IEEEFP)
+		    {
+			dst[x] = float(((float *)buf)[x*spp+c]);
+		    }
+		    else
+		    {
+			dst[x] = float(((::uint32 *)buf)[x*spp+c] >> 8) / float(0xFFFFFF);
+		    }
 		}
+	    }
 	}
-	delete [] scanline;
+    }
 
-	// Close the file
-	TIFFClose(image);
-	return true;
-}
+    free(buf);
 
-#endif
+    TIFFClose(tif);
 
-#if defined(HAVE_OPENEXR)
+    return fimage.release();
+}
 
-namespace
+static bool saveFloatTIFF(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
 {
-	class ExrStream : public Imf::IStream
-	{
-	public:
-		ExrStream(const char * name, Stream & s) : Imf::IStream(name), m_stream(s)
-		{
-			nvDebugCheck(s.isLoading());
-		}
-		
-		virtual bool read(char c[], int n)
-		{
-			m_stream.serialize(c, n);
-			
-			if (m_stream.isError())
-			{
-				throw Iex::InputExc("I/O error.");
-			}
-			
-			return m_stream.isAtEnd();
-		}
-		
-		virtual Imf::Int64 tellg()
-		{
-			return m_stream.tell();
-		}
-		
-		virtual void seekg(Imf::Int64 pos)
-		{
-			m_stream.seek(pos);
-		}
-		
-		virtual void clear()
-		{
-			m_stream.clearError();
-		}
-		
-	private:
-		Stream & m_stream;
-	};
+    nvCheck(fileName != NULL);
+    nvCheck(fimage != NULL);
+    nvCheck(base_component + num_components <= fimage->componentCount());
 
-} // namespace
+    const int iW = fimage->width();
+    const int iH = fimage->height();
+    const int iC = num_components;
+
+    TIFF * image = TIFFOpen(fileName, "w");
+
+    // Open the TIFF file
+    if (image == NULL)
+    {
+        nvDebug("Could not open '%s' for writing\n", fileName);
+        return false;
+    }
+
+    TIFFSetField(image, TIFFTAG_IMAGEWIDTH,  iW);
+    TIFFSetField(image, TIFFTAG_IMAGELENGTH, iH);
+    TIFFSetField(image, TIFFTAG_SAMPLESPERPIXEL, iC);
+    TIFFSetField(image, TIFFTAG_SAMPLEFORMAT, SAMPLEFORMAT_IEEEFP);
+    TIFFSetField(image, TIFFTAG_BITSPERSAMPLE, 32);
+
+    uint32 rowsperstrip = TIFFDefaultStripSize(image, (uint32)-1);
+
+    TIFFSetField(image, TIFFTAG_ROWSPERSTRIP, rowsperstrip);
+    TIFFSetField(image, TIFFTAG_COMPRESSION, COMPRESSION_PACKBITS);
+    if (num_components == 3)
+    {
+        // Set this so that it can be visualized with pfstools.
+        TIFFSetField(image, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_RGB);
+    }
+    TIFFSetField(image, TIFFTAG_ORIENTATION, ORIENTATION_TOPLEFT);
+    TIFFSetField(image, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
 
-FloatImage * nv::ImageIO::loadFloatEXR(const char * fileName, Stream & s)
+    float * scanline = new float[iW * iC];
+    for (int y = 0; y < iH; y++)
+    {
+        for (int c = 0; c < iC; c++)
+        {
+            const float * src = fimage->scanline(y, base_component + c);
+            for (int x = 0; x < iW; x++) scanline[x * iC + c] = src[x];
+        }
+        if (TIFFWriteScanline(image, scanline, y, 0)==-1)
+        {
+            nvDebug("Error writing scanline %d\n", y);
+            return false;
+        }
+    }
+    delete [] scanline;
+
+    // Close the file
+    TIFFClose(image);
+    return true;
+}
+
+#endif // defined(HAVE_TIFF)
+
+#if defined(HAVE_OPENEXR)
+
+namespace
 {
-	nvCheck(s.isLoading());
-	nvCheck(!s.isError());
+    class ExrStream : public Imf::IStream
+    {
+    public:
+        ExrStream(const char * name, Stream & s) : Imf::IStream(name), m_stream(s)
+        {
+            nvDebugCheck(s.isLoading());
+        }
 
-	ExrStream stream(fileName, s);
-	Imf::InputFile inputFile(stream);
+	virtual bool read(char c[], int n)
+	{
+	    m_stream.serialize(c, n);
 
-	Imath::Box2i box = inputFile.header().dataWindow();
+	    if (m_stream.isError())
+	    {
+		throw Iex::InputExc("I/O error.");
+	    }
 
-	int width = box.max.x - box.min.y + 1;
-	int height = box.max.x - box.min.y + 1;
+	    return m_stream.isAtEnd();
+	}
 
-	const Imf::ChannelList & channels = inputFile.header().channels();
-	
-	// Count channels.
-	uint channelCount= 0;
-	for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it)
+	virtual Imf::Int64 tellg()
 	{
-		channelCount++;
+	    return m_stream.tell();
 	}
-	
-	// Allocate FloatImage.
-	AutoPtr<FloatImage> fimage(new FloatImage());
-	fimage->allocate(channelCount, width, height);
-	
-	// Describe image's layout with a framebuffer.
-	Imf::FrameBuffer frameBuffer;
-	uint i = 0;
-	for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it, ++i)
+
+	virtual void seekg(Imf::Int64 pos)
 	{
-		frameBuffer.insert(it.name(), Imf::Slice(Imf::FLOAT, (char *)fimage->channel(i), sizeof(float), sizeof(float) * width));
+	    nvDebugCheck(pos >= 0 && pos < UINT_MAX);
+	    m_stream.seek((uint)pos);
 	}
-	
-	// Read it.
-	inputFile.setFrameBuffer (frameBuffer);
-	inputFile.readPixels (box.min.y, box.max.y);
-	
-	return fimage.release();
-}
-
-bool nv::ImageIO::saveFloatEXR(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
-{
-	nvCheck(fileName != NULL);
-	nvCheck(fimage != NULL);
-	nvCheck(base_component + num_components <= fimage->componentNum());
-	nvCheck(num_components > 0 && num_components <= 4);
-	
-	const int w = fimage->width();
-	const int h = fimage->height();
-	
-	const char * channelNames[] = {"R", "G", "B", "A"};
-	
-    Imf::Header header (w, h);
-	
-	for (uint c = 0; c < num_components; c++)
+
+	virtual void clear()
 	{
-		header.channels().insert(channelNames[c], Imf::Channel(Imf::FLOAT));
+	    m_stream.clearError();
 	}
-	
+
+    private:
+        Stream & m_stream;
+    };
+
+    static int channelIndexFromName(const char* name)
+    {
+        char c = tolower(name[0]);
+        switch (c)
+        {
+        default:
+        case 'r':
+            return 0;
+        case 'g':
+            return 1;
+        case 'b':
+            return 2;
+        case 'a':
+            return 3;
+        }
+    }
+
+} // namespace
+
+static FloatImage * loadFloatEXR(const char * fileName, Stream & s)
+{
+    nvCheck(s.isLoading());
+    nvCheck(!s.isError());
+
+    ExrStream stream(fileName, s);
+    Imf::InputFile inputFile(stream);
+
+    Imath::Box2i box = inputFile.header().dataWindow();
+
+    int width = box.max.x - box.min.y + 1;
+    int height = box.max.x - box.min.y + 1;
+
+    const Imf::ChannelList & channels = inputFile.header().channels();
+
+    // Count channels.
+    uint channelCount= 0;
+    for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it)
+    {
+        channelCount++;
+    }
+
+    // Allocate FloatImage.
+    AutoPtr<FloatImage> fimage(new FloatImage());
+    fimage->allocate(channelCount, width, height);
+
+    // Describe image's layout with a framebuffer.
+    Imf::FrameBuffer frameBuffer;
+    uint i = 0;
+    for (Imf::ChannelList::ConstIterator it = channels.begin(); it != channels.end(); ++it, ++i)
+    {
+        int channelIndex = channelIndexFromName(it.name());
+        frameBuffer.insert(it.name(), Imf::Slice(Imf::FLOAT, (char *)fimage->channel(channelIndex), sizeof(float), sizeof(float) * width));
+    }
+
+    // Read it.
+    inputFile.setFrameBuffer (frameBuffer);
+    inputFile.readPixels (box.min.y, box.max.y);
+
+    return fimage.release();
+}
+
+static bool saveFloatEXR(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
+{
+    nvCheck(fileName != NULL);
+    nvCheck(fimage != NULL);
+    nvCheck(base_component + num_components <= fimage->componentCount());
+    nvCheck(num_components > 0 && num_components <= 4);
+
+    const int w = fimage->width();
+    const int h = fimage->height();
+
+    const char * channelNames[] = {"R", "G", "B", "A"};
+
+    Imf::Header header (w, h);
+
+    for (uint c = 0; c < num_components; c++)
+    {
+        header.channels().insert(channelNames[c], Imf::Channel(Imf::FLOAT));
+    }
+
     Imf::OutputFile file(fileName, header);
     Imf::FrameBuffer frameBuffer;
-    
-	for (uint c = 0; c < num_components; c++)
-	{
-		char * channel = (char *) fimage->channel(base_component + c);
-		frameBuffer.insert(channelNames[c], Imf::Slice(Imf::FLOAT, channel, sizeof(float), sizeof(float) * w));
-	}
-	
-	file.setFrameBuffer(frameBuffer);
-	file.writePixels(h);
-	
-	return true;
+
+    for (uint c = 0; c < num_components; c++)
+    {
+        char * channel = (char *) fimage->channel(base_component + c);
+        frameBuffer.insert(channelNames[c], Imf::Slice(Imf::FLOAT, channel, sizeof(float), sizeof(float) * w));
+    }
+
+    file.setFrameBuffer(frameBuffer);
+    file.writePixels(h);
+
+    return true;
 }
 
 #endif // defined(HAVE_OPENEXR)
 
-#if 0 // @@ Disable temporarily.
 
-FloatImage * nv::ImageIO::loadFloatPFM(const char * fileName, Stream & s)
+#if defined(HAVE_FREEIMAGE)
+
+static unsigned DLL_CALLCONV ReadProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
 {
-	nvCheck(s.isLoading());
-	nvCheck(!s.isError());
+    Stream * s = (Stream *) handle;
+    s->serialize(buffer, size * count);
+    return count;
+}
 
-	Tokenizer parser(&s);
+static unsigned DLL_CALLCONV WriteProc(void *buffer, unsigned size, unsigned count, fi_handle handle)
+{
+    Stream * s = (Stream *) handle;
+    s->serialize(buffer, size * count);
+    return count;
+}
 
-	parser.nextToken();
+static int DLL_CALLCONV SeekProc(fi_handle handle, long offset, int origin)
+{
+    Stream * s = (Stream *) handle;
 
-	bool grayscale;
-	if (parser.token() == "PF")
-	{
-		grayscale = false;
-	}
-	else if (parser.token() == "Pf")
-	{
-		grayscale = true;
-	}
-	else
-	{
-		// Invalid file.
-		return NULL;
-	}
+    switch(origin) {
+        case SEEK_SET :
+            s->seek(offset);
+            break;
+        case SEEK_END :
+            s->seek(s->size() + offset);
+            break;
+        case SEEK_CUR :
+            s->seek(s->tell() + offset);
+            break;
+        default :
+            return 1;
+    }
 
-	parser.nextLine();
-	
-	int width = parser.token().toInt(); parser.nextToken();
-	int height = parser.token().toInt();
+    return 0;
+}
 
-	parser.nextLine();
+static long DLL_CALLCONV TellProc(fi_handle handle)
+{
+    Stream * s = (Stream *) handle;
+    return s->tell();
+}
 
-	float scaleFactor = parser.token().toFloat();
 
-	if (scaleFactor >= 0)
-	{
-		s.setByteOrder(Stream::BigEndian);
-	}
-	else
-	{
-		s.setByteOrder(Stream::LittleEndian);
-	}
-	scaleFactor = fabsf(scaleFactor);
+Image * nv::ImageIO::loadFreeImage(FREE_IMAGE_FORMAT fif, Stream & s)
+{
+    nvCheck(!s.isError());
 
-	// Allocate image.
-	AutoPtr<FloatImage> fimage(new FloatImage());
+    FreeImageIO io;
+    io.read_proc = ReadProc;
+    io.write_proc = NULL;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
 
-	if (grayscale)
-	{
-		fimage->allocate(1, width, height);
+    FIBITMAP * bitmap = FreeImage_LoadFromHandle(fif, &io, (fi_handle)&s, 0);
 
-		float * channel = fimage->channel(0);
+    if (bitmap == NULL)
+    {
+        return NULL;
+    }
 
-		for (int i = 0; i < width * height; i++)
-		{
-			s << channel[i];
-		}
-	}
-	else
-	{
-		fimage->allocate(3, width, height);
+    const int w = FreeImage_GetWidth(bitmap);
+    const int h = FreeImage_GetHeight(bitmap);
 
-		float * rchannel = fimage->channel(0);
-		float * gchannel = fimage->channel(1);
-		float * bchannel = fimage->channel(2);
+    if (FreeImage_GetImageType(bitmap) != FIT_BITMAP)
+    {
+        // @@ Use tone mapping?
+        FIBITMAP * tmp = FreeImage_ConvertToType(bitmap, FIT_BITMAP, true);
+        FreeImage_Unload(bitmap);
+        bitmap = tmp;
+    }
 
-		for (int i = 0; i < width * height; i++)
-		{
-			s << rchannel[i] << gchannel[i] << bchannel[i];
-		}
-	}
+    nvDebugCheck(FreeImage_GetImageType(bitmap) == FIT_BITMAP);
+    if (FreeImage_GetBPP(bitmap) != 32)
+    {
+        FIBITMAP * tmp = FreeImage_ConvertTo32Bits(bitmap);
+        FreeImage_Unload(bitmap);
+        bitmap = tmp;
+    }
+
+
+    Image * image = new Image();
+    image->allocate(w, h, 1); // freeimage can only load 2d images:
+
+    // Copy the image over to our internal format, FreeImage has the scanlines bottom to top though.
+    for (int y=0; y < h; y++)
+    {
+        const void * src = FreeImage_GetScanLine(bitmap, h - y - 1);
+        void * dst = image->scanline(y);
+
+        memcpy(dst, src, 4 * w);
+    }
+
+    FreeImage_Unload(bitmap);
 
-	return fimage.release();
+    return image;
 }
 
-bool nv::ImageIO::saveFloatPFM(const char * fileName, const FloatImage * fimage, uint base_component, uint num_components)
+FloatImage * nv::ImageIO::loadFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s)
 {
-	nvCheck(fileName != NULL);
-	nvCheck(fimage != NULL);
-	nvCheck(fimage->componentNum() <= base_component + num_components);
-	nvCheck(num_components == 1 || num_components == 3);
+    nvCheck(!s.isError());
 
-	StdOutputStream stream(fileName);
-	TextWriter writer(&stream);
+    FreeImageIO io;
+    io.read_proc = ReadProc;
+    io.write_proc = NULL;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
 
-	if (num_components == 1) writer.write("Pf\n");
-	else /*if (num_components == 3)*/ writer.write("PF\n");
+    FIBITMAP * bitmap = FreeImage_LoadFromHandle(fif, &io, (fi_handle)&s, 0);
 
-	int w = fimage->width();
-	int h = fimage->height();
-	writer.write("%d %d\n", w, h);
-	writer.write("%f\n", -1.0f);	// little endian with 1.0 scale.
+    if (bitmap == NULL)
+    {
+        return NULL;
+    }
 
-	if (num_components == 1)
-	{
-		float * channel = const_cast<float *>(fimage->channel(0));
+    const int w = FreeImage_GetWidth(bitmap);
+    const int h = FreeImage_GetHeight(bitmap);
 
-		for (int i = 0; i < w * h; i++)
-		{
-			stream << channel[i];
-		}
-	}
-	else
-	{
-		float * rchannel = const_cast<float *>(fimage->channel(0));
-		float * gchannel = const_cast<float *>(fimage->channel(1));
-		float * bchannel = const_cast<float *>(fimage->channel(2));
+    FREE_IMAGE_TYPE fit = FreeImage_GetImageType(bitmap);
 
-		for (int i = 0; i < w * h; i++)
-		{
-			stream << rchannel[i] << gchannel[i] << bchannel[i];
-		}
-	}
+    FloatImage * floatImage = new FloatImage();
+
+    switch (fit)
+    {
+        case FIT_BITMAP:
+            floatImage->allocate(4, w, h);
+            {
+                FIBITMAP * tmp = FreeImage_ConvertTo32Bits(bitmap);
+
+                uint bitcount = FreeImage_GetBPP(bitmap);
+                uint byteCount = bitcount / 8;
+
+                for (int y=0; y < h; y++)
+                {
+                    const Color32 * src = (const Color32 *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                    float * r = floatImage->scanline(y, 0);
+                    float * g = floatImage->scanline(y, 1);
+                    float * b = floatImage->scanline(y, 2);
+                    float * a = floatImage->scanline(y, 3);
+
+                    for (int x=0; x < w; x++)
+                    {
+                        r[x] = float(src[x].r) / 255.0f;
+                        g[x] = float(src[x].g) / 255.0f;
+                        b[x] = float(src[x].b) / 255.0f;
+                        a[x] = float(src[x].a) / 255.0f;
+                    }
+
+                    src += byteCount;
+                }
 
-	return true;
+                FreeImage_Unload(tmp);
+            }
+            break;
+        case FIT_FLOAT:
+            floatImage->allocate(1, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const float * src = (const float *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+                float * dst = floatImage->scanline(y, 0);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst[x] = src[x];
+                }
+            }
+            break;
+        case FIT_UINT16:
+            floatImage->allocate(1, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const uint16 * src = (const uint16 *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+                float * dst = floatImage->scanline(y, 0);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst[x] = float(src[x]) / 65535;
+                }
+            }
+            break;
+        case FIT_COMPLEX:
+            floatImage->allocate(2, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const FICOMPLEX * src = (const FICOMPLEX *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                float * dst_real = floatImage->scanline(y, 0);
+                float * dst_imag = floatImage->scanline(y, 1);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst_real[x] = (float)src[x].r;
+                    dst_imag[x] = (float)src[x].i;
+                }
+            }
+            break;
+        case FIT_RGBF:
+            floatImage->allocate(3, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const FIRGBF * src = (const FIRGBF *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                float * dst_red = floatImage->scanline(y, 0);
+                float * dst_green = floatImage->scanline(y, 1);
+                float * dst_blue = floatImage->scanline(y, 2);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst_red[x] = src[x].red;
+                    dst_green[x] = src[x].green;
+                    dst_blue[x] = src[x].blue;
+                }
+            }
+            break;
+        case FIT_RGBAF:
+            floatImage->allocate(4, w, h);
+
+            for (int y=0; y < h; y++)
+            {
+                const FIRGBAF * src = (const FIRGBAF *)FreeImage_GetScanLine(bitmap, h - y - 1 );
+
+                float * dst_red = floatImage->scanline(y, 0);
+                float * dst_green = floatImage->scanline(y, 1);
+                float * dst_blue = floatImage->scanline(y, 2);
+                float * dst_alpha = floatImage->scanline(y, 3);
+
+                for (int x=0; x < w; x++)
+                {
+                    dst_red[x] = src[x].red;
+                    dst_green[x] = src[x].green;
+                    dst_blue[x] = src[x].blue;
+                    dst_alpha[x] = src[x].alpha;
+                }
+            }
+            break;
+        default:
+            delete floatImage;
+            floatImage = NULL;
+    }
+
+    FreeImage_Unload(bitmap);
+
+    return floatImage;
 }
 
-#endif
+bool nv::ImageIO::saveFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const Image * img, const char ** tags)
+{
+    nvCheck(!s.isError());
 
-#if 0
+    FreeImageIO io;
+    io.read_proc = NULL;
+    io.write_proc = WriteProc;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
 
-/** Save PNG*/
-static bool SavePNG(const PiImage * img, const char * name) {
-	nvCheck( img != NULL );
-	nvCheck( img->mem != NULL );
+    const uint w = img->width();
+    const uint h = img->height();
 
-	if( piStrCmp(piExtension(name), ".png" ) != 0 ) {
-		return false;
-	}
-	
-	if( img->flags & PI_IT_CUBEMAP ) {
-		nvDebug("*** Cannot save cubemaps as PNG.");
-		return false;
-	}
-	if( img->flags & PI_IT_DDS ) {
-		nvDebug("*** Cannot save DDS surface as PNG.");
-		return false;
-	}
+    FIBITMAP * bitmap = FreeImage_Allocate(w, h, 32);
 
-	nvDebug( "--- Saving '%s'.\n", name );
-	
-	PiAutoPtr<PiStream> ar( PiFileSystem::CreateFileWriter( name ) );
-	if( ar == NULL ) {
-		nvDebug( "*** SavePNG: Error, cannot save file '%s'.\n", name );
-		return false;
-	}
+    for (uint i = 0; i < h; i++)
+    {
+        uint8 * scanline = FreeImage_GetScanLine(bitmap, i);
+        memcpy(scanline, img->scanline(h - i - 1), w * sizeof(Color32));
+    }
 
-/*
-public class PNGEnc {
+    if (tags != NULL)
+    {
+    #pragma NV_MESSAGE("TODO: Save image metadata")
+        //FreeImage_SetMetadata(
+    }
 
-    public static function encode(img:BitmapData):ByteArray {
-        // Create output byte array
-        var png:ByteArray = new ByteArray();
-        // Write PNG signature
-        png.writeUnsignedInt(0x89504e47);
-        png.writeUnsignedInt(0x0D0A1A0A);
-        // Build IHDR chunk
-        var IHDR:ByteArray = new ByteArray();
-        IHDR.writeInt(img.width);
-        IHDR.writeInt(img.height);
-        IHDR.writeUnsignedInt(0x08060000); // 32bit RGBA
-        IHDR.writeByte(0);
-        writeChunk(png,0x49484452,IHDR);
-        // Build IDAT chunk
-        var IDAT:ByteArray= new ByteArray();
-        for(var i:int=0;i < img.height;i++) {
-            // no filter
-            IDAT.writeByte(0);
-            var p:uint;
-            if ( !img.transparent ) {
-                for(var j:int=0;j < img.width;j++) {
-                    p = img.getPixel(j,i);
-                    IDAT.writeUnsignedInt(
-                        uint(((p&0xFFFFFF) << 8)|0xFF));
-                }
-            } else {
-                for(var j:int=0;j < img.width;j++) {
-                    p = img.getPixel32(j,i);
-                    IDAT.writeUnsignedInt(
-                        uint(((p&0xFFFFFF) << 8)|
-                        (shr(p,24))));
-                }
+    bool result = FreeImage_SaveToHandle(fif, bitmap, &io, (fi_handle)&s, 0) != 0;
+
+    FreeImage_Unload(bitmap);
+
+    return result;
+}
+
+bool nv::ImageIO::saveFloatFreeImage(FREE_IMAGE_FORMAT fif, Stream & s, const FloatImage * img, uint baseComponent, uint componentCount)
+{
+    nvCheck(!s.isError());
+
+    FreeImageIO io;
+    io.read_proc = NULL;
+    io.write_proc = WriteProc;
+    io.seek_proc = SeekProc;
+    io.tell_proc = TellProc;
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    FREE_IMAGE_TYPE type;
+    if (componentCount == 1)
+    {
+        type = FIT_FLOAT;
+    }
+    else if (componentCount == 3)
+    {
+        type = FIT_RGBF;
+    }
+    else if (componentCount == 4)
+    {
+        type = FIT_RGBAF;
+    }
+    else {
+        return false;
+    }
+
+
+    FIBITMAP * bitmap = FreeImage_AllocateT(type, w, h);
+
+    for (uint y = 0; y < h; y++)
+    {
+        float * scanline = (float *)FreeImage_GetScanLine(bitmap, y);
+
+        for (uint x = 0; x < w; x++)
+        {
+            for (uint c = 0; c < componentCount; c++)
+            {
+                scanline[x * componentCount + c] = img->pixel(x, y, baseComponent + c);
             }
         }
-        IDAT.compress();
-        writeChunk(png,0x49444154,IDAT);
-        // Build IEND chunk
-        writeChunk(png,0x49454E44,null);
-        // return PNG
-        return png;
-    }
-
-    private static var crcTable:Array;
-    private static var crcTableComputed:Boolean = false;
-
-    private static function writeChunk(png:ByteArray, 
-            type:uint, data:ByteArray) {
-        if (!crcTableComputed) {
-            crcTableComputed = true;
-            crcTable = [];
-            for (var n:uint = 0; n < 256; n++) {
-                var c:uint = n;
-                for (var k:uint = 0; k < 8; k++) {
-                    if (c & 1) {
-                        c = uint(uint(0xedb88320) ^ 
-                            uint(c >>> 1));
-                    } else {
-                        c = uint(c >>> 1);
-                    }
-                }
-                crcTable[n] = c;
+    }
+
+    bool result = FreeImage_SaveToHandle(fif, bitmap, &io, (fi_handle)&s, 0) != 0;
+
+    FreeImage_Unload(bitmap);
+
+    return result;
+}
+
+#endif // defined(HAVE_FREEIMAGE)
+
+
+#if defined(HAVE_STBIMAGE)
+
+static Image * loadSTB(Stream & s)
+{
+    // @@ Assumes stream cursor is at the beginning and that image occupies the whole stream.
+    const int size = s.size();
+    uint8 * buffer = new uint8[size];
+
+    s.serialize(buffer, size);
+
+    int w, h, n;
+    uint8 * data = stbi_load_from_memory(buffer, size, &w, &h, &n, 4);
+
+    delete [] buffer;
+
+    if (data != NULL) {
+        Image * img = new Image;
+        img->allocate(w, h);
+        img->setFormat(n == 4 ? Image::Format_ARGB : Image::Format_RGB);
+
+        for (int y = 0; y < h; ++y)
+        {
+            nv::Color32* dest = img->scanline(y);
+            uint8* src = data + y * w * 4;
+
+            for (int x = 0; x < w; ++x)
+            {
+                dest[x].r = src[x * 4 + 0];
+                dest[x].g = src[x * 4 + 1];
+                dest[x].b = src[x * 4 + 2];
+                dest[x].a = src[x * 4 + 3];
+            }
+        }
+        
+        free(data);
+
+        return img;
+    }
+
+    return NULL;
+}
+
+static FloatImage * loadFloatSTB(Stream & s)
+{
+    // @@ Assumes stream cursor is at the beginning and that image occupies the whole stream.
+    const int size = s.size();
+    uint8 * buffer = new uint8[size];
+
+    s.serialize(buffer, size);
+
+    int w, h, n;
+    float * data = stbi_loadf_from_memory(buffer, size, &w, &h, &n, 0);
+
+    delete [] buffer;
+
+    // Copy to image.
+    if (data != NULL) {
+        FloatImage * img = new FloatImage;
+        img->allocate(n, w, h);
+
+        const int count = w * h;
+
+        for (int c = 0; c < n; c++) {
+            float * dst = img->channel(c);
+
+            for (int i = 0; i < count; i++) {
+                dst[i] = data[i*n + c];
             }
         }
-        var len:uint = 0;
-        if (data != null) {
-            len = data.length;
-        }
-        png.writeUnsignedInt(len);
-        var p:uint = png.position;
-        png.writeUnsignedInt(type);
-        if ( data != null ) {
-            png.writeBytes(data);
-        }
-        var e:uint = png.position;
-        png.position = p;
-        var c:uint = 0xffffffff;
-        for (var i:int = 0; i < (e-p); i++) {
-            c = uint(crcTable[
-                (c ^ png.readUnsignedByte()) & 
-                uint(0xff)] ^ uint(c >>> 8));
-        }
-        c = uint(c^uint(0xffffffff));
-        png.position = e;
-        png.writeUnsignedInt(c);
+        return img;
     }
+
+    return NULL;
 }
-*/
+
+#endif // defined(HAVE_STBIMAGE)
+
+
+
+
+
+Image * nv::ImageIO::load(const char * fileName)
+{
+    nvDebugCheck(fileName != NULL);
+
+    StdInputStream stream(fileName);
+
+    if (stream.isError()) {
+        return NULL;
+    }
+
+    return ImageIO::load(fileName, stream);
 }
 
-#endif // 0
+Image * nv::ImageIO::load(const char * fileName, Stream & s)
+{
+    nvDebugCheck(fileName != NULL);
+    nvDebugCheck(s.isLoading());
+
+    const char * extension = Path::extension(fileName);
 
-#if 0
+    if (strCaseDiff(extension, ".tga") == 0) {
+        return loadTGA(s);
+    }
 
+    if (strCaseDiff(extension, ".psd") == 0) {
+        return loadPSD(s);
+    }
 
-namespace ImageIO {
+    /*if (strCaseDiff(extension, ".ppm") == 0) {
+        return loadPPM(s);
+    }*/
 
-	/** Init ImageIO plugins. */
-	void InitPlugins() {
-	//	AddInputPlugin( "", LoadANY );
-		AddInputPlugin( "tga", LoadTGA );
-#if HAVE_PNG
-		AddInputPlugin( "png", LoadPNG );
+#if defined(HAVE_JPEG)
+    if (strCaseDiff(extension, ".jpg") == 0 || strCaseDiff(extension, ".jpeg") == 0) {
+        return loadJPG(s);
+    }
 #endif
-#if HAVE_JPEG
-		AddInputPlugin( "jpg", LoadJPG );
+
+#if defined(HAVE_PNG)
+    if (strCaseDiff(extension, ".png") == 0) {
+        return loadPNG(s);
+    }
 #endif
-		AddInputPlugin( "dds", LoadDDS );
-		
-		AddOutputPlugin( "tga", SaveTGA );
-	}
-	
-	/** Reset ImageIO plugins. */
-	void ResetPlugins() {
-		s_plugin_load_map.Clear();
-		s_plugin_save_map.Clear();
-	}
-	
-	/** Add an input plugin. */
-	void AddInputPlugin( const char * ext, ImageInput_Plugin plugin ) {
-		s_plugin_load_map.Add(ext, plugin);
-	}
-	
-	/** Add an output plugin. */
-	void AddOutputPlugin( const char * ext, ImageOutput_Plugin plugin ) {
-		s_plugin_save_map.Add(ext, plugin);
-	}
 
-	
-	bool Load(PiImage * img, const char * name, PiStream & stream) {
-			
-		// Get name extension.
-		const char * extension = piExtension(name);
-		
-		// Skip the dot.
-		if( *extension == '.' ) {
-			extension++;
-		}
-		
-		// Lookup plugin in the map.
-		ImageInput_Plugin plugin = NULL;
-		if( s_plugin_load_map.Get(extension, &plugin) ) {
-			return plugin(img, stream);
-		}
-		
-		/*foreach(i, s_plugin_load_map) {
-			nvDebug("%s %s %d\n", s_plugin_load_map[i].key.GetStr(), extension, 0 == strcmp(extension, s_plugin_load_map[i].key));
-		}
-		
-		nvDebug("No plugin found for '%s' %d.\n", extension, s_plugin_load_map.Size());*/
-		
-		return false;
-	}
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
+        return loadFreeImage(fif, s);
+    }
+#endif
 
-	bool Save(const PiImage * img, const char * name, PiStream & stream) {
-				
-		// Get name extension.
-		const char * extension = piExtension(name);
-		
-		// Skip the dot.
-		if( *extension == '.' ) {
-			extension++;
-		}
-		
-		// Lookup plugin in the map.
-		ImageOutput_Plugin plugin = NULL;
-		if( s_plugin_save_map.Get(extension, &plugin) ) {
-			return plugin(img, stream);
-		}
-		
-		return false;
-	}
-	
-} // ImageIO
+#if defined(HAVE_STBIMAGE)
+    return loadSTB(s);
+#endif
+
+    return NULL;
+}
+
+bool nv::ImageIO::save(const char * fileName, Stream & s, const Image * img, const char ** tags/*=NULL*/)
+{
+    nvDebugCheck(fileName != NULL);
+    nvDebugCheck(s.isSaving());
+    nvDebugCheck(img != NULL);
+
+    const char * extension = Path::extension(fileName);
+
+    if (strCaseDiff(extension, ".tga") == 0) {
+        return saveTGA(s, img);
+    }
+
+    if (strCaseDiff(extension, ".ppm") == 0) {
+        return savePPM(s, img);
+    }
 
-#endif // 0
+#if defined(HAVE_PNG)
+    if (strCaseDiff(extension, ".png") == 0) {
+        return savePNG(s, img, tags);
+    }
+#endif
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
+        return saveFreeImage(fif, s, img, tags);
+    }
+#endif
+
+    return false;
+}
+
+bool nv::ImageIO::save(const char * fileName, const Image * img, const char ** tags/*=NULL*/)
+{
+    nvDebugCheck(fileName != NULL);
+    nvDebugCheck(img != NULL);
+
+    StdOutputStream stream(fileName);
+    if (stream.isError())
+    {
+        return false;
+    }
+
+    return ImageIO::save(fileName, stream, img, tags);
+}
+
+FloatImage * nv::ImageIO::loadFloat(const char * fileName)
+{
+    nvDebugCheck(fileName != NULL);
+
+    StdInputStream stream(fileName);
+
+    if (stream.isError()) {
+        return NULL;
+    }
+
+    return loadFloat(fileName, stream);
+}
+
+FloatImage * nv::ImageIO::loadFloat(const char * fileName, Stream & s)
+{
+    nvDebugCheck(fileName != NULL);
+
+    const char * extension = Path::extension(fileName);
+
+    /*if (strCaseDiff(extension, ".pfm") == 0) {
+        return loadFloatPFM(s);
+    }*/
+
+#if defined(HAVE_TIFF)
+    #pragma NV_MESSAGE("TODO: Load TIFF from stream.")
+    if (strCaseDiff(extension, ".tif") == 0 || strCaseDiff(extension, ".tiff") == 0) {
+        return loadFloatTIFF(fileName, s);
+    }
+#endif
+
+#if defined(HAVE_OPENEXR)
+    #pragma NV_MESSAGE("TODO: Load EXR from stream.")
+    if (strCaseDiff(extension, ".exr") == 0) {
+        return loadFloatEXR(fileName, s);
+    }
+#endif
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsReading(fif)) {
+        return loadFloatFreeImage(fif, s);
+    }
+#endif
+
+    if (strCaseDiff(extension, ".dds") == 0) {
+        const uint spos = s.tell(); // Save stream position.
+        FloatImage * floatImage = loadFloatDDS(s);
+        if (floatImage != NULL) return floatImage;
+        else s.seek(spos);
+    }
 
+    // Try to load as an RGBA8 image and convert to float.
+    AutoPtr<Image> img(load(fileName, s));
+    if (img != NULL) {
+        return new FloatImage(img.ptr());
+    }
+
+    return NULL;
+}
+
+bool nv::ImageIO::saveFloat(const char * fileName, Stream & s, const FloatImage * fimage, uint baseComponent, uint componentCount)
+{
+    if (componentCount == 0) {
+        componentCount = fimage->componentCount() - baseComponent;
+    }
+    if (baseComponent + componentCount < fimage->componentCount()) {
+        return false;
+    }
+
+    const char * extension = Path::extension(fileName);
+
+    if (strCaseDiff(extension, ".dds") == 0) {
+        return saveFloatDDS(s, fimage, baseComponent, componentCount);
+    }
+
+    /*if (strCaseDiff(extension, ".pfm") == 0) {
+        return saveFloatPFM(s, fimage, baseComponent, componentCount);
+    }*/
+
+#if defined(HAVE_FREEIMAGE)
+    FREE_IMAGE_FORMAT fif = FreeImage_GetFIFFromFilename(fileName);
+    if (fif != FIF_UNKNOWN && FreeImage_FIFSupportsWriting(fif)) {
+        return saveFloatFreeImage(fif, s, fimage, baseComponent, componentCount);
+    }
+#endif
+
+    // If everything else fails, save as LDR.
+    if (componentCount <= 4)
+    {
+        AutoPtr<Image> image(fimage->createImage(baseComponent, componentCount));
+        nvCheck(image != NULL);
+
+        if (componentCount == 1)
+        {
+            Color32 * c = image->pixels();
+            const uint count = image->width() * image->height();
+            for (uint i = 0; i < count; i++)
+            {
+                c[i].b = c[i].g = c[i].r;
+            }
+        }
+
+        if (componentCount == 4)
+        {
+            image->setFormat(Image::Format_ARGB);
+        }
+
+        return ImageIO::save(fileName, s, image.ptr());
+    }
+
+    return false;
+}
+
+bool nv::ImageIO::saveFloat(const char * fileName, const FloatImage * fimage, uint baseComponent, uint componentCount)
+{
+    if (componentCount == 0) {
+        componentCount = fimage->componentCount() - baseComponent;
+    }
+    if (baseComponent + componentCount < fimage->componentCount()) {
+        return false;
+    }
+
+    const char * extension = Path::extension(fileName);
+
+#if defined(HAVE_OPENEXR)
+    if (strCaseDiff(extension, ".exr") == 0) {
+        return saveFloatEXR(fileName, fimage, baseComponent, componentCount);
+    }
+#endif
+
+#if defined(HAVE_TIFF)
+    if (strCaseDiff(extension, ".tif") == 0 || strCaseDiff(extension, ".tiff") == 0) {
+        return saveFloatTIFF(fileName, fimage, baseComponent, componentCount);
+    }
+#endif
+
+    StdOutputStream stream(fileName);
+
+    if (stream.isError()) {
+        return false;
+    }
+
+    return saveFloat(fileName, stream, fimage, baseComponent, componentCount);
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/KtxFile.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/KtxFile.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/KtxFile.h
@@ -0,0 +1,102 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_IMAGE_KTXFILE_H
+#define NV_IMAGE_KTXFILE_H
+
+#include "nvimage.h"
+#include "nvcore/StrLib.h"
+
+// KTX File format specification:
+// http://www.khronos.org/opengles/sdk/tools/KTX/file_format_spec/#key
+
+namespace nv
+{
+    class Stream;
+
+    // GL types (Table 3.2)
+    const uint KTX_UNSIGNED_BYTE;
+    const uint KTX_UNSIGNED_SHORT_5_6_5;
+    // ...
+
+    // GL formats (Table 3.3)
+    // ...
+
+    // GL internal formats (Table 3.12, 3.13)
+    // ...
+
+    // GL base internal format. (Table 3.11)
+    const uint KTX_RGB;
+    const uint KTX_RGBA;
+    const uint KTX_ALPHA;
+    // ...
+
+
+    struct KtxHeader {
+        uint8 identifier[12];
+        uint32 endianness;
+        uint32 glType;
+        uint32 glTypeSize;
+        uint32 glFormat;
+        uint32 glInternalFormat;
+        uint32 glBaseInternalFormat;
+        uint32 pixelWidth;
+        uint32 pixelHeight;
+        uint32 pixelDepth;
+        uint32 numberOfArrayElements;
+        uint32 numberOfFaces;
+        uint32 numberOfMipmapLevels;
+        uint32 bytesOfKeyValueData;
+
+        KtxHeader();
+
+    };
+
+    NVIMAGE_API Stream & operator<< (Stream & s, DDSHeader & header);
+
+
+    struct KtxFile {
+        KtxFile();
+        ~KtxFile();
+
+        void addKeyValue(const char * key, const char * value);
+
+    private:
+        KtxHeader header;
+
+        Array<String> keyArray;
+        Array<String> valueArray;
+
+    };
+
+    NVIMAGE_API Stream & operator<< (Stream & s, KtxFile & file);
+
+
+    /*
+    for each keyValuePair that fits in bytesOfKeyValueData
+        UInt32   keyAndValueByteSize
+        Byte     keyAndValue[keyAndValueByteSize]
+        Byte     valuePadding[3 - ((keyAndValueByteSize + 3) % 4)]
+    end
+
+    for each mipmap_level in numberOfMipmapLevels*
+        UInt32 imageSize;
+        for each array_element in numberOfArrayElements*
+           for each face in numberOfFaces
+               for each z_slice in pixelDepth*
+                   for each row or row_of_blocks in pixelHeight*
+                       for each pixel or block_of_pixels in pixelWidth
+                           Byte data[format-specific-number-of-bytes]**
+                       end
+                   end
+               end
+               Byte cubePadding[0-3]
+           end
+        end
+        Byte mipPadding[3 - ((imageSize + 3) % 4)]
+    end
+    */
+
+} // nv namespace
+
+#endif // NV_IMAGE_KTXFILE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/KtxFile.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/KtxFile.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/KtxFile.cpp
@@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "KtxFile.h"
+
+using namespace nv;
+
+static const uint8 fileIdentifier[12] = {
+    0xAB, 0x4B, 0x54, 0x58,
+    0x20, 0x31, 0x31, 0xBB,
+    0x0D, 0x0A, 0x1A, 0x0A
+};
+
+
+KtxHeader::KtxHeader() {
+    memcpy(identifier, fileIdentifier, 12);
+
+    endianness = 0x04030201;
+
+    glType = 0;
+    glTypeSize = 1;
+    glFormat = 0;
+    glInternalFormat = KTX_RGBA;
+    glBaseInternalFormat = KTX_RGBA;
+    pixelWidth = 0;
+    pixelHeight = 0;
+    pixelDepth = 0;
+    numberOfArrayElements = 0;
+    numberOfFaces = 1;
+    numberOfMipmapLevels = 0;
+    bytesOfKeyValueData = 0;
+}
+
+
+Stream & operator<< (Stream & s, DDSHeader & header) {
+    s.serialize(header.identifier, 12);
+    s << header.endiannes << header.glType << header.glTypeSize << header.glFormat << header.glInternalFormat << header.glBaseInternalFormat;
+    s << header.pixelWidth << header.pixelHeight << header.pixelDepth;
+    s << header.numberOfArrayElements << header.numberOfFaces << header.numberOfMipmapLevels;
+    s << header.bytesOfKeyValueData;
+    return s;
+}
+
+
+KtxFile::KtxFile() {
+}
+KtxFile::~KtxFile() {
+}
+
+void KtxFile::addKeyValue(const char * key, const char * value) {
+    keyArray.append(key);
+    valueArray.append(value);
+    bytesOfKeyValueData += strlen(key) + 1 + strlen(value) + 1;
+}
+
+
+Stream & operator<< (Stream & s, KtxFile & file) {
+    s << header;
+
+    if (s.isSaving()) {
+
+        int keyValueCount = keyArray.count();
+        for (int i = 0; i < keyValueCount; i++) {
+            const String & key = keyArray[i];
+            const String & value = valueArray[i];
+            uint keySize = key.length() + 1;
+            uint valueSize = value.length() + 1;
+            uint keyValueSize = keySize + valueSize;
+
+            s << keyValueSize;
+
+            s.serialize(key.str(), keySize);
+            s.serialize(value.str(), valueSize);
+        }
+    }
+    else {
+        // @@ Read key value pairs.
+    }
+
+    return s;
+}
+
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMap.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMap.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMap.h
@@ -21,12 +21,14 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
+#pragma once
 #ifndef NV_IMAGE_NORMALMAP_H
 #define NV_IMAGE_NORMALMAP_H
 
-#include <nvmath/Vector.h>
-#include <nvimage/nvimage.h>
-#include <nvimage/FloatImage.h>
+#include "nvimage.h"
+#include "FloatImage.h"
+
+#include "nvmath/Vector.h"
 
 
 namespace nv
@@ -41,11 +43,13 @@
 		NormalMapFilter_Sobel9x9,	// very large
 	};
 
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+	// @@ These two functions should be deprecated:
+	NVIMAGE_API FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+	NVIMAGE_API FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
 
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
+	NVIMAGE_API FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
 
-	void normalizeNormalMap(FloatImage * img);
+	NVIMAGE_API void normalizeNormalMap(FloatImage * img);
 
 	// @@ Add generation of DU/DV maps.
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMap.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMap.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMap.cpp
@@ -21,14 +21,17 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#include <nvimage/NormalMap.h>
-#include <nvimage/Filter.h>
-#include <nvimage/FloatImage.h>
-#include <nvimage/Image.h>
+#include "NormalMap.h"
+#include "Filter.h"
+#include "FloatImage.h"
+#include "Image.h"
 
-#include <nvmath/Color.h>
+#include "nvmath/Color.inl"
+#include "nvmath/Vector.h"
 
-#include <nvcore/Ptr.h>
+#include "nvcore/Ptr.h"
+
+#include <string.h> // memcpy
 
 
 using namespace nv;
@@ -36,106 +39,170 @@
 // Create normal map using the given kernels.
 static FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, const Kernel2 * kdu, const Kernel2 * kdv)
 {
-	nvCheck(kdu != NULL);
-	nvCheck(kdv != NULL);
-	nvCheck(img != NULL);
-	
-	const uint w = img->width();
-	const uint h = img->height();
-	
-	AutoPtr<FloatImage> fimage(new FloatImage());
-	fimage->allocate(4, w, h);
-	
-	// Compute height and store in alpha channel:
-	float * alphaChannel = fimage->channel(3);
-	for(uint i = 0; i < w*h; i++)
-	{
-		Vector4 color = toVector4(img->pixel(i));
-		alphaChannel[i] = dot(color, heightWeights);
-	}
-	
-	float heightScale = 1.0f / 16.0f;	// @@ Use a user defined factor.
-	
-	for(uint y = 0; y < h; y++)
-	{
-		for(uint x = 0; x < w; x++)
-		{
-			const float du = fimage->applyKernel(kdu, x, y, 3, wm);
-			const float dv = fimage->applyKernel(kdv, x, y, 3, wm);
-			
-			Vector3 n = normalize(Vector3(du, dv, heightScale));
-			
-			fimage->setPixel(0.5f * n.x() + 0.5f, x, y, 0);
-			fimage->setPixel(0.5f * n.y() + 0.5f, x, y, 1);
-			fimage->setPixel(0.5f * n.z() + 0.5f, x, y, 2);
-		}
-	}
-	
-	return fimage.release();
+    nvDebugCheck(kdu != NULL);
+    nvDebugCheck(kdv != NULL);
+    nvDebugCheck(img != NULL);
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    AutoPtr<FloatImage> fimage(new FloatImage());
+    fimage->allocate(4, w, h);
+
+    // Compute height and store in alpha channel:
+    float * alphaChannel = fimage->channel(3);
+    for(uint i = 0; i < w * h; i++)
+    {
+        Vector4 color = toVector4(img->pixel(i));
+        alphaChannel[i] = dot(color, heightWeights);
+    }
+
+    float heightScale = 1.0f / 16.0f;	// @@ Use a user defined factor.
+
+    for(uint y = 0; y < h; y++)
+    {
+        for(uint x = 0; x < w; x++)
+        {
+            const float du = fimage->applyKernelXY(kdu, x, y, 0, 3, wm);
+            const float dv = fimage->applyKernelXY(kdv, x, y, 0, 3, wm);
+
+            Vector3 n = normalize(Vector3(du, dv, heightScale));
+
+            fimage->pixel(0, x, y, 0) = 0.5f * n.x + 0.5f;
+            fimage->pixel(1, x, y, 0) = 0.5f * n.y + 0.5f;
+            fimage->pixel(2, x, y, 0) = 0.5f * n.z + 0.5f;
+        }
+    }
+
+    return fimage.release();
+}
+
+
+// Create normal map using the given kernels.
+static FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, const Kernel2 * kdu, const Kernel2 * kdv)
+{
+    nvDebugCheck(kdu != NULL);
+    nvDebugCheck(kdv != NULL);
+    nvDebugCheck(img != NULL);
+
+#pragma NV_MESSAGE("FIXME: Height scale parameter should go away. It should be a sensible value that produces good results when the heightmap is in the [0, 1] range.")
+    const float heightScale = 1.0f / 16.0f;
+
+    const uint w = img->width();
+    const uint h = img->height();
+
+    AutoPtr<FloatImage> img_out(new FloatImage());
+    img_out->allocate(4, w, h);
+
+    for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            const float du = img->applyKernelXY(kdu, x, y, 0, 3, wm);
+            const float dv = img->applyKernelXY(kdv, x, y, 0, 3, wm);
+
+            Vector3 n = normalize(Vector3(du, dv, heightScale));
+
+            img_out->pixel(0, x, y, 0) = n.x;
+            img_out->pixel(1, x, y, 0) = n.y;
+            img_out->pixel(2, x, y, 0) = n.z;
+        }
+    }
+
+    // Copy alpha channel.
+    /*for (uint y = 0; y < h; y++)
+    {
+        for (uint x = 0; x < w; x++)
+        {
+            
+            img_out->pixel(3, x, y, 0) = img->pixel(3, x, y, 0);
+        }
+    }*/
+    memcpy(img_out->channel(3), img->channel(3), w * h * sizeof(float));
+
+    return img_out.release();
 }
 
 
 /// Create normal map using the given filter.
 FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter /*= Sobel3x3*/)
 {
-	nvCheck(img != NULL);
-	
-	// Init the kernels.
-	Kernel2 * kdu = NULL;
-	Kernel2 * kdv = NULL;
-
-	switch(filter)
-	{
-		case NormalMapFilter_Sobel3x3:
-			kdu = new Kernel2(3);
-			break;
-		case NormalMapFilter_Sobel5x5:
-			kdu = new Kernel2(5);
-			break;
-		case NormalMapFilter_Sobel7x7:
-			kdu = new Kernel2(7);
-			break;
-		case NormalMapFilter_Sobel9x9:
-			kdu = new Kernel2(9);
-			break;
-		default:
-			nvDebugCheck(false);
-	};
+    nvDebugCheck(img != NULL);
+
+    // Init the kernels.
+    Kernel2 * kdu = NULL;
+    Kernel2 * kdv = NULL;
+
+    switch(filter)
+    {
+        case NormalMapFilter_Sobel3x3:
+            kdu = new Kernel2(3);
+            break;
+        case NormalMapFilter_Sobel5x5:
+            kdu = new Kernel2(5);
+            break;
+        case NormalMapFilter_Sobel7x7:
+            kdu = new Kernel2(7);
+            break;
+        case NormalMapFilter_Sobel9x9:
+            kdu = new Kernel2(9);
+            break;
+        default:
+            nvDebugCheck(false);
+    };
 
-	kdu->initSobel();
-	kdu->normalize();
+    kdu->initSobel();
+    kdu->normalize();
 
-	kdv = new Kernel2(*kdu);
-	kdv->transpose();
+    kdv = new Kernel2(*kdu);
+    kdv->transpose();
 
-	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+    return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
 }
 
 
 /// Create normal map combining multiple sobel filters.
 FloatImage * nv::createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights)
 {
-	nvCheck(img != NULL);
+    nvDebugCheck(img != NULL);
+
+    Kernel2 * kdu = NULL;
+    Kernel2 * kdv = NULL;
+
+    kdu = new Kernel2(9);
+    kdu->initBlendedSobel(filterWeights);
+    kdu->normalize();
 
-	Kernel2 * kdu = NULL;
-	Kernel2 * kdv = NULL;
+    kdv = new Kernel2(*kdu);
+    kdv->transpose();
 
-	kdu = new Kernel2(9);
-	kdu->initBlendedSobel(filterWeights);
-	kdu->normalize();
-	
-	kdv = new Kernel2(*kdu);
-	kdv->transpose();
-	
-	return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
+    return ::createNormalMap(img, wm, heightWeights, kdu, kdv);
 }
 
+
+FloatImage * nv::createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights)
+{
+    nvDebugCheck(img != NULL);
+
+    Kernel2 * kdu = NULL;
+    Kernel2 * kdv = NULL;
+
+    kdu = new Kernel2(9);
+    kdu->initBlendedSobel(filterWeights);
+    kdu->normalize();
+
+    kdv = new Kernel2(*kdu);
+    kdv->transpose();
+
+    return ::createNormalMap(img, wm, kdu, kdv);
+}
+
+
 /// Normalize the given image in place.
 void nv::normalizeNormalMap(FloatImage * img)
 {
-	nvCheck(img != NULL);
-	img->expandNormals(0);
-	img->normalize(0);
-	img->packNormals(0);
+    nvDebugCheck(img != NULL);
+
+    img->normalize(0);
 }
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMipmap.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMipmap.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMipmap.h
@@ -1,17 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_IMAGE_NORMALMIPMAP_H
-#define NV_IMAGE_NORMALMIPMAP_H
-
-#include <nvimage/nvimage.h>
-
-
-namespace nv
-{
-	class FloatImage;
-
-	FloatImage * createNormalMipmapMap(const FloatImage * img);
-
-} // nv namespace
-
-#endif // NV_IMAGE_NORMALMIPMAP_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMipmap.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMipmap.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/NormalMipmap.cpp
@@ -1,98 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvcore/Ptr.h>
-
-#include <nvmath/Montecarlo.h>
-#include <nvmath/SphericalHarmonic.h>
-
-#include <nvimage/NormalMipmap.h>
-#include <nvimage/FloatImage.h>
-
-using namespace nv;
-
-FloatImage * nv::createNormalMipmapMap(const FloatImage * img)
-{
-	nvDebugCheck(img != NULL);
-	
-	uint w = img->width();
-	uint h = img->height();
-	
-	uint hw = w / 2;
-	uint hh = h / 2;
-	
-	FloatImage dotImg;
-	dotImg.allocate(1, w, h);
-	
-	FloatImage shImg;
-	shImg.allocate(9, hw, hh);
-	
-	SampleDistribution distribution(256);
-	const uint sampleCount = distribution.sampleCount();
-	
-	for (uint d = 0; d < sampleCount; d++)
-	{
-		const float * xChannel = img->channel(0);
-		const float * yChannel = img->channel(1);
-		const float * zChannel = img->channel(2);
-		
-		Vector3 dir = distribution.sampleDir(d);
-		
-		Sh2 basis;
-		basis.eval(dir);
-		
-		for(uint i = 0; i < w*h; i++)
-		{
-			Vector3 normal(xChannel[i], yChannel[i], zChannel[i]);
-			normal = normalizeSafe(normal, Vector3(zero), 0.0f);
-			
-			dotImg.setPixel(dot(dir, normal), d);
-		}
-		
-		// @@ It would be nice to have a fastDownSample that took an existing image as an argument, to avoid allocations.
-		AutoPtr<FloatImage> dotMip(dotImg.fastDownSample());
-		
-		for(uint p = 0; p < hw*hh; p++)
-		{
-			float f = dotMip->pixel(p);
-			
-			// Project irradiance to sh basis and accumulate.
-			for (uint i = 0; i < 9; i++)
-			{
-				float & sum = shImg.channel(i)[p];
-				sum += f * basis.elemAt(i);
-			}
-		}
-	}
-	
-	
-	
-	FloatImage * normalMipmap = new FloatImage;
-	normalMipmap->allocate(4, hw, hh);
-	
-	// Precompute the clamped cosine radiance transfer.
-	Sh2 prt;
-	prt.cosineTransfer();
-	
-	// Allocate outside the loop.
-	Sh2 sh;
-	
-	for(uint p = 0; p < hw*hh; p++)
-	{
-		for (uint i = 0; i < 9; i++)
-		{
-			sh.elemAt(i) = shImg.channel(i)[p];
-		}
-		
-		// Convolve sh irradiance by radiance transfer.
-		sh *= prt;
-		
-		// Now sh(0) is the ambient occlusion.
-		// and sh(1) is the normal direction.
-		
-		// Should we use SVD to fit only the normals to the SH?
-		
-	}
-	
-	return normalMipmap;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/PixelFormat.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/PixelFormat.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/PixelFormat.h
@@ -21,60 +21,96 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
+#pragma once
 #ifndef NV_IMAGE_PIXELFORMAT_H
 #define NV_IMAGE_PIXELFORMAT_H
 
-#include <nvimage/nvimage.h>
+#include "nvimage.h"
 
 
 namespace nv
 {
-	namespace PixelFormat
-	{
+    namespace PixelFormat
+    {
 
-		// Convert component @a c having @a inbits to the returned value having @a outbits.
-		inline uint convert(uint c, uint inbits, uint outbits)
-		{
-			if (inbits == 0)
-			{
-				return 0;
-			}
-			else if (inbits >= outbits)
-			{
-				// truncate
-				return c >> (inbits - outbits);
-			}
-			else
-			{
-				// bitexpand
-				return (c << (outbits - inbits)) | convert(c, inbits, outbits - inbits);
-			}
-		}
-
-		// Get pixel component shift and size given its mask.
-		inline void maskShiftAndSize(uint mask, uint * shift, uint * size)
-		{
-			if (!mask)
-			{
-				*shift = 0;
-				*size = 0;
-				return;
-			}
-
-			*shift = 0;
-			while((mask & 1) == 0) {
-				++(*shift);
-				mask >>= 1;
-			}
-			
-			*size = 0;
-			while((mask & 1) == 1) {
-				++(*size);
-				mask >>= 1;
-			}
-		}
+        // Convert component @a c having @a inbits to the returned value having @a outbits.
+        inline uint convert(uint c, uint inbits, uint outbits)
+        {
+            if (inbits == 0)
+            {
+            	return 0;
+            }
+            else if (inbits >= outbits)
+            {
+            	// truncate
+            	return c >> (inbits - outbits);
+            }
+            else
+            {
+            	// bitexpand
+                return (c << (outbits - inbits)) | convert(c, inbits, outbits - inbits);
+            }
+        }
+
+        // Get pixel component shift and size given its mask.
+        inline void maskShiftAndSize(uint mask, uint * shift, uint * size)
+        {
+            if (!mask)
+            {
+                *shift = 0;
+                *size = 0;
+                return;
+            }
+        
+            *shift = 0;
+            while((mask & 1) == 0) {
+                ++(*shift);
+                mask >>= 1;
+            }
+        
+            *size = 0;
+            while((mask & 1) == 1) {
+                ++(*size);
+                mask >>= 1;
+            }
+        }
+
+        inline float quantizeCeil(float f, int inbits, int outbits)
+        {
+            nvDebugCheck(f >= 0.0f && f <= 1.0f);
+            //uint i = f * (float(1 << inbits) - 1);
+            //i = convert(i, inbits, outbits);
+            //float result = float(i) / (float(1 << outbits) - 1);
+            //nvCheck(result >= f);
+            float result;
+            int offset = 0;
+            do {
+                uint i = offset + uint(f * (float(1 << inbits) - 1));
+                i = convert(i, inbits, outbits);
+                result = float(i) / (float(1 << outbits) - 1);
+                offset++;
+            } while (result < f);
+
+            return result;
+        }
+
+        /*
+        inline float quantizeRound(float f, int bits)
+        {
+            nvDebugCheck(f >= 0.0f && f <= 1.0f);
+            float scale = float(1 << bits);
+            return fround(f * scale) / scale;
+        }
+
+        inline float quantizeFloor(float f, int bits)
+        {
+            nvDebugCheck(f >= 0.0f && f <= 1.0f);
+            float scale = float(1 << bits);
+            return floor(f * scale) / scale;
+        }
+        */
 
-	} // PixelFormat namespace
+    } // PixelFormat namespace
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/PsdFile.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/PsdFile.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/PsdFile.h
@@ -1,69 +1,70 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_PSDFILE_H
 #define NV_IMAGE_PSDFILE_H
 
-#include <nvcore/Stream.h>
+#include "nvcore/Stream.h"
 
 namespace nv
 {
-	enum PsdColorMode
-	{
-		PsdColorMode_Bitmap = 0,
-		PsdColorMode_GrayScale = 1,
-		PsdColorMode_Indexed = 2,
-		PsdColorMode_RGB = 3,
-		PsdColorMode_CMYK = 4,
-		PsdColorMode_MultiChannel = 7,
-		PsdColorMode_DuoTone = 8,
-		PsdColorMode_LabColor = 9
-	};
-
-	/// PSD header.
-	struct PsdHeader
-	{
-		uint32 signature;
-		uint16 version;
-		uint8 reserved[6];
-		uint16 channel_count;
-		uint32 height;
-		uint32 width;
-		uint16 depth;
-		uint16 color_mode;
-		
-		bool isValid() const
-		{
-			return signature == 0x38425053;	// '8BPS'
-		}
-		
-		bool isSupported() const
-		{
-			if (version != 1) {
-				nvDebug("*** bad version number %u\n", version);
-				return false;
-			}
-			if (channel_count > 4) {
-				return false;
-			}
-			if (depth != 8) {
-				return false;
-			}
-			if (color_mode != PsdColorMode_RGB) {
-				return false;
-			}
-			return true;
-		}
-	};
-
-
-	inline Stream & operator<< (Stream & s, PsdHeader & head)
-	{
-		s << head.signature << head.version;
-		for (int i = 0; i < 6; i++) {
-			s << head.reserved[i];
-		}
-		return s << head.channel_count << head.height << head.width << head.depth << head.color_mode;
-	}
+    enum PsdColorMode
+    {
+        PsdColorMode_Bitmap = 0,
+        PsdColorMode_GrayScale = 1,
+        PsdColorMode_Indexed = 2,
+        PsdColorMode_RGB = 3,
+        PsdColorMode_CMYK = 4,
+        PsdColorMode_MultiChannel = 7,
+        PsdColorMode_DuoTone = 8,
+        PsdColorMode_LabColor = 9
+    };
+
+    /// PSD header.
+    struct PsdHeader
+    {
+        uint32 signature;
+        uint16 version;
+        uint8 reserved[6];
+        uint16 channel_count;
+        uint32 height;
+        uint32 width;
+        uint16 depth;
+        uint16 color_mode;
+
+        bool isValid() const
+        {
+                return signature == 0x38425053;	// '8BPS'
+        }
+
+        bool isSupported() const
+        {
+            if (version != 1) {
+                nvDebug("*** bad version number %u\n", version);
+                return false;
+            }
+            if (channel_count > 4) {
+                return false;
+            }
+            if (depth != 8) { // @@ Add support for 16 bit depths.
+                return false;
+            }
+            if (color_mode != PsdColorMode_RGB) {
+                return false;
+            }
+            return true;
+        }
+    };
+
+
+    inline Stream & operator<< (Stream & s, PsdHeader & head)
+    {
+        s << head.signature << head.version;
+        for (int i = 0; i < 6; i++) {
+            s << head.reserved[i];
+        }
+        return s << head.channel_count << head.height << head.width << head.depth << head.color_mode;
+    }
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/Quantize.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/Quantize.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/Quantize.h
@@ -1,9 +1,10 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_QUANTIZE_H
 #define NV_IMAGE_QUANTIZE_H
 
-#include <nvimage/nvimage.h>
+#include "nvimage.h"
 
 
 namespace nv
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/Quantize.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/Quantize.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/Quantize.cpp
@@ -12,13 +12,16 @@
 @@ This code needs to be reviewed, I'm not sure it's correct.
 */
 
-#include <nvimage/Quantize.h>
-#include <nvimage/Image.h>
-#include <nvimage/PixelFormat.h>
+#include "Quantize.h"
+#include "Image.h"
+#include "PixelFormat.h"
 
-#include <nvmath/Color.h>
+#include "nvmath/Color.h"
+#include "nvmath/Vector.inl"
 
-#include <nvcore/Containers.h> // swap
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memset
 
 
 using namespace nv;
@@ -82,8 +85,8 @@
 	memset(row0, 0, sizeof(float)*(w+2));
 	memset(row1, 0, sizeof(float)*(w+2));
 	
-	for(uint y = 0; y < h; y++) {
-		for(uint x = 0; x < w; x++) {
+	for (uint y = 0; y < h; y++) {
+		for (uint x = 0; x < w; x++) {
 			
 			Color32 pixel = image->pixel(x, y);
 			
@@ -91,7 +94,7 @@
 			int alpha = int(pixel.a) + int(row0[1+x]);
 			
 			// Convert color.
-			if( alpha > alpha_threshold ) pixel.a = 255;
+			if (alpha > alpha_threshold) pixel.a = 255;
 			else pixel.a = 0;
 			
 			// Store color.
@@ -174,10 +177,10 @@
 			Color32 pixel = image->pixel(x, y);
 
 			// Add error.
-			pixel.r = clamp(int(pixel.r) + int(row0[1+x].x()), 0, 255);
-			pixel.g = clamp(int(pixel.g) + int(row0[1+x].y()), 0, 255);
-			pixel.b = clamp(int(pixel.b) + int(row0[1+x].z()), 0, 255);
-			pixel.a = clamp(int(pixel.a) + int(row0[1+x].w()), 0, 255);
+			pixel.r = clamp(int(pixel.r) + int(row0[1+x].x), 0, 255);
+			pixel.g = clamp(int(pixel.g) + int(row0[1+x].y), 0, 255);
+			pixel.b = clamp(int(pixel.b) + int(row0[1+x].z), 0, 255);
+			pixel.a = clamp(int(pixel.a) + int(row0[1+x].w), 0, 255);
 			
 			int r = pixel.r;
 			int g = pixel.g;
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/TgaFile.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/TgaFile.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/TgaFile.h
@@ -1,9 +1,10 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_TGAFILE_H
 #define NV_IMAGE_TGAFILE_H
 
-#include <nvcore/Stream.h>
+#include "nvcore/Stream.h"
 
 namespace nv
 {
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/ValveTextureFormat.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/ValveTextureFormat.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/ValveTextureFormat.h
@@ -0,0 +1,122 @@
+
+/*
+For more info:
+http://developer.valvesoftware.com/wiki/VTF
+
+File Layout:
+    VTF Header
+    VTF Low Resolution Image Data
+    For Each Mipmap (Smallest to Largest)
+      For Each Frame (First to Last)
+        For Each Face (First to Last)
+          For Each Z Slice (Min to Max; Varies with Mipmap)
+            VTF High Resolution Image Data
+
+
+*/
+
+
+enum
+{
+    IMAGE_FORMAT_NONE = -1,
+    IMAGE_FORMAT_RGBA8888 = 0,
+    IMAGE_FORMAT_ABGR8888,
+    IMAGE_FORMAT_RGB888,
+    IMAGE_FORMAT_BGR888,
+    IMAGE_FORMAT_RGB565,
+    IMAGE_FORMAT_I8,
+    IMAGE_FORMAT_IA88,
+    IMAGE_FORMAT_P8,
+    IMAGE_FORMAT_A8,
+    IMAGE_FORMAT_RGB888_BLUESCREEN,
+    IMAGE_FORMAT_BGR888_BLUESCREEN,
+    IMAGE_FORMAT_ARGB8888,
+    IMAGE_FORMAT_BGRA8888,
+    IMAGE_FORMAT_DXT1,
+    IMAGE_FORMAT_DXT3,
+    IMAGE_FORMAT_DXT5,
+    IMAGE_FORMAT_BGRX8888,
+    IMAGE_FORMAT_BGR565,
+    IMAGE_FORMAT_BGRX5551,
+    IMAGE_FORMAT_BGRA4444,
+    IMAGE_FORMAT_DXT1_ONEBITALPHA,
+    IMAGE_FORMAT_BGRA5551,
+    IMAGE_FORMAT_UV88,
+    IMAGE_FORMAT_UVWQ8888,
+    IMAGE_FORMAT_RGBA16161616F,
+    IMAGE_FORMAT_RGBA16161616,
+    IMAGE_FORMAT_UVLX8888,
+	IMAGE_FORMAT_R32F,						//!<  = Luminance - 32 bpp
+	IMAGE_FORMAT_RGB323232F,				//!<  = Red, Green, Blue - 96 bpp
+	IMAGE_FORMAT_RGBA32323232F,				//!<  = Red, Green, Blue, Alpha - 128 bpp
+	IMAGE_FORMAT_NV_DST16,
+	IMAGE_FORMAT_NV_DST24,					
+	IMAGE_FORMAT_NV_INTZ,
+	IMAGE_FORMAT_NV_RAWZ,
+	IMAGE_FORMAT_ATI_DST16,
+	IMAGE_FORMAT_ATI_DST24,
+	IMAGE_FORMAT_NV_NULL,
+	IMAGE_FORMAT_ATI2N,						
+	IMAGE_FORMAT_ATI1N,
+};
+
+
+enum
+{
+    TEXTUREFLAGS_POINTSAMPLE = 0x00000001,
+    TEXTUREFLAGS_TRILINEAR = 0x00000002,
+    TEXTUREFLAGS_CLAMPS = 0x00000004,
+    TEXTUREFLAGS_CLAMPT = 0x00000008,
+    TEXTUREFLAGS_ANISOTROPIC = 0x00000010,
+    TEXTUREFLAGS_HINT_DXT5 = 0x00000020,
+    TEXTUREFLAGS_NOCOMPRESS = 0x00000040,
+    TEXTUREFLAGS_NORMAL = 0x00000080,
+    TEXTUREFLAGS_NOMIP = 0x00000100,
+    TEXTUREFLAGS_NOLOD = 0x00000200,
+    TEXTUREFLAGS_MINMIP = 0x00000400,
+    TEXTUREFLAGS_PROCEDURAL = 0x00000800,
+    TEXTUREFLAGS_ONEBITALPHA = 0x00001000,
+    TEXTUREFLAGS_EIGHTBITALPHA = 0x00002000,
+    TEXTUREFLAGS_ENVMAP = 0x00004000,
+    TEXTUREFLAGS_RENDERTARGET = 0x00008000,
+    TEXTUREFLAGS_DEPTHRENDERTARGET = 0x00010000,
+    TEXTUREFLAGS_NODEBUGOVERRIDE = 0x00020000,
+    TEXTUREFLAGS_SINGLECOPY = 0x00040000,
+    TEXTUREFLAGS_ONEOVERMIPLEVELINALPHA = 0x00080000,
+    TEXTUREFLAGS_PREMULTCOLORBYONEOVERMIPLEVEL = 0x00100000,
+    TEXTUREFLAGS_NORMALTODUDV = 0x00200000,
+    TEXTUREFLAGS_ALPHATESTMIPGENERATION = 0x00400000,
+    TEXTUREFLAGS_NODEPTHBUFFER = 0x00800000,
+    TEXTUREFLAGS_NICEFILTERED = 0x01000000,
+    TEXTUREFLAGS_CLAMPU = 0x02000000
+};
+
+
+struct VtfHeader
+{
+    char signature[4];          // File signature ("VTF\0").
+    uint32 version[2];           // version[0].version[1] (currently 7.2).
+    uint32 headerSize;          // Size of the header struct (16 byte aligned; currently 80 bytes).
+    
+    // 7.0
+    uint16 width;                 // Width of the largest mipmap in pixels. Must be a power of 2.
+    uint16 height;            // Height of the largest mipmap in pixels. Must be a power of 2.
+    uint32 flags;            // VTF flags.
+    uint16 frames;            // Number of frames, if animated (1 for no animation).
+    uint16 firstFrame;        // First frame in animation (0 based).
+    uint8 padding0[4];      // reflectivity padding (16 byte alignment).
+    float reflectivity[3];      // reflectivity vector.
+    uint8 padding1[4];        // reflectivity padding (8 byte packing).
+    float bumpmapScale;           // Bumpmap scale.
+    uint32 highResImageFormat;  // High resolution image format.
+    uint8 mipmapCount;              // Number of mipmaps.
+    uint32 lowResImageFormat;    // Low resolution image format (always DXT1).
+    uint8 lowResImageWidth;        // Low resolution image width.
+    uint8 lowResImageHeight;        // Low resolution image height.
+    
+    // 7.2
+    uint16 depth;                        // Depth of the largest mipmap in pixels.
+                                            // Must be a power of 2. Can be 0 or 1 for a 2D texture (v7.2 only).
+};
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvimage/nvimage.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvimage/nvimage.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvimage/nvimage.h
@@ -1,9 +1,12 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_IMAGE_H
 #define NV_IMAGE_H
 
-#include <nvcore/nvcore.h>
+#include "nvcore/nvcore.h"
+#include "nvcore/Debug.h" // nvDebugCheck
+#include "nvcore/Utils.h" // isPowerOfTwo
 
 // Function linkage
 #if NVIMAGE_SHARED
@@ -19,4 +22,27 @@
 #define NVIMAGE_CLASS
 #endif
 
+
+namespace nv {
+
+    // Some utility functions:
+
+    inline uint computeBitPitch(uint w, uint bitsize, uint alignmentInBits)
+    {
+        nvDebugCheck(isPowerOfTwo(alignmentInBits));
+
+        return ((w * bitsize +  alignmentInBits - 1) / alignmentInBits) * alignmentInBits;
+    }
+
+    inline uint computeBytePitch(uint w, uint bitsize, uint alignmentInBytes)
+    {
+        uint pitch = computeBitPitch(w, bitsize, 8*alignmentInBytes);
+        nvDebugCheck((pitch & 7) == 0);
+
+        return (pitch + 7) / 8;
+    }
+
+
+} // nv namespace
+
 #endif // NV_IMAGE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Basis.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Basis.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Basis.h
@@ -1,78 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_MATH_BASIS_H
-#define NV_MATH_BASIS_H
-
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
-#include <nvmath/Matrix.h>
-
-namespace nv
-{
-
-	/// Basis class to compute tangent space basis, ortogonalizations and to
-	/// transform vectors from one space to another.
-	struct Basis
-	{
-		/// Create a null basis.
-		Basis() : tangent(0, 0, 0), bitangent(0, 0, 0), normal(0, 0, 0) {}
-
-		/// Create a basis given three vectors.
-		Basis(Vector3::Arg n, Vector3::Arg t, Vector3::Arg b) : tangent(t), bitangent(b), normal(n) {}
-
-		/// Create a basis with the given tangent vectors and the handness.
-		Basis(Vector3::Arg n, Vector3::Arg t, float sign)
-		{
-			build(n, t, sign);
-		}
-
-		NVMATH_API void normalize(float epsilon = NV_EPSILON);
-		NVMATH_API void orthonormalize(float epsilon = NV_EPSILON);
-		NVMATH_API void robustOrthonormalize(float epsilon = NV_EPSILON);
-		NVMATH_API void buildFrameForDirection(Vector3::Arg d);
-
-		/// Calculate the determinant [ F G N ] to obtain the handness of the basis. 
-		float handness() const
-		{
-			return determinant() > 0.0f ? 1.0f : -1.0f;
-		}
-
-		/// Build a basis from 2 vectors and a handness flag.
-		void build(Vector3::Arg n, Vector3::Arg t, float sign)
-		{
-			normal = n;
-			tangent = t;
-			bitangent = sign * cross(t, n);
-		}
-
-		/// Compute the determinant of this basis.
-		float determinant() const
-		{
-			return 
-				tangent.x() * bitangent.y() * normal.z() - tangent.z() * bitangent.y() * normal.x() +
-				tangent.y() * bitangent.z() * normal.x() - tangent.y() * bitangent.x() * normal.z() + 
-				tangent.z() * bitangent.x() * normal.y() - tangent.x() * bitangent.z() * normal.y();
-		}
-		
-		/*
-		// Get transform matrix for this basis.
-		NVMATH_API Matrix matrix() const;
-		
-		// Transform by this basis. (From this basis to object space).
-		NVMATH_API Vector3 transform(Vector3::Arg v) const;
-
-		// Transform by the transpose. (From object space to this basis).
-		NVMATH_API Vector3 transformT(Vector3::Arg v);
-
-		// Transform by the inverse. (From object space to this basis).
-		NVMATH_API Vector3 transformI(Vector3::Arg v) const;
-		*/
-		
-		Vector3 tangent;
-		Vector3 bitangent;
-		Vector3 normal;
-	};
-
-} // nv namespace
-
-#endif // NV_MATH_BASIS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Basis.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Basis.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Basis.cpp
@@ -1,173 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvmath/Basis.h>
-
-using namespace nv;
-
-
-/// Normalize basis vectors.
-void Basis::normalize(float epsilon /*= NV_EPSILON*/)
-{
-	normal = ::normalize(normal, epsilon);
-	tangent = ::normalize(tangent, epsilon);
-	bitangent = ::normalize(bitangent, epsilon);
-}
-
-
-/// Gram-Schmidt orthogonalization.
-/// @note Works only if the vectors are close to orthogonal.
-void Basis::orthonormalize(float epsilon /*= NV_EPSILON*/)
-{
-	// N' = |N|
-	// T' = |T - (N' dot T) N'|
-	// B' = |B - (N' dot B) N' - (T' dot B) T'|
-
-	normal = ::normalize(normal, epsilon);
-
-	tangent -= normal * dot(normal, tangent);
-	tangent = ::normalize(tangent, epsilon);
-
-	bitangent -= normal * dot(normal, bitangent);
-	bitangent -= tangent * dot(tangent, bitangent);
-	bitangent = ::normalize(bitangent, epsilon);
-}
-
-
-/// Robust orthonormalization. 
-/// Returns an orthonormal basis even when the original is degenerate.
-void Basis::robustOrthonormalize(float epsilon /*= NV_EPSILON*/)
-{
-	if (length(normal) < epsilon)
-	{
-		normal = cross(tangent, bitangent);
-		
-		if (length(normal) < epsilon)
-		{
-			tangent = Vector3(1, 0, 0);
-			bitangent = Vector3(0, 1, 0);
-			normal = Vector3(0, 0, 1);
-			return;
-		}
-	}
-	normal = ::normalize(normal, epsilon);
-	
-	tangent -= normal * dot(normal, tangent);
-	bitangent -= normal * dot(normal, bitangent);
-	
-	if (length(tangent) < epsilon)
-	{
-		if (length(bitangent) < epsilon)
-		{
-			buildFrameForDirection(normal);
-		}
-		else
-		{
-			tangent = cross(bitangent, normal);
-			nvCheck(isNormalized(tangent, epsilon));
-		}
-	}
-	else
-	{
-		tangent = ::normalize(tangent, epsilon);
-		bitangent -= tangent * dot(tangent, bitangent);
-		
-		if (length(bitangent) < epsilon)
-		{
-			bitangent = cross(tangent, normal);
-			nvCheck(isNormalized(bitangent));
-		}
-		else
-		{
-			tangent = ::normalize(tangent, epsilon);
-		}
-	}
-	
-	// Check vector lengths.
-	nvCheck(isNormalized(normal, epsilon));
-	nvCheck(isNormalized(tangent, epsilon));
-	nvCheck(isNormalized(bitangent, epsilon));
-
-	// Check vector angles.
-	nvCheck(equal(dot(normal, tangent), 0.0f, epsilon));
-	nvCheck(equal(dot(normal, bitangent), 0.0f, epsilon));
-	nvCheck(equal(dot(tangent, bitangent), 0.0f, epsilon));
-
-	// Check vector orientation.
-	const float det = dot(cross(normal, tangent), bitangent);
-	nvCheck(equal(det, 1.0f, epsilon) || equal(det, -1.0f, epsilon));
-}
-
-
-/// Build an arbitrary frame for the given direction.
-void Basis::buildFrameForDirection(Vector3::Arg d)
-{
-	nvCheck(isNormalized(d));
-	normal = d;
-
-	// Choose minimum axis.
-	if (fabsf(normal.x()) < fabsf(normal.y()) && fabsf(normal.x()) < fabsf(normal.z()))
-	{
-		tangent = Vector3(1, 0, 0);
-	}
-	else if (fabsf(normal.y()) < fabsf(normal.z()))
-	{
-		tangent = Vector3(0, 1, 0);
-	}
-	else
-	{
-		tangent = Vector3(0, 0, 1);
-	}
-
-	// Ortogonalize
-	tangent -= normal * dot(normal, tangent);
-	tangent = ::normalize(tangent);
-
-	bitangent = cross(normal, tangent);
-}
-
-
-
-/*
-/// Transform by this basis. (From this basis to object space).
-Vector3 Basis::transform(Vector3::Arg v) const
-{
-	Vector3 o = tangent * v.x();
-	o += bitangent * v.y();
-	o += normal * v.z();
-	return o;
-}
-
-/// Transform by the transpose. (From object space to this basis).
-Vector3 Basis::transformT(Vector3::Arg v)
-{
-	return Vector3(dot(tangent, v), dot(bitangent, v), dot(normal, v));
-}
-
-/// Transform by the inverse. (From object space to this basis).
-/// @note Uses Kramer's rule so the inverse is not accurate if the basis is ill-conditioned.
-Vector3 Basis::transformI(Vector3::Arg v) const
-{
-	const float det = determinant();
-	nvCheck(!equalf(det, 0.0f));
-	
-	const float idet = 1.0f / det;
-
-	// Rows of the inverse matrix.
-	Vector3 r0, r1, r2;
-	r0.x =  (bitangent.y() * normal.z() - bitangent.z() * normal.y()) * idet;
-	r0.y = -(bitangent.x() * normal.z() - bitangent.z() * normal.x()) * idet;
-	r0.z =  (bitangent.x() * normal.y() - bitangent.y() * normal.x()) * idet;
-
-	r1.x = -(tangent.y() * normal.z() - tangent.z() * normal.y()) * idet;
-	r1.y =  (tangent.x() * normal.z() - tangent.z() * normal.x()) * idet;
-	r1.z = -(tangent.x() * normal.y() - tangent.y() * normal.x()) * idet;
-
-	r2.x =  (tangent.y() * bitangent.z() - tangent.z() * bitangent.y()) * idet;
-	r2.y = -(tangent.x() * bitangent.z() - tangent.z() * bitangent.x()) * idet;
-	r2.z =  (tangent.x() * bitangent.y() - tangent.y() * bitangent.x()) * idet;
-
-	return Vector3(dot(v, r0), dot(v, r1), dot(v, r2));
-}	
-*/
-
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.h
@@ -1,138 +1,101 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_MATH_BOX_H
 #define NV_MATH_BOX_H
 
-#include <nvmath/Vector.h>
+#include "Vector.h"
 
 #include <float.h> // FLT_MAX
 
 namespace nv
 {
+    class Vector;
+    class Stream;
+    class Sphere;
+
+    // Axis Aligned Bounding Box.
+    class Box
+    {
+    public:
+
+        inline Box() {}
+        inline Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) {}
+        inline Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) {}
+
+        Box & operator=(const Box & b);
+
+        operator const float * () const { return reinterpret_cast<const float *>(this); }
+
+        // Clear the bounds.
+        void clearBounds();
+
+        // min < max
+        bool isValid() const;
+
+        // Build a cube centered on center and with edge = 2*dist
+        void cube(const Vector3 & center, float dist);
+
+        // Build a box, given center and extents.
+        void setCenterExtents(const Vector3 & center, const Vector3 & extents);
+
+        // Get box center.
+        Vector3 center() const;
+
+        // Return extents of the box.
+        Vector3 extents() const;
+
+        // Return extents of the box.
+        float extents(uint axis) const;
+
+        // Add a point to this box.
+        void addPointToBounds(const Vector3 & p);
+
+        // Add a box to this box.
+        void addBoxToBounds(const Box & b);
+
+        // Add sphere to this box.
+        void addSphereToBounds(const Vector3 & p, float r);
+
+        // Translate box.
+        void translate(const Vector3 & v);
+
+        // Scale the box.
+        void scale(float s);
+
+        // Expand the box by a fixed amount.
+        void expand(float r);
+
+        // Get the area of the box.
+        float area() const;
+ 
+        // Get the volume of the box.
+        float volume() const;
+
+        // Return true if the box contains the given point.
+        bool contains(const Vector3 & p) const;
+
+        // Split the given box in 8 octants and assign the ith one to this box.
+        void setOctant(const Box & box, const Vector3 & center, int i);
+
+
+        // Clip the given segment against this box.
+        bool clipSegment(const Vector3 & origin, const Vector3 & dir, float * t_near, float * t_far) const;
 
-/// Axis Aligned Bounding Box.
-class Box
-{
-public:
 
-	/// Default ctor.
-	Box() { };
+        friend Stream & operator<< (Stream & s, Box & box);
 
-	/// Copy ctor.
-	Box( const Box & b ) : m_mins(b.m_mins), m_maxs(b.m_maxs) { }
+        const Vector3 & corner(int i) const { return (&minCorner)[i]; }
 
-	/// Init ctor.
-	Box( Vector3::Arg mins, Vector3::Arg maxs ) : m_mins(mins), m_maxs(maxs) { }
-
-	// Cast operators.
-	operator const float * () const { return reinterpret_cast<const float *>(this); }
-
-	/// Min corner of the box.
-	Vector3 mins() const { return m_mins; }
-
-	/// Max corner of the box.
-	Vector3 maxs() const { return m_maxs; }
-
-	/// Clear the bounds.
-	void clearBounds()
-	{
-		m_mins.set(FLT_MAX, FLT_MAX, FLT_MAX);
-		m_maxs.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
-	}
-
-	/// Build a cube centered on center and with edge = 2*dist
-	void cube(Vector3::Arg center, float dist)
-	{
-		setCenterExtents(center, Vector3(dist, dist, dist));
-	}
-
-	/// Build a box, given center and extents.
-	void setCenterExtents(Vector3::Arg center, Vector3::Arg extents)
-	{
-		m_mins = center - extents;
-		m_maxs = center + extents;
-	}
-
-	/// Get box center.
-	Vector3 center() const
-	{
-		return (m_mins + m_maxs) * 0.5f;
-	}
-
-	/// Return extents of the box.
-	Vector3 extents() const
-	{
-		return (m_maxs - m_mins) * 0.5f;
-	}
-
-	/// Return extents of the box.
-	scalar extents(uint axis) const
-	{
-		nvDebugCheck(axis < 3);
-		if (axis == 0) return (m_maxs.x() - m_mins.x()) * 0.5f;
-		if (axis == 1) return (m_maxs.y() - m_mins.y()) * 0.5f;
-		if (axis == 2) return (m_maxs.z() - m_mins.z()) * 0.5f;
-		nvAssume(false);
-		return 0.0f;
-	}
-
-	/// Add a point to this box.
-	void addPointToBounds(Vector3::Arg p)
-	{
-		m_mins = min(m_mins, p);
-		m_maxs = max(m_maxs, p);
-	}
-
-	/// Add a box to this box.
-	void addBoxToBounds(const Box & b)
-	{
-		m_mins = min(m_mins, b.m_mins);
-		m_maxs = max(m_maxs, b.m_maxs);
-	}
-
-	/// Translate box.
-	void translate(Vector3::Arg v)
-	{
-		m_mins += v;
-		m_maxs += v;
-	}
-
-	/// Scale the box.
-	void scale(float s)
-	{
-		m_mins *= s;
-		m_maxs *= s;
-	}
-
-	/// Get the area of the box.
-	float area() const
-	{
-		const Vector3 d = extents();
-		return 8.0f * (d.x()*d.y() + d.x()*d.z() + d.y()*d.z());
-	}	
-
-	/// Get the volume of the box.
-	float volume() const
-	{
-		Vector3 d = extents();
-		return 8.0f * (d.x() * d.y() * d.z());
-	}
-	
-	/// Return true if the box contains the given point.
-	bool contains(Vector3::Arg p) const
-	{
-		return 
-			m_mins.x() < p.x() && m_mins.y() < p.y() && m_mins.z() < p.z() &&
-			m_maxs.x() > p.x() && m_maxs.y() > p.y() && m_maxs.z() > p.z();
-	}
-
-private:
-
-	Vector3 m_mins;
-	Vector3 m_maxs;
-};
+        Vector3 minCorner;
+        Vector3 maxCorner;
+    };
 
+    float distanceSquared(const Box &box, const Vector3 &point);
+    bool overlap(const Box &box, const Sphere &sphere);
 
+    // p is ray origin, id is inverse ray direction.
+    bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.cpp
@@ -0,0 +1,119 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Box.h"
+#include "Box.inl"
+//#include "Sphere.h"
+
+using namespace nv;
+
+
+
+
+// Clip the given segment against this box.
+bool Box::clipSegment(const Vector3 & origin, const Vector3 & dir, float * t_near, float * t_far) const {
+
+	// Avoid aliasing.
+	float tnear = *t_near;
+	float tfar = *t_far;
+
+	// clip ray segment to box
+	for (int i = 0; i < 3; i++)
+	{
+		const float pos = origin.component[i] + tfar * dir.component[i];
+		const float dt = tfar - tnear;
+
+		if (dir.component[i] < 0) {
+			
+			// clip end point
+			if (pos < minCorner.component[i]) {
+                tfar = tnear + dt * (origin.component[i] - minCorner.component[i]) / (origin.component[i] - pos);
+			}
+			
+			// clip start point
+			if (origin.component[i] > maxCorner.component[i]) {
+				tnear = tnear + dt * (origin.component[i] - maxCorner.component[i]) / (tfar * dir.component[i]);
+			}
+		}
+		else {
+
+			// clip end point
+			if (pos > maxCorner.component[i]) {
+				tfar = tnear + dt * (maxCorner.component[i] - origin.component[i]) / (pos - origin.component[i]);
+			}
+
+			// clip start point
+			if (origin.component[i] < minCorner.component[i]) {
+				tnear = tnear + dt * (minCorner.component[i] - origin.component[i]) / (tfar * dir.component[i]);
+			}
+		}
+
+		if (tnear > tfar) {
+			// Clipped away.
+			return false;
+		}
+	}
+
+	// Return result.
+	*t_near = tnear;
+	*t_far = tfar;
+	return true;
+}
+
+
+float nv::distanceSquared(const Box &box, const Vector3 &point) {
+    Vector3 closest;
+
+    if (point.x < box.minCorner.x) closest.x = box.minCorner.x;
+    else if (point.x > box.maxCorner.x) closest.x = box.maxCorner.x;
+    else closest.x = point.x;
+
+    if (point.y < box.minCorner.y) closest.y = box.minCorner.y;
+    else if (point.y > box.maxCorner.y) closest.y = box.maxCorner.y;
+    else closest.y = point.y;
+
+    if (point.z < box.minCorner.z) closest.z = box.minCorner.z;
+    else if (point.z > box.maxCorner.z) closest.z = box.maxCorner.z;
+    else closest.z = point.z;
+
+    return lengthSquared(point - closest);
+}
+
+/*bool nv::overlap(const Box &box, const Sphere &sphere) {
+    return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
+}*/
+
+
+bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
+    // Precompute these in ray structure?
+    int sdx = (id.x < 0);
+    int sdy = (id.y < 0);
+    int sdz = (id.z < 0);
+
+    float tmin = (box.corner(  sdx).x - p.x) * id.x;
+    float tmax = (box.corner(1-sdx).x - p.x) * id.x;
+    float tymin = (box.corner(  sdy).y - p.y) * id.y;
+    float tymax = (box.corner(1-sdy).y - p.y) * id.y;
+
+    if ((tmin > tymax) || (tymin > tmax)) 
+        return false;
+
+    if (tymin > tmin) tmin = tymin;
+    if (tymax < tmax) tmax = tymax;
+
+    float tzmin = (box.corner(  sdz).z - p.z) * id.z;
+    float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
+
+    if ((tmin > tzmax) || (tzmin > tmax)) 
+        return false;
+
+    if (tzmin > tmin) tmin = tzmin;
+    if (tzmax < tmax) tmax = tzmax;
+
+    if (tmax < 0) 
+        return false;
+
+    if (t != NULL) *t = tmin;
+
+    return true;
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Box.inl
@@ -0,0 +1,154 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_BOX_INL
+#define NV_MATH_BOX_INL
+
+#include "Box.h"
+#include "Vector.inl"
+
+#include <float.h> // FLT_MAX
+
+namespace nv
+{
+    // Default ctor.
+    //inline Box::Box() { };
+
+    // Copy ctor.
+    //inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
+
+    // Init ctor.
+    //inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
+
+    // Assignment operator.
+    inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
+
+    // Clear the bounds.
+    inline void Box::clearBounds()
+    {
+        minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
+        maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+    }
+
+    // min < max
+    inline bool Box::isValid() const
+    {
+        return minCorner.x <= maxCorner.x && minCorner.y <= maxCorner.y && minCorner.z <= maxCorner.z;
+    }
+
+    // Build a cube centered on center and with edge = 2*dist
+    inline void Box::cube(const Vector3 & center, float dist)
+    {
+        setCenterExtents(center, Vector3(dist));
+    }
+
+    // Build a box, given center and extents.
+    inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
+    {
+        minCorner = center - extents;
+        maxCorner = center + extents;
+    }
+
+    // Get box center.
+    inline Vector3 Box::center() const
+    {
+        return (minCorner + maxCorner) * 0.5f;
+    }
+
+    // Return extents of the box.
+    inline Vector3 Box::extents() const
+    {
+        return (maxCorner - minCorner) * 0.5f;
+    }
+
+    // Return extents of the box.
+    inline float Box::extents(uint axis) const
+    {
+        nvDebugCheck(axis < 3);
+        if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
+        if (axis == 1) return (maxCorner.y - minCorner.y) * 0.5f;
+        if (axis == 2) return (maxCorner.z - minCorner.z) * 0.5f;
+        nvUnreachable();
+        return 0.0f;
+    }
+
+    // Add a point to this box.
+    inline void Box::addPointToBounds(const Vector3 & p)
+    {
+        minCorner = min(minCorner, p);
+        maxCorner = max(maxCorner, p);
+    }
+
+    // Add a box to this box.
+    inline void Box::addBoxToBounds(const Box & b)
+    {
+        minCorner = min(minCorner, b.minCorner);
+        maxCorner = max(maxCorner, b.maxCorner);
+    }
+
+    // Add sphere to this box.
+    inline void Box::addSphereToBounds(const Vector3 & p, float r) {
+        minCorner = min(minCorner, p - Vector3(r));
+        maxCorner = min(maxCorner, p + Vector3(r));
+    }
+
+    // Translate box.
+    inline void Box::translate(const Vector3 & v)
+    {
+        minCorner += v;
+        maxCorner += v;
+    }
+
+    // Scale the box.
+    inline void Box::scale(float s)
+    {
+        minCorner *= s;
+        maxCorner *= s;
+    }
+
+    // Expand the box by a fixed amount.
+    inline void Box::expand(float r) {
+        minCorner -= Vector3(r,r,r);
+        maxCorner += Vector3(r,r,r);
+    }
+
+    // Get the area of the box.
+    inline float Box::area() const
+    {
+        const Vector3 d = extents();
+        return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
+    }	
+
+    // Get the volume of the box.
+    inline float Box::volume() const
+    {
+        Vector3 d = extents();
+        return 8.0f * (d.x * d.y * d.z);
+    }
+
+    // Return true if the box contains the given point.
+    inline bool Box::contains(const Vector3 & p) const
+    {
+        return 
+            minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
+            maxCorner.x > p.x && maxCorner.y > p.y && maxCorner.z > p.z;
+    }
+
+    // Split the given box in 8 octants and assign the ith one to this box.
+    inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
+    {
+        minCorner = box.minCorner;
+        maxCorner = box.maxCorner;
+
+        if (i & 4) minCorner.x = center.x;
+        else       maxCorner.x = center.x;
+        if (i & 2) minCorner.y = center.y;
+        else       maxCorner.y = center.y;
+        if (i & 1) minCorner.z = center.z;
+        else       maxCorner.z = center.z;
+    }
+
+} // nv namespace
+
+
+#endif // NV_MATH_BOX_INL
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/CMakeLists.txt
@@ -1,17 +1,17 @@
 PROJECT(nvmath)
 
 SET(MATH_SRCS
-	nvmath.h
-	Vector.h
-	Matrix.h
-	Quaternion.h
-	Box.h
-	Color.h
-	Montecarlo.h Montecarlo.cpp
-	Random.h Random.cpp
-	SphericalHarmonic.h SphericalHarmonic.cpp
-	Basis.h Basis.cpp
-	Triangle.h Triangle.cpp TriBox.cpp)
+    nvmath.h
+    Box.h Box.inl
+    Color.h Color.inl
+    Fitting.h Fitting.cpp
+    Gamma.h Gamma.cpp
+    Half.h Half.cpp
+    Matrix.h
+    Plane.h Plane.inl Plane.cpp
+    SphericalHarmonic.h SphericalHarmonic.cpp
+    SimdVector.h SimdVector_SSE.h SimdVector_VE.h
+    Vector.h Vector.inl)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
@@ -19,15 +19,15 @@
 ADD_DEFINITIONS(-DNVMATH_EXPORTS)
 
 IF(NVMATH_SHARED)
-	ADD_DEFINITIONS(-DNVMATH_SHARED=1)
-	ADD_LIBRARY(nvmath SHARED ${MATH_SRCS})
+    ADD_DEFINITIONS(-DNVMATH_SHARED=1)
+    ADD_LIBRARY(nvmath SHARED ${MATH_SRCS})
 ELSE(NVMATH_SHARED)
-	ADD_LIBRARY(nvmath ${MATH_SRCS})
+    ADD_LIBRARY(nvmath ${MATH_SRCS})
 ENDIF(NVMATH_SHARED)
 
 TARGET_LINK_LIBRARIES(nvmath ${LIBS} nvcore)
 
 INSTALL(TARGETS nvmath
-	RUNTIME DESTINATION ${BINDIR}
-	LIBRARY DESTINATION ${LIBDIR}
-	ARCHIVE DESTINATION ${LIBDIR})
+    RUNTIME DESTINATION ${BINDIR}
+    LIBRARY DESTINATION ${LIBDIR}
+    ARCHIVE DESTINATION ${LIBDIR})
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.h
@@ -1,178 +1,149 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_MATH_COLOR_H
 #define NV_MATH_COLOR_H
 
-#include <nvcore/Debug.h>
-#include <nvmath/Vector.h>
+#include "nvmath.h"
 
 namespace nv
 {
 
-/// 64 bit color stored as BGRA.
-class NVMATH_CLASS Color64 
-{
-public:
-	Color64() { }
-	Color64(const Color64 & c) : u(c.u) { }
-	Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
-	explicit Color64(uint64 U) : u(U) { }
-
-	void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
-	{
-		r = R;
-		g = G;
-		b = B;
-		a = A;
-	}
-
-	operator uint64 () const {
-		return u;
-	}
+    /// 64 bit color stored as BGRA.
+    class NVMATH_CLASS Color64 
+    {
+    public:
+        Color64() { }
+        Color64(const Color64 & c) : u(c.u) { }
+        Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
+        explicit Color64(uint64 U) : u(U) { }
+
+        void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        operator uint64 () const {
+            return u;
+        }
 
-	union {
-		struct {
+        union {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint16 r, a, b, g;
+                uint16 r, a, b, g;
 #else
-			uint16 a: 16;
-			uint16 r: 16;
-			uint16 g: 16;
-			uint16 b: 16;
+                uint16 a: 16;
+                uint16 r: 16;
+                uint16 g: 16;
+                uint16 b: 16;
 #endif
-		};
-		uint64 u;
-	};
-};
+            };
+            uint64 u;
+        };
+    };
+
+    /// 32 bit color stored as BGRA.
+    class NVMATH_CLASS Color32
+    {
+    public:
+        Color32() { }
+        Color32(const Color32 & c) : u(c.u) { }
+        Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
+        Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
+        //Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
+        //Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
+        //Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
+        explicit Color32(uint32 U) : u(U) { }
+
+        void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        operator uint32 () const {
+            return u;
+        }
 
-/// 32 bit color stored as BGRA.
-class NVMATH_CLASS Color32
-{
-public:
-	Color32() { }
-	Color32(const Color32 & c) : u(c.u) { }
-	Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
-	Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
-	//Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
-	//Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
-	//Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
-	explicit Color32(uint32 U) : u(U) { }
-
-	void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
-	{
-		r = R;
-		g = G;
-		b = B;
-		a = A;
-	}
-
-	void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
-	{
-		r = R;
-		g = G;
-		b = B;
-		a = A;
-	}
-	
-	operator uint32 () const {
-		return u;
-	}
-	
-	union {
-		struct {
+        union {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint8 b, g, r, a;
+                uint8 b, g, r, a;
 #else
-			uint8 a: 8;
-			uint8 r: 8;
-			uint8 g: 8;
-			uint8 b: 8;
+                uint8 a: 8;
+                uint8 r: 8;
+                uint8 g: 8;
+                uint8 b: 8;
 #endif
-		};
-		uint32 u;
-	};
-};
-
+            };
+            uint8 component[4];
+            uint32 u;
+        };
+    };
+
+
+    /// 16 bit 565 BGR color.
+    class NVMATH_CLASS Color16
+    {
+    public:
+        Color16() { }
+        Color16(const Color16 & c) : u(c.u) { }
+        explicit Color16(uint16 U) : u(U) { }
 
-/// 16 bit 565 BGR color.
-class NVMATH_CLASS Color16
-{
-public:
-	Color16() { }
-	Color16(const Color16 & c) : u(c.u) { }
-	explicit Color16(uint16 U) : u(U) { }
-	
-	union {
-		struct {
+        union {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint16 b : 5;
-			uint16 g : 6;
-			uint16 r : 5;
+                uint16 b : 5;
+                uint16 g : 6;
+                uint16 r : 5;
 #else
-			uint16 r : 5;
-			uint16 g : 6;
-			uint16 b : 5;
+                uint16 r : 5;
+                uint16 g : 6;
+                uint16 b : 5;
 #endif
-		};
-		uint16 u;
-	};
-};
-
-
-/// Clamp color components.
-inline Vector3 colorClamp(Vector3::Arg c)
-{
-	return Vector3(clamp(c.x(), 0.0f, 1.0f), clamp(c.y(), 0.0f, 1.0f), clamp(c.z(), 0.0f, 1.0f));
-}
-
-/// Clamp without allowing the hue to change.
-inline Vector3 colorNormalize(Vector3::Arg c)
-{
-	float scale = 1.0f;
-	if (c.x() > scale) scale = c.x();
-	if (c.y() > scale) scale = c.y();
-	if (c.z() > scale) scale = c.z();
-	return c / scale;
-}
+            };
+            uint16 u;
+        };
+    };
+
+    /// 16 bit 4444 BGRA color.
+    class NVMATH_CLASS Color16_4444
+    {
+    public:
+        Color16_4444() { }
+        Color16_4444(const Color16_4444 & c) : u(c.u) { }
+        explicit Color16_4444(uint16 U) : u(U) { }
 
-/// Convert Color32 to Color16.
-inline Color16 toColor16(Color32 c)
-{
-	Color16 color;
-	//         rrrrrggggggbbbbb
-	// rrrrr000gggggg00bbbbb000
-//	color.u = (c.u >> 3) & 0x1F;
-//	color.u |= (c.u >> 5) & 0x7E0;
-//	color.u |= (c.u >> 8) & 0xF800;
-	
-	color.r = c.r >> 3;
-	color.g = c.g >> 2;
-	color.b = c.b >> 3;
-	return color; 
-}
-
-
-/// Promote 16 bit color to 32 bit using regular bit expansion.
-inline Color32 toColor32(Color16 c)
-{
-	Color32 color;
-//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
-//	c.u |= (c.u >> 5) & 0x070007;
-//	c.u |= (c.u >> 6) & 0x000300;
-	
-	color.b = (c.b << 3) | (c.b >> 2);
-	color.g = (c.g << 2) | (c.g >> 4);
-	color.r = (c.r << 3) | (c.r >> 2);
-	color.a = 0xFF;
-	
-	return color;
-}
-
-inline Vector4 toVector4(Color32 c)
-{
-	const float scale = 1.0f / 255.0f;
-	return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
-}
+        union {
+            struct {
+#if NV_LITTLE_ENDIAN
+                uint16 b : 4;
+                uint16 g : 4;
+                uint16 r : 4;
+                uint16 a : 4;
+#else
+                uint16 a : 4;
+                uint16 r : 4;
+                uint16 g : 4;
+                uint16 b : 4;
+#endif
+            };
+            uint16 u;
+        };
+    };
 
 } // nv namespace
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.cpp
@@ -0,0 +1,4 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Color.h"
+#include "Color.inl"
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Color.inl
@@ -0,0 +1,203 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_COLOR_INL
+#define NV_MATH_COLOR_INL
+
+#include "Color.h"
+#include "Vector.inl"
+#include "ftoi.h"
+
+
+namespace nv
+{
+    // for Color16 & Color16_4444 bitfields
+    NV_FORCEINLINE uint32 U32round(float f) { return uint32(floorf(f + 0.5f)); }
+    NV_FORCEINLINE uint16 U16round(float f) { return uint16(floorf(f + 0.5f)); }
+    NV_FORCEINLINE uint16 toU4_in_U16(int x) { nvDebugCheck(x >= 0 && x <= 15u); return (uint16)x; }
+    NV_FORCEINLINE uint16 toU5_in_U16(int x) { nvDebugCheck(x >= 0 && x <= 31u); return (uint16)x; }
+    NV_FORCEINLINE uint16 toU6_in_U16(int x) { nvDebugCheck(x >= 0 && x <= 63u); return (uint16)x; }
+
+    // Clamp color components.
+    inline Vector3 colorClamp(Vector3::Arg c)
+    {
+        return Vector3(saturate(c.x), saturate(c.y), saturate(c.z));
+    }
+
+    // Clamp without allowing the hue to change.
+    inline Vector3 colorNormalize(Vector3::Arg c)
+    {
+        float scale = 1.0f;
+        if (c.x > scale) scale = c.x;
+        if (c.y > scale) scale = c.y;
+        if (c.z > scale) scale = c.z;
+        return c / scale;
+    }
+
+    // Convert Color16 from float components
+    inline Color16 toColor16(float r, float g, float b)
+    {
+        Color16 color; // 5,6,5
+        color.r = toU5_in_U16(nv::U16round(saturate(r) * 31u));
+        color.g = toU6_in_U16(nv::U16round(saturate(g) * 63u));
+        color.b = toU5_in_U16(nv::U16round(saturate(b) * 31u));
+        return color;
+    }
+
+    // Convert Color32 to Color16.
+    inline Color16 toColor16(Color32 c)
+    {
+        Color16 color;
+        //         rrrrrggggggbbbbb
+        // rrrrr000gggggg00bbbbb000
+        // color.u = (c.u >> 3) & 0x1F;
+        // color.u |= (c.u >> 5) & 0x7E0;
+        // color.u |= (c.u >> 8) & 0xF800;
+
+        color.r = c.r >> 3;
+        color.g = c.g >> 2;
+        color.b = c.b >> 3;
+        return color; 
+    }
+
+    // Convert Color32 to Color16_4444.
+    inline Color16_4444 toColor16_4444(Color32 c)
+    {
+        Color16_4444 color;
+        color.a = c.a >> 4;
+        color.r = c.r >> 4;
+        color.g = c.g >> 4;
+        color.b = c.b >> 4;
+        return color; 
+    }
+
+    // Convert float[4] to Color16_4444.
+    inline Color16_4444 toColor16_4444(float r, float g, float b, float a)
+    {
+        Color16_4444 color;
+        color.a = toU4_in_U16(nv::U16round(saturate(a) * 15u));
+        color.r = toU4_in_U16(nv::U16round(saturate(r) * 15u));
+        color.g = toU4_in_U16(nv::U16round(saturate(g) * 15u));
+        color.b = toU4_in_U16(nv::U16round(saturate(b) * 15u));
+        return color;
+    }
+
+    // Convert float[4] to Color16_4444.
+    inline Color16_4444 toColor16_4444_from_argb(float * fc)
+    {
+        Color16_4444 color;
+        color.a = toU4_in_U16(nv::U16round(saturate(fc[0]) * 15u));
+        color.r = toU4_in_U16(nv::U16round(saturate(fc[1]) * 15u));
+        color.g = toU4_in_U16(nv::U16round(saturate(fc[2]) * 15u));
+        color.b = toU4_in_U16(nv::U16round(saturate(fc[3]) * 15u));
+        return color;
+    }
+
+    // Convert float[4] to Color16_4444.
+    inline Color16_4444 toColor16_4444_from_bgra(float * fc)
+    {
+        Color16_4444 color;
+        color.b = toU4_in_U16(nv::U16round(saturate(fc[0]) * 15u));
+        color.g = toU4_in_U16(nv::U16round(saturate(fc[1]) * 15u));
+        color.r = toU4_in_U16(nv::U16round(saturate(fc[2]) * 15u));
+        color.a = toU4_in_U16(nv::U16round(saturate(fc[3]) * 15u));
+        return color;
+    }
+
+    // Promote 16 bit color to 32 bit using regular bit expansion.
+    inline Color32 toColor32(Color16 c)
+    {
+        Color32 color;
+        // c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+        // c.u |= (c.u >> 5) & 0x070007;
+        // c.u |= (c.u >> 6) & 0x000300;
+
+        color.b = (c.b << 3) | (c.b >> 2);
+        color.g = (c.g << 2) | (c.g >> 4);
+        color.r = (c.r << 3) | (c.r >> 2);
+        color.a = 0xFF;
+
+        return color;
+    }
+
+    // @@ Quantize with exact endpoints or with uniform bins?
+    inline Color32 toColor32(const Vector4 & v)
+    {
+        Color32 color;
+        color.r = U8(ftoi_round(saturate(v.x) * 255));
+        color.g = U8(ftoi_round(saturate(v.y) * 255));
+        color.b = U8(ftoi_round(saturate(v.z) * 255));
+        color.a = U8(ftoi_round(saturate(v.w) * 255));
+        return color;
+    }
+
+    inline Color32 toColor32_from_bgra(const Vector4 & v)
+    {
+        Color32 color;
+        color.b = U8(ftoi_round(saturate(v.x) * 255));
+        color.g = U8(ftoi_round(saturate(v.y) * 255));
+        color.r = U8(ftoi_round(saturate(v.z) * 255));
+        color.a = U8(ftoi_round(saturate(v.w) * 255));
+        return color;
+    }
+
+    inline Color32 toColor32_from_argb(const Vector4 & v)
+    {
+        Color32 color;
+        color.a = U8(ftoi_round(saturate(v.x) * 255));
+        color.r = U8(ftoi_round(saturate(v.y) * 255));
+        color.g = U8(ftoi_round(saturate(v.z) * 255));
+        color.b = U8(ftoi_round(saturate(v.w) * 255));
+        return color;
+    }
+
+    inline Vector4 toVector4(Color32 c)
+    {
+        const float scale = 1.0f / 255.0f;
+        return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
+    }
+
+
+    inline float perceptualColorDistance(Vector3::Arg c0, Vector3::Arg c1)
+    {
+        float rmean = (c0.x + c1.x) * 0.5f;
+        float r = c1.x - c0.x;
+        float g = c1.y - c0.y;
+        float b = c1.z - c0.z;
+        return sqrtf((2 + rmean)*r*r + 4*g*g + (3 - rmean)*b*b);
+    }
+
+    
+    inline float hue(float r, float g, float b) {
+        float h = atan2f(sqrtf(3.0f)*(g-b), 2*r-g-b) * (1.0f / (2 * PI)) + 0.5f;
+        return h;
+    }
+
+    inline float toSrgb(float f) {
+        if (nv::isNan(f))           f = 0.0f;
+        else if (f <= 0.0f)         f = 0.0f;
+        else if (f <= 0.0031308f)   f = 12.92f * f;
+        else if (f <= 1.0f)         f = (powf(f, 0.41666f) * 1.055f) - 0.055f;
+        else                        f = 1.0f;
+        return f;
+    }
+
+    inline float fromSrgb(float f) {
+        if (f < 0.0f)           f = 0.0f;
+        else if (f < 0.04045f)  f = f / 12.92f;
+        else if (f <= 1.0f)     f = powf((f + 0.055f) / 1.055f, 2.4f);
+        else                    f = 1.0f;
+        return f;
+    }
+
+    inline Vector3 toSrgb(const Vector3 & v) {
+        return Vector3(toSrgb(v.x), toSrgb(v.y), toSrgb(v.z));
+    }
+
+    inline Vector3 fromSrgb(const Vector3 & v) {
+        return Vector3(fromSrgb(v.x), fromSrgb(v.y), fromSrgb(v.z));
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_COLOR_INL
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Fitting.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Fitting.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Fitting.h
@@ -0,0 +1,50 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_FITTING_H
+#define NV_MATH_FITTING_H
+
+#include "Vector.h"
+#include "Plane.h"
+
+namespace nv
+{
+    namespace Fit
+    {
+        Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector4 computeCentroid(int n, const Vector4 * points);
+        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
+
+        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+
+        NVMATH_API Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+        NVMATH_API Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        NVMATH_API Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+        NVMATH_API Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        NVMATH_API Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+        NVMATH_API Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+
+        Plane bestPlane(int n, const Vector3 * points);
+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+
+        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
+
+        // Returns number of clusters [1-4].
+        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_FITTING_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Fitting.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Fitting.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Fitting.cpp
@@ -0,0 +1,1205 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "Fitting.h"
+#include "Vector.inl"
+#include "Plane.inl"
+
+#include "nvcore/Array.inl"
+#include "nvcore/Utils.h" // max, swap
+
+#include <float.h> // FLT_MAX
+//#include <vector>
+#include <string.h>
+
+using namespace nv;
+
+// @@ Move to EigenSolver.h
+
+// @@ We should be able to do something cheaper...
+static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
+{
+	const Vector3 row0(matrix[0], matrix[1], matrix[2]);
+	const Vector3 row1(matrix[1], matrix[3], matrix[4]);
+	const Vector3 row2(matrix[2], matrix[4], matrix[5]);
+
+	float r0 = lengthSquared(row0);
+	float r1 = lengthSquared(row1);
+	float r2 = lengthSquared(row2);
+
+	if (r0 > r1 && r0 > r2) return row0;
+	if (r1 > r2) return row1;
+	return row2;
+}
+
+
+static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    Vector3 v = estimatePrincipalComponent(matrix);
+
+    const int NUM = 8;
+    for (int i = 0; i < NUM; i++)
+    {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+
+        float norm = max(max(x, y), z);
+
+        v = Vector3(x, y, z) / norm;
+    }
+
+    return v;
+}
+
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
+{
+    Vector3 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    Vector3 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points)
+{
+    Vector4 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    Vector4 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.y * v.y;
+        covariance[4] += v.y * v.z;
+        covariance[5] += v.z * v.z;
+    }
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 a = (points[i] - centroid) * metric;
+        Vector3 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.y * b.y;
+        covariance[4] += a.y * b.z;
+        covariance[5] += a.z * b.z;
+    }
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.x * v.w;
+
+		covariance[4] += v.y * v.y;
+        covariance[5] += v.y * v.z;
+        covariance[6] += v.y * v.w;
+
+		covariance[7] += v.z * v.z;
+		covariance[8] += v.z * v.w;
+
+		covariance[9] += v.w * v.w;
+	}
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 a = (points[i] - centroid) * metric;
+        Vector4 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.x * b.w;
+
+		covariance[4] += a.y * b.y;
+        covariance[5] += a.y * b.z;
+        covariance[6] += a.y * b.w;
+
+		covariance[7] += a.z * b.z;
+		covariance[8] += a.z * b.w;
+
+		covariance[9] += a.w * b.w;
+    }
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+
+
+static inline Vector3 firstEigenVector_EigenSolver3(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+	if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors))
+	{
+		return Vector3(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+
+
+static inline Vector4 firstEigenVector_EigenSolver4(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[4] == 0 && matrix[7] == 0&& matrix[9] == 0)
+    {
+        return Vector4(0.0f);
+    }
+
+    float eigenValues[4];
+    Vector4 eigenVectors[4];
+	if (!nv::Fit::eigenSolveSymmetric4(matrix, eigenValues, eigenVectors))
+	{
+		return Vector4(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points)
+{
+    float matrix[10];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    float matrix[10];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
+
+Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector3(R[0], R[1], R[2]);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+		Q[i*n+3] = points[i].w;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector4(R[0], R[1], R[2], R[3]);
+}
+
+
+
+Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, matrix);
+
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    return Plane(eigenVectors[2], centroid);
+}
+
+bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        return false;
+    }
+
+    return eigenValues[2] < epsilon;
+}
+
+
+
+// Tridiagonal solver from Charles Bloom. 
+// Householder transforms followed by QL decomposition. 
+// Seems to be based on the code from Numerical Recipes in C.
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd);
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[3];
+    float diag[3];
+    float work[3][3];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[1][1] = matrix[3];
+    work[1][2] = work[2][1] = matrix[4];
+    work[2][2] = matrix[5];
+
+    EigenSolver3_Tridiagonal(work, diag, subd);
+    if (!EigenSolver3_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 3; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector3(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows :
+
+    for (int i=0; i < 3; i++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // shuffle to sort by singular value :
+    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[0], eigenValues[2]);
+        swap(eigenVectors[0], eigenVectors[2]);
+    }
+    if (eigenValues[1] > eigenValues[0])
+    {
+        swap(eigenValues[0], eigenValues[1]);
+        swap(eigenVectors[0], eigenVectors[1]);
+    }
+    if (eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[1], eigenValues[2]);
+        swap(eigenVectors[1], eigenVectors[2]);
+    }
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
+
+    return true;
+}
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+    const float epsilon = 1e-08f;
+
+    float a = mat[0][0];
+    float b = mat[0][1];
+    float c = mat[0][2];
+    float d = mat[1][1];
+    float e = mat[1][2];
+    float f = mat[2][2];
+
+    diag[0] = a;
+    subd[2] = 0.f;
+    if (fabsf(c) >= epsilon)
+    {
+        const float ell = sqrtf(b*b+c*c);
+        b /= ell;
+        c /= ell;
+        const float q = 2*b*e+c*(f-d);
+        diag[1] = d+c*q;
+        diag[2] = f-c*q;
+        subd[0] = ell;
+        subd[1] = e-b*q;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
+        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
+    }
+    else
+    {
+        diag[1] = d;
+        diag[2] = f;
+        subd[0] = b;
+        subd[1] = e;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
+        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
+    }
+}
+
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 3; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m <= 1; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 3; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+// Tridiagonal solver for 4x4 symmetric matrices.
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd);
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[4];
+    float diag[4];
+    float work[4][4];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[0][3] = work[3][0] = matrix[3];
+    work[1][1] = matrix[4];
+    work[1][2] = work[2][1] = matrix[5];
+    work[1][3] = work[3][1] = matrix[6];
+    work[2][2] = matrix[7];
+    work[2][3] = work[3][2] = matrix[8];
+    work[3][3] = matrix[9];
+
+    EigenSolver4_Tridiagonal(work, diag, subd);
+    if (!EigenSolver4_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 4; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector4(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 4; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows
+
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // sort by singular value
+
+	for (int i = 0; i < 3; ++i)
+	{
+		for (int j = i+1; j < 4; ++j)
+		{
+			if (eigenValues[j] > eigenValues[i])
+			{
+				swap(eigenValues[i], eigenValues[j]);
+				swap(eigenVectors[i], eigenVectors[j]);
+			}
+		}
+	}
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2] && eigenValues[0] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2] && eigenValues[1] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[2] >= eigenValues[2]);
+
+    return true;
+}
+
+#include "nvmath/Matrix.inl"
+
+inline float signNonzero(float x)
+{
+	return (x >= 0.0f) ? 1.0f : -1.0f;
+}
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+
+	static const int n = 4;
+
+	// Set epsilon relative to size of elements in matrix
+	static const float relEpsilon = 1e-6f;
+	float maxElement = FLT_MAX;
+	for (int i = 0; i < n; ++i)
+		for (int j = 0; j < n; ++j)
+			maxElement = max(maxElement, fabsf(mat[i][j]));
+	float epsilon = relEpsilon * maxElement;
+
+	// Iterative algorithm, works for any size of matrix but might be slower than
+	// a closed-form solution for symmetric 4x4 matrices.  Based on this article:
+	// http://en.wikipedia.org/wiki/Householder_transformation#Tridiagonalization
+
+	Matrix A, Q(identity);
+	memcpy(&A, mat, sizeof(float)*n*n);
+
+	// We proceed from left to right, making the off-tridiagonal entries zero in
+	// one column of the matrix at a time.
+	for (int k = 0; k < n - 2; ++k)
+	{
+		float sum = 0.0f;
+		for (int j = k+1; j < n; ++j)
+			sum += A(j,k)*A(j,k);
+		float alpha = -signNonzero(A(k+1,k)) * sqrtf(sum);
+		float r = sqrtf(0.5f * (alpha*alpha - A(k+1,k)*alpha));
+
+		// If r is zero, skip this column - already in tridiagonal form
+		if (fabsf(r) < epsilon)
+			continue;
+
+		float v[n] = {};
+		v[k+1] = 0.5f * (A(k+1,k) - alpha) / r;
+		for (int j = k+2; j < n; ++j)
+			v[j] = 0.5f * A(j,k) / r;
+
+		Matrix P(identity);
+		for (int i = 0; i < n; ++i)
+			for (int j = 0; j < n; ++j)
+				P(i,j) -= 2.0f * v[i] * v[j];
+
+		A = mul(mul(P, A), P);
+		Q = mul(Q, P);
+	}
+
+	nvDebugCheck(fabsf(A(2,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,2)) < epsilon);
+	nvDebugCheck(fabsf(A(3,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,3)) < epsilon);
+	nvDebugCheck(fabsf(A(3,1)) < epsilon);
+	nvDebugCheck(fabsf(A(1,3)) < epsilon);
+
+	for (int i = 0; i < n; ++i)
+		diag[i] = A(i,i);
+	for (int i = 0; i < n - 1; ++i)
+		subd[i] = A(i+1,i);
+	subd[n-1] = 0.0f;
+
+	memcpy(mat, &Q, sizeof(float)*n*n);
+}
+
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 4; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m < 3; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 4; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
+{
+    // Compute principal component.
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
+    Vector3 principal = firstEigenVector_PowerMethod(matrix);
+
+    // Pick initial solution.
+    int mini, maxi;
+    mini = maxi = 0;
+
+    float mindps, maxdps;
+    mindps = maxdps = dot(points[0] - centroid, principal);
+
+    for (int i = 1; i < n; ++i)
+    {
+        float dps = dot(points[i] - centroid, principal);
+
+        if (dps < mindps) {
+            mindps = dps;
+            mini = i;
+        }
+        else {
+            maxdps = dps;
+            maxi = i;
+        }
+    }
+
+    cluster[0] = centroid + mindps * principal;
+    cluster[1] = centroid + maxdps * principal;
+    cluster[2] = (2.0f * cluster[0] + cluster[1]) / 3.0f;
+    cluster[3] = (2.0f * cluster[1] + cluster[0]) / 3.0f;
+
+    // Now we have to iteratively refine the clusters.
+    while (true)
+    {
+        Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) };
+        float total[4] = {0, 0, 0, 0};
+
+        for (int i = 0; i < n; ++i)
+        {
+            // Find nearest cluster.
+            int nearest = 0;
+            float mindist = FLT_MAX;
+            for (int j = 0; j < 4; j++)
+            {
+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
+                if (dist < mindist)
+                {
+                    mindist = dist;
+                    nearest = j;
+                }
+            }
+
+            newCluster[nearest] += weights[i] * points[i];
+            total[nearest] += weights[i];
+        }
+
+        for (int j = 0; j < 4; j++)
+        {
+            if (total[j] != 0)
+                newCluster[j] /= total[j];
+        }
+
+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
+        {
+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
+        }
+
+        cluster[0] = newCluster[0];
+        cluster[1] = newCluster[1];
+        cluster[2] = newCluster[2];
+        cluster[3] = newCluster[3];
+
+        // Sort clusters by weight.
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
+            {
+                swap( total[j], total[j - 1] );
+                swap( cluster[j], cluster[j - 1] );
+            }
+        }
+    }
+}
+
+
+
+// Adaptation of James Arvo's SVD code, as found in ZOH.
+
+inline float Sqr(float x) { return x*x; }
+
+inline float svd_pythag( float a, float b )
+{
+	float at = fabsf(a);
+	float bt = fabsf(b);
+	if( at > bt )
+		return at * sqrtf( 1.0f + Sqr( bt / at ) );
+	else if( bt > 0.0f )
+		return bt * sqrtf( 1.0f + Sqr( at / bt ) );
+	else return 0.0f;
+}
+
+inline float SameSign( float a, float b ) 
+{
+	float t;
+	if( b >= 0.0f ) t = fabsf( a );
+	else t = -fabsf( a );
+	return t;
+}
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
+{
+	static const int MaxIterations = 30;
+
+	int    i, j, k, l, p, q, iter;
+	float  c, f, h, s, x, y, z;
+	float  norm  = 0.0f;
+	float  g     = 0.0f;
+	float  scale = 0.0f;
+
+    Array<float> temp; temp.resize(cols, 0.0f);
+
+	for( i = 0; i < cols; i++ ) 
+	{
+		temp[i] = scale * g;
+		scale   = 0.0f;
+		g       = 0.0f;
+		s       = 0.0f;
+		l       = i + 1;
+
+		if( i < rows )
+		{
+			for( k = i; k < rows; k++ ) scale += fabsf( Q[k*cols+i] );
+			if( scale != 0.0f ) 
+			{
+				for( k = i; k < rows; k++ ) 
+				{
+					Q[k*cols+i] /= scale;
+					s += Sqr( Q[k*cols+i] );
+				}
+				f = Q[i*cols+i];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+i] = f - g;
+				if( i != cols - 1 )
+				{
+					for( j = l; j < cols; j++ ) 
+					{
+						s = 0.0f;
+						for( k = i; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+						f = s / h;
+						for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+					}
+				}
+				for( k = i; k < rows; k++ ) Q[k*cols+i] *= scale;
+			}
+		}
+
+		diag[i] = scale * g;
+		g       = 0.0f;
+		s       = 0.0f;
+		scale   = 0.0f;
+
+		if( i < rows && i != cols - 1 ) 
+		{
+			for( k = l; k < cols; k++ ) scale += fabsf( Q[i*cols+k] );
+			if( scale != 0.0f ) 
+			{
+				for( k = l; k < cols; k++ ) 
+				{
+					Q[i*cols+k] /= scale;
+					s += Sqr( Q[i*cols+k] );
+				}
+				f = Q[i*cols+l];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+l] = f - g;
+				for( k = l; k < cols; k++ ) temp[k] = Q[i*cols+k] / h;
+				if( i != rows - 1 ) 
+				{
+					for( j = l; j < rows; j++ ) 
+					{
+						s = 0.0f;
+						for( k = l; k < cols; k++ ) s += Q[j*cols+k] * Q[i*cols+k];
+						for( k = l; k < cols; k++ ) Q[j*cols+k] += s * temp[k];
+					}
+				}
+				for( k = l; k < cols; k++ ) Q[i*cols+k] *= scale;
+			}
+		}
+		norm = max( norm, fabsf( diag[i] ) + fabsf( temp[i] ) );
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		if( i < cols - 1 ) 
+		{
+			if( g != 0.0f ) 
+			{
+				for( j = l; j < cols; j++ ) R[i*cols+j] = ( Q[i*cols+j] / Q[i*cols+l] ) / g;
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < cols; k++ ) s += Q[i*cols+k] * R[j*cols+k];
+					for( k = l; k < cols; k++ ) R[j*cols+k] += s * R[i*cols+k];
+				}
+			}
+			for( j = l; j < cols; j++ ) 
+			{
+				R[i*cols+j] = 0.0f;
+				R[j*cols+i] = 0.0f;
+			}
+		}
+		R[i*cols+i] = 1.0f;
+		g = temp[i];
+		l = i;
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		l = i + 1;
+		g = diag[i];
+		if( i < cols - 1 ) for( j = l; j < cols; j++ ) Q[i*cols+j] = 0.0f;
+		if( g != 0.0f ) 
+		{
+			g = 1.0f / g;
+			if( i != cols - 1 ) 
+			{
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+					f = ( s / Q[i*cols+i] ) * g;
+					for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+				}
+			}
+			for( j = i; j < rows; j++ ) Q[j*cols+i] *= g;
+		} 
+		else 
+		{
+			for( j = i; j < rows; j++ ) Q[j*cols+i] = 0.0f;
+		}
+		Q[i*cols+i] += 1.0f;
+	}
+
+
+	for( k = cols - 1; k >= 0; k-- ) 
+	{
+		for( iter = 1; iter <= MaxIterations; iter++ ) 
+		{
+			int jump;
+
+			for( l = k; l >= 0; l-- )
+			{
+				q = l - 1;
+				if( fabsf( temp[l] ) + norm == norm ) { jump = 1; break; }
+				if( fabsf( diag[q] ) + norm == norm ) { jump = 0; break; }
+			}
+
+			if( !jump )
+			{
+				c = 0.0f;
+				s = 1.0f;
+				for( i = l; i <= k; i++ )
+				{
+					f = s * temp[i];
+					temp[i] *= c;
+					if( fabsf( f ) + norm == norm ) break;
+					g = diag[i];
+					h = svd_pythag( f, g );
+					diag[i] = h;
+					h = 1.0f / h;
+					c = g * h;
+					s = -f * h;
+					for( j = 0; j < rows; j++ ) 
+					{
+						y = Q[j*cols+q];
+						z = Q[j*cols+i];
+						Q[j*cols+q] = y * c + z * s;
+						Q[j*cols+i] = z * c - y * s;
+					}
+				}
+			}
+
+			z = diag[k];
+			if( l == k ) 
+			{
+				if( z < 0.0f ) 
+				{
+					diag[k] = -z;
+					for( j = 0; j < cols; j++ ) R[k*cols+j] *= -1.0f; 
+				}
+				break;
+			}
+			if( iter >= MaxIterations ) return;
+			x = diag[l];
+			q = k - 1;
+			y = diag[q];
+			g = temp[q];
+			h = temp[k];
+			f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0f * h * y );
+			g = svd_pythag( f, 1.0f );
+			f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
+			c = 1.0f;
+			s = 1.0f;
+			for( j = l; j <= q; j++ ) 
+			{
+				i = j + 1;
+				g = temp[i];
+				y = diag[i];
+				h = s * g;
+				g = c * g;
+				z = svd_pythag( f, h );
+				temp[j] = z;
+				c = f / z;
+				s = h / z;
+				f = x * c + g * s;
+				g = g * c - x * s;
+				h = y * s;
+				y = y * c;
+				for( p = 0; p < cols; p++ ) 
+				{
+					x = R[j*cols+p];
+					z = R[i*cols+p];
+					R[j*cols+p] = x * c + z * s;
+					R[i*cols+p] = z * c - x * s;
+				}
+				z = svd_pythag( f, h );
+				diag[j] = z;
+				if( z != 0.0f ) 
+				{
+					z = 1.0f / z;
+					c = f * z;
+					s = h * z;
+				}
+				f = c * g + s * y;
+				x = c * y - s * g;
+				for( p = 0; p < rows; p++ ) 
+				{
+					y = Q[p*cols+j];
+					z = Q[p*cols+i];
+					Q[p*cols+j] = y * c + z * s;
+					Q[p*cols+i] = z * c - y * s;
+				}
+			}
+			temp[l] = 0.0f;
+			temp[k] = f;
+			diag[k] = x;
+		}
+	}
+
+	// Sort the singular values into descending order.
+
+	for( i = 0; i < cols - 1; i++ )
+	{
+		float biggest = diag[i];  // Biggest singular value so far.
+		int   bindex  = i;        // The row/col it occurred in.
+		for( j = i + 1; j < cols; j++ )
+		{
+			if( diag[j] > biggest ) 
+			{
+				biggest = diag[j];
+				bindex  = j;
+			}            
+		}
+		if( bindex != i )  // Need to swap rows and columns.
+		{
+			// Swap columns in Q.
+			for (int j = 0; j < rows; ++j)
+				swap(Q[j*cols+i], Q[j*cols+bindex]);
+
+			// Swap rows in R.
+			for (int j = 0; j < rows; ++j)
+				swap(R[i*cols+j], R[bindex*cols+j]);
+
+			// Swap elements in diag.
+			swap(diag[i], diag[bindex]);
+		}
+	}
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Gamma.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Gamma.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Gamma.h
@@ -0,0 +1,38 @@
+//
+// Fast implementations of powf(x,5/11) and powf(x,11/5) for gamma conversion
+// Copyright 2017 Ken Cooke <ken@highfidelity.io>
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a 
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included 
+// in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+
+#pragma once
+#ifndef NV_MATH_GAMMA_H
+#define NV_MATH_GAMMA_H
+
+#include "nvmath.h"
+
+namespace nv {
+
+    // gamma conversion of float array (in-place is allowed)
+    NVMATH_API void powf_5_11(const float* src, float* dst, int count);
+    NVMATH_API void powf_11_5(const float* src, float* dst, int count);
+
+} // nv namespace
+
+#endif // NV_MATH_GAMMA_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Gamma.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Gamma.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Gamma.cpp
@@ -0,0 +1,444 @@
+//
+// Fast implementations of powf(x,5/11) and powf(x,11/5) for gamma conversion
+// Copyright 2017 Ken Cooke <ken@highfidelity.io>
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a 
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included 
+// in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+
+#include "Gamma.h"
+#include <stdint.h>
+#include <limits>
+
+#define INFINITE_RESULT std::numeric_limits<float>::infinity()
+
+//
+// pow(2.0, e * 5/11.0) over e=[-127,128]
+//
+static const float pow_5_11_table[512] = {
+    // sign bit = 0
+    0.00000000e+00f, 5.74369237e-18f, 7.87087416e-18f, 1.07858603e-17f, 
+    1.47804139e-17f, 2.02543544e-17f, 2.77555756e-17f, 3.80348796e-17f, 
+    5.21211368e-17f, 7.14242467e-17f, 9.78762916e-17f, 1.34124875e-16f, 
+    1.83798156e-16f, 2.51867973e-16f, 3.45147530e-16f, 4.72973245e-16f, 
+    6.48139341e-16f, 8.88178420e-16f, 1.21711615e-15f, 1.66787638e-15f, 
+    2.28557589e-15f, 3.13204133e-15f, 4.29199599e-15f, 5.88154098e-15f, 
+    8.05977514e-15f, 1.10447209e-14f, 1.51351438e-14f, 2.07404589e-14f, 
+    2.84217094e-14f, 3.89477167e-14f, 5.33720441e-14f, 7.31384286e-14f, 
+    1.00225323e-13f, 1.37343872e-13f, 1.88209311e-13f, 2.57912805e-13f, 
+    3.53431070e-13f, 4.84324603e-13f, 6.63694685e-13f, 9.09494702e-13f, 
+    1.24632693e-12f, 1.70790541e-12f, 2.34042972e-12f, 3.20721032e-12f, 
+    4.39500389e-12f, 6.02269797e-12f, 8.25320975e-12f, 1.13097942e-11f, 
+    1.54983873e-11f, 2.12382299e-11f, 2.91038305e-11f, 3.98824619e-11f, 
+    5.46529731e-11f, 7.48937509e-11f, 1.02630730e-10f, 1.40640125e-10f, 
+    1.92726335e-10f, 2.64102712e-10f, 3.61913416e-10f, 4.95948393e-10f, 
+    6.79623358e-10f, 9.31322575e-10f, 1.27623878e-09f, 1.74889514e-09f, 
+    2.39660003e-09f, 3.28418337e-09f, 4.50048399e-09f, 6.16724272e-09f, 
+    8.45128678e-09f, 1.15812293e-08f, 1.58703486e-08f, 2.17479474e-08f, 
+    2.98023224e-08f, 4.08396410e-08f, 5.59646445e-08f, 7.66912009e-08f, 
+    1.05093868e-07f, 1.44015488e-07f, 1.97351767e-07f, 2.70441177e-07f, 
+    3.70599338e-07f, 5.07851155e-07f, 6.95934318e-07f, 9.53674316e-07f, 
+    1.30686851e-06f, 1.79086862e-06f, 2.45411843e-06f, 3.36300377e-06f, 
+    4.60849560e-06f, 6.31525654e-06f, 8.65411766e-06f, 1.18591788e-05f, 
+    1.62512370e-05f, 2.22698982e-05f, 3.05175781e-05f, 4.18197924e-05f, 
+    5.73077959e-05f, 7.85317898e-05f, 1.07616121e-04f, 1.47471859e-04f, 
+    2.02088209e-04f, 2.76931765e-04f, 3.79493722e-04f, 5.20039583e-04f, 
+    7.12636742e-04f, 9.76562500e-04f, 1.33823336e-03f, 1.83384947e-03f, 
+    2.51301727e-03f, 3.44371586e-03f, 4.71909950e-03f, 6.46682270e-03f, 
+    8.86181649e-03f, 1.21437991e-02f, 1.66412666e-02f, 2.28043757e-02f, 
+    3.12500000e-02f, 4.28234674e-02f, 5.86831830e-02f, 8.04165527e-02f, 
+    1.10198908e-01f, 1.51011184e-01f, 2.06938326e-01f, 2.83578128e-01f, 
+    3.88601571e-01f, 5.32520533e-01f, 7.29740024e-01f, 1.00000000e+00f, 
+    1.37035096e+00f, 1.87786186e+00f, 2.57332969e+00f, 3.52636504e+00f, 
+    4.83235788e+00f, 6.62202644e+00f, 9.07450008e+00f, 1.24352503e+01f, 
+    1.70406570e+01f, 2.33516808e+01f, 3.20000000e+01f, 4.38512306e+01f, 
+    6.00915794e+01f, 8.23465500e+01f, 1.12843681e+02f, 1.54635452e+02f, 
+    2.11904846e+02f, 2.90384003e+02f, 3.97928009e+02f, 5.45301025e+02f, 
+    7.47253784e+02f, 1.02400000e+03f, 1.40323938e+03f, 1.92293054e+03f, 
+    2.63508960e+03f, 3.61099780e+03f, 4.94833447e+03f, 6.78095508e+03f, 
+    9.29228809e+03f, 1.27336963e+04f, 1.74496328e+04f, 2.39121211e+04f, 
+    3.27680000e+04f, 4.49036602e+04f, 6.15337773e+04f, 8.43228672e+04f, 
+    1.15551930e+05f, 1.58346703e+05f, 2.16990563e+05f, 2.97353219e+05f, 
+    4.07478281e+05f, 5.58388250e+05f, 7.65187875e+05f, 1.04857600e+06f, 
+    1.43691713e+06f, 1.96908088e+06f, 2.69833175e+06f, 3.69766175e+06f, 
+    5.06709450e+06f, 6.94369800e+06f, 9.51530300e+06f, 1.30393050e+07f, 
+    1.78684240e+07f, 2.44860120e+07f, 3.35544320e+07f, 4.59813480e+07f, 
+    6.30105880e+07f, 8.63466160e+07f, 1.18325176e+08f, 1.62147024e+08f, 
+    2.22198336e+08f, 3.04489696e+08f, 4.17257760e+08f, 5.71789568e+08f, 
+    7.83552384e+08f, 1.07374182e+09f, 1.47140314e+09f, 2.01633882e+09f, 
+    2.76309171e+09f, 3.78640563e+09f, 5.18870477e+09f, 7.11034675e+09f, 
+    9.74367027e+09f, 1.33522483e+10f, 1.82972662e+10f, 2.50736763e+10f, 
+    3.43597384e+10f, 4.70849004e+10f, 6.45228421e+10f, 8.84189348e+10f, 
+    1.21164980e+11f, 1.66038553e+11f, 2.27531096e+11f, 3.11797449e+11f, 
+    4.27271946e+11f, 5.85512518e+11f, 8.02357641e+11f, 1.09951163e+12f, 
+    1.50671681e+12f, 2.06473095e+12f, 2.82940591e+12f, 3.87727937e+12f, 
+    5.31323368e+12f, 7.28099507e+12f, 9.97751836e+12f, 1.36727023e+13f, 
+    1.87364006e+13f, 2.56754445e+13f, 3.51843721e+13f, 4.82149380e+13f, 
+    6.60713903e+13f, 9.05409892e+13f, 1.24072940e+14f, 1.70023478e+14f, 
+    2.32991842e+14f, 3.19280587e+14f, 4.37526473e+14f, 5.99564818e+14f, 
+    8.21614225e+14f, 1.12589991e+15f, 1.54287801e+15f, 2.11428449e+15f, 
+    2.89731166e+15f, 3.97033407e+15f, 5.44075129e+15f, 7.45573896e+15f, 
+    1.02169788e+16f, 1.40008471e+16f, 1.91860742e+16f, 2.62916552e+16f, 
+    3.60287970e+16f, 4.93720965e+16f, 6.76571037e+16f, 9.27139730e+16f, 
+    1.27050690e+17f, 1.74104041e+17f, 2.38583647e+17f, INFINITE_RESULT, 
+    // sign bit = 1
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+};
+
+//
+// pow(2.0, e * 11/5.0) over e=[-127,128]
+//
+static const float pow_11_5_table[512] = {
+    // sign bit = 0
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 1.40129846e-45f, 
+    4.20389539e-45f, 1.96181785e-44f, 8.96831017e-44f, 4.11981749e-43f, 
+    1.89315423e-42f, 8.69926087e-42f, 3.99734400e-41f, 1.83670992e-40f, 
+    8.43930599e-40f, 3.87768572e-39f, 1.78171625e-38f, 8.18661824e-38f, 
+    3.76158192e-37f, 1.72836915e-36f, 7.94149964e-36f, 3.64895487e-35f, 
+    1.67661942e-34f, 7.70371978e-34f, 3.53970002e-33f, 1.62641913e-32f, 
+    7.47305957e-32f, 3.43371656e-31f, 1.57772181e-30f, 7.24930563e-30f, 
+    3.33090637e-29f, 1.53048260e-28f, 7.03225152e-28f, 3.23117427e-27f, 
+    1.48465779e-26f, 6.82169625e-26f, 3.13442837e-25f, 1.44020511e-24f, 
+    6.61744490e-24f, 3.04057916e-23f, 1.39708339e-22f, 6.41930929e-22f, 
+    2.94954007e-21f, 1.35525272e-20f, 6.22710612e-20f, 2.86122679e-19f, 
+    1.31467454e-18f, 6.04065806e-18f, 2.77555756e-17f, 1.27531133e-16f, 
+    5.85979246e-16f, 2.69245347e-15f, 1.23712677e-14f, 5.68434189e-14f, 
+    2.61183761e-13f, 1.20008550e-12f, 5.51414470e-12f, 2.53363563e-11f, 
+    1.16415322e-10f, 5.34904343e-10f, 2.45777509e-09f, 1.12929683e-08f, 
+    5.18888577e-08f, 2.38418579e-07f, 1.09548409e-06f, 5.03352339e-06f, 
+    2.31279992e-05f, 1.06268380e-04f, 4.88281250e-04f, 2.24355143e-03f, 
+    1.03086559e-02f, 4.73661423e-02f, 2.17637643e-01f, 1.00000000e+00f, 
+    4.59479332e+00f, 2.11121273e+01f, 9.70058594e+01f, 4.45721893e+02f, 
+    2.04800000e+03f, 9.41013672e+03f, 4.32376367e+04f, 1.98668000e+05f, 
+    9.12838438e+05f, 4.19430400e+06f, 1.92719600e+07f, 8.85506800e+07f, 
+    4.06872064e+08f, 1.86949312e+09f, 8.58993459e+09f, 3.94689741e+10f, 
+    1.81351793e+11f, 8.33273987e+11f, 3.82872191e+12f, 1.75921860e+13f, 
+    8.08324589e+13f, 3.71408471e+14f, 1.70654513e+15f, 7.84122247e+15f, 
+    3.60287970e+16f, 1.65544876e+17f, 7.60644549e+17f, 3.49500442e+18f, 
+    1.60588236e+19f, 7.37869763e+19f, 3.39035906e+20f, 1.55780004e+21f, 
+    7.15776905e+21f, 3.28884708e+22f, 1.51115727e+23f, 6.94345535e+23f, 
+    3.19037448e+24f, 1.46591110e+25f, 6.73555881e+25f, 3.09485010e+26f, 
+    1.42201966e+27f, 6.53388693e+27f, 3.00218593e+28f, 1.37944245e+29f, 
+    6.33825300e+29f, 2.91229625e+30f, 1.33814004e+31f, 6.14847679e+31f, 
+    2.82509813e+32f, 1.29807421e+33f, 5.96438273e+33f, 2.74051081e+34f, 
+    1.25920805e+35f, 5.78580097e+35f, 2.65845599e+36f, 1.22150558e+37f, 
+    5.61256613e+37f, 2.57885808e+38f, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, INFINITE_RESULT, 
+    // sign bit = 1
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+    0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 0.00000000e+00f, 
+};
+
+//
+// powf(x, 5/11.0f)
+//
+// for x = +inf or NaN, returns +inf
+// for x = subnormal or 0.0f, returns 0.0f
+// for x < 0.0f, returns 0.0f
+//
+// rel |error| < 1.2e-5, smooth
+//
+static inline float _powf_5_11(float x) {
+
+    union { float f; uint32_t u; } m = { x };
+
+    // split into mantissa and exponent
+    int k = m.u >> 23;                              // [sign|exponent] bits
+    m.u = (m.u & ((1 << 23) - 1)) | (127 << 23);    // mantissa with zero exponent
+
+    // pow(2, e * 5/11) from table
+    float pow_e = pow_5_11_table[k];
+
+    // polynomial for pow(m, 5/11) over m=[1,2)
+    float pow_m = (((-0.0110083047f * m.f + 0.0905038750f) * m.f - 0.324697506f) * m.f + 0.876040946f) * m.f + 0.369160989f;
+
+    // recontruct the result
+    return pow_e * pow_m;
+}
+
+//
+// powf(x, 11/5.0f)
+//
+// for x = +inf or NaN, returns +inf
+// for x = subnormal or 0.0f, returns 0.0f
+// for x < 0.0f, returns 0.0f
+//
+// rel |error| < 2.9e-6, smooth
+//
+static inline float _powf_11_5(float x) {
+
+    union { float f; uint32_t u; } m = { x };
+
+    // split into mantissa and exponent
+    int k = m.u >> 23;                              // [sign|exponent] bits
+    m.u = (m.u & ((1 << 23) - 1)) | (127 << 23);    // mantissa with zero exponent
+
+    // pow(2, e * 11/5) from table
+    float pow_e = pow_11_5_table[k];
+
+    // polynomial for pow(m, 11/5) over m=[1,2)
+    float pow_m = (((-0.00916587552f * m.f + 0.119315466f) * m.f + 1.01847068f) * m.f - 0.158338739f) * m.f + 0.0297184721f;
+
+    // recontruct the result
+    return pow_e * pow_m;
+}
+
+#if (NV_USE_SSE > 1)
+#include <emmintrin.h> // SSE2
+
+void nv::powf_5_11(const float* src, float* dst, int count) {
+
+    int i = 0;
+    for (; i < count - 3; i += 4) {
+
+        __m128 x = _mm_loadu_ps(&src[i]);
+
+        // split into mantissa and exponent
+        __m128i k = _mm_srli_epi32(_mm_castps_si128(x), 23);
+        x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32((1 << 23) - 1)));
+        x = _mm_or_ps(x, _mm_castsi128_ps(_mm_set1_epi32(127 << 23)));
+
+        // pow(2, e * 5/11) from table
+        __m128 pow_e = _mm_setr_ps(
+            pow_5_11_table[_mm_cvtsi128_si32(k)],
+            pow_5_11_table[_mm_extract_epi16(k, 2)],
+            pow_5_11_table[_mm_extract_epi16(k, 4)],
+            pow_5_11_table[_mm_extract_epi16(k, 6)]
+        );
+
+        // polynomial for pow(m, 5/11) over m=[1,2)
+        __m128 pow_m = _mm_set1_ps(-0.0110083047f);
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(0.0905038750f));
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(-0.324697506f));
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(0.876040946f));
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(0.369160989f));
+
+        // recontruct the result
+        _mm_storeu_ps(&dst[i], _mm_mul_ps(pow_e, pow_m));
+    }
+
+    for (; i < count; i++) {
+        dst[i] = _powf_5_11(src[i]);
+    }
+}
+
+void nv::powf_11_5(const float* src, float* dst, int count) {
+
+    int i = 0;
+    for (; i < count - 3; i += 4) {
+
+        __m128 x = _mm_loadu_ps(&src[i]);
+
+        // split into mantissa and exponent
+        __m128i k = _mm_srli_epi32(_mm_castps_si128(x), 23);
+        x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32((1 << 23) - 1)));
+        x = _mm_or_ps(x, _mm_castsi128_ps(_mm_set1_epi32(127 << 23)));
+
+        // pow(2, e * 11/5) from table
+        __m128 pow_e = _mm_setr_ps(
+            pow_11_5_table[_mm_cvtsi128_si32(k)],
+            pow_11_5_table[_mm_extract_epi16(k, 2)],
+            pow_11_5_table[_mm_extract_epi16(k, 4)],
+            pow_11_5_table[_mm_extract_epi16(k, 6)]
+        );
+
+        // polynomial for pow(m, 11/5) over m=[1,2)
+        __m128 pow_m = _mm_set1_ps(-0.00916587552f);
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(0.119315466f));
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(1.01847068f));
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(-0.158338739f));
+        pow_m = _mm_add_ps(_mm_mul_ps(pow_m, x), _mm_set1_ps(0.0297184721f));
+
+        // recontruct the result
+        _mm_storeu_ps(&dst[i], _mm_mul_ps(pow_e, pow_m));
+    }
+
+    for (; i < count; i++) {
+        dst[i] = _powf_11_5(src[i]);
+    }
+}
+
+#else
+
+void nv::powf_5_11(const float* src, float* dst, int count) {
+    for (int i = 0; i < count; i++) {
+        dst[i] = _powf_5_11(src[i]);
+    }
+}
+void nv::powf_11_5(const float* src, float* dst, int count) {
+    for (int i = 0; i < count; i++) {
+        dst[i] = _powf_11_5(src[i]);
+    }
+}
+
+#endif // SSE2
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Half.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Half.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Half.h
@@ -0,0 +1,78 @@
+#pragma once
+#ifndef NV_MATH_HALF_H
+#define NV_MATH_HALF_H
+
+#include "nvmath.h"
+
+namespace nv {
+
+    NVMATH_API uint32 half_to_float( uint16 h );
+    NVMATH_API uint16 half_from_float( uint32 f );
+
+    // vin,vout must be 16 byte aligned. count must be a multiple of 8.
+    // implement a non-SSE version if we need it. For now, this naming makes it clear this is only available when SSE2 is
+    void half_to_float_array_SSE2(const uint16 * vin, float * vout, int count);
+
+    NVMATH_API void half_init_tables();
+    NVMATH_API uint32 fast_half_to_float(uint16 h);
+
+    inline uint16 to_half(float c) {
+        union { float f; uint32 u; } f;
+        f.f = c;
+        return nv::half_from_float( f.u );
+    }
+
+    inline float to_float(uint16 c) {
+        union { float f; uint32 u; } f;
+        f.u = nv::fast_half_to_float( c );
+        return f.f;
+    }
+
+
+    union Half {
+        uint16 raw;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint negative:1;
+            uint biasedexponent:5;
+            uint mantissa:10;
+        #else
+            uint mantissa:10;
+            uint biasedexponent:5;
+            uint negative:1;
+        #endif
+        } field;
+    };
+
+
+    inline float TestHalfPrecisionAwayFromZero(float input)
+    {
+        Half h;
+        h.raw = to_half(input);
+        h.raw += 1;
+
+        float f = to_float(h.raw);
+        
+        // Subtract the initial value to find our precision
+        float delta = f - input;
+
+        return delta;
+    }
+     
+    inline float TestHalfPrecisionTowardsZero(float input)
+    {
+        Half h;
+        h.raw = to_half(input);
+        h.raw -= 1;
+
+        float f = to_float(h.raw);
+
+        // Subtract the initial value to find our precision
+        float delta = f - input;
+
+        return -delta;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_HALF_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Half.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Half.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Half.cpp
@@ -0,0 +1,787 @@
+// Branch-free implementation of half-precision (16 bit) floating point
+// Copyright 2006 Mike Acton <macton@gmail.com>
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a 
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
+// and/or sell copies of the Software, and to permit persons to whom the 
+// Software is furnished to do so, subject to the following conditions:
+// 
+// The above copyright notice and this permission notice shall be included 
+// in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE
+//
+// Half-precision floating point format
+// ------------------------------------
+//
+//   | Field    | Last | First | Note
+//   |----------|------|-------|----------
+//   | Sign     | 15   | 15    |
+//   | Exponent | 14   | 10    | Bias = 15
+//   | Mantissa | 9    | 0     |
+//
+// Compiling
+// ---------
+//
+//  Preferred compile flags for GCC: 
+//     -O3 -fstrict-aliasing -std=c99 -pedantic -Wall -Wstrict-aliasing
+//
+//     This file is a C99 source file, intended to be compiled with a C99 
+//     compliant compiler. However, for the moment it remains combatible
+//     with C++98. Therefore if you are using a compiler that poorly implements
+//     C standards (e.g. MSVC), it may be compiled as C++. This is not
+//     guaranteed for future versions. 
+//
+// Features
+// --------
+//
+//  * QNaN + <x>  = QNaN
+//  * <x>  + +INF = +INF
+//  * <x>  - -INF = -INF
+//  * INF  - INF  = SNaN
+//  * Denormalized values
+//  * Difference of ZEROs is always +ZERO
+//  * Sum round with guard + round + sticky bit (grs)
+//  * And of course... no branching
+// 
+// Precision of Sum
+// ----------------
+//
+//  (SUM)        uint16 z = half_add( x, y );
+//  (DIFFERENCE) uint16 z = half_add( x, -y );
+//
+//     Will have exactly (0 ulps difference) the same result as:
+//     (For 32 bit IEEE 784 floating point and same rounding mode)
+//
+//     union FLOAT_32
+//     {
+//       float    f32;
+//       uint32 u32;
+//     };
+//
+//     union FLOAT_32 fx = { .u32 = half_to_float( x ) };
+//     union FLOAT_32 fy = { .u32 = half_to_float( y ) };
+//     union FLOAT_32 fz = { .f32 = fx.f32 + fy.f32    };
+//     uint16       z  = float_to_half( fz );
+//
+
+#include "Half.h"
+#include <stdio.h>
+
+
+// Load immediate
+static inline uint32 _uint32_li( uint32 a )
+{
+    return (a);
+}
+
+// Decrement
+static inline uint32 _uint32_dec( uint32 a )
+{
+    return (a - 1);
+}
+
+// Increment
+static inline uint32 _uint32_inc( uint32 a )
+{
+  return (a + 1);
+}
+
+// Complement
+static inline uint32 _uint32_not( uint32 a )
+{
+    return (~a);
+}
+
+// Negate
+static inline uint32 _uint32_neg( uint32 a )
+{
+#pragma warning(disable : 4146)     // unary minus operator applied to unsigned type, result still unsigned
+    return (-a);
+#pragma warning(default : 4146)
+}
+
+// Extend sign
+static inline uint32 _uint32_ext( uint32 a )
+{
+    return (((int32)a)>>31);
+}
+
+// And
+static inline uint32 _uint32_and( uint32 a, uint32 b )
+{
+    return (a & b);
+}
+
+// And with Complement
+static inline uint32 _uint32_andc( uint32 a, uint32 b )
+{
+    return (a & ~b);
+}
+
+// Or
+static inline uint32 _uint32_or( uint32 a, uint32 b )
+{
+    return (a | b);
+}
+
+// Shift Right Logical
+static inline uint32 _uint32_srl( uint32 a, int sa )
+{
+    return (a >> sa);
+}
+
+// Shift Left Logical
+static inline uint32 _uint32_sll( uint32 a, int sa )
+{
+    return (a << sa);
+}
+
+// Add
+static inline uint32 _uint32_add( uint32 a, uint32 b )
+{
+    return (a + b);
+}
+
+// Subtract
+static inline uint32 _uint32_sub( uint32 a, uint32 b )
+{
+    return (a - b);
+}
+
+// Select on Sign bit
+static inline uint32 _uint32_sels( uint32 test, uint32 a, uint32 b )
+{
+    const uint32 mask   = _uint32_ext( test );
+    const uint32 sel_a  = _uint32_and(  a,     mask  );
+    const uint32 sel_b  = _uint32_andc( b,     mask  );
+    const uint32 result = _uint32_or(   sel_a, sel_b );
+
+    return (result);
+}
+
+// Load Immediate
+static inline uint16 _uint16_li( uint16 a )
+{
+    return (a);
+}
+
+// Extend sign
+static inline uint16 _uint16_ext( uint16 a )
+{
+    return (((int16)a)>>15);
+}
+
+// Negate
+static inline uint16 _uint16_neg( uint16 a )
+{
+    return (-a);
+}
+
+// Complement
+static inline uint16 _uint16_not( uint16 a )
+{
+    return (~a);
+}
+
+// Decrement
+static inline uint16 _uint16_dec( uint16 a )
+{
+    return (a - 1);
+}
+
+// Shift Left Logical
+static inline uint16 _uint16_sll( uint16 a, int sa )
+{
+    return (a << sa);
+}
+
+// Shift Right Logical
+static inline uint16 _uint16_srl( uint16 a, int sa )
+{
+    return (a >> sa);
+}
+
+// Add
+static inline uint16 _uint16_add( uint16 a, uint16 b )
+{
+    return (a + b);
+}
+
+// Subtract
+static inline uint16 _uint16_sub( uint16 a, uint16 b )
+{
+    return (a - b);
+}
+
+// And
+static inline uint16 _uint16_and( uint16 a, uint16 b )
+{
+    return (a & b);
+}
+
+// Or
+static inline uint16 _uint16_or( uint16 a, uint16 b )
+{
+    return (a | b);
+}
+
+// Exclusive Or
+static inline uint16 _uint16_xor( uint16 a, uint16 b )
+{
+    return (a ^ b);
+}
+
+// And with Complement
+static inline uint16 _uint16_andc( uint16 a, uint16 b )
+{
+    return (a & ~b);
+}
+
+// And then Shift Right Logical
+static inline uint16 _uint16_andsrl( uint16 a, uint16 b, int sa )
+{
+    return ((a & b) >> sa);
+}
+
+// Shift Right Logical then Mask
+static inline uint16 _uint16_srlm( uint16 a, int sa, uint16 mask )
+{
+    return ((a >> sa) & mask);
+}
+
+// Add then Mask
+static inline uint16 _uint16_addm( uint16 a, uint16 b, uint16 mask )
+{
+    return ((a + b) & mask);
+}
+
+
+// Select on Sign bit
+static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
+{
+    const uint16 mask   = _uint16_ext( test );
+    const uint16 sel_a  = _uint16_and(  a,     mask  );
+    const uint16 sel_b  = _uint16_andc( b,     mask  );
+    const uint16 result = _uint16_or(   sel_a, sel_b );
+
+    return (result);
+}
+
+#if NV_OS_XBOX
+#include <PPCIntrinsics.h>
+#elif NV_CC_MSVC
+
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+uint32 _uint32_nlz( uint32 x ) {
+    unsigned long index;
+    _BitScanReverse(&index, x);
+    return 31 - index;
+}
+#endif
+
+
+// Count Leading Zeros
+static inline uint32 _uint32_cntlz( uint32 x )
+{
+#if NV_CC_GCC
+    /* On PowerPC, this will map to insn: cntlzw */
+    /* On Pentium, this will map to insn: clz    */
+    uint32 is_x_nez_msb = _uint32_neg( x );
+    uint32 nlz          = __builtin_clz( x );
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
+    return (result);
+#elif NV_OS_XBOX
+    // Xbox PPC has this as an intrinsic.
+    return _CountLeadingZeros(x);
+#elif NV_CC_MSVC
+    uint32 is_x_nez_msb = _uint32_neg( x );
+    uint32 nlz          = _uint32_nlz( x );
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
+    return (result);
+#else
+    const uint32 x0  = _uint32_srl(  x,  1 );
+    const uint32 x1  = _uint32_or(   x,  x0 );
+    const uint32 x2  = _uint32_srl(  x1, 2 );
+    const uint32 x3  = _uint32_or(   x1, x2 );
+    const uint32 x4  = _uint32_srl(  x3, 4 );
+    const uint32 x5  = _uint32_or(   x3, x4 );
+    const uint32 x6  = _uint32_srl(  x5, 8 );
+    const uint32 x7  = _uint32_or(   x5, x6 );
+    const uint32 x8  = _uint32_srl(  x7, 16 );
+    const uint32 x9  = _uint32_or(   x7, x8 );
+    const uint32 xA  = _uint32_not(  x9 );
+    const uint32 xB  = _uint32_srl(  xA, 1 );
+    const uint32 xC  = _uint32_and(  xB, 0x55555555 );
+    const uint32 xD  = _uint32_sub(  xA, xC );
+    const uint32 xE  = _uint32_and(  xD, 0x33333333 );
+    const uint32 xF  = _uint32_srl(  xD, 2 );
+    const uint32 x10 = _uint32_and(  xF, 0x33333333 );
+    const uint32 x11 = _uint32_add(  xE, x10 );
+    const uint32 x12 = _uint32_srl(  x11, 4 );
+    const uint32 x13 = _uint32_add(  x11, x12 );
+    const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
+    const uint32 x15 = _uint32_srl(  x14, 8 );
+    const uint32 x16 = _uint32_add(  x14, x15 );
+    const uint32 x17 = _uint32_srl(  x16, 16 );
+    const uint32 x18 = _uint32_add(  x16, x17 );
+    const uint32 x19 = _uint32_and(  x18, 0x0000003f );
+    return ( x19 );
+#endif
+}
+
+// Count Leading Zeros
+static inline uint16 _uint16_cntlz( uint16 x )
+{
+#ifdef __GNUC__
+    /* On PowerPC, this will map to insn: cntlzw */
+    /* On Pentium, this will map to insn: clz    */
+    uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x );
+    uint32 nlz   = _uint32_sub( nlz32, 16 );
+    return (nlz);
+#elif _NV_OS_XBOX_
+    uint16 nlz32 = (uint16)_CountLeadingZeros( (uint32)x );
+    return _uint32_sub( nlz32, 16);
+#else
+    const uint16 x0  = _uint16_srl(  x,  1 );
+    const uint16 x1  = _uint16_or(   x,  x0 );
+    const uint16 x2  = _uint16_srl(  x1, 2 );
+    const uint16 x3  = _uint16_or(   x1, x2 );
+    const uint16 x4  = _uint16_srl(  x3, 4 );
+    const uint16 x5  = _uint16_or(   x3, x4 );
+    const uint16 x6  = _uint16_srl(  x5, 8 );
+    const uint16 x7  = _uint16_or(   x5, x6 );
+    const uint16 x8  = _uint16_not(  x7 );
+    const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
+    const uint16 xA  = _uint16_sub(  x8, x9 );
+    const uint16 xB  = _uint16_and(  xA, 0x3333 );
+    const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
+    const uint16 xD  = _uint16_add(  xB, xC );
+    const uint16 xE  = _uint16_srl(  xD, 4 );
+    const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
+    const uint16 x10 = _uint16_srl(  xF, 8 );
+    const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
+    return ( x11 );
+#endif
+}
+
+uint16
+nv::half_from_float( uint32 f )
+{
+    const uint32 one                        = _uint32_li( 0x00000001 );
+    const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
+    const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
+    const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
+    const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
+    const uint32 f_snan_mask                = _uint32_li( 0x7fc00000 );
+    const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
+    const uint32 h_e_pos                    = _uint32_li( 0x0000000a );
+    const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
+    const uint32 h_snan_mask                = _uint32_li( 0x00007e00 );
+    const uint32 h_e_mask_value             = _uint32_li( 0x0000001f );
+    const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
+    const uint32 f_h_bias_offset            = _uint32_li( 0x00000070 );
+    const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
+    const uint32 h_nan_min                  = _uint32_li( 0x00007c01 );
+    const uint32 f_h_e_biased_flag          = _uint32_li( 0x0000008f );
+    const uint32 f_s                        = _uint32_and( f,               f_s_mask         );
+    const uint32 f_e                        = _uint32_and( f,               f_e_mask         );
+    const uint16 h_s                        = _uint32_srl( f_s,             f_h_s_pos_offset );
+    const uint32 f_m                        = _uint32_and( f,               f_m_mask         );
+    const uint16 f_e_amount                 = _uint32_srl( f_e,             f_e_pos          );
+    const uint32 f_e_half_bias              = _uint32_sub( f_e_amount,      f_h_bias_offset  );
+    const uint32 f_snan                     = _uint32_and( f,               f_snan_mask      );
+    const uint32 f_m_round_mask             = _uint32_and( f_m,             f_m_round_bit    );
+    const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,  one              );
+    const uint32 f_m_rounded                = _uint32_add( f_m,             f_m_round_offset );
+    const uint32 f_m_denorm_sa              = _uint32_sub( one,             f_e_half_bias    );
+    const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,     f_m_hidden_bit   );
+    const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden, f_m_denorm_sa    );
+    const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,      f_h_m_pos_offset );
+    const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,     f_m_hidden_bit   );
+    const uint32 m_nan                      = _uint32_srl( f_m,             f_h_m_pos_offset );
+    const uint32 h_em_nan                   = _uint32_or(  h_e_mask,        m_nan            );
+    const uint32 h_e_norm_overflow_offset   = _uint32_inc( f_e_half_bias );
+    const uint32 h_e_norm_overflow          = _uint32_sll( h_e_norm_overflow_offset, h_e_pos          );
+    const uint32 h_e_norm                   = _uint32_sll( f_e_half_bias,            h_e_pos          );
+    const uint32 h_m_norm                   = _uint32_srl( f_m_rounded,              f_h_m_pos_offset );
+    const uint32 h_em_norm                  = _uint32_or(  h_e_norm,                 h_m_norm         );
+    const uint32 is_h_ndenorm_msb           = _uint32_sub( f_h_bias_offset,   f_e_amount    );
+    const uint32 is_f_e_flagged_msb         = _uint32_sub( f_h_e_biased_flag, f_e_half_bias );
+    const uint32 is_h_denorm_msb            = _uint32_not( is_h_ndenorm_msb );
+    const uint32 is_f_m_eqz_msb             = _uint32_dec( f_m   );
+    const uint32 is_h_nan_eqz_msb           = _uint32_dec( m_nan );
+    const uint32 is_f_inf_msb               = _uint32_and( is_f_e_flagged_msb, is_f_m_eqz_msb   );
+    const uint32 is_f_nan_underflow_msb     = _uint32_and( is_f_e_flagged_msb, is_h_nan_eqz_msb );
+    const uint32 is_e_overflow_msb          = _uint32_sub( h_e_mask_value,     f_e_half_bias    );
+    const uint32 is_h_inf_msb               = _uint32_or(  is_e_overflow_msb,  is_f_inf_msb     );
+    const uint32 is_f_nsnan_msb             = _uint32_sub( f_snan,             f_snan_mask      );
+    const uint32 is_m_norm_overflow_msb     = _uint32_neg( f_m_rounded_overflow );
+    const uint32 is_f_snan_msb              = _uint32_not( is_f_nsnan_msb );
+    const uint32 h_em_overflow_result       = _uint32_sels( is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm                 );
+    const uint32 h_em_nan_result            = _uint32_sels( is_f_e_flagged_msb,     h_em_nan,          h_em_overflow_result      );
+    const uint32 h_em_nan_underflow_result  = _uint32_sels( is_f_nan_underflow_msb, h_nan_min,         h_em_nan_result           );
+    const uint32 h_em_inf_result            = _uint32_sels( is_h_inf_msb,           h_e_mask,          h_em_nan_underflow_result );
+    const uint32 h_em_denorm_result         = _uint32_sels( is_h_denorm_msb,        h_m_denorm,        h_em_inf_result           );
+    const uint32 h_em_snan_result           = _uint32_sels( is_f_snan_msb,          h_snan_mask,       h_em_denorm_result        );
+    const uint32 h_result                   = _uint32_or( h_s, h_em_snan_result );
+
+    return (uint16)(h_result);
+}
+
+uint32 
+nv::half_to_float( uint16 h )
+{
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
+    const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
+    const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
+    const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
+    const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
+    const uint32 f_result              = _uint32_or( f_s, f_nan_result );
+
+    return (f_result);
+}
+
+
+#if !NV_OS_IOS && (defined(__i386__) || defined(__x86_64__))
+
+#if NV_CC_GNUC
+#if defined(__i386__) || defined(__x86_64__)
+#include <xmmintrin.h>
+#endif
+#endif
+
+#include "nvcore/Memory.h" // NV_ALIGN_16
+
+static __m128 half_to_float4_SSE2(__m128i h)
+{
+#define SSE_CONST4(name, val) static const NV_ALIGN_16 uint name[4] = { (val), (val), (val), (val) }
+    
+#define CONST(name) *(const __m128i *)&name
+
+    SSE_CONST4(mask_nosign,         0x7fff);
+    SSE_CONST4(mask_justsign,       0x8000);
+    SSE_CONST4(mask_shifted_exp,    0x7c00 << 13);
+    SSE_CONST4(expadjust_normal,    (127 - 15) << 23);
+    SSE_CONST4(expadjust_infnan,    (128 - 16) << 23);
+    SSE_CONST4(expadjust_denorm,    1 << 23);
+    SSE_CONST4(magic_denorm,        113 << 23);
+
+    __m128i mnosign     = CONST(mask_nosign);
+    __m128i expmant     = _mm_and_si128(mnosign, h);
+    __m128i justsign    = _mm_and_si128(h, CONST(mask_justsign));
+    __m128i mshiftexp   = CONST(mask_shifted_exp);
+    __m128i eadjust     = CONST(expadjust_normal);
+    __m128i shifted     = _mm_slli_epi32(expmant, 13);
+    __m128i adjusted    = _mm_add_epi32(eadjust, shifted);
+    __m128i justexp     = _mm_and_si128(shifted, mshiftexp);
+
+    __m128i zero        = _mm_setzero_si128();
+    __m128i b_isinfnan  = _mm_cmpeq_epi32(mshiftexp, justexp);
+    __m128i b_isdenorm  = _mm_cmpeq_epi32(zero, justexp);
+
+    __m128i adj_infnan  = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan));
+    __m128i adjusted2   = _mm_add_epi32(adjusted, adj_infnan);
+
+    __m128i adj_den     = CONST(expadjust_denorm);
+    __m128i den1        = _mm_add_epi32(adj_den, adjusted2);
+    __m128  den2        = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
+    __m128  adjusted3   = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
+    __m128  adjusted4   = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
+    __m128  adjusted5   = _mm_or_ps(adjusted3, adjusted4);
+    __m128i sign        = _mm_slli_epi32(justsign, 16);
+    __m128  final       = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
+
+    // ~21 SSE2 ops.
+    return final;
+
+#undef SSE_CONST4
+#undef CONST
+}
+
+
+void nv::half_to_float_array_SSE2(const uint16 * vin, float * vout, int count) {
+    nvDebugCheck((intptr_t(vin) & 15) == 0);
+    nvDebugCheck((intptr_t(vout) & 15) == 0);
+    nvDebugCheck((count & 7) == 0);
+
+    __m128i zero = _mm_setzero_si128();
+
+    for (int i = 0; i < count; i += 8)
+    {
+        __m128i in = _mm_loadu_si128((const __m128i *)(vin + i));
+        __m128i a = _mm_unpacklo_epi16(in, zero);
+        __m128i b = _mm_unpackhi_epi16(in, zero);
+        
+        __m128 outa = half_to_float4_SSE2(a);
+        _mm_storeu_ps((float *)(vout + i), outa);
+        
+        __m128 outb = half_to_float4_SSE2(b);
+        _mm_storeu_ps((float *)(vout + i + 4), outb);
+    }
+}
+
+#endif 
+
+
+// @@ These tables could be smaller.
+namespace nv {
+    uint32 mantissa_table[2048] = { 0xDEADBEEF };
+    uint32 exponent_table[64];
+    uint32 offset_table[64];
+}
+
+void nv::half_init_tables()
+{
+    // Init mantissa table.
+    mantissa_table[0] = 0;
+
+    // denormals
+    for (int i = 1; i < 1024; i++) {
+        uint m = i << 13;
+        uint e = 0;
+
+        while ((m & 0x00800000) == 0) {
+            e -= 0x00800000;
+            m <<= 1;
+        }
+        m &= ~0x00800000;
+        e += 0x38800000;
+        mantissa_table[i] = m | e;
+    }
+
+    // normals
+    for (int i = 1024; i < 2048; i++) {
+        mantissa_table[i] = (i - 1024) << 13;
+    }
+
+
+    // Init exponent table.
+    exponent_table[0] = 0;
+
+    for (int i = 1; i < 31; i++) {
+        exponent_table[i] = 0x38000000 + (i << 23);
+    }
+
+    exponent_table[31] = 0x7f800000;
+    exponent_table[32] = 0x80000000;
+
+    for (int i = 33; i < 63; i++) {
+        exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
+    }
+
+    exponent_table[63] = 0xff800000;
+
+
+    // Init offset table.
+    offset_table[0] = 0;
+
+    for (int i = 1; i < 32; i++) {
+        offset_table[i] = 1024;
+    }
+
+    offset_table[32] = 0;
+
+    for (int i = 33; i < 64; i++) {
+        offset_table[i] = 1024;
+    }
+}
+
+// Fast half to float conversion based on:
+// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+uint32 nv::fast_half_to_float(uint16 h)
+{
+	// Initialize table if necessary.
+	if (mantissa_table[0] != 0)
+		half_init_tables();
+	uint exp = h >> 10;
+	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+}
+
+#if 0
+
+// Inaccurate conversion suggested at the ffmpeg mailing list:
+// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
+uint32 nv::fast_half_to_float(uint16 v)
+{
+    if (v & 0x8000) return 0;
+    uint exp = v >> 10;
+    if (!exp) return (v>>9)&1;
+    if (exp >= 15) return 0xffff;
+    v <<= 6;
+    return (v+(1<<16)) >> (15-exp);
+}
+
+#endif
+
+#if 0
+
+// Some more from a gamedev thread:
+// http://www.devmaster.net/forums/showthread.php?t=10924
+
+// I believe it does not handle specials either.
+
+// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
+
+
+static __declspec(align(16)) unsigned half_sign[4]    = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
+static __declspec(align(16)) unsigned half_exponent[4]    = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
+static __declspec(align(16)) unsigned half_mantissa[4]    = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
+static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
+
+__asm
+{
+    movaps  xmm1, xmm0  // Input in xmm0
+    movaps  xmm2, xmm0
+
+    andps   xmm0, half_sign
+    andps   xmm1, half_exponent
+    andps   xmm2, half_mantissa
+    paddd   xmm1, half_bias_offset
+
+    pslld   xmm0, 16
+    pslld   xmm1, 13
+    pslld   xmm2, 13
+
+    orps    xmm1, xmm2
+    orps    xmm0, xmm1  // Result in xmm0
+}
+
+
+#endif
+
+#if 0
+// These version computes the tables at compile time:
+// http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a
+
+/* This method is faster than the OpenEXR implementation (very often
+ * used, eg. in Ogre), with the additional benefit of rounding, inspired
+ * by James Tursas half-precision code. */
+static inline uint16_t float_to_half_branch(uint32_t x)
+{
+    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
+    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
+    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
+
+    /* If zero, or denormal, or exponent underflows too much for a denormal
+     * half, return signed zero. */
+    if (e < 103)
+        return bits;
+
+    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
+    if (e > 142)
+    {
+        bits |= 0x7c00u;
+        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
+         * not Inf, so make sure we set one mantissa bit too. */
+        bits |= e == 255 && (x & 0x007fffffu);
+        return bits;
+    }
+
+    /* If exponent underflows but not too much, return a denormal */
+    if (e < 113)
+    {
+        m |= 0x0800u;
+        /* Extra rounding may overflow and set mantissa to 0 and exponent
+         * to 1, which is OK. */
+        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
+        return bits;
+    }
+
+    bits |= ((e - 112) << 10) | (m >> 1);
+    /* Extra rounding. An overflow will set mantissa to 0 and increment
+     * the exponent, which is OK. */
+    bits += m & 1;
+    return bits;
+}
+
+/* These macros implement a finite iterator useful to build lookup
+ * tables. For instance, S64(0) will call S1(x) for all values of x
+ * between 0 and 63.
+ * Due to the exponential behaviour of the calls, the stress on the
+ * compiler may be important. */
+#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
+#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
+#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
+#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
+#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
+
+/* Lookup table-based algorithm from Fast Half Float Conversions
+ * by Jeroen van der Zijp, November 2008. No rounding is performed,
+ * and some NaN values may be incorrectly converted to Inf. */
+static inline uint16_t float_to_half_nobranch(uint32_t x)
+{
+    static uint16_t const basetable[512] =
+    {
+#define S1(i) (((i) < 103) ? 0x0000 : \
+               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
+               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
+        S256(0),
+#undef S1
+#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
+                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
+                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
+        S256(0),
+#undef S1
+    };
+
+    static uint8_t const shifttable[512] =
+    {
+#define S1(i) (((i) < 103) ? 24 : \
+               ((i) < 113) ? 126 - (i) : \
+               ((i) < 143 || (i) == 255) ? 13 : 24)
+        S256(0), S256(0),
+#undef S1
+    };
+
+    uint16_t bits = basetable[(x >> 23) & 0x1ff];
+    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
+    return bits;
+}
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.h
@@ -1,1000 +1,113 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_MATH_MATRIX_H
 #define NV_MATH_MATRIX_H
 
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
+#include "Vector.h"
 
-namespace nv
-{
-
-// @@ Use scalar defined in Vector.h, but should use a template instead.
-
-/// 4x4 transformation matrix.
-/// -# Matrices are stored in memory in column major order.
-/// -# Points are to be though of as column vectors.
-/// -# Transformation of a point p by a matrix M is: p' = M * p
-class NVMATH_CLASS Matrix
-{
-public:
-	typedef Matrix const & Arg;
-	
-	Matrix();
-	Matrix(zero_t);
-	Matrix(identity_t);
-	Matrix(const Matrix & m);
-
-	scalar data(uint idx) const;
-	scalar & data(uint idx);
-	scalar get(uint row, uint col) const;
-	scalar operator()(uint row, uint col) const;
-	scalar & operator()(uint row, uint col);
-	const scalar * ptr() const;
-
-	Vector4 row(uint i) const;
-	Vector4 column(uint i) const;
-	
-	void scale(scalar s);
-	void scale(Vector3::Arg s);
-	void translate(Vector3::Arg t);
-	void rotate(scalar theta, scalar v0, scalar v1, scalar v2);
-    scalar determinant() const;
-	
-	void apply(Matrix::Arg m);
-
-private:
-	scalar m_data[16];
-};
-
-
-inline Matrix::Matrix()
-{
-}
-
-inline Matrix::Matrix(zero_t)
-{
-	for(int i = 0; i < 16; i++) {
-		m_data[i] = 0.0f;
-	}
-}
-
-inline Matrix::Matrix(identity_t)
-{
-	for(int i = 0; i < 4; i++) {
-		for(int j = 0; j < 4; j++) {
-			m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
-		}
-	}
-}
-
-inline Matrix::Matrix(const Matrix & m)
-{
-	for(int i = 0; i < 16; i++) {
-		m_data[i] = m.m_data[i];
-	}
-}
-
-
-// Accessors
-inline scalar Matrix::data(uint idx) const
-{
-	nvDebugCheck(idx < 16);
-	return m_data[idx];
-}
-inline scalar & Matrix::data(uint idx)
-{
-	nvDebugCheck(idx < 16);
-	return m_data[idx];
-}
-inline scalar Matrix::get(uint row, uint col) const
-{
-	nvDebugCheck(row < 4 && col < 4);
-	return m_data[col * 4 + row];
-}
-inline scalar Matrix::operator()(uint row, uint col) const
-{
-	nvDebugCheck(row < 4 && col < 4);
-	return m_data[col * 4 + row];
-}
-inline scalar & Matrix::operator()(uint row, uint col)
-{
-	nvDebugCheck(row < 4 && col < 4);
-	return m_data[col * 4 + row];
-}
-
-inline const scalar * Matrix::ptr() const
-{
-	return m_data;
-}
-
-inline Vector4 Matrix::row(uint i) const
-{
-	nvDebugCheck(i < 4);
-	return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
-}
-
-inline Vector4 Matrix::column(uint i) const
-{
-	nvDebugCheck(i < 4);
-	return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
-}
-
-/// Apply scale.
-inline void Matrix::scale(scalar s)
-{
-	m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
-	m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
-	m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
-    m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
-}
-
-/// Apply scale.
-inline void Matrix::scale(Vector3::Arg s)
-{
-	m_data[0] *= s.x(); m_data[1] *= s.x(); m_data[2] *= s.x(); m_data[3] *= s.x();
-	m_data[4] *= s.y(); m_data[5] *= s.y(); m_data[6] *= s.y(); m_data[7] *= s.y();
-	m_data[8] *= s.z(); m_data[9] *= s.z(); m_data[10] *= s.z(); m_data[11] *= s.z();
-}
-
-/// Apply translation.
-inline void Matrix::translate(Vector3::Arg t)
-{
-	m_data[12] = m_data[0] * t.x() + m_data[4] * t.y() + m_data[8]  * t.z() + m_data[12];
-	m_data[13] = m_data[1] * t.x() + m_data[5] * t.y() + m_data[9]  * t.z() + m_data[13];
-	m_data[14] = m_data[2] * t.x() + m_data[6] * t.y() + m_data[10] * t.z() + m_data[14];
-	m_data[15] = m_data[3] * t.x() + m_data[7] * t.y() + m_data[11] * t.z() + m_data[15];
-}
-
-Matrix rotation(scalar theta, scalar v0, scalar v1, scalar v2);
-
-/// Apply rotation.
-inline void Matrix::rotate(scalar theta, scalar v0, scalar v1, scalar v2)
-{
-	Matrix R(rotation(theta, v0, v1, v2));
-	apply(R);
-}
-
-/// Apply transform.
-inline void Matrix::apply(Matrix::Arg m)
-{
-	nvDebugCheck(this != &m);
-	
-	for(int i = 0; i < 4; i++) {
-		const scalar ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
-		m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
-		m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
-		m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
-		m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
-	}
-}
-
-/// Get scale matrix.
-inline Matrix scale(Vector3::Arg s)
-{
-	Matrix m(identity);
-	m(0,0) = s.x();
-	m(1,1) = s.y();
-	m(2,2) = s.z();
-	return m;
-}
-
-/// Get scale matrix.
-inline Matrix scale(scalar s)
-{
-	Matrix m(identity);
-	m(0,0) = m(1,1) = m(2,2) = s;
-	return m;
-}
-
-/// Get translation matrix.
-inline Matrix translation(Vector3::Arg t)
-{
-	Matrix m(identity);
-	m(0,3) = t.x();
-	m(1,3) = t.y();
-	m(2,3) = t.z();
-	return m;
-}
-
-/// Get rotation matrix.
-inline Matrix rotation(scalar theta, scalar v0, scalar v1, scalar v2)
-{
-	scalar cost = cosf(theta);
-	scalar sint = sinf(theta);
-
-	Matrix m(identity);
-	
-	if( 1 == v0 && 0 == v1 && 0 == v2 ) {
-		m(1,1) = cost; m(2,1) = -sint;
-		m(1,2) = sint; m(2,2) = cost;
-	}
-	else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
-		m(0,0) = cost; m(2,0) = sint;
-		m(1,2) = -sint; m(2,2) = cost;
-	}
-	else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
-		m(0,0) = cost; m(1,0) = -sint;
-		m(0,1) = sint; m(1,1) = cost;
-	} 
-	else {
-		scalar a2, b2, c2;
-		a2 = v0 * v0;
-		b2 = v1 * v1;
-		c2 = v2 * v2;
-
-		scalar iscale = 1.0f / sqrtf(a2 + b2 + c2);
-		v0 *= iscale;
-		v1 *= iscale;
-		v2 *= iscale;
-
-		scalar abm, acm, bcm;
-		scalar mcos, asin, bsin, csin;
-		mcos = 1.0f - cost;
-		abm = v0 * v1 * mcos;
-		acm = v0 * v2 * mcos;
-		bcm = v1 * v2 * mcos;
-		asin = v0 * sint;
-		bsin = v1 * sint;
-		csin = v2 * sint;
-		m(0,0) = a2 * mcos + cost;
-		m(1,0) = abm - csin;
-		m(2,0) = acm + bsin;
-		m(3,0) = abm + csin;
-		m(1,1) = b2 * mcos + cost;
-		m(2,1) = bcm - asin;
-		m(3,1) = acm - bsin;
-		m(1,2) = bcm + asin;
-		m(2,2) = c2 * mcos + cost;
-	}
-	return m;
-}
-
-//Matrix rotation(scalar yaw, scalar pitch, scalar roll);
-//Matrix skew(scalar angle, Vector3::Arg v1, Vector3::Arg v2);
-
-/// Get frustum matrix.
-inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
-{
-	Matrix m(zero);
-
-	scalar doubleznear = 2.0f * zNear;
-	scalar one_deltax = 1.0f / (xmax - xmin);
-	scalar one_deltay = 1.0f / (ymax - ymin);
-	scalar one_deltaz = 1.0f / (zFar - zNear);
-
-	m(0,0) = doubleznear * one_deltax;
-	m(1,1) = doubleznear * one_deltay;
-	m(0,2) = (xmax + xmin) * one_deltax;
-	m(1,2) = (ymax + ymin) * one_deltay;
-	m(2,2) = -(zFar + zNear) * one_deltaz;
-	m(3,2) = -1.0f;
-	m(2,3) = -(zFar * doubleznear) * one_deltaz;
-	
-	return m;
-}
-
-/// Get infinite frustum matrix.
-inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
-{
-	Matrix m(zero);
-	
-	scalar doubleznear = 2.0f * zNear;
-	scalar one_deltax = 1.0f / (xmax - xmin);
-	scalar one_deltay = 1.0f / (ymax - ymin);
-	scalar nudge = 1.0; // 0.999;
-
-	m(0,0) = doubleznear * one_deltax;
-	m(1,1) = doubleznear * one_deltay;
-	m(0,2) = (xmax + xmin) * one_deltax;
-	m(1,2) = (ymax + ymin) * one_deltay;
-	m(2,2) = -1.0f * nudge;
-	m(3,2) = -1.0f;
-	m(2,3) = -doubleznear * nudge;
-	
-	return m;
-}
-
-/// Get perspective matrix.
-inline Matrix perspective(scalar fovy, scalar aspect, scalar zNear, scalar zFar)
-{
-	scalar xmax = zNear * tan(fovy / 2);
-	scalar xmin = -xmax;
-
-	scalar ymax = xmax / aspect;
-	scalar ymin = -ymax;
-
-	return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
-}
-
-/// Get infinite perspective matrix.
-inline Matrix perspective(scalar fovy, scalar aspect, scalar zNear)
-{
-	scalar x = zNear * tan(fovy / 2);
-	scalar y = x / aspect;
-	return frustum( -x, x, -y, y, zNear );	
-}
-
-/// Get matrix determinant.
-inline scalar Matrix::determinant() const
-{
-	return 
-		m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
-		m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
-		m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
-		m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
-		m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
-		m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
-}
-
-inline Matrix transpose(Matrix::Arg m)
-{
-	Matrix r;
-	for (int i = 0; i < 4; i++)
-	{
-		for (int j = 0; j < 4; j++)
-		{
-			r(i, j) = m(j, i);
-		}
-	}
-	return r;
-}
-
-inline Matrix inverse(Matrix::Arg m)
-{
-   Matrix r;
-   r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
-   r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
-   r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
-   r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
-   r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
-   r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
-   r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
-   r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
-   r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
-   r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
-   r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
-   r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
-   r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
-   r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
-   r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
-   r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
-   r.scale(1.0f / m.determinant());
-   return r;
-}
+// - Matrices are stored in memory in *column major* order.
+// - Points are to be though of as column vectors.
+// - Transformation of a point p by a matrix M is: p' = M * p
 
-inline Matrix isometryInverse(Matrix::Arg m)
-{
-	Matrix r(identity);
-	
-	// transposed 3x3 upper left matrix
-	for (int i = 0; i < 3; i++)
-	{
-		for (int j = 0; j < 3; j++)
-		{
-			r(i, j) = m(j, i);
-		}
-	}
-	
-	// translate by the negative offsets
-	r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
-
-	return r;
-}
-
-//Matrix affineInverse(Matrix::Arg m);
-
-/// Transform the given 3d point with the given matrix.
-inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
-{
-	return Vector3(
-		p.x() * m(0,0) + p.y() * m(0,1) + p.z() * m(0,2) + m(0,3),
-		p.x() * m(1,0) + p.y() * m(1,1) + p.z() * m(1,2) + m(1,3),
-		p.x() * m(2,0) + p.y() * m(2,1) + p.z() * m(2,2) + m(2,3));
-}
-
-/// Transform the given 3d vector with the given matrix.
-inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
+namespace nv
 {
-	return Vector3(
-		p.x() * m(0,0) + p.y() * m(0,1) + p.z() * m(0,2),
-		p.x() * m(1,0) + p.y() * m(1,1) + p.z() * m(1,2),
-		p.x() * m(2,0) + p.y() * m(2,1) + p.z() * m(2,2));
-}
+    enum identity_t { identity };
 
-/// Transform the given 4d vector with the given matrix.
-inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
-{
-	return Vector4(
-		p.x() * m(0,0) + p.y() * m(0,1) + p.z() * m(0,2) + p.w() * m(0,3),
-		p.x() * m(1,0) + p.y() * m(1,1) + p.z() * m(1,2) + p.w() * m(1,3),
-		p.x() * m(2,0) + p.y() * m(2,1) + p.z() * m(2,2) + p.w() * m(2,3),
-		p.x() * m(3,0) + p.y() * m(3,1) + p.z() * m(3,2) + p.w() * m(3,3));
-}
-
-inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
-{
-	// @@ Is this the right order? mul(a, b) = b * a
-	Matrix m = a;
-	m.apply(b);
-	return m;
-}
+    // 3x3 matrix.
+    class NVMATH_CLASS Matrix3
+    {
+    public:
+        Matrix3();
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
+        Matrix3(const Matrix3 & m);
+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+
+        Vector3 row(uint i) const;
+        Vector3 column(uint i) const;
+
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix3 & m);
+        void operator-=(const Matrix3 & m);
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        float determinant() const;
+
+    private:
+        float m_data[9];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
+
+
+    // 4x4 matrix.
+    class NVMATH_CLASS Matrix
+    {
+    public:
+        typedef Matrix const & Arg;
+
+        Matrix();
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
+        Matrix(const Matrix & m);
+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
+        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+        const float * ptr() const;
+
+        Vector4 row(uint i) const;
+        Vector4 column(uint i) const;
+
+        void zero();
+        void identity();
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        void translate(Vector3::Arg t);
+        void rotate(float theta, float v0, float v1, float v2);
+        float determinant() const;
+
+        void operator+=(const Matrix & m);
+        void operator-=(const Matrix & m);
+
+        void apply(Matrix::Arg m);
+
+    private:
+        float m_data[16];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Compute inverse using LU decomposition.
+    extern Matrix inverseLU(const Matrix & m);
+
+    // Compute inverse using Gaussian elimination and partial pivoting.
+    extern Matrix inverse(const Matrix & m);
+    extern Matrix3 inverse(const Matrix3 & m);
 
 } // nv namespace
 
-
-
-
-#if 0
-	/** @name Special matrices. */
-	//@{
-	/** Generate a translation matrix. */
-	void TranslationMatrix(const Vec3 & v) {
-		data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
-		data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
-		data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
-		data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
-	}
-
-	/** Rotate theta degrees around v. */
-	void RotationMatrix( scalar theta, scalar v0, scalar v1, scalar v2 ) {
-		scalar cost = cos(theta);
-		scalar sint = sin(theta);
-
-		if( 1 == v0 && 0 == v1 && 0 == v2 ) {
-			data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
-	        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
-		    data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
-			data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
-	    }
-		else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
-	        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
-		    data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
-			data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
-			data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
-	    }
-		else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
-			data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
-	        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
-		    data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
-			data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
-	    } 
-		else {
-			//we need scale a,b,c to unit length.
-			scalar a2, b2, c2;
-	        a2 = v0 * v0;
-		    b2 = v1 * v1;
-			c2 = v2 * v2;
-
-			scalar iscale = 1.0f / sqrtf(a2 + b2 + c2);
-			v0 *= iscale;
-			v1 *= iscale;
-			v2 *= iscale;
-
-			scalar abm, acm, bcm;
-			scalar mcos, asin, bsin, csin;
-	        mcos = 1.0f - cost;
-		    abm = v0 * v1 * mcos;
-			acm = v0 * v2 * mcos;
-	        bcm = v1 * v2 * mcos;
-		    asin = v0 * sint;
-			bsin = v1 * sint;
-	        csin = v2 * sint;
-		    data[0] = a2 * mcos + cost;
-			data[1] = abm - csin;
-	        data[2] = acm + bsin;
-		    data[3] = abm + csin;
-			data[4] = 0.0f;
-	        data[5] = b2 * mcos + cost;
-		    data[6] = bcm - asin;
-			data[7] = acm - bsin;
-			data[8] = 0.0f;
-		    data[9] = bcm + asin;
-			data[10] = c2 * mcos + cost;
-			data[11] = 0.0f;
-			data[12] = 0.0f;
-			data[13] = 0.0f;
-			data[14] = 0.0f;
-			data[15] = 1.0f;
-		}
-	}
-
-	/*
-	void SkewMatrix(scalar angle, const Vec3 & v1, const Vec3 & v2) {
-		v1.Normalize();
-		v2.Normalize();
-
-		Vec3 v3;
-		v3.Cross(v1, v2);
-		v3.Normalize();
-
-		// Get skew factor.
-		scalar costheta = Vec3DotProduct(v1, v2);
-		scalar sintheta = Real.Sqrt(1 - costheta * costheta);
-		scalar skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
-
-		// Build orthonormal matrix.
-		v1 = FXVector3.Cross(v3, v2);
-		v1.Normalize();
-
-		Matrix R = Matrix::Identity;
-		R[0, 0] = v3.X; // Not sure this is in the correct order...
-		R[1, 0] = v3.Y;
-		R[2, 0] = v3.Z;
-		R[0, 1] = v1.X;
-		R[1, 1] = v1.Y;
-		R[2, 1] = v1.Z;
-		R[0, 2] = v2.X;
-		R[1, 2] = v2.Y;
-		R[2, 2] = v2.Z;
-
-		// Build skew matrix.
-		Matrix S = Matrix::Identity;
-		S[2, 1] = -skew;
-
-		// Return skew transform.
-		return R * S * R.Transpose;	// Not sure this is in the correct order...
-	}
-	*/
-
-	/**
-	 * Generate rotation matrix for the euler angles. This is the same as computing
-	 * 3 rotation matrices and multiplying them together in our custom order.
-	 *
-	 * @todo Have to recompute this code for our new convention.
-	**/
-	void RotationMatrix( scalar yaw, scalar pitch, scalar roll ) {
-		scalar sy = sin(yaw+ToRadian(90));
-		scalar cy = cos(yaw+ToRadian(90));
-		scalar sp = sin(pitch-ToRadian(90));
-		scalar cp = cos(pitch-ToRadian(90));
-		scalar sr = sin(roll);
-		scalar cr = cos(roll);
-
-		data[0] = cr*cy + sr*sp*sy;
-		data[1] = cp*sy;
-		data[2] = -sr*cy + cr*sp*sy;
-		data[3] = 0;
-
-		data[4] = -cr*sy + sr*sp*cy;
-		data[5] = cp*cy;
-		data[6] = sr*sy + cr*sp*cy;
-		data[7] = 0;
-
-		data[8] = sr*cp;
-		data[9] = -sp;
-		data[10] = cr*cp;
-		data[11] = 0;
-
-		data[12] = 0;
-		data[13] = 0;
-		data[14] = 0;
-		data[15] = 1;
-	}
-
-	/** Create a frustum matrix with the far plane at the infinity. */
-	void Frustum( scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar ) {
-		scalar one_deltax, one_deltay, one_deltaz, doubleznear;
-
-		doubleznear = 2.0f * zNear;
-		one_deltax = 1.0f / (xmax - xmin);
-		one_deltay = 1.0f / (ymax - ymin);
-		one_deltaz = 1.0f / (zFar - zNear);
-
-		data[0] = (scalar)(doubleznear * one_deltax);
-		data[1] = 0.0f;
-		data[2] = 0.0f;
-		data[3] = 0.0f;
-		data[4] = 0.0f;
-		data[5] = (scalar)(doubleznear * one_deltay);
-		data[6] = 0.f;
-		data[7] = 0.f;
-		data[8] = (scalar)((xmax + xmin) * one_deltax);
-		data[9] = (scalar)((ymax + ymin) * one_deltay);
-		data[10] = (scalar)(-(zFar + zNear) * one_deltaz);
-		data[11] = -1.f;
-		data[12] = 0.f;
-		data[13] = 0.f;
-		data[14] = (scalar)(-(zFar * doubleznear) * one_deltaz);
-		data[15] = 0.f;
-	}
-
-	/** Create a frustum matrix with the far plane at the infinity. */
-	void FrustumInf( scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear ) {
-		scalar one_deltax, one_deltay, doubleznear, nudge;
-
-		doubleznear = 2.0f * zNear;
-		one_deltax = 1.0f / (xmax - xmin);
-		one_deltay = 1.0f / (ymax - ymin);
-	    nudge = 1.0; // 0.999;
-
-		data[0] = doubleznear * one_deltax;
-		data[1] = 0.0f;
-		data[2] = 0.0f;
-		data[3] = 0.0f;
-
-		data[4] = 0.0f;
-		data[5] = doubleznear * one_deltay;
-		data[6] = 0.f;
-		data[7] = 0.f;
-
-		data[8] = (xmax + xmin) * one_deltax;
-		data[9] = (ymax + ymin) * one_deltay;
-		data[10] = -1.0f * nudge;
-		data[11] = -1.0f;
-
-		data[12] = 0.f;
-		data[13] = 0.f;
-		data[14] = -doubleznear * nudge;
-		data[15] = 0.f;
-	}
-
-	/** Create an inverse frustum matrix with the far plane at the infinity. */
-	void FrustumInfInv( scalar left, scalar right, scalar bottom, scalar top, scalar zNear ) {
-		// this matrix is wrong (not tested scalarly) I think it should be transposed.
-		data[0] = (right - left) / (2 * zNear);
-		data[1] = 0;
-		data[2] = 0;
-		data[3] = (right + left) / (2 * zNear);
-		data[4] = 0;
-		data[5] = (top - bottom) / (2 * zNear);
-		data[6] = 0;
-		data[7] = (top + bottom) / (2 * zNear);
-		data[8] = 0;
-		data[9] = 0;
-		data[10] = 0;
-		data[11] = -1;
-		data[12] = 0;
-		data[13] = 0;
-		data[14] = -1 / (2 * zNear);
-		data[15] = 1 / (2 * zNear);
-	}
-
-	/** Create an homogeneous projection matrix. */
-	void Perspective( scalar fov, scalar aspect, scalar zNear, scalar zFar ) {
-		scalar xmin, xmax, ymin, ymax;
-
-		xmax = zNear * tan( fov/2 );
-		xmin = -xmax;
-
-		ymax = xmax / aspect;
-		ymin = -ymax;
-
-		Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
-	}
-
-	/** Create a projection matrix with the far plane at the infinity. */
-	void PerspectiveInf( scalar fov, scalar aspect, scalar zNear ) {
-		scalar x = zNear * tan( fov/2 );
-		scalar y = x / aspect;
-		FrustumInf( -x, x, -y, y, zNear );
-	}
-
-	/** Create an inverse projection matrix with far plane at the infinity. */
-	void PerspectiveInfInv( scalar fov, scalar aspect, scalar zNear ) {
-		scalar x = zNear * tan( fov/2 );
-		scalar y = x / aspect;
-		FrustumInfInv( -x, x, -y, y, zNear );
-	}
-
-	/** Build bone matrix from quatertion and offset. */
-	void BoneMatrix(const Quat & q, const Vec3 & offset) {
-		scalar x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
-
-		// calculate coefficients
-		x2 = q.x + q.x;
-		y2 = q.y + q.y;
-		z2 = q.z + q.z;
-
-		xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
-		yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
-		wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
-
-		data[0] = 1.0f - (yy + zz); 	
-		data[1] = xy - wz;
-		data[2] = xz + wy;		
-		data[3] = 0.0f;
- 
-		data[4] = xy + wz;		
-		data[5] = 1.0f - (xx + zz);
-		data[6] = yz - wx;		
-		data[7] = 0.0f;
-
-		data[8] = xz - wy;		
-		data[9] = yz + wx;
-		data[10] = 1.0f - (xx + yy);		
-		data[11] = 0.0f;
-
-		data[12] = offset.x;
-		data[13] = offset.y;
-		data[14] = offset.z;			
-		data[15] = 1.0f;
-	}
-
-	//@}
-
-
-	/** @name Transformations: */
-	//@{
-
-	/** Apply a general scale. */
-	void Scale( scalar x, scalar y, scalar z ) {
-		data[0] *= x;	data[4] *= y;	data[8]  *= z;
-		data[1] *= x;	data[5] *= y;	data[9]  *= z;
-		data[2] *= x;	data[6] *= y;	data[10] *= z;
-		data[3] *= x;	data[7] *= y;	data[11] *= z;
-	}
-
-	/** Apply a rotation of theta degrees around the axis v*/
-	void Rotate( scalar theta, const Vec3 & v ) {
-		Matrix b;
-		b.RotationMatrix( theta, v[0], v[1], v[2] );
-		Multiply4x3( b );
-	}
-
-	/** Apply a rotation of theta degrees around the axis v*/
-	void Rotate( scalar theta, scalar v0, scalar v1, scalar v2 ) {
-		Matrix b;
-		b.RotationMatrix( theta, v0, v1, v2 );
-		Multiply4x3( b );
-	}
-
-	/**
-	 * Translate the matrix by t. This is the same as multiplying by a
-	 * translation matrix with the given offset.
-	 * this = T * this
-	 */
-	void Translate( const Vec3 &t ) {
-		data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
-		data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
-		data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
-		data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
-	}
-
-	/** 
-	 * Translate the matrix by x, y, z. This is the same as multiplying by a 
-	 * translation matrix with the given offsets.
-	 */
-	void Translate( scalar x, scalar y, scalar z ) {
-		data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
-		data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
-		data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
-		data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
-	}
-
-	/** Compute the transposed matrix. */
-	void Transpose() {
-		piSwap(data[1], data[4]);
-		piSwap(data[2], data[8]);
-		piSwap(data[6], data[9]);
-		piSwap(data[3], data[12]);
-		piSwap(data[7], data[13]);
-		piSwap(data[11], data[14]);
-	}
-
-	/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
-	void IsometryInverse() {
-		// transposed 3x3 upper left matrix
-		piSwap(data[1], data[4]);
-		piSwap(data[2], data[8]);
-		piSwap(data[6], data[9]);
-
-		// translate by the negative offsets
-		Vec3 v(-data[12], -data[13], -data[14]);
-		data[12] = data[13] = data[14] = 0;
-		Translate(v);
-	}
-
-	/** Compute the inverse of the affine portion of this matrix. */
-	void AffineInverse() {
-		data[12] = data[13] = data[14] = 0;
-		Transpose();
-	}
-	//@}
-
-	/** @name Matrix operations: */
-	//@{
-	
-	/** Return the determinant of this matrix. */
-	scalar Determinant() const {
-		return	data[0] * data[5] * data[10] * data[15] + 
-				data[1] * data[6] * data[11] * data[12] +
-				data[2] * data[7] * data[ 8] * data[13] +
-				data[3] * data[4] * data[ 9] * data[14] -
-				data[3] * data[6] * data[ 9] * data[12] -
-				data[2] * data[5] * data[ 8] * data[15] -
-				data[1] * data[4] * data[11] * data[14] -
-				data[0] * data[7] * data[10] * data[12];
-	}
-
-
-	/** Standard matrix product: this *= B. */
-	void Multiply4x4( const Matrix & restrict B ) {
-		Multiply4x4(*this, B);
-	}
-
-	/** Standard matrix product: this = A * B. this != B*/
-	void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
-		piDebugCheck(this != &B);
-	
-		for(int i = 0; i < 4; i++) {
-			const scalar ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
-			GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
-			GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
-			GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
-			GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
-		}
-
-		/* Unrolled but does not allow this == A
-		data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
-		data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
-		data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
-		data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
-		data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
-		data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
-		data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
-		data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
-		data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
-		data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
-		data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
-		data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
-		data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
-		data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
-		data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
-		data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
-		*/
-	}
-
-	/** Standard matrix product: this *= B. */
-	void Multiply4x3( const Matrix & restrict B ) {
-		Multiply4x3(*this, B);
-	}
-
-	/** Standard product of matrices, where the last row is [0 0 0 1]. */
-	void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
-		piDebugCheck(this != &B);
-	
-		for(int i = 0; i < 3; i++) {
-			const scalar ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
-			GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
-			GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
-			GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
-			GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
-		}
-		data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
-
-		/* Unrolled but does not allow this == A
-		data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
-		data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
-		data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
-		data[3] = 0.0f;
-		data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
-		data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
-		data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
-		data[7] = 0.0f;
-		data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
-		data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
-		data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
-		data[11]= 0.0f;
-		data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
-		data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
-		data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
-		data[15]= 1.0f;
-		*/
-	}
-	//@}
-
-
-	/** @name Vector operations: */
-	//@{
-	
-	/** Transform 3d vector (w=0). */
-	void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-		piDebugCheck(&orig != dest);
-		dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
-		dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
-		dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
-	}
-	/** Transform 3d vector by the transpose (w=0). */
-	void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-		piDebugCheck(&orig != dest);
-		dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
-		dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
-		dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
-	}
-
-	/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
-	void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-		piDebugCheck(&orig != dest);
-		dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-		dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-		dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-	}
-
-	/** Transform a point, normalize it, and return w. */
-	scalar TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-		piDebugCheck(&orig != dest);
-		scalar w;
-		dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-		dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-		dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-		w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
-		*dest *= w;
-		return w;
-	}
-
-	/** Transform a point and return w. */
-	scalar TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
-		piDebugCheck(&orig != dest);
-		dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-		dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-		dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-		return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
-	}
-
-	/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
-	void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
-		dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
-		dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
-		dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
-		dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
-	}
-	//@}
-
-	/** @name Matrix analysis. */
-	//@{
-	
-	/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
-	void GetEulerAnglesZYZ(scalar * s, scalar * t, scalar * r) const {
-		if( GetElem(2,2) < 1.0f ) {
-			if( GetElem(2,2) > -1.0f ) {
-				// 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
-				//	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
-				//	-cs*st				ss*st				ct
-				*s = atan2(GetElem(1,2), -GetElem(0,2));
-				*t = acos(GetElem(2,2));
-				*r = atan2(GetElem(2,1), GetElem(2,0));		
-			}
-			else {
-				// 	-c(s-r)	 	s(s-r)		0
-				//	s(s-r)		c(s-r)		0
-				//	0			0			-1
-				*s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
-				*t = PI;
-				*r = 0;
-			}
-		}
-		else {
-			// 	c(s+r)		-s(s+r)		0
-			//	s(s+r)		c(s+r)		0
-			//	0			0			1
-			*s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
-			*t = 0;
-			*r = 0;
-		}
-	}
-
-	//@}
-	
-	MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
-
-	/** Print to debug output. */
-	void Print() const {
-		piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
-		piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
-		piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
-		piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
-	}
-
-
-public:
-
-	scalar data[16];
-
-};
-#endif
-
-
-
-
 #endif // NV_MATH_MATRIX_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.cpp
@@ -0,0 +1,441 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Matrix.inl"
+#include "Vector.inl"
+
+#include "nvcore/Array.inl"
+
+#include <float.h>
+
+#if !NV_CC_MSVC && !NV_OS_ORBIS
+#include <alloca.h>
+#endif
+
+using namespace nv;
+
+
+// Given a matrix a[1..n][1..n], this routine replaces it by the LU decomposition of a rowwise
+// permutation of itself. a and n are input. a is output, arranged as in equation (2.3.14) above;
+// indx[1..n] is an output vector that records the row permutation effected by the partial
+// pivoting; d is output as -1 depending on whether the number of row interchanges was even
+// or odd, respectively. This routine is used in combination with lubksb to solve linear equations
+// or invert a matrix.
+static bool ludcmp(float **a, int n, int *indx, float *d)
+{
+    const float TINY = 1.0e-20f;
+
+    float * vv = (float*)alloca(sizeof(float) * n);    // vv stores the implicit scaling of each row.
+
+    *d = 1.0; // No row interchanges yet.
+    for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information.
+    
+        float big = 0.0;
+        for (int j = 0; j < n; j++) {
+            big = max(big, fabsf(a[i][j]));
+        }
+        if (big == 0) {
+            return false;   // Singular matrix
+        }
+        
+        // No nonzero largest element.
+        vv[i] = 1.0f / big; // Save the scaling.
+    }
+
+    for (int j = 0; j < n; j++) {       // This is the loop over columns of Crout's method.
+        for (int i = 0; i < j; i++) {   // This is equation (2.3.12) except for i = j.
+            float sum = a[i][j];
+            for (int k = 0; k < i; k++) sum -= a[i][k]*a[k][j];
+            a[i][j] = sum;
+        }
+
+        int imax = -1;
+        float big = 0.0;                // Initialize for the search for largest pivot element.
+        for (int i = j; i < n; i++) {   // This is i = j of equation (2.3.12) and i = j+ 1 : : : N
+            float sum = a[i][j];              // of equation (2.3.13).
+            for (int k = 0; k < j; k++) {
+                sum -= a[i][k]*a[k][j];
+            }
+            a[i][j]=sum;
+
+            float dum = vv[i]*fabs(sum);
+            if (dum >= big) {
+                // Is the figure of merit for the pivot better than the best so far?
+                big = dum;
+                imax = i;
+            }
+        }
+        nvDebugCheck(imax != -1);
+
+        if (j != imax) {                // Do we need to interchange rows?
+            for (int k = 0; k < n; k++) {   // Yes, do so...
+                swap(a[imax][k], a[j][k]);
+            }
+            *d = -(*d); // ...and change the parity of d.
+            vv[imax]=vv[j]; // Also interchange the scale factor.
+        }
+
+        indx[j]=imax;
+        if (a[j][j] == 0.0) a[j][j] = TINY;
+        
+        // If the pivot element is zero the matrix is singular (at least to the precision of the
+        // algorithm). For some applications on singular matrices, it is desirable to substitute
+        // TINY for zero.
+        if (j != n-1) { // Now, finally, divide by the pivot element.
+            float dum = 1.0f / a[j][j];
+            for (int i = j+1; i < n; i++) a[i][j] *= dum;
+        }
+    } // Go back for the next column in the reduction.
+
+    return true;
+}
+
+
+// Solves the set of n linear equations Ax = b. Here a[1..n][1..n] is input, not as the matrix
+// A but rather as its LU decomposition, determined by the routine ludcmp. indx[1..n] is input
+// as the permutation vector returned by ludcmp. b[1..n] is input as the right-hand side vector
+// B, and returns with the solution vector X. a, n, and indx are not modified by this routine
+// and can be left in place for successive calls with different right-hand sides b. This routine takes
+// into account the possibility that b will begin with many zero elements, so it is efficient for use
+// in matrix inversion.
+static void lubksb(float **a, int n, int *indx, float b[])
+{
+    int ii = 0;
+    for (int i=0; i<n; i++) {   // When ii is set to a positive value, it will become 
+        int ip = indx[i];       // the index of the first nonvanishing element of b. We now 
+        float sum = b[ip];      // do the forward substitution, equation (2.3.6). The 
+        b[ip] = b[i];           // only new wrinkle is to unscramble the permutation as we go.
+        if (ii != 0) {
+            for (int j = ii-1; j < i; j++) sum -= a[i][j]*b[j];
+        }
+        else if (sum != 0.0f) {
+            ii = i+1;             // A nonzero element was encountered, so from now on we 
+        }
+        b[i] = sum;             // will have to do the sums in the loop above.
+    }
+    for (int i=n-1; i>=0; i--) {  // Now we do the backsubstitution, equation (2.3.7).
+        float sum = b[i];
+        for (int j = i+1; j < n; j++) {
+            sum -= a[i][j]*b[j];
+        }
+        b[i] = sum/a[i][i];     // Store a component of the solution vector X.
+    } // All done!
+}
+
+
+bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    float m[4][4];
+    float *a[4] = {m[0], m[1], m[2], m[3]};
+    int idx[4];
+    float d;
+
+    for (int y = 0; y < 4; y++) {
+        for (int x = 0; x < 4; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+
+    // Create LU decomposition.
+    if (!ludcmp(a, 4, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+
+    // Init solution.
+    *x = b;
+
+    // Do back substitution.
+    lubksb(a, 4, idx, x->component);
+
+    return true;
+}
+
+// @@ Not tested.
+Matrix nv::inverseLU(const Matrix & A)
+{
+    Vector4 Ai[4];
+
+    solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]);
+    solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]);
+    solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]);
+    solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]);
+
+    return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]);
+}
+
+
+
+bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    float m[3][3];
+    float *a[3] = {m[0], m[1], m[2]};
+    int idx[3];
+    float d;
+
+    for (int y = 0; y < 3; y++) {
+        for (int x = 0; x < 3; x++) {
+            a[x][y] = A(x, y);
+        }
+    }
+
+    // Create LU decomposition.
+    if (!ludcmp(a, 3, idx, &d)) {
+        // Singular matrix.
+        return false;
+    }
+
+    // Init solution.
+    *x = b;
+
+    // Do back substitution.
+    lubksb(a, 3, idx, x->component);
+
+    return true;
+}
+
+
+bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    *x = transform(inverseCramer(A), b);
+    
+    return true; // @@ Return false if determinant(A) == 0 !
+}
+
+bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
+{
+    nvDebugCheck(x != NULL);
+
+    const float det = A.determinant();
+    if (equal(det, 0.0f)) {   // @@ Use input epsilon.
+        return false;
+    }
+
+    Matrix3 Ai = inverseCramer(A);
+
+    *x = transform(Ai, b);
+    
+    return true;
+}
+
+
+
+// Inverse using gaussian elimination. From Jon's code.
+Matrix nv::inverse(const Matrix & m) {
+
+    Matrix A = m;
+    Matrix B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<4; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+Matrix3 nv::inverse(const Matrix3 & m) {
+
+    Matrix3 A = m;
+    Matrix3 B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<3; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<3; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<3; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<3; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<3; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<3; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<3; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<3; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<3; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=3-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<3; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+
+
+
+#if 0 
+
+// Copyright (C) 1999-2004 Michael Garland.
+// 
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, and/or sell copies of the Software, and to permit persons
+// to whom the Software is furnished to do so, provided that the above
+// copyright notice(s) and this permission notice appear in all copies of
+// the Software and that both the above copyright notice(s) and this
+// permission notice appear in supporting documentation.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+// OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+// HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
+// INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
+// FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+// NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+// WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+// 
+// Except as contained in this notice, the name of a copyright holder
+// shall not be used in advertising or otherwise to promote the sale, use
+// or other dealings in this Software without prior written authorization
+// of the copyright holder.
+
+
+// Matrix inversion code for 4x4 matrices using Gaussian elimination
+// with partial pivoting.  This is a specialized version of a
+// procedure originally due to Paul Heckbert <ph@cs.cmu.edu>.
+//
+// Returns determinant of A, and B=inverse(A)
+// If matrix A is singular, returns 0 and leaves trash in B.
+//
+#define SWAP(a, b, t)   {t = a; a = b; b = t;}
+double invert(Mat4& B, const Mat4& m)
+{
+    Mat4 A = m;
+    int i, j, k;
+    double max, t, det, pivot;
+
+    /*---------- forward elimination ----------*/
+
+    for (i=0; i<4; i++)                 /* put identity matrix in B */
+        for (j=0; j<4; j++)
+            B(i, j) = (double)(i==j);
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return 0.;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                SWAP(A(i, k), A(j, k), t);
+            for (k=0; k<4; k++)
+                SWAP(B(i, k), B(j, k), t);
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return det;
+}
+
+#endif // 0
+
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Matrix.inl
@@ -0,0 +1,1274 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_MATRIX_INL
+#define NV_MATH_MATRIX_INL
+
+#include "Matrix.h"
+
+namespace nv
+{
+    inline Matrix3::Matrix3() {}
+    
+    inline Matrix3::Matrix3(float f)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = f;
+        }
+    }
+
+    inline Matrix3::Matrix3(identity_t)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix3::Matrix3(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+    
+    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
+    {
+        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;
+        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;
+        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;
+    }
+
+    inline float Matrix3::data(uint idx) const
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float & Matrix3::data(uint idx)
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float Matrix3::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float Matrix3::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float & Matrix3::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+
+    inline Vector3 Matrix3::row(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(i, 0), get(i, 1), get(i, 2));
+    }
+    inline Vector3 Matrix3::column(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(0, i), get(1, i), get(2, i));
+    }
+
+    inline void Matrix3::operator*=(float s)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::operator/=(float s)
+    {
+        float is = 1.0f /s;
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= is;
+        }
+    }
+
+    inline void Matrix3::operator+=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix3::operator-=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m -= b;
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator*(float s, const Matrix3 & a)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator/(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m /= s;
+        return m;
+    }
+
+    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m;
+
+        for(int i = 0; i < 3; i++) {
+            const float ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);
+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);
+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);
+            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);
+        }
+
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)
+    {
+        return mul(a, b);
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transform(const Matrix3 & m, const Vector3 & p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    inline void Matrix3::scale(float s)
+    {
+        for (int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x;
+        m_data[3] *= s.y; m_data[4] *= s.y; m_data[5] *= s.y;
+        m_data[6] *= s.z; m_data[7] *= s.z; m_data[8] *= s.z;
+    }
+
+    inline float Matrix3::determinant() const
+    {
+        return 
+            get(0,0) * get(1,1) * get(2,2) + 
+            get(0,1) * get(1,2) * get(2,0) + 
+            get(0,2) * get(1,0) * get(2,1) -
+            get(0,2) * get(1,1) * get(2,0) - 
+            get(0,1) * get(1,0) * get(2,2) -
+            get(0,0) * get(1,2) * get(2,1);
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix3 inverseCramer(const Matrix3 & m)
+    {
+        const float det = m.determinant();
+        if (equal(det, 0.0f, 0.0f)) {
+            return Matrix3(0);
+        }
+
+        Matrix3 r;
+
+        r.data(0) =  - m.data(5) * m.data(7) + m.data(4) * m.data(8);
+        r.data(1) =  + m.data(5) * m.data(6) - m.data(3) * m.data(8);
+        r.data(2) =  - m.data(4) * m.data(6) + m.data(3) * m.data(7);
+
+        r.data(3) =  + m.data(2) * m.data(7) - m.data(1) * m.data(8);
+        r.data(4) =  - m.data(2) * m.data(6) + m.data(0) * m.data(8);
+        r.data(5) =  + m.data(1) * m.data(6) - m.data(0) * m.data(7);
+
+        r.data(6) =  - m.data(2) * m.data(4) + m.data(1) * m.data(5);
+        r.data(7) =  + m.data(2) * m.data(3) - m.data(0) * m.data(5);
+        r.data(8) =  - m.data(1) * m.data(3) + m.data(0) * m.data(4);
+
+        r.scale(1.0f / det);
+
+        return r;
+    }
+
+
+
+    inline Matrix::Matrix()
+    {
+    }
+
+    inline Matrix::Matrix(float f)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = 0.0f;
+        }
+    }
+
+    inline Matrix::Matrix(identity_t)
+    {
+        for(int i = 0; i < 4; i++) {
+            for(int j = 0; j < 4; j++) {
+                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix3 & m)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                operator()(i, j) = m.get(i, j);
+            }
+        }
+        for(int i = 0; i < 4; i++) {
+            operator()(3, i) = 0;
+            operator()(i, 3) = 0;
+        }
+    }
+
+    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
+    {
+        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
+        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;
+        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;
+        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
+    }
+
+    /*inline Matrix::Matrix(const float m[])
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m[i];
+        }
+    }*/
+
+
+    // Accessors
+    inline float Matrix::data(uint idx) const
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float & Matrix::data(uint idx)
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float Matrix::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float Matrix::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float & Matrix::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+
+    inline const float * Matrix::ptr() const
+    {
+        return m_data;
+    }
+
+    inline Vector4 Matrix::row(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
+    }
+
+    inline Vector4 Matrix::column(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
+    }
+
+    inline void Matrix::zero()
+    {
+        m_data[0] = 0; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 0; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 0; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 0;
+    }
+
+    inline void Matrix::identity()
+    {
+        m_data[0] = 1; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 1; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 1; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 1;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(float s)
+    {
+        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
+        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
+        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
+        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;
+        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;
+        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;
+    }
+
+    // Apply translation.
+    inline void Matrix::translate(Vector3::Arg t)
+    {
+        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];
+        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];
+        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];
+        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];
+    }
+
+    Matrix rotation(float theta, float v0, float v1, float v2);
+
+    // Apply rotation.
+    inline void Matrix::rotate(float theta, float v0, float v1, float v2)
+    {
+        Matrix R(rotation(theta, v0, v1, v2));
+        apply(R);
+    }
+
+    // Apply transform.
+    inline void Matrix::apply(Matrix::Arg m)
+    {
+        nvDebugCheck(this != &m);
+
+        for(int i = 0; i < 4; i++) {
+            const float ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
+            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
+            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
+            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
+            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
+        }
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(Vector3::Arg s)
+    {
+        Matrix m(identity);
+        m(0,0) = s.x;
+        m(1,1) = s.y;
+        m(2,2) = s.z;
+        return m;
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(float s)
+    {
+        Matrix m(identity);
+        m(0,0) = m(1,1) = m(2,2) = s;
+        return m;
+    }
+
+    // Get translation matrix.
+    inline Matrix translation(Vector3::Arg t)
+    {
+        Matrix m(identity);
+        m(0,3) = t.x;
+        m(1,3) = t.y;
+        m(2,3) = t.z;
+        return m;
+    }
+
+    // Get rotation matrix.
+    inline Matrix rotation(float theta, float v0, float v1, float v2)
+    {
+        float cost = cosf(theta);
+        float sint = sinf(theta);
+
+        Matrix m(identity);
+
+        if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+            m(1,1) = cost; m(2,1) = -sint;
+            m(1,2) = sint; m(2,2) = cost;
+        }
+        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+            m(0,0) = cost; m(2,0) = sint;
+            m(1,2) = -sint; m(2,2) = cost;
+        }
+        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+            m(0,0) = cost; m(1,0) = -sint;
+            m(0,1) = sint; m(1,1) = cost;
+        } 
+        else {
+            float a2, b2, c2;
+            a2 = v0 * v0;
+            b2 = v1 * v1;
+            c2 = v2 * v2;
+
+            float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+            v0 *= iscale;
+            v1 *= iscale;
+            v2 *= iscale;
+
+            float abm, acm, bcm;
+            float mcos, asin, bsin, csin;
+            mcos = 1.0f - cost;
+            abm = v0 * v1 * mcos;
+            acm = v0 * v2 * mcos;
+            bcm = v1 * v2 * mcos;
+            asin = v0 * sint;
+            bsin = v1 * sint;
+            csin = v2 * sint;
+            m(0,0) = a2 * mcos + cost;
+            m(1,0) = abm - csin;
+            m(2,0) = acm + bsin;
+            m(3,0) = abm + csin;
+            m(1,1) = b2 * mcos + cost;
+            m(2,1) = bcm - asin;
+            m(3,1) = acm - bsin;
+            m(1,2) = bcm + asin;
+            m(2,2) = c2 * mcos + cost;
+        }
+        return m;
+    }
+
+    //Matrix rotation(float yaw, float pitch, float roll);
+    //Matrix skew(float angle, Vector3::Arg v1, Vector3::Arg v2);
+
+    // Get frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float one_deltaz = 1.0f / (zFar - zNear);
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -(zFar + zNear) * one_deltaz;
+        m(3,2) = -1.0f;
+        m(2,3) = -(zFar * doubleznear) * one_deltaz;
+
+        return m;
+    }
+
+    // Get inverse frustum matrix.
+    inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float one_doubleznear = 1.0f / (2.0f * zNear);
+        float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);
+
+        m(0,0) = (xmax - xmin) * one_doubleznear;
+        m(0,3) = (xmax + xmin) * one_doubleznear;
+        m(1,1) = (ymax - ymin) * one_doubleznear;
+        m(1,3) = (ymax + ymin) * one_doubleznear;
+        m(2,3) = -1;
+        m(3,2) = -(zFar - zNear) * one_doubleznearzfar;
+        m(3,3) = (zFar + zNear) * one_doubleznearzfar;
+
+        return m;
+    }
+
+    // Get infinite frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float nudge = 1.0; // 0.999;
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -1.0f * nudge;
+        m(3,2) = -1.0f;
+        m(2,3) = -doubleznear * nudge;
+
+        return m;
+    }
+
+    // Get perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tan(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get inverse perspective matrix.
+    inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tan(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get infinite perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear)
+    {
+        float x = zNear * tan(fovy / 2);
+        float y = x / aspect;
+        return frustum( -x, x, -y, y, zNear );	
+    }
+
+    // Get matrix determinant.
+    inline float Matrix::determinant() const
+    {
+        return 
+            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
+            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
+            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
+            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
+            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
+            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
+    }
+
+    inline Matrix transpose(Matrix::Arg m)
+    {
+        Matrix r;
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+        return r;
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix inverseCramer(Matrix::Arg m)
+    {
+        Matrix r;
+        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
+        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
+        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
+        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
+        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
+        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
+        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
+        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
+        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
+        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
+        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
+        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
+        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
+        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
+        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
+        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
+        r.scale(1.0f / m.determinant());
+        return r;
+    }
+
+    inline Matrix isometryInverse(Matrix::Arg m)
+    {
+        Matrix r(identity);
+
+        // transposed 3x3 upper left matrix
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+
+        // translate by the negative offsets
+        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
+
+        return r;
+    }
+
+    // Transform the given 3d point with the given matrix.
+    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    // Transform the given 4d vector with the given matrix.
+    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
+    {
+        return Vector4(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),
+            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));
+    }
+
+    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
+    {
+        // @@ Is this the right order? mul(a, b) = b * a
+        Matrix m = a;
+        m.apply(b);
+        return m;
+    }
+
+    inline void Matrix::operator+=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix::operator-=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix operator+(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix operator-(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m -= b;
+        return m;
+    }
+
+
+} // nv namespace
+
+
+#if 0 // old code.
+/** @name Special matrices. */
+//@{
+/** Generate a translation matrix. */
+void TranslationMatrix(const Vec3 & v) {
+    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
+    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
+    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
+    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
+}
+
+/** Rotate theta degrees around v. */
+void RotationMatrix( float theta, float v0, float v1, float v2 ) {
+    float cost = cos(theta);
+    float sint = sin(theta);
+
+    if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    } 
+    else {
+        //we need scale a,b,c to unit length.
+        float a2, b2, c2;
+        a2 = v0 * v0;
+        b2 = v1 * v1;
+        c2 = v2 * v2;
+
+        float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+        v0 *= iscale;
+        v1 *= iscale;
+        v2 *= iscale;
+
+        float abm, acm, bcm;
+        float mcos, asin, bsin, csin;
+        mcos = 1.0f - cost;
+        abm = v0 * v1 * mcos;
+        acm = v0 * v2 * mcos;
+        bcm = v1 * v2 * mcos;
+        asin = v0 * sint;
+        bsin = v1 * sint;
+        csin = v2 * sint;
+        data[0] = a2 * mcos + cost;
+        data[1] = abm - csin;
+        data[2] = acm + bsin;
+        data[3] = abm + csin;
+        data[4] = 0.0f;
+        data[5] = b2 * mcos + cost;
+        data[6] = bcm - asin;
+        data[7] = acm - bsin;
+        data[8] = 0.0f;
+        data[9] = bcm + asin;
+        data[10] = c2 * mcos + cost;
+        data[11] = 0.0f;
+        data[12] = 0.0f;
+        data[13] = 0.0f;
+        data[14] = 0.0f;
+        data[15] = 1.0f;
+    }
+}
+
+/*
+void SkewMatrix(float angle, const Vec3 & v1, const Vec3 & v2) {
+v1.Normalize();
+v2.Normalize();
+
+Vec3 v3;
+v3.Cross(v1, v2);
+v3.Normalize();
+
+// Get skew factor.
+float costheta = Vec3DotProduct(v1, v2);
+float sintheta = Real.Sqrt(1 - costheta * costheta);
+float skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
+
+// Build orthonormal matrix.
+v1 = FXVector3.Cross(v3, v2);
+v1.Normalize();
+
+Matrix R = Matrix::Identity;
+R[0, 0] = v3.X; // Not sure this is in the correct order...
+R[1, 0] = v3.Y;
+R[2, 0] = v3.Z;
+R[0, 1] = v1.X;
+R[1, 1] = v1.Y;
+R[2, 1] = v1.Z;
+R[0, 2] = v2.X;
+R[1, 2] = v2.Y;
+R[2, 2] = v2.Z;
+
+// Build skew matrix.
+Matrix S = Matrix::Identity;
+S[2, 1] = -skew;
+
+// Return skew transform.
+return R * S * R.Transpose;	// Not sure this is in the correct order...
+}
+*/
+
+/**
+* Generate rotation matrix for the euler angles. This is the same as computing
+* 3 rotation matrices and multiplying them together in our custom order.
+*
+* @todo Have to recompute this code for our new convention.
+**/
+void RotationMatrix( float yaw, float pitch, float roll ) {
+    float sy = sin(yaw+ToRadian(90));
+    float cy = cos(yaw+ToRadian(90));
+    float sp = sin(pitch-ToRadian(90));
+    float cp = cos(pitch-ToRadian(90));
+    float sr = sin(roll);
+    float cr = cos(roll);
+
+    data[0] = cr*cy + sr*sp*sy;
+    data[1] = cp*sy;
+    data[2] = -sr*cy + cr*sp*sy;
+    data[3] = 0;
+
+    data[4] = -cr*sy + sr*sp*cy;
+    data[5] = cp*cy;
+    data[6] = sr*sy + cr*sp*cy;
+    data[7] = 0;
+
+    data[8] = sr*cp;
+    data[9] = -sp;
+    data[10] = cr*cp;
+    data[11] = 0;
+
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = 0;
+    data[15] = 1;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void Frustum( float xmin, float xmax, float ymin, float ymax, float zNear, float zFar ) {
+    float one_deltax, one_deltay, one_deltaz, doubleznear;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    one_deltaz = 1.0f / (zFar - zNear);
+
+    data[0] = (float)(doubleznear * one_deltax);
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+    data[4] = 0.0f;
+    data[5] = (float)(doubleznear * one_deltay);
+    data[6] = 0.f;
+    data[7] = 0.f;
+    data[8] = (float)((xmax + xmin) * one_deltax);
+    data[9] = (float)((ymax + ymin) * one_deltay);
+    data[10] = (float)(-(zFar + zNear) * one_deltaz);
+    data[11] = -1.f;
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = (float)(-(zFar * doubleznear) * one_deltaz);
+    data[15] = 0.f;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void FrustumInf( float xmin, float xmax, float ymin, float ymax, float zNear ) {
+    float one_deltax, one_deltay, doubleznear, nudge;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    nudge = 1.0; // 0.999;
+
+    data[0] = doubleznear * one_deltax;
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+
+    data[4] = 0.0f;
+    data[5] = doubleznear * one_deltay;
+    data[6] = 0.f;
+    data[7] = 0.f;
+
+    data[8] = (xmax + xmin) * one_deltax;
+    data[9] = (ymax + ymin) * one_deltay;
+    data[10] = -1.0f * nudge;
+    data[11] = -1.0f;
+
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = -doubleznear * nudge;
+    data[15] = 0.f;
+}
+
+/** Create an inverse frustum matrix with the far plane at the infinity. */
+void FrustumInfInv( float left, float right, float bottom, float top, float zNear ) {
+    // this matrix is wrong (not tested floatly) I think it should be transposed.
+    data[0] = (right - left) / (2 * zNear);
+    data[1] = 0;
+    data[2] = 0;
+    data[3] = (right + left) / (2 * zNear);
+    data[4] = 0;
+    data[5] = (top - bottom) / (2 * zNear);
+    data[6] = 0;
+    data[7] = (top + bottom) / (2 * zNear);
+    data[8] = 0;
+    data[9] = 0;
+    data[10] = 0;
+    data[11] = -1;
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = -1 / (2 * zNear);
+    data[15] = 1 / (2 * zNear);
+}
+
+/** Create an homogeneous projection matrix. */
+void Perspective( float fov, float aspect, float zNear, float zFar ) {
+    float xmin, xmax, ymin, ymax;
+
+    xmax = zNear * tan( fov/2 );
+    xmin = -xmax;
+
+    ymax = xmax / aspect;
+    ymin = -ymax;
+
+    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+/** Create a projection matrix with the far plane at the infinity. */
+void PerspectiveInf( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInf( -x, x, -y, y, zNear );
+}
+
+/** Create an inverse projection matrix with far plane at the infinity. */
+void PerspectiveInfInv( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInfInv( -x, x, -y, y, zNear );
+}
+
+/** Build bone matrix from quatertion and offset. */
+void BoneMatrix(const Quat & q, const Vec3 & offset) {
+    float x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
+
+    // calculate coefficients
+    x2 = q.x + q.x;
+    y2 = q.y + q.y;
+    z2 = q.z + q.z;
+
+    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
+    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
+    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
+
+    data[0] = 1.0f - (yy + zz); 	
+    data[1] = xy - wz;
+    data[2] = xz + wy;		
+    data[3] = 0.0f;
+
+    data[4] = xy + wz;		
+    data[5] = 1.0f - (xx + zz);
+    data[6] = yz - wx;		
+    data[7] = 0.0f;
+
+    data[8] = xz - wy;		
+    data[9] = yz + wx;
+    data[10] = 1.0f - (xx + yy);		
+    data[11] = 0.0f;
+
+    data[12] = offset.x;
+    data[13] = offset.y;
+    data[14] = offset.z;			
+    data[15] = 1.0f;
+}
+
+//@}
+
+
+/** @name Transformations: */
+//@{
+
+/** Apply a general scale. */
+void Scale( float x, float y, float z ) {
+    data[0] *= x;	data[4] *= y;	data[8]  *= z;
+    data[1] *= x;	data[5] *= y;	data[9]  *= z;
+    data[2] *= x;	data[6] *= y;	data[10] *= z;
+    data[3] *= x;	data[7] *= y;	data[11] *= z;
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, const Vec3 & v ) {
+    Matrix b;
+    b.RotationMatrix( theta, v[0], v[1], v[2] );
+    Multiply4x3( b );
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, float v0, float v1, float v2 ) {
+    Matrix b;
+    b.RotationMatrix( theta, v0, v1, v2 );
+    Multiply4x3( b );
+}
+
+/**
+* Translate the matrix by t. This is the same as multiplying by a
+* translation matrix with the given offset.
+* this = T * this
+*/
+void Translate( const Vec3 &t ) {
+    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
+    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
+    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
+    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
+}
+
+/** 
+* Translate the matrix by x, y, z. This is the same as multiplying by a 
+* translation matrix with the given offsets.
+*/
+void Translate( float x, float y, float z ) {
+    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
+    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
+    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
+    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
+}
+
+/** Compute the transposed matrix. */
+void Transpose() {
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+    piSwap(data[3], data[12]);
+    piSwap(data[7], data[13]);
+    piSwap(data[11], data[14]);
+}
+
+/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
+void IsometryInverse() {
+    // transposed 3x3 upper left matrix
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+
+    // translate by the negative offsets
+    Vec3 v(-data[12], -data[13], -data[14]);
+    data[12] = data[13] = data[14] = 0;
+    Translate(v);
+}
+
+/** Compute the inverse of the affine portion of this matrix. */
+void AffineInverse() {
+    data[12] = data[13] = data[14] = 0;
+    Transpose();
+}
+//@}
+
+/** @name Matrix operations: */
+//@{
+
+/** Return the determinant of this matrix. */
+float Determinant() const {
+    return	data[0] * data[5] * data[10] * data[15] + 
+        data[1] * data[6] * data[11] * data[12] +
+        data[2] * data[7] * data[ 8] * data[13] +
+        data[3] * data[4] * data[ 9] * data[14] -
+        data[3] * data[6] * data[ 9] * data[12] -
+        data[2] * data[5] * data[ 8] * data[15] -
+        data[1] * data[4] * data[11] * data[14] -
+        data[0] * data[7] * data[10] * data[12];
+}
+
+
+/** Standard matrix product: this *= B. */
+void Multiply4x4( const Matrix & restrict B ) {
+    Multiply4x4(*this, B);
+}
+
+/** Standard matrix product: this = A * B. this != B*/
+void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 4; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+
+    /* Unrolled but does not allow this == A
+    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
+    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
+    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
+    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
+    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
+    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
+    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
+    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
+    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
+    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
+    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
+    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
+    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
+    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
+    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
+    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
+    */
+}
+
+/** Standard matrix product: this *= B. */
+void Multiply4x3( const Matrix & restrict B ) {
+    Multiply4x3(*this, B);
+}
+
+/** Standard product of matrices, where the last row is [0 0 0 1]. */
+void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 3; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
+
+    /* Unrolled but does not allow this == A
+    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
+    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
+    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
+    data[3] = 0.0f;
+    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
+    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
+    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
+    data[7] = 0.0f;
+    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
+    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
+    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
+    data[11]= 0.0f;
+    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
+    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
+    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
+    data[15]= 1.0f;
+    */
+}
+//@}
+
+
+/** @name Vector operations: */
+//@{
+
+/** Transform 3d vector (w=0). */
+void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
+}
+/** Transform 3d vector by the transpose (w=0). */
+void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
+    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
+    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
+}
+
+/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
+void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+}
+
+/** Transform a point, normalize it, and return w. */
+float TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    float w;
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
+    *dest *= w;
+    return w;
+}
+
+/** Transform a point and return w. */
+float TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+
+/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
+void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+//@}
+
+/** @name Matrix analysis. */
+//@{
+
+/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
+void GetEulerAnglesZYZ(float * s, float * t, float * r) const {
+    if( GetElem(2,2) < 1.0f ) {
+        if( GetElem(2,2) > -1.0f ) {
+            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
+            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
+            //	-cs*st				ss*st				ct
+            *s = atan2(GetElem(1,2), -GetElem(0,2));
+            *t = acos(GetElem(2,2));
+            *r = atan2(GetElem(2,1), GetElem(2,0));		
+        }
+        else {
+            // 	-c(s-r)	 	s(s-r)		0
+            //	s(s-r)		c(s-r)		0
+            //	0			0			-1
+            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
+            *t = PI;
+            *r = 0;
+        }
+    }
+    else {
+        // 	c(s+r)		-s(s+r)		0
+        //	s(s+r)		c(s+r)		0
+        //	0			0			1
+        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
+        *t = 0;
+        *r = 0;
+    }
+}
+
+//@}
+
+MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
+
+/** Print to debug output. */
+void Print() const {
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
+}
+
+
+public:
+
+    float data[16];
+
+};
+#endif
+
+
+#endif // NV_MATH_MATRIX_INL
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Montecarlo.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Montecarlo.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Montecarlo.h
@@ -1,84 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_MATH_MONTECARLO_H
-#define NV_MATH_MONTECARLO_H
-
-#include <nvmath/Vector.h>
-#include <nvmath/Random.h>
-
-namespace nv
-{
-
-/// A random sample distribution.
-class SampleDistribution
-{
-public:
-	
-	// Sampling method.
-	enum Method {
-		Method_Random,
-		Method_Stratified,
-		Method_NRook
-	};
-
-	// Distribution functions.
-	enum Distribution {
-		Distribution_Uniform,
-		Distribution_Cosine
-	};
-	
-	/// Constructor.
-	SampleDistribution(int num)
-	{
-		m_sampleArray.resize(num);
-	}
-	
-	void redistribute(Method method=Method_NRook, Distribution dist=Distribution_Cosine);
-	
-	/// Get parametric coordinates of the sample.
-	Vector2 sample(int i) { return m_sampleArray[i].uv; }
-	
-	/// Get sample direction.
-	Vector3 sampleDir(int i) { return m_sampleArray[i].dir; }
-
-	/// Get number of samples.
-	uint sampleCount() const { return m_sampleArray.count(); }
-	
-private:
-	
-	void redistributeRandom(const Distribution dist);
-	void redistributeStratified(const Distribution dist);
-	void multiStageNRooks(const int size, int* cells);
-	void redistributeNRook(const Distribution dist);
-	
-	
-	/// A sample of the random distribution.
-	struct Sample
-	{
-		/// Set sample given the 3d coordinates.
-		void setDir(float x, float y, float z) {
-			dir.set(x, y, z);
-			uv.set(acosf(z), atan2f(y, x));
-		}
-		
-		/// Set sample given the 2d parametric coordinates.
-		void setUV(float u, float v) {
-			uv.set(u, v);
-			dir.set(sinf(u) * cosf(v), sinf(u) * sinf(v), cosf(u));
-		}
-		
-		Vector2 uv;
-		Vector3 dir;
-	};
-	
-	/// Random seed.
-	MTRand m_rand;
-	
-	/// Samples.
-	Array<Sample> m_sampleArray;
-	
-};
-
-} // nv namespace
-
-#endif // NV_MATH_MONTECARLO_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Montecarlo.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Montecarlo.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Montecarlo.cpp
@@ -1,156 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvmath/Montecarlo.h>
-
-using namespace nv;
-
-
-void SampleDistribution::redistribute(Method method/*=Method_NRook*/, Distribution dist/*=Distribution_Cosine*/)
-{
-	switch(method) 
-	{
-		case Method_Random:
-			redistributeRandom(dist);
-			break;
-		case Method_Stratified:
-			redistributeStratified(dist);
-			break;
-		case Method_NRook:
-			redistributeNRook(dist);
-			break;
-	};
-}
-	
-void SampleDistribution::redistributeRandom(const Distribution dist)
-{
-	const uint sampleCount = m_sampleArray.count();
-	
-	// This is the worst method possible!
-	for(uint i = 0; i < sampleCount; i++)
-	{
-		float x = m_rand.getFloat();
-		float y = m_rand.getFloat();
-		
-		// Map uniform distribution in the square to the (hemi)sphere.
-		if( dist == Distribution_Uniform ) {
-			m_sampleArray[i].setUV(acosf(1 - 2 * x), 2 * PI * y);
-		}
-		else {
-			nvDebugCheck(dist == Distribution_Cosine);
-			m_sampleArray[i].setUV(acosf(sqrtf(x)), 2 * PI * y);
-		}
-	}
-}
-
-
-void SampleDistribution::redistributeStratified(const Distribution dist)
-{
-	const uint sampleCount = m_sampleArray.count();
-	const uint sqrtSampleCount = uint(sqrtf(float(sampleCount)));
-	
-	nvDebugCheck(sqrtSampleCount*sqrtSampleCount == sampleCount);	// Must use exact powers!
-
-	// Create a uniform distribution of points on the hemisphere with low variance.
-	for(uint v = 0, i = 0; v < sqrtSampleCount; v++) {
-		for(uint u = 0; u < sqrtSampleCount; u++, i++) {
-			float x = (u + m_rand.getFloat()) / float(sqrtSampleCount);
-			float y = (v + m_rand.getFloat()) / float(sqrtSampleCount);
-			
-			// Map uniform distribution in the square to the (hemi)sphere.
-			if( dist == Distribution_Uniform ) {
-				m_sampleArray[i].setUV(acosf(1 - 2 * x), 2 * PI * y);
-			}
-			else {
-				nvDebugCheck(dist == Distribution_Cosine);
-				m_sampleArray[i].setUV(acosf(sqrtf(x)), 2 * PI * y);
-			}
-		}
-	}
-}
-
-
-/** Multi-Stage N-rooks Sampling Method.
- * See: http://www.acm.org/jgt/papers/WangSung9/9
- */
-void SampleDistribution::multiStageNRooks(const int size, int* cells)
-{
-	if (size == 1) {
-		return;
-	}
-
-	int size1 = size >> 1;
-	int size2 = size >> 1;
-
-	if (size & 1) {
-		if (m_rand.getFloat() > 0.5) {
-			size1++;
-		}
-		else {
-			size2++;
-		}
-	}
-
-	int* upper_cells = new int[size1];
-	int* lower_cells = new int[size2];
-
-	int i, j;
-	for(i = 0, j = 0; i < size - 1; i += 2, j++) {
-		if (m_rand.get() & 1) {
-			upper_cells[j] = cells[i];
-			lower_cells[j] = cells[i + 1];
-		}
-		else {
-			upper_cells[j] = cells[i + 1];
-			lower_cells[j] = cells[i];
-		}
-	}
-
-	if (size1 != size2) {
-		if (size1 > size2) {
-			upper_cells[j] = cells[i];
-		}
-		else {
-			lower_cells[j] = cells[i];
-		}
-	}
-
-	multiStageNRooks(size1, upper_cells);
-	memcpy(cells, upper_cells, size1 * sizeof(int));
-	delete [] upper_cells;
-
-	multiStageNRooks(size2, lower_cells);
-	memcpy(cells + size1, lower_cells, size2 * sizeof(int));
-	delete [] lower_cells;
-}
-
-
-void SampleDistribution::redistributeNRook(const Distribution dist)
-{
-	const uint sampleCount = m_sampleArray.count();
-	
-	// Generate nrook cells
-	int * cells = new int[sampleCount];
-	for(uint32 i = 0; i < sampleCount; i++)
-	{
-		cells[i] = i;
-	}
-	multiStageNRooks(sampleCount, cells);
-
-	for(uint i = 0; i < sampleCount; i++)
-	{
-		float x = (i + m_rand.getFloat()) / sampleCount;
-		float y = (cells[i] + m_rand.getFloat()) / sampleCount;
-
-		// Map uniform distribution in the square to the (hemi)sphere.
-		if( dist == Distribution_Uniform ) {
-			m_sampleArray[i].setUV(acosf(1 - 2 * x), 2 * PI * y);
-		}
-		else {
-			nvDebugCheck(dist == Distribution_Cosine);
-			m_sampleArray[i].setUV(acosf(sqrtf(x)), 2 * PI * y);
-		}
-	}
-
-	delete [] cells;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/PackedFloat.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/PackedFloat.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/PackedFloat.h
@@ -0,0 +1,79 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MATH_PACKEDFLOAT_H
+#define NV_MATH_PACKEDFLOAT_H
+
+#include "nvmath.h"
+#include "Vector.h"
+
+namespace nv
+{
+
+    union FloatRGB9E5 {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 e : 5;
+            uint32 zm : 9;
+            uint32 ym : 9;
+            uint32 xm : 9;
+        #else
+            uint32 xm : 9;
+            uint32 ym : 9;
+            uint32 zm : 9;
+            uint32 e : 5;
+        #endif
+        };
+    };
+
+    union FloatR11G11B10 {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 ze : 5;
+            uint32 zm : 5;
+            uint32 ye : 5;
+            uint32 ym : 6;
+            uint32 xe : 5;
+            uint32 xm : 6;
+        #else
+            uint32 xm : 6;
+            uint32 xe : 5;
+            uint32 ym : 6;
+            uint32 ye : 5;
+            uint32 zm : 5;
+            uint32 ze : 5;
+        #endif
+        };
+    };
+
+    union FloatRGBE8 {
+        uint32 v;
+        struct {
+        #if NV_LITTLE_ENDIAN
+            uint8 r, g, b, e;
+        #else
+            uint8 e: 8;
+            uint8 b: 8;
+            uint8 g: 8;
+            uint8 r: 8;
+        #endif
+        };
+    };
+
+    NVMATH_API Vector3 rgb9e5_to_vector3(FloatRGB9E5 v);
+    NVMATH_API FloatRGB9E5 vector3_to_rgb9e5(const Vector3 & v);
+
+    NVMATH_API float float11_to_float32(uint v);
+    NVMATH_API float float10_to_float32(uint v);
+
+    NVMATH_API Vector3 r11g11b10_to_vector3(FloatR11G11B10 v);
+    NVMATH_API FloatR11G11B10 vector3_to_r11g11b10(const Vector3 & v);
+
+    NVMATH_API Vector3 rgbe8_to_vector3(FloatRGBE8 v);
+    NVMATH_API FloatRGBE8 vector3_to_rgbe8(const Vector3 & v);
+
+} // nv
+
+#endif // NV_MATH_PACKEDFLOAT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/PackedFloat.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/PackedFloat.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/PackedFloat.cpp
@@ -0,0 +1,61 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "PackedFloat.h"
+#include "Vector.inl"
+#include "ftoi.h"
+
+using namespace nv;
+
+Vector3 nv::rgb9e5_to_vector3(FloatRGB9E5 v) {
+}
+
+FloatRGB9E5 nv::vector3_to_rgb9e5(const Vector3 & v) {
+}
+
+
+float nv::float11_to_float32(uint v) {
+}
+
+float nv::float10_to_float32(uint v) {
+}
+
+Vector3 nv::r11g11b10_to_vector3(FloatR11G11B10 v) {
+}
+
+FloatR11G11B10 nv::vector3_to_r11g11b10(const Vector3 & v) {
+}
+
+// These are based on: 
+// http://www.graphics.cornell.edu/~bjw/rgbe/rgbe.c
+// While this may not be the best way to encode/decode RGBE8, I'm not making any changes to maintain compatibility.
+FloatRGBE8 nv::vector3_to_rgbe8(const Vector3 & v) {
+
+    float m = max3(v.x, v.y, v.z);
+
+    FloatRGBE8 rgbe;
+
+    if (m < 1e-32) {
+        rgbe.v = 0;
+    }
+    else {
+        int e;
+        float scale = frexpf(m, &e) * 256.0f / m;
+        rgbe.r = U8(ftoi_round(v.x * scale));
+        rgbe.g = U8(ftoi_round(v.y * scale));
+        rgbe.b = U8(ftoi_round(v.z * scale));
+        rgbe.e = U8(e + 128);
+    }
+
+    return rgbe;
+}
+
+
+Vector3 nv::rgbe8_to_vector3(FloatRGBE8 v) {
+    if (v.e != 0) {
+        float scale = ldexpf(1.0f, v.e-(int)(128+8));             // +8 to divide by 256. @@ Shouldn't we divide by 255 instead?
+        return scale * Vector3(float(v.r), float(v.g), float(v.b));
+    }
+    
+    return Vector3(0);
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.h
@@ -1,77 +1,45 @@
-// This code is in the public domain -- castanyo@yahoo.es
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
+#pragma once
 #ifndef NV_MATH_PLANE_H
 #define NV_MATH_PLANE_H
 
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
+#include "nvmath.h"
+#include "Vector.h"
+
+#if NV_USE_ALTIVEC
+#undef vector
+#endif
 
 namespace nv
 {
-	class Matrix;
+    class Matrix;
+
+    class NVMATH_CLASS Plane
+    {
+    public:
+        Plane();
+        Plane(float x, float y, float z, float w);
+        Plane(const Vector4 & v);
+        Plane(const Vector3 & v, float d);
+        Plane(const Vector3 & normal, const Vector3 & point);
+        Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2);
+
+        const Plane & operator=(const Plane & v);
+
+        Vector3 vector() const;
+        float offset() const;
+
+        void operator*=(float s);
+
+        Vector4 v;
+    };
+
+    Plane transformPlane(const Matrix &, const Plane &);
 
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c);
 
-	class NVMATH_CLASS Plane
-	{
-	public:
-		typedef Plane const & Arg;
-		
-		Plane();
-		Plane(float x, float y, float z, float w);
-		Plane(Vector4::Arg v);
-		Plane(Vector3::Arg v, float d);
-		Plane(Vector3::Arg normal, Vector3::Arg point);
-		
-		const Plane & operator=(Plane::Arg v);
-		
-		Vector3 vector() const;
-		scalar offset() const;
-		
-		const Vector4 & asVector() const;
-		Vector4 & asVector();
-		
-		void operator*=(scalar s);
-
-	private:
-		Vector4 p;
-	};
-
-	inline Plane::Plane() {}
-	inline Plane::Plane(float x, float y, float z, float w) : p(x, y, z, w) {}
-	inline Plane::Plane(Vector4::Arg v) : p(v) {}
-	inline Plane::Plane(Vector3::Arg v, float d) : p(v, d) {}
-	inline Plane::Plane(Vector3::Arg normal, Vector3::Arg point) : p(normal, dot(normal, point)) {}
-	
-	inline const Plane & Plane::operator=(Plane::Arg v) { p = v.p; return *this; }
-	
-	inline Vector3 Plane::vector() const { return p.xyz(); }
-	inline scalar Plane::offset() const { return p.w(); }
-
-	inline const Vector4 & Plane::asVector() const { return p; }
-	inline Vector4 & Plane::asVector() { return p; }
-
-	// Normalize plane.
-	inline Plane normalize(Plane::Arg plane, float epsilon = NV_EPSILON)
-	{
-		const float len = length(plane.vector());
-		nvDebugCheck(!isZero(len, epsilon));
-		const float inv = 1.0f / len;
-		return Plane(plane.asVector() * inv);
-	}
-
-	// Get the distance from the given point to this plane.
-	inline float distance(Plane::Arg plane, Vector3::Arg point)
-	{
-		return dot(plane.vector(), point) - plane.offset();
-	}
-
-	inline void Plane::operator*=(scalar s)
-	{
-		scale(p, s);
-	}
 
-	Plane transformPlane(const Matrix&, Plane::Arg);
-	
 } // nv namespace
 
 #endif // NV_MATH_PLANE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.cpp
@@ -1,17 +1,27 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
 #include "Plane.h"
-#include "Matrix.h"
+#include "Plane.inl"
+#include "Matrix.inl"
 
 namespace nv
 {
-	Plane transformPlane(const Matrix& m, Plane::Arg p)
-	{
-		Vector3 newVec = transformVector(m, p.vector());
-
-		Vector3 ptInPlane = p.offset() * p.vector();
-		ptInPlane = transformPoint(m, ptInPlane);
-
-		return Plane(newVec, ptInPlane);
-	}
-}
+    Plane transformPlane(const Matrix & m, const Plane & p)
+    {
+        Vector3 newVec = transformVector(m, p.vector());
+
+        Vector3 ptInPlane = p.offset() * p.vector();
+        ptInPlane = transformPoint(m, ptInPlane);
+
+        return Plane(newVec, ptInPlane);
+    }
+
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c)
+    {
+        return dot(a.vector(), cross(b.vector(), c.vector())) * (
+            a.offset() * cross(b.vector(), c.vector()) + 
+            c.offset() * cross(a.vector(), b.vector()) +
+            b.offset() * cross(c.vector(), a.vector()));
+    }
+
+} // nv namespace
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Plane.inl
@@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_PLANE_INL
+#define NV_MATH_PLANE_INL
+
+#include "Plane.h"
+#include "Vector.inl"
+
+namespace nv
+{
+    inline Plane::Plane() {}
+    inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {}
+    inline Plane::Plane(const Vector4 & v) : v(v) {}
+    inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {}
+    inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {}
+    inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) {
+        Vector3 n = cross(v1-v0, v2-v0);
+        float d = -dot(n, v0);
+        v = Vector4(n, d);
+    }
+
+    inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; }
+
+    inline Vector3 Plane::vector() const { return v.xyz(); }
+    inline float Plane::offset() const { return v.w; }
+
+    // Normalize plane.
+    inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
+    {
+        const float len = length(plane.vector());
+        const float inv = isZero(len, epsilon) ? 0 : 1.0f / len;
+        return Plane(plane.v * inv);
+    }
+
+    // Get the signed distance from the given point to this plane.
+    inline float distance(const Plane & plane, const Vector3 & point)
+    {
+        return dot(plane.vector(), point) + plane.offset();
+    }
+
+    inline void Plane::operator*=(float s)
+    {
+        v *= s;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Quaternion.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Quaternion.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Quaternion.h
@@ -1,128 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_MATH_QUATERNION_H
-#define NV_MATH_QUATERNION_H
-
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
-
-namespace nv
-{
-
-	class NVMATH_CLASS Quaternion
-	{
-	public:
-		typedef Quaternion const & Arg;
-		
-		Quaternion();
-		explicit Quaternion(zero_t);
-		Quaternion(float x, float y, float z, float w);
-		Quaternion(Vector4::Arg v);
-		
-		const Quaternion & operator=(Quaternion::Arg v);
-		
-		scalar x() const;
-		scalar y() const;
-		scalar z() const;
-		scalar w() const;
-		
-		const Vector4 & asVector() const;
-		Vector4 & asVector();
-		
-	private:
-		Vector4 q;
-	};
-
-	inline Quaternion::Quaternion() {}
-	inline Quaternion::Quaternion(zero_t) : q(zero) {}
-	inline Quaternion::Quaternion(float x, float y, float z, float w) : q(x, y, z, w) {}
-	inline Quaternion::Quaternion(Vector4::Arg v) : q(v) {}
-	
-	inline const Quaternion & Quaternion::operator=(Quaternion::Arg v) { q = v.q; return *this; }
-	
-	inline scalar Quaternion::x() const { return q.x(); }
-	inline scalar Quaternion::y() const { return q.y(); }
-	inline scalar Quaternion::z() const { return q.z(); }
-	inline scalar Quaternion::w() const { return q.w(); }
-
-	inline const Vector4 & Quaternion::asVector() const { return q; }
-	inline Vector4 & Quaternion::asVector() { return q; }
-
-
-	inline Quaternion mul(Quaternion::Arg a, Quaternion::Arg b)
-	{
-		// @@ Efficient SIMD implementation?
-		return Quaternion(
-			+ a.x() * b.w() + a.y()*b.z() - a.z()*b.y() + a.w()*b.x(),
-			- a.x() * b.z() + a.y()*b.w() + a.z()*b.x() + a.w()*b.y(),
-			+ a.x() * b.y() - a.y()*b.x() + a.z()*b.w() + a.w()*b.z(),
-			- a.x() * b.x() - a.y()*b.y() - a.z()*b.z() + a.w()*b.w());
-	}
-
-	inline Quaternion scale(Quaternion::Arg q, float s)
-	{
-		return scale(q.asVector(), s);
-	}
-	inline Quaternion operator *(Quaternion::Arg q, float s)
-	{
-		return scale(q, s);
-	}
-	inline Quaternion operator *(float s, Quaternion::Arg q)
-	{
-		return scale(q, s);
-	}
-
-	inline Quaternion scale(Quaternion::Arg q, Vector4::Arg s)
-	{
-		return scale(q.asVector(), s);
-	}
-	/*inline Quaternion operator *(Quaternion::Arg q, Vector4::Arg s)
-	{
-		return scale(q, s);
-	}
-	inline Quaternion operator *(Vector4::Arg s, Quaternion::Arg q)
-	{
-		return scale(q, s);
-	}*/
-
-	inline Quaternion conjugate(Quaternion::Arg q)
-	{
-		return scale(q, Vector4(-1, -1, -1, 1));
-	}
-
-	inline float length(Quaternion::Arg q)
-	{
-		return length(q.asVector());
-	}
-
-	inline bool isNormalized(Quaternion::Arg q, float epsilon = NV_NORMAL_EPSILON)
-	{
-		return equal(length(q), 1, epsilon);
-	}
-
-	inline Quaternion normalize(Quaternion::Arg q, float epsilon = NV_EPSILON)
-	{
-		float l = length(q);
-		nvDebugCheck(!isZero(l, epsilon));
-		Quaternion n = scale(q, 1.0f / l);
-		nvDebugCheck(isNormalized(n));
-		return n;
-	}
-
-	inline Quaternion inverse(Quaternion::Arg q)
-	{
-		return conjugate(normalize(q));
-	}
-
-	/// Create a rotation quaternion for @a angle alpha around normal vector @a v.
-	inline Quaternion axisAngle(Vector3::Arg v, float alpha)
-	{
-		float s = sinf(alpha * 0.5f);
-		float c = cosf(alpha * 0.5f);
-		return Quaternion(Vector4(v * s, c));
-	}
-
-
-} // nv namespace
-
-#endif // NV_MATH_QUATERNION_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Random.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Random.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Random.h
@@ -1,368 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_MATH_RANDOM_H
-#define NV_MATH_RANDOM_H
-
-#include <nvcore/Containers.h> // nextPowerOfTwo
-#include <nvmath/nvmath.h>
-
-namespace nv
-{
-
-/// Interface of the random number generators.
-class Rand
-{
-public:
-
-	virtual ~Rand() {}
-
-	enum time_e { Time };
-
-	/// Provide a new seed.
-	virtual void seed( uint s ) { /* empty */ };
-	
-	/// Get an integer random number.
-	virtual uint get() = 0;
-
-	/// Get a random number on [0, max] interval.
-	uint getRange( uint max )
-	{
-		uint n;
-	//	uint mask = Bitmask( max );
-	//	do { n = Get() & mask; } while( n > max );		
-		uint np2 = nextPowerOfTwo( max );
-		do { n = get() & (np2-1); } while( n > max );
-		return n;
-	}
-
-	/// Random number on [0.0, 1.0] interval.
-	float getFloat()
-	{
-    	union
-		{
-			uint32 i;
-			float f;
-		} pun;
-
-		pun.i = 0x3f800000UL | (get() & 0x007fffffUL);
-		return pun.f - 1.0f;
-	}
-
-	/*
-	/// Random number on [0.0, 1.0] interval.
-	double getReal()
-	{
-		return double(get()) * (1.0/4294967295.0); // 2^32-1
-	}
-	
-	/// Random number on [0.0, 1.0) interval.
-	double getRealExclusive()
-	{
-		return double(get()) * (1.0/4294967296.0); // 2^32
-	}
-	*/
-
-	/// Get the max value of the random number.
-	uint max() const { return 4294967295U; }
-
-	// Get a random seed.
-	static uint randomSeed();
-	
-};
-
-
-/// Very simple random number generator with low storage requirements.
-class SimpleRand : public Rand
-{
-public:
-
-	/// Constructor that uses the current time as the seed.
-	SimpleRand( time_e )
-	{
-		seed(randomSeed());
-	}
-	
-	/// Constructor that uses the given seed.
-	SimpleRand( uint s = 0 )
-	{
-		seed(s);
-	}
-	
-	/// Set the given seed.
-	virtual void seed( uint s )
-	{
-		current = s;
-	}
-	
-	/// Get a random number.
-	virtual uint get()
-	{
-		return current = current * 1103515245 + 12345;
-	}
-
-private:
-
-	uint current;
-	
-};
-
-
-/// Mersenne twister random number generator.
-class MTRand : public Rand
-{
-public:
-
-	enum { N = 624 };       // length of state vector
-	enum { M = 397 };
-
-	/// Constructor that uses the current time as the seed.
-	MTRand( time_e )
-	{
-		seed(randomSeed());
-	}
-	
-	/// Constructor that uses the given seed.
-	MTRand( uint s = 0 )
-	{
-		seed(s);
-	}
-	
-	/// Constructor that uses the given seeds.
-	NVMATH_API MTRand( const uint * seed_array, uint length );
-
-	
-	/// Provide a new seed.
-	virtual void seed( uint s )
-	{
-		initialize(s);
-		reload();
-	}	
-
-	/// Get a random number between 0 - 65536.
-	virtual uint get()
-	{
-		// Pull a 32-bit integer from the generator state
-		// Every other access function simply transforms the numbers extracted here
-		if( left == 0 ) { 
-			reload(); 
-		}
-		left--;
-		
-		uint s1;
-		s1 = *next++;
-		s1 ^= (s1 >> 11);
-		s1 ^= (s1 <<  7) & 0x9d2c5680U;
-		s1 ^= (s1 << 15) & 0xefc60000U;
-		return ( s1 ^ (s1 >> 18) );		
-	};
-
-	
-private:
-	
-	NVMATH_API void initialize( uint32 seed );
-	NVMATH_API void reload();
-	
-	uint hiBit( uint u ) const { return u & 0x80000000U; }
-	uint loBit( uint u ) const { return u & 0x00000001U; }
-	uint loBits( uint u ) const { return u & 0x7fffffffU; }
-	uint mixBits( uint u, uint v ) const { return hiBit(u) | loBits(v); }
-	uint twist( uint m, uint s0, uint s1 ) const { return m ^ (mixBits(s0,s1)>>1) ^ ((~loBit(s1)+1) & 0x9908b0dfU); }
-		
-private:
-
-	uint state[N];	// internal state
-	uint * next;	// next value to get from state
-	int left;		// number of values left before reload needed		
-
-};
-
-
-
-/** George Marsaglia's random number generator. 
- * Code based on Thatcher Ulrich public domain source code:
- * http://cvs.sourceforge.net/viewcvs.py/tu-testbed/tu-testbed/base/tu_random.cpp?rev=1.7&view=auto
- *
- * PRNG code adapted from the complimentary-multiply-with-carry
- * code in the article: George Marsaglia, "Seeds for Random Number
- * Generators", Communications of the ACM, May 2003, Vol 46 No 5,
- * pp90-93.
- * 
- * The article says:
- * 
- * "Any one of the choices for seed table size and multiplier will
- * provide a RNG that has passed extensive tests of randomness,
- * particularly those in [3], yet is simple and fast --
- * approximately 30 million random 32-bit integers per second on a
- * 850MHz PC.  The period is a*b^n, where a is the multiplier, n
- * the size of the seed table and b=2^32-1.  (a is chosen so that
- * b is a primitive root of the prime a*b^n + 1.)"
- * 
- * [3] Marsaglia, G., Zaman, A., and Tsang, W.  Toward a universal
- * random number generator.  _Statistics and Probability Letters
- * 8_ (1990), 35-39.
- */
-class GMRand : public Rand
-{
-public:
-
-	enum { SEED_COUNT = 8 };
-
-//	const uint64 a = 123471786;		// for SEED_COUNT=1024
-//	const uint64 a = 123554632;		// for SEED_COUNT=512
-//	const uint64 a = 8001634;		// for SEED_COUNT=255
-//	const uint64 a = 8007626;		// for SEED_COUNT=128
-//	const uint64 a = 647535442;		// for SEED_COUNT=64
-//	const uint64 a = 547416522;		// for SEED_COUNT=32
-//	const uint64 a = 487198574;		// for SEED_COUNT=16
-//	const uint64 a = 716514398U;	// for SEED_COUNT=8
-	enum { a = 716514398U };
-	
-
-	GMRand( time_e )
-	{
-		seed(randomSeed());
-	}
-	
-	GMRand(uint s = 987654321)
-	{
-		seed(s);
-	}
-
-
-	/// Provide a new seed.
-	virtual void seed( uint s )
-	{
-		c = 362436;
-		i = SEED_COUNT - 1;
-		
-		for(int i = 0; i < SEED_COUNT; i++) {
-			s = s ^ (s << 13);
-			s = s ^ (s >> 17);
-			s = s ^ (s << 5);
-			Q[i] = s;
-		}
-	}
-
-	/// Get a random number between 0 - 65536.
-	virtual uint get()
-	{
-		const uint32 r = 0xFFFFFFFE;		
-		
-		uint64 t;
-		uint32 x;
-
-		i = (i + 1) & (SEED_COUNT - 1);
-		t = a * Q[i] + c;
-		c = uint32(t >> 32);
-		x = uint32(t + c);
-		
-		if( x < c ) {
-			x++;
-			c++;
-		}
-		
-		uint32  val = r - x;
-		Q[i] = val;
-		return val;
-	};
-
-
-private:
-
-	uint32 c;
-	uint32 i;
-	uint32 Q[8];
-
-};
-
-
-/** Random number implementation from the GNU Sci. Lib. (GSL).
- * Adapted from Nicholas Chapman version:
- * 
- * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
- * This is the Unix rand48() generator. The generator returns the
- * upper 32 bits from each term of the sequence,
- * 
- * x_{n+1} = (a x_n + c) mod m 
- * 
- * using 48-bit unsigned arithmetic, with a = 0x5DEECE66D , c = 0xB
- * and m = 2^48. The seed specifies the upper 32 bits of the initial
- * value, x_1, with the lower 16 bits set to 0x330E.
- * 
- * The theoretical value of x_{10001} is 244131582646046.
- * 
- * The period of this generator is ? FIXME (probably around 2^48). 
- */
-class Rand48 : public Rand
-{
-public:
-
-	Rand48( time_e )
-	{
-		seed(randomSeed());
-	}
-	
-	Rand48( uint s = 0x1234ABCD )
-	{
-		seed(s);
-	}	
-
-
-	/** Set the given seed. */
-	virtual void seed( uint s ) {
-		vstate.x0 = 0x330E;
-		vstate.x1 = uint16(s & 0xFFFF);
-		vstate.x2 = uint16((s >> 16) & 0xFFFF);
-	}
-	
-	/** Get a random number. */
-	virtual uint get() {
-		
-		advance();
-
-		uint x1 = vstate.x1;
-		uint x2 = vstate.x2;
-		return (x2 << 16) + x1;
-	}
-	
-	
-private:
-	
-	void advance()
-	{
-		/* work with unsigned long ints throughout to get correct integer
-		promotions of any unsigned short ints */
-		const uint32 x0 = vstate.x0;
-		const uint32 x1 = vstate.x1;
-		const uint32 x2 = vstate.x2;
-		
-		uint32 a;
-		a = a0 * x0 + c0;
-		
-		vstate.x0 = uint16(a & 0xFFFF);
-		a >>= 16;
-		
-		/* although the next line may overflow we only need the top 16 bits
-		in the following stage, so it does not matter */
-		
-		a += a0 * x1 + a1 * x0; 
-		vstate.x1 = uint16(a & 0xFFFF);
-		
-		a >>= 16;
-		a += a0 * x2 + a1 * x1 + a2 * x0;
-		vstate.x2 = uint16(a & 0xFFFF);
-	}
-
-	
-private:	
-	NVMATH_API static const uint16 a0, a1, a2, c0;
-
-	struct rand48_state_t { 
-		uint16 x0, x1, x2; 
-	} vstate;
-
-};
-
-} // nv namespace
-
-#endif // NV_MATH_RANDOM_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Random.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Random.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Random.cpp
@@ -1,54 +0,0 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#include <nvmath/Random.h>
-#include <time.h>
-
-using namespace nv;
-
-// Statics
-const uint16 Rand48::a0 = 0xE66D; 
-const uint16 Rand48::a1 = 0xDEEC; 
-const uint16 Rand48::a2 = 0x0005;
-const uint16 Rand48::c0 = 0x000B;
-
-
-/// Get a random seed based on the current time.
-uint Rand::randomSeed()
-{
-	return (uint)time(NULL);
-}
-
-
-void MTRand::initialize( uint32 seed )
-{
-	// Initialize generator state with seed
-	// See Knuth TAOCP Vol 2, 3rd Ed, p.106 for multiplier.
-	// In previous versions, most significant bits (MSBs) of the seed affect
-	// only MSBs of the state array.  Modified 9 Jan 2002 by Makoto Matsumoto.
-	uint32 *s = state;
-	uint32 *r = state;
-	int i = 1;
-	*s++ = seed & 0xffffffffUL;
-	for( ; i < N; ++i )
-	{
-		*s++ = ( 1812433253UL * ( *r ^ (*r >> 30) ) + i ) & 0xffffffffUL;
-		r++;
-	}
-}
-
-
-void MTRand::reload()
-{
-	// Generate N new values in state
-	// Made clearer and faster by Matthew Bellew (matthew.bellew@home.com)
-	uint32 *p = state;
-	int i;
-	for( i = N - M; i--; ++p )
-		*p = twist( p[M], p[0], p[1] );
-	for( i = M; --i; ++p )
-		*p = twist( p[M-N], p[0], p[1] );
-	*p = twist( p[M-N], p[0], state[0] );
-
-	left = N, next = state;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector.h
@@ -0,0 +1,12 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "Vector.h" // Vector3, Vector4
+
+
+#if NV_USE_ALTIVEC
+#   include "SimdVector_VE.h"
+#endif
+
+#if NV_USE_SSE
+#   include "SimdVector_SSE.h"
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector_SSE.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector_SSE.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector_SSE.h
@@ -0,0 +1,216 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef NV_SIMD_VECTOR_SSE_H
+#define NV_SIMD_VECTOR_SSE_H
+
+#include "nvcore/Memory.h"
+
+#include <xmmintrin.h>
+#if (NV_USE_SSE > 1)
+#include <emmintrin.h>
+#endif
+
+// See this for ideas:
+// http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/
+
+
+namespace nv {
+
+#define NV_SIMD_NATIVE NV_FORCEINLINE
+#define NV_SIMD_INLINE inline
+
+    class SimdVector
+    {
+    public:
+        __m128 vec;
+
+        typedef SimdVector const& Arg;
+
+        NV_SIMD_NATIVE SimdVector() {}
+        
+        NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
+        
+        NV_SIMD_NATIVE explicit SimdVector(float f) {
+            vec = _mm_set1_ps(f);
+        }
+
+        NV_SIMD_NATIVE explicit SimdVector(const float * v)
+        {
+            vec = _mm_load_ps( v );
+        }
+
+        NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w)
+        {
+            vec = _mm_setr_ps( x, y, z, w );
+        }
+
+        NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+
+        NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
+        {
+            vec = arg.vec;
+            return *this;
+        }
+
+        NV_SIMD_INLINE float toFloat() const 
+        {
+            NV_ALIGN_16 float f;
+            _mm_store_ss(&f, vec);
+            return f;
+        }
+
+        NV_SIMD_INLINE Vector3 toVector3() const
+        {
+            NV_ALIGN_16 float c[4];
+            _mm_store_ps( c, vec );
+            return Vector3( c[0], c[1], c[2] );
+        }
+
+        NV_SIMD_INLINE Vector4 toVector4() const
+        {
+            NV_ALIGN_16 float c[4];
+            _mm_store_ps( c, vec );
+            return Vector4( c[0], c[1], c[2], c[3] );
+        }
+
+#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
+        NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
+#undef SSE_SPLAT
+
+        NV_SIMD_NATIVE SimdVector& operator+=( Arg v )
+        {
+            vec = _mm_add_ps( vec, v.vec );
+            return *this;
+        }
+
+        NV_SIMD_NATIVE SimdVector& operator-=( Arg v )
+        {
+            vec = _mm_sub_ps( vec, v.vec );
+            return *this;
+        }
+
+        NV_SIMD_NATIVE SimdVector& operator*=( Arg v )
+        {
+            vec = _mm_mul_ps( vec, v.vec );
+            return *this;
+        }
+    };
+
+
+    NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( _mm_add_ps( left.vec, right.vec ) );
+    }
+
+    NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
+    }
+
+    NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
+    }
+
+    // Returns a*b + c
+    NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
+    }
+
+    // Returns -( a*b - c )
+    NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
+    }
+
+    NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v )
+    {
+        // get the reciprocal estimate
+        __m128 estimate = _mm_rcp_ps( v.vec );
+
+        // one round of Newton-Rhaphson refinement
+        __m128 diff = _mm_sub_ps( _mm_set1_ps( 1.0f ), _mm_mul_ps( estimate, v.vec ) );
+        return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
+    }
+
+    NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( _mm_min_ps( left.vec, right.vec ) );
+    }
+
+    NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( _mm_max_ps( left.vec, right.vec ) );
+    }
+
+    NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v )
+    {
+#if (NV_USE_SSE == 1)
+        // convert to ints
+        __m128 input = v.vec;
+        __m64 lo = _mm_cvttps_pi32( input );
+        __m64 hi = _mm_cvttps_pi32( _mm_movehl_ps( input, input ) );
+
+        // convert to floats
+        __m128 part = _mm_movelh_ps( input, _mm_cvtpi32_ps( input, hi ) );
+        __m128 truncated = _mm_cvtpi32_ps( part, lo );
+
+        // clear out the MMX multimedia state to allow FP calls later
+        _mm_empty(); 
+        return SimdVector( truncated );
+#else
+        // use SSE2 instructions
+        return SimdVector( _mm_cvtepi32_ps( _mm_cvttps_epi32( v.vec ) ) );
+#endif
+    }
+
+    NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
+    }
+
+    NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
+    {
+        __m128 a = _mm_andnot_ps( bits.vec, off.vec );
+        __m128 b = _mm_and_ps( bits.vec, on.vec );
+
+        return SimdVector( _mm_or_ps( a, b ) );
+    }
+
+    NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
+    {
+        __m128 bits = _mm_cmplt_ps( left.vec, right.vec );
+        int value = _mm_movemask_ps( bits );
+        return value != 0;
+    }
+
+} // namespace nv
+
+#endif // NV_SIMD_VECTOR_SSE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector_VE.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector_VE.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/SimdVector_VE.h
@@ -0,0 +1,189 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2016 Raptor Engineering, LLC
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef NV_SIMD_VECTOR_VE_H
+#define NV_SIMD_VECTOR_VE_H
+
+#ifndef __APPLE_ALTIVEC__
+#include <altivec.h>
+#undef bool
+#endif
+
+namespace nv {
+
+    class SimdVector
+    {
+	public:
+        vector float vec;
+
+        typedef SimdVector Arg;
+
+        SimdVector() {}
+        explicit SimdVector(float v) : vec(vec_splats(v)) {}	
+        explicit SimdVector(vector float v) : vec(v) {}
+        SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+
+        SimdVector& operator=(const SimdVector & arg)
+        {
+            vec = arg.vec;
+            return *this;
+        }
+
+        SimdVector(const float * v)
+        {
+            union { vector float v; float c[4]; } u;
+            u.c[0] = v[0];
+            u.c[1] = v[1];
+            u.c[2] = v[2];
+            u.c[3] = v[3];
+            vec = u.v;
+        }
+
+        SimdVector(float x, float y, float z, float w)
+        {
+            union { vector float v; float c[4]; } u;
+            u.c[0] = x;
+            u.c[1] = y;
+            u.c[2] = z;
+            u.c[3] = w;
+            vec = u.v;
+        }
+
+        float toFloat() const
+        {
+            union { vector float v; float c[4]; } u;
+            u.v = vec;
+            return u.c[0];
+        }
+
+        Vector3 toVector3() const
+        {
+            union { vector float v; float c[4]; } u;
+            u.v = vec;
+            return Vector3( u.c[0], u.c[1], u.c[2] );
+        }
+
+        Vector4 toVector4() const
+        {
+            union { vector float v; float c[4]; } u;
+            u.v = vec;
+            return Vector4( u.c[0], u.c[1], u.c[2], u.c[3] );
+        }
+
+        SimdVector splatX() const { return SimdVector( vec_splat( vec, 0 ) ); }
+        SimdVector splatY() const { return SimdVector( vec_splat( vec, 1 ) ); }
+        SimdVector splatZ() const { return SimdVector( vec_splat( vec, 2 ) ); }
+        SimdVector splatW() const { return SimdVector( vec_splat( vec, 3 ) ); }
+
+        SimdVector& operator+=( Arg v )
+        {
+            vec = vec_add( vec, v.vec );
+            return *this;
+        }
+
+        SimdVector& operator-=( Arg v )
+        {
+            vec = vec_sub( vec, v.vec );
+            return *this;
+        }
+
+        SimdVector& operator*=( Arg v )
+        {
+            vec = vec_madd( vec, v.vec, vec_splats( -0.0f ) );
+            return *this;
+        }
+    };
+
+    inline SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( vec_add( left.vec, right.vec ) );
+    }
+
+    inline SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( vec_sub( left.vec, right.vec ) );
+    }
+
+    inline SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
+    {
+        return SimdVector( vec_madd( left.vec, right.vec, vec_splats( -0.0f ) ) );
+    }
+
+    // Returns a*b + c
+    inline SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( vec_madd( a.vec, b.vec, c.vec ) );
+    }
+
+    // Returns -( a*b - c )
+    inline SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
+    {
+        return SimdVector( vec_nmsub( a.vec, b.vec, c.vec ) );
+    }
+
+    inline SimdVector reciprocal( SimdVector::Arg v )
+    {
+        // get the reciprocal estimate
+        vector float estimate = vec_re( v.vec );
+
+        // one round of Newton-Rhaphson refinement
+        vector float diff = vec_nmsub( estimate, v.vec, vec_splats( 1.0f ) );
+        return SimdVector( vec_madd( diff, estimate, estimate ) );
+    }
+
+    inline SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( vec_min( left.vec, right.vec ) );
+    }
+
+    inline SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( vec_max( left.vec, right.vec ) );
+    }
+
+    inline SimdVector truncate( SimdVector::Arg v )
+    {
+        return SimdVector( vec_trunc( v.vec ) );
+    }
+
+    inline SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
+    {
+        return SimdVector( ( vector float )vec_cmpeq( left.vec, right.vec ) );
+    }
+
+    inline SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
+    {
+        return SimdVector( vec_sel( off.vec, on.vec, ( vector unsigned int )bits.vec ) );
+    }
+
+    inline bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
+    {
+        return vec_any_lt( left.vec, right.vec ) != 0;
+    }
+
+} // namespace nv
+
+#endif // NV_SIMD_VECTOR_VE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/SphericalHarmonic.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/SphericalHarmonic.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/SphericalHarmonic.h
@@ -3,415 +3,415 @@
 #ifndef NV_MATH_SPHERICALHARMONIC_H
 #define NV_MATH_SPHERICALHARMONIC_H
 
-#include <nvmath/Vector.h>
+#include "nvmath.h"
 
-namespace nv
-{
-
-	NVMATH_API float legendrePolynomial( int l, int m, float x ) NV_CONST;
-	NVMATH_API float y( int l, int m, float theta, float phi ) NV_CONST;
-	NVMATH_API float y( int l, int m, Vector3::Arg v ) NV_CONST;
-	NVMATH_API float hy( int l, int m, float theta, float phi ) NV_CONST;
-	NVMATH_API float hy( int l, int m, Vector3::Arg v ) NV_CONST;
-	
-	class Sh;
-	float dot(const Sh & a, const Sh & b) NV_CONST;
-
-
-	/// Spherical harmonic class.
-	class Sh
-	{
-		friend class Sh2;
-		friend class ShMatrix;
-	public:
-		
-		/// Construct a spherical harmonic of the given order.
-		Sh(int o) : m_order(o)
-		{
-			m_elemArray = new float[basisNum()];
-		}
-		
-		/// Copy constructor.
-		Sh(const Sh & sh) : m_order(sh.order())
-		{
-			m_elemArray = new float[basisNum()];
-			memcpy(m_elemArray, sh.m_elemArray, sizeof(float) * basisNum());
-		}
-		
-		/// Destructor.
-		~Sh()
-		{
-			delete [] m_elemArray;
-			m_elemArray = NULL;
-		}
-		
-		/// Get number of bands.
-		static int bandNum(int order) {
-			return order + 1;
-		}
-		
-		/// Get number of sh basis.
-		static int basisNum(int order) {
-			return (order + 1) * (order + 1);
-		}
-		
-		/// Get the index for the given coefficients.
-		static int index( int l, int m ) {
-			return l * l + l + m;
-		}
-		
-		/// Get sh order.
-		int order() const
-		{
-			return m_order;
-		}
-		
-		/// Get sh order.
-		int bandNum() const
-		{
-			return bandNum(m_order);
-		}
-		
-		/// Get sh order.
-		int basisNum() const
-		{
-			return basisNum(m_order);
-		}
-		
-		/// Get sh coefficient indexed by l,m.
-		float elem( int l, int m ) const
-		{
-			return m_elemArray[index(l, m)];
-		}
-		
-		/// Get sh coefficient indexed by l,m.
-		float & elem( int l, int m )
-		{
-			return m_elemArray[index(l, m)];
-		}
-		
-		
-		/// Get sh coefficient indexed by i.
-		float elemAt( int i ) const {
-			return m_elemArray[i];
-		}
-		
-		/// Get sh coefficient indexed by i.
-		float & elemAt( int i )
-		{
-			return m_elemArray[i];
-		}
-		
-		
-		/// Reset the sh coefficients.
-		void reset()
-		{
-			for( int i = 0; i < basisNum(); i++ ) {
-				m_elemArray[i] = 0.0f;
-			}
-		}
-		
-		/// Copy spherical harmonic.
-		void operator= ( const Sh & sh )
-		{
-			nvDebugCheck(order() <= sh.order());
-			
-			for(int i = 0; i < basisNum(); i++) {
-				m_elemArray[i] = sh.m_elemArray[i];
-			}
-		}
-		
-		/// Add spherical harmonics.
-		void operator+= ( const Sh & sh )
-		{
-			nvDebugCheck(order() == sh.order());
-			
-			for(int i = 0; i < basisNum(); i++) {
-				m_elemArray[i] += sh.m_elemArray[i];
-			}
-		}
-		
-		/// Substract spherical harmonics.
-		void operator-= ( const Sh & sh )
-		{
-			nvDebugCheck(order() == sh.order());
-			
-			for(int i = 0; i < basisNum(); i++) {
-				m_elemArray[i] -= sh.m_elemArray[i];
-			}
-		}
-		
-		// Not exactly convolution, nor product.
-		void operator*= ( const Sh & sh )
-		{
-			nvDebugCheck(order() == sh.order());
-			
-			for(int i = 0; i < basisNum(); i++) {
-				m_elemArray[i] *= sh.m_elemArray[i];
-			}
-		}
-		
-		/// Scale spherical harmonics.
-		void operator*= ( float f )
-		{
-			for(int i = 0; i < basisNum(); i++) {
-				m_elemArray[i] *= f;
-			}
-		}
-		
-		/// Add scaled spherical harmonics.
-		void addScaled( const Sh & sh, float f )
-		{
-			nvDebugCheck(order() == sh.order());
-			
-			for(int i = 0; i < basisNum(); i++) {
-				m_elemArray[i] += sh.m_elemArray[i] * f;
-			}
-		}
-		
-		
-		/*/// Add a weighted sample to the sh coefficients.
-		void AddSample( const Vec3 & dir, const Color3f & color, float w=1.0f ) {
-			for(int l = 0; l <= order; l++) {
-				for(int m = -l; m <= l; m++) {
-					Color3f & elem = GetElem(l, m);
-					elem.Mad( elem, color, w * y(l, m, dir) );
-				}
-			}
-		}*/
-		
-		/// Evaluate 
-		void eval(Vector3::Arg dir)
-		{
-			for(int l = 0; l <= m_order; l++) {
-				for(int m = -l; m <= l; m++) {
-					elem(l, m) = y(l, m, dir);
-				}
-			}
-		}
-		
-		
-		/// Evaluate the spherical harmonic function.
-		float sample(Vector3::Arg dir) const
-		{
-			Sh sh(order());
-			sh.eval(dir);
-			
-			return dot(sh, *this);
-		}
-		
-		
-	protected:
-		
-		const int m_order;
-		float * m_elemArray;
-		
-	};
-
-
-	/// Compute dot product of the spherical harmonics.
-	inline float dot(const Sh & a, const Sh & b)
-	{
-		nvDebugCheck(a.order() == b.order());
-		
-		float sum = 0;
-		for( int i = 0; i < Sh::basisNum(a.order()); i++ ) {
-			sum += a.elemAt(i) * b.elemAt(i);
-		}
-		
-		return sum;
-	}
-
-	
-	/// Second order spherical harmonic.
-	class Sh2 : public Sh
-	{
-	public:
-		
-		/// Constructor.
-		Sh2() : Sh(2) {}
-		
-		/// Copy constructor.
-		Sh2(const Sh2 & sh) : Sh(sh) {}
-		
-		/// Spherical harmonic resulting from projecting the clamped cosine transfer function to the SH basis.
-		void cosineTransfer()
-		{
-			const float c1 = 0.282095f;	// K(0, 0)
-			const float c2 = 0.488603f; // K(1, 0)
-			const float c3 = 1.092548f; // sqrt(15.0f / PI) / 2.0f = K(2, -2)
-			const float c4 = 0.315392f; // sqrt(5.0f / PI) / 4.0f) = K(2, 0)
-			const float c5 = 0.546274f; // sqrt(15.0f / PI) / 4.0f) = K(2, 2)
-			
-			const float normalization = PI * 16.0f / 17.0f;
-			
-			const float const1 = c1 * normalization * 1.0f;
-			const float const2 = c2 * normalization * (2.0f / 3.0f);
-			const float const3 = c3 * normalization * (1.0f / 4.0f);
-			const float const4 = c4 * normalization * (1.0f / 4.0f);
-			const float const5 = c5 * normalization * (1.0f / 4.0f);
-			
-			m_elemArray[0] = const1;
-			
-			m_elemArray[1] = -const2;
-			m_elemArray[2] = const2;
-			m_elemArray[3] = -const2;
-			
-			m_elemArray[4] = const3;
-			m_elemArray[5] = -const3;
-			m_elemArray[6] = const4;
-			m_elemArray[7] = -const3;
-			m_elemArray[8] = const5;
-		}
-	};
-	
-	
+#include <string.h> // memcpy
 
-#if 0
 
-/// Spherical harmonic matrix.
-class ShMatrix
+namespace nv
 {
-public:
+    class Vector3;
+    class Matrix;
+
+    NVMATH_API float legendrePolynomial( int l, int m, float x ) NV_CONST;
+    NVMATH_API float shBasis( int l, int m, float theta, float phi ) NV_CONST;
+    NVMATH_API float shBasis( int l, int m, const Vector3 & v ) NV_CONST;
+    NVMATH_API float hshBasis( int l, int m, float theta, float phi ) NV_CONST;
+    NVMATH_API float hshBasis( int l, int m, const Vector3 & v ) NV_CONST;
+
+    class Sh;
+    float dot(const Sh & a, const Sh & b) NV_CONST;
+
+
+    /// Spherical harmonic class.
+    class Sh
+    {
+        friend class Sh2;
+        friend class ShMatrix;
+    public:
+
+        /// Construct a spherical harmonic of the given order.
+        Sh(int o) : m_order(o)
+        {
+            m_elemArray = new float[basisNum()];
+        }
+
+        /// Copy constructor.
+        Sh(const Sh & sh) : m_order(sh.order())
+        {
+            m_elemArray = new float[basisNum()];
+            memcpy(m_elemArray, sh.m_elemArray, sizeof(float) * basisNum());
+        }
+
+        /// Destructor.
+        ~Sh()
+        {
+            delete [] m_elemArray;
+            m_elemArray = NULL;
+        }
+
+        /// Get number of bands.
+        static int bandNum(int m_order) {
+            return m_order + 1;
+        }
+
+        /// Get number of sh basis.
+        static int basisNum(int m_order) {
+            return (m_order + 1) * (m_order + 1);
+        }
+
+        /// Get the index for the given coefficients.
+        static int index( int l, int m ) {
+            return l * l + l + m;
+        }
+
+        /// Get sh order.
+        int order() const
+        {
+            return m_order;
+        }
+
+        /// Get sh order.
+        int bandNum() const
+        {
+            return bandNum(m_order);
+        }
+
+        /// Get sh order.
+        int basisNum() const
+        {
+            return basisNum(m_order);
+        }
+
+        /// Get sh coefficient indexed by l,m.
+        float elem( int l, int m ) const
+        {
+            return m_elemArray[index(l, m)];
+        }
+
+        /// Get sh coefficient indexed by l,m.
+        float & elem( int l, int m )
+        {
+            return m_elemArray[index(l, m)];
+        }
+
+
+        /// Get sh coefficient indexed by i.
+        float elemAt( int i ) const {
+            return m_elemArray[i];
+        }
+
+        /// Get sh coefficient indexed by i.
+        float & elemAt( int i )
+        {
+            return m_elemArray[i];
+        }
+
+
+        /// Reset the sh coefficients.
+        void reset()
+        {
+            for( int i = 0; i < basisNum(); i++ ) {
+                m_elemArray[i] = 0.0f;
+            }
+        }
+
+        /// Copy spherical harmonic.
+        void operator= ( const Sh & sh )
+        {
+            nvDebugCheck(order() <= sh.order());
+
+            for(int i = 0; i < basisNum(); i++) {
+                m_elemArray[i] = sh.m_elemArray[i];
+            }
+        }
+
+        /// Add spherical harmonics.
+        void operator+= ( const Sh & sh )
+        {
+            nvDebugCheck(order() == sh.order());
+
+            for(int i = 0; i < basisNum(); i++) {
+                m_elemArray[i] += sh.m_elemArray[i];
+            }
+        }
+
+        /// Substract spherical harmonics.
+        void operator-= ( const Sh & sh )
+        {
+            nvDebugCheck(order() == sh.order());
+
+            for(int i = 0; i < basisNum(); i++) {
+                m_elemArray[i] -= sh.m_elemArray[i];
+            }
+        }
+
+        // Not exactly convolution, nor product.
+        void operator*= ( const Sh & sh )
+        {
+            nvDebugCheck(order() == sh.order());
+
+            for(int i = 0; i < basisNum(); i++) {
+                m_elemArray[i] *= sh.m_elemArray[i];
+            }
+        }
+
+        /// Scale spherical harmonics.
+        void operator*= ( float f )
+        {
+            for(int i = 0; i < basisNum(); i++) {
+                m_elemArray[i] *= f;
+            }
+        }
+
+        /// Add scaled spherical harmonics.
+        void addScaled( const Sh & sh, float f )
+        {
+            nvDebugCheck(order() == sh.order());
+
+            for(int i = 0; i < basisNum(); i++) {
+                m_elemArray[i] += sh.m_elemArray[i] * f;
+            }
+        }
+
+
+        /*/// Add a weighted sample to the sh coefficients.
+        void AddSample( const Vec3 & dir, const Color3f & color, float w=1.0f ) {
+        for(int l = 0; l <= order; l++) {
+        for(int m = -l; m <= l; m++) {
+        Color3f & elem = GetElem(l, m);
+        elem.Mad( elem, color, w * shBasis(l, m, dir) );
+        }
+        }
+        }*/
+
+        /// Evaluate 
+        void eval(const Vector3 & dir)
+        {
+            for(int l = 0; l <= m_order; l++) {
+                for(int m = -l; m <= l; m++) {
+                    elem(l, m) = shBasis(l, m, dir);
+                }
+            }
+        }
+
+
+        /// Evaluate the spherical harmonic function.
+        float sample(const Vector3 & dir) const
+        {
+            Sh sh(order());
+            sh.eval(dir);
+
+            return dot(sh, *this);
+        }
+
+
+    protected:
+
+        const int m_order;
+        float * m_elemArray;
+
+    };
+
+
+    /// Compute dot product of the spherical harmonics.
+    inline float dot(const Sh & a, const Sh & b)
+    {
+        nvDebugCheck(a.order() == b.order());
+
+        float sum = 0;
+        for( int i = 0; i < Sh::basisNum(a.order()); i++ ) {
+            sum += a.elemAt(i) * b.elemAt(i);
+        }
+
+        return sum;
+    }
+
+
+    /// Second order spherical harmonic.
+    class Sh2 : public Sh
+    {
+    public:
+
+        /// Constructor.
+        Sh2() : Sh(2) {}
+
+        /// Copy constructor.
+        Sh2(const Sh2 & sh) : Sh(sh) {}
+
+        /// Spherical harmonic resulting from projecting the clamped cosine transfer function to the SH basis.
+        void cosineTransfer()
+        {
+            const float c1 = 0.282095f;	// K(0, 0)
+            const float c2 = 0.488603f; // K(1, 0)
+            const float c3 = 1.092548f; // sqrt(15.0f / PI) / 2.0f = K(2, -2)
+            const float c4 = 0.315392f; // sqrt(5.0f / PI) / 4.0f) = K(2, 0)
+            const float c5 = 0.546274f; // sqrt(15.0f / PI) / 4.0f) = K(2, 2)
+
+            const float normalization = PI * 16.0f / 17.0f;
+
+            const float const1 = c1 * normalization * 1.0f;
+            const float const2 = c2 * normalization * (2.0f / 3.0f);
+            const float const3 = c3 * normalization * (1.0f / 4.0f);
+            const float const4 = c4 * normalization * (1.0f / 4.0f);
+            const float const5 = c5 * normalization * (1.0f / 4.0f);
+
+            m_elemArray[0] = const1;
+
+            m_elemArray[1] = -const2;
+            m_elemArray[2] = const2;
+            m_elemArray[3] = -const2;
+
+            m_elemArray[4] = const3;
+            m_elemArray[5] = -const3;
+            m_elemArray[6] = const4;
+            m_elemArray[7] = -const3;
+            m_elemArray[8] = const5;
+        }
+    };
+
+
+
+    /// Spherical harmonic matrix.
+    class ShMatrix
+    {
+    public:
+
+        /// Create an identity matrix of the given order.
+        ShMatrix(int o = 2) : m_order(o), m_identity(true)
+        {
+            nvCheck(m_order > 0);
+            m_e = new float[size()];
+            m_band = new float *[bandNum()];		
+            setupBands();
+        }
+
+        /// Destroy and free matrix elements.
+        ~ShMatrix()
+        {
+            delete m_e;
+            delete m_band;
+        }
+
+        /// Set identity matrix.
+        void setIdentity()
+        {
+            m_identity = true;
+        }
+
+        /// Return true if this is an identity matrix, false in other case.
+        bool isIdentity() const {
+            return m_identity;
+        }
+
+        /// Get number of bands of this matrix.
+        int bandNum() const
+        {
+            return m_order+1;
+        }
+
+        /// Get total number of elements in the matrix.
+        int size() const
+        {
+            int size = 0;
+            for (int i = 0; i < bandNum(); i++) {
+                size += square(i * 2 + 1);
+            }
+            return size;
+        }
+
+        /// Get element at the given raw index.
+        float element(int idx) const
+        {
+            return m_e[idx];
+        }
+
+        /// Get element at the given with the given indices.
+        float & element(int b, int x, int y)
+        {
+            nvDebugCheck(b >= 0);
+            nvDebugCheck(b < bandNum());
+            return m_band[b][(b + y) * (b * 2 + 1) + (b + x)];
+        }
+
+        /// Get element at the given with the given indices.
+        float element(int b, int x, int y) const
+        {
+            nvDebugCheck(b >= 0);
+            nvDebugCheck(b < bandNum());
+            return m_band[b][(b + y) * (b * 2 + 1) + (b + x)];
+        }
+
+        /// Copy matrix.
+        void copy(const ShMatrix & m)
+        {
+            nvDebugCheck(m_order == m.m_order);
+            memcpy(m_e, m.m_e, size() * sizeof(float));
+        }
+
+        /// Rotate the given coefficients.
+        /*void transform( const Sh & restrict source,  Sh * restrict dest ) const {
+            nvCheck( &source != dest );	// Make sure there's no aliasing.
+            nvCheck( dest->m_order <= m_order );
+            nvCheck( m_order <= source.m_order );
+
+            if (m_identity) {
+                *dest = source;
+                return;
+            }
+
+            // Loop through each band.
+            for (int l = 0; l <= dest->m_order; l++) {
+
+                for (int mo = -l; mo <= l; mo++) {
+
+                    Color3f rgb = Color3f::Black;
+
+                    for( int mi = -l; mi <= l; mi++ ) {
+                        rgb.Mad( rgb, source.elem(l, mi), elem(l, mo, mi) );
+                    }
+
+                    dest->elem(l, mo) = rgb;
+                }
+            }
+        }*/
+
+
+        NVMATH_API void multiply( const ShMatrix &A, const ShMatrix &B );
+        NVMATH_API void rotation( const Matrix & m );
+        NVMATH_API void rotation( int axis, float angles );
+        NVMATH_API void print();
+
+
+    private:
+
+        // @@ These could be static indices precomputed only once.
+        /// Setup the band pointers.
+        void setupBands()
+        {
+            int size = 0;
+            for( int i = 0; i < bandNum(); i++ ) {
+                m_band[i] = &m_e[size];
+                size += square(i * 2 + 1);
+            }
+        }
+
+
+    private:
+
+        // Matrix order.
+        const int m_order;
+
+        // Identity flag for quick transform.
+        bool m_identity;
 
-	/// Create an identity matrix of the given order.
-	ShMatrix(int o = 2) : order(o), identity(true)
-	{
-		nvCheck(order > 0);
-		e = new float[Size()];
-		band = new float *[GetBandNum()];		
-		setupBands();
-	}
-
-	/// Destroy and free matrix elements.
-	~ShMatrix()
-	{
-		delete e;
-		delete band;
-	}
-
-	/// Set identity matrix.
-	void setIdentity()
-	{
-		identity = true;
-	}
-
-	/// Return true if this is an identity matrix, false in other case.
-	bool isIdentity() const {
-		return identity;
-	}
-	
-	/// Get number of bands of this matrix.
-	int bandNum() const
-	{
-		return order+1;
-	}
-	
-	/// Get total number of elements in the matrix.
-	int size() const
-	{
-		int size = 0;
-		for( int i = 0; i < bandNum(); i++ ) {
-			size += SQ(i * 2 + 1);
-		}
-		return size;
-	}
-
-	/// Get element at the given raw index.
-	float elem(const int idx) const
-	{
-		return e[idx];
-	}
-	
-	/// Get element at the given with the given indices.
-	float & elem( const int b, const int x, const int y )
-	{
-		nvDebugCheck(b >= 0);
-		nvDebugCheck(b < bandNum());
-		return band[b][(b + y) * (b * 2 + 1) + (b + x)];
-	}
-
-	/// Get element at the given with the given indices.
-	float elem( const int b, const int x, const int y ) const
-	{
-		nvDebugCheck(b >= 0);
-		nvDebugCheck(b < bandNum());
-		return band[b][(b + y) * (b * 2 + 1) + (b + x)];
-	}
-
-	/** Copy matrix. */
-	void Copy( const ShMatrix & m )
-	{
-		nvDebugCheck(order == m.order);
-		memcpy(e, m.e, Size() * sizeof(float));
-	}
-	
-	/** Rotate the given coefficients. */
-	void transform( const Sh & restrict source,  Sh * restrict dest ) const {
-		piCheck( &source != dest );	// Make sure there's no aliasing.
-		piCheck( dest->order <= order );
-		piCheck( order <= source.order );
-		
-		if( identity ) {
-			*dest = source;
-			return;
-		}
-		
-		// Loop through each band.
-		for( int l = 0; l <= dest->order; l++ ) {
-			
-			for( int mo = -l; mo <= l; mo++ ) {
-				
-				Color3f rgb = Color3f::Black;
-				
-				for( int mi = -l; mi <= l; mi++ ) {
-					rgb.Mad( rgb, source.elem(l, mi), elem(l, mo, mi) );
-				}
-				
-				dest->elem(l, mo) = rgb;
-			}
-		}
-	}
-
-
-	MATHLIB_API void multiply( const ShMatrix &A, const ShMatrix &B );
-	MATHLIB_API void rotation( const Matrix & m );
-	MATHLIB_API void rotation( int axis, float angles );
-	MATHLIB_API void print();
-	
-
-private:
-
-	// @@ These could be static indices precomputed only once.
-	/// Setup the band pointers.
-	void setupBands()
-	{
-		int size = 0;
-		for( int i = 0; i < bandNum(); i++ ) {
-			band[i] = &e[size];
-			size += SQ(i * 2 + 1);
-		}
-	}
-	
-	
-private:
-
-	// Matrix order.
-	const int m_order;
-
-	// Identity flag for quick transform.
-	bool m_identity;
-
-	// Array of elements.
-	float * m_e;
-	
-	// Band pointers.
-	float ** m_band;
-	
-};
+        // Array of elements.
+        float * m_e;
 
-#endif // 0
+        // Band pointers.
+        float ** m_band;
 
+    };
 
 
 } // nv namespace
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/SphericalHarmonic.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/SphericalHarmonic.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/SphericalHarmonic.cpp
@@ -1,6 +1,8 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
-#include <nvmath/SphericalHarmonic.h>
+#include "SphericalHarmonic.h"
+
+#include "Vector.h"
 
 using namespace nv;
 
@@ -11,8 +13,10 @@
 	// Basic integer factorial.
 	inline static int factorial( int v )
 	{
-		if (v == 0) {
-			return 1;
+		const static int fac_table[] = { 1, 1, 2, 6, 24, 120, 720, 5040, 40320, 362880, 3628800, 39916800 };
+
+		if(v <= 11){
+			return fac_table[v];
 		}
 	
 		int result = v;
@@ -80,7 +84,7 @@
 	
 	template <int l, int m> float legendre(float x);
 	
-	template <> float legendre<0, 0>(float x) {
+	template <> float legendre<0, 0>(float ) {
 		return 1;
 	}
 	
@@ -171,7 +175,7 @@
  * @param theta is the altitude, in the range [0, PI]
  * @param phi is the azimuth, in the range [0, 2*PI]
  */
-float nv::y( int l, int m, float theta, float phi )
+float nv::shBasis( int l, int m, float theta, float phi )
 {
 	if( m == 0 ) {
 		// K(l, 0) = sqrt((2*l+1)/(4*PI))
@@ -193,11 +197,11 @@
  * y = sin(theta)*sin(phi)
  * z = cos(theta)
  */
-float nv::y( int l, int m, Vector3::Arg v )
+float nv::shBasis( int l, int m, Vector3::Arg v )
 {
-	float theta = acosf(v.z());
-	float phi = atan2f(v.y(), v.x());
-	return y( l, m, theta, phi );
+	float theta = acosf(v.z);
+	float phi = atan2f(v.y, v.x);
+	return shBasis( l, m, theta, phi );
 }
 
 
@@ -208,7 +212,7 @@
  * @param theta is the altitude, in the range [0, PI/2]
  * @param phi is the azimuth, in the range [0, 2*PI]
  */
-float nv::hy( int l, int m, float theta, float phi )
+float nv::hshBasis( int l, int m, float theta, float phi )
 {
 	if( m == 0 ) {
 		// HK(l, 0) = sqrt((2*l+1)/(2*PI))
@@ -230,11 +234,11 @@
  * y = sin(theta)*sin(phi)
  * z = cos(theta)
  */
-float nv::hy( int l, int m, Vector3::Arg v )
+float nv::hshBasis( int l, int m, Vector3::Arg v )
 {
-	float theta = acosf(v.z());
-	float phi = atan2f(v.y(), v.x());
-	return y( l, m, theta, phi );
+	float theta = acosf(v.z);
+	float phi = atan2f(v.y, v.x);
+	return hshBasis( l, m, theta, phi );
 }
 
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/TriBox.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/TriBox.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/TriBox.cpp
@@ -1,226 +0,0 @@
-/********************************************************/
-/* AABB-triangle overlap test code                      */
-/* by Tomas Akenine-Möller                              */
-/* Function: int triBoxOverlap(float boxcenter[3],      */
-/*          float boxhalfsize[3],float triverts[3][3]); */
-/* History:                                             */
-/*   2001-03-05: released the code in its first version */
-/*   2001-06-18: changed the order of the tests, faster */
-/*                                                      */
-/* Acknowledgement: Many thanks to Pierre Terdiman for  */
-/* suggestions and discussions on how to optimize code. */
-/* Thanks to David Hunt for finding a ">="-bug!         */
-/********************************************************/
-
-#include <nvmath/Vector.h>
-#include <nvmath/Triangle.h>
-
-using namespace nv;
-
-#define X 0
-#define Y 1
-#define Z 2
-
-#define FINDMINMAX(x0,x1,x2,min,max) \
-  min = max = x0;   \
-  if(x1<min) min=x1;\
-  if(x1>max) max=x1;\
-  if(x2<min) min=x2;\
-  if(x2>max) max=x2;
-
-
-static bool planeBoxOverlap(Vector3::Arg normal, Vector3::Arg vert, Vector3::Arg maxbox)	// -NJMP-
-{
-	Vector3 vmin, vmax;
-
-	float signs[3] = {1, 1, 1};
-	if (normal.x() <= 0.0f) signs[0] = -1;
-	if (normal.y() <= 0.0f) signs[1] = -1;
-	if (normal.z() <= 0.0f) signs[2] = -1;
-	
-	Vector3 sign(signs[0], signs[1], signs[2]);
-	vmin = -scale(sign, maxbox) - vert;
-	vmax = scale(sign, maxbox) - vert;
-
-	if (dot(normal, vmin) > 0.0f) return false;
-	if (dot(normal, vmax) >= 0.0f) return true;
-
-	return false;
-}
-
-
-/*======================== X-tests ========================*/
-#define AXISTEST_X01(a, b, fa, fb) \
-	p0 = a*v0.y() - b*v0.z(); \
-	p2 = a*v2.y() - b*v2.z(); \
-	if(p0<p2) {min=p0; max=p2;} else {min=p2; max=p0;} \
-	rad = fa * boxhalfsize.y() + fb * boxhalfsize.z(); \
-	if(min>rad || max<-rad) return false;
-
-#define AXISTEST_X2(a, b, fa, fb) \
-	p0 = a*v0.y() - b*v0.z(); \
-	p1 = a*v1.y() - b*v1.z(); \
-	if(p0<p1) {min=p0; max=p1;} else {min=p1; max=p0;} \
-	rad = fa * boxhalfsize.y() + fb * boxhalfsize.z(); \
-	if(min>rad || max<-rad) return false;
-
-/*======================== Y-tests ========================*/
-#define AXISTEST_Y02(a, b, fa, fb) \
-	p0 = -a*v0.x() + b*v0.z(); \
-	p2 = -a*v2.x() + b*v2.z(); \
-	if(p0<p2) {min=p0; max=p2;} else {min=p2; max=p0;} \
-	rad = fa * boxhalfsize.x() + fb * boxhalfsize.z(); \
-	if(min>rad || max<-rad) return false;
-
-#define AXISTEST_Y1(a, b, fa, fb) \
-	p0 = -a*v0.x() + b*v0.z(); \
-	p1 = -a*v1.x() + b*v1.z(); \
-	if(p0<p1) {min=p0; max=p1;} else {min=p1; max=p0;} \
-	rad = fa * boxhalfsize.x() + fb * boxhalfsize.z(); \
-	if(min>rad || max<-rad) return false;
-
-/*======================== Z-tests ========================*/
-
-#define AXISTEST_Z12(a, b, fa, fb) \
-	p1 = a*v1.x() - b*v1.y();	\
-	p2 = a*v2.x() - b*v2.y();	\
-	if(p2<p1) {min=p2; max=p1;} else {min=p1; max=p2;} \
-	rad = fa * boxhalfsize.x() + fb * boxhalfsize.y(); \
-	if(min>rad || max<-rad) return false;
-
-#define AXISTEST_Z0(a, b, fa, fb) \
-	p0 = a*v0.x() - b*v0.y();	\
-	p1 = a*v1.x() - b*v1.y();	\
-	if(p0<p1) {min=p0; max=p1;} else {min=p1; max=p0;} \
-	rad = fa * boxhalfsize.x() + fb * boxhalfsize.y(); \
-	if(min>rad || max<-rad) return false;
-
-
-bool triBoxOverlap(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & tri)
-{
-	// use separating axis theorem to test overlap between triangle and box
-	// need to test for overlap in these directions:
-	// 1) the {x,y,z}-directions (actually, since we use the AABB of the triangle
-	//    we do not even need to test these)
-	// 2) normal of the triangle
-	// 3) crossproduct(edge from tri, {x,y,z}-directin)
-	//    this gives 3x3=9 more tests
-	Vector3 v0, v1, v2;
-	float min, max, p0, p1, p2, rad, fex, fey, fez;
-	Vector3 normal, e0, e1, e2;
-
-	// This is the fastest branch on Sun.
-	// move everything so that the boxcenter is in (0,0,0)
-	v0 = tri.v[0] - boxcenter;
-	v1 = tri.v[1] - boxcenter;
-	v2 = tri.v[2] - boxcenter;
-
-	// Compute triangle edges.
-	e0 = v1 - v0;	// tri edge 0
-	e1 = v2 - v1;	// tri edge 1
-	e2 = v0 - v2;	// tri edge 2
-
-	// Bullet 3:
-	//  test the 9 tests first (this was faster)
-	fex = fabsf(e0.x());
-	fey = fabsf(e0.y());
-	fez = fabsf(e0.z());
-	AXISTEST_X01(e0.z(), e0.y(), fez, fey);
-	AXISTEST_Y02(e0.z(), e0.x(), fez, fex);
-	AXISTEST_Z12(e0.y(), e0.x(), fey, fex);
-
-	fex = fabsf(e1.x());
-	fey = fabsf(e1.y());
-	fez = fabsf(e1.z());
-	AXISTEST_X01(e1.z(), e1.y(), fez, fey);
-	AXISTEST_Y02(e1.z(), e1.x(), fez, fex);
-	AXISTEST_Z0(e1.y(), e1.x(), fey, fex);
-
-	fex = fabsf(e2.x());
-	fey = fabsf(e2.y());
-	fez = fabsf(e2.z());
-	AXISTEST_X2(e2.z(), e2.y(), fez, fey);
-	AXISTEST_Y1(e2.z(), e2.x(), fez, fex);
-	AXISTEST_Z12(e2.y(), e2.x(), fey, fex);
-
-	// Bullet 1:
-	//  first test overlap in the {x,y,z}-directions
-	//  find min, max of the triangle each direction, and test for overlap in
-	//  that direction -- this is equivalent to testing a minimal AABB around
-	//  the triangle against the AABB
-
-	// test in X-direction
-	FINDMINMAX(v0.x(), v1.x(), v2.x(), min, max);
-	if(min > boxhalfsize.x() || max < -boxhalfsize.x()) return false;
-
-	// test in Y-direction
-	FINDMINMAX(v0.y(), v1.y(), v2.y(), min, max);
-	if(min > boxhalfsize.y() || max < -boxhalfsize.y()) return false;
-
-	// test in Z-direction
-	FINDMINMAX(v0.z(), v1.z(), v2.z(), min, max);
-	if(min > boxhalfsize.z() || max < -boxhalfsize.z()) return false;
-
-	// Bullet 2:
-	//  test if the box intersects the plane of the triangle
-	//  compute plane equation of triangle: normal*x+d=0
-	normal = cross(e0, e1);
-
-	return planeBoxOverlap(normal, v0, boxhalfsize);
-}
-
-
-bool triBoxOverlapNoBounds(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & tri)
-{
-	// use separating axis theorem to test overlap between triangle and box
-	// need to test for overlap in these directions:
-	// 1) the {x,y,z}-directions (actually, since we use the AABB of the triangle
-	//    we do not even need to test these)
-	// 2) normal of the triangle
-	// 3) crossproduct(edge from tri, {x,y,z}-directin)
-	//    this gives 3x3=9 more tests
-	Vector3 v0, v1, v2;
-	float min, max, p0, p1, p2, rad, fex, fey, fez;
-	Vector3 normal, e0, e1, e2;
-
-	// This is the fastest branch on Sun.
-	// move everything so that the boxcenter is in (0,0,0)
-	v0 = tri.v[0] - boxcenter;
-	v1 = tri.v[1] - boxcenter;
-	v2 = tri.v[2] - boxcenter;
-
-	// Compute triangle edges.
-	e0 = v1 - v0;	// tri edge 0
-	e1 = v2 - v1;	// tri edge 1
-	e2 = v0 - v2;	// tri edge 2
-
-	// Bullet 3:
-	//  test the 9 tests first (this was faster)
-	fex = fabsf(e0.x());
-	fey = fabsf(e0.y());
-	fez = fabsf(e0.z());
-	AXISTEST_X01(e0.z(), e0.y(), fez, fey);
-	AXISTEST_Y02(e0.z(), e0.x(), fez, fex);
-	AXISTEST_Z12(e0.y(), e0.x(), fey, fex);
-
-	fex = fabsf(e1.x());
-	fey = fabsf(e1.y());
-	fez = fabsf(e1.z());
-	AXISTEST_X01(e1.z(), e1.y(), fez, fey);
-	AXISTEST_Y02(e1.z(), e1.x(), fez, fex);
-	AXISTEST_Z0(e1.y(), e1.x(), fey, fex);
-
-	fex = fabsf(e2.x());
-	fey = fabsf(e2.y());
-	fez = fabsf(e2.z());
-	AXISTEST_X2(e2.z(), e2.y(), fez, fey);
-	AXISTEST_Y1(e2.z(), e2.x(), fez, fex);
-	AXISTEST_Z12(e2.y(), e2.x(), fey, fex);
-
-	// Bullet 2:
-	//  test if the box intersects the plane of the triangle
-	//  compute plane equation of triangle: normal*x+d=0
-	normal = cross(e0, e1);
-
-	return planeBoxOverlap(normal, v0, boxhalfsize);
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Triangle.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Triangle.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Triangle.h
@@ -1,81 +0,0 @@
-// This code is in the public domain -- Ignacio Castaño <castanyo@yahoo.es>
-
-#ifndef NV_MATH_TRIANGLE_H
-#define NV_MATH_TRIANGLE_H
-
-#include <nvmath/nvmath.h>
-#include <nvmath/Vector.h>
-#include <nvmath/Box.h>
-
-namespace nv
-{
-
-	/// Triangle class with three vertices.
-	class Triangle
-	{
-	public:
-		Triangle() {};
-
-		Triangle(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
-		{
-			v[0] = v0;
-			v[1] = v1;
-			v[2] = v2;
-		}
-
-		/// Get the bounds of the triangle.
-		Box bounds() const
-		{
-			Box bounds;
-			bounds.clearBounds();
-			bounds.addPointToBounds(v[0]);
-			bounds.addPointToBounds(v[1]);
-			bounds.addPointToBounds(v[2]);
-			return bounds;
-		}
-
-		Vector4 plane() const
-		{
-			Vector3 n = cross(v[1]-v[0], v[2]-v[0]);
-			return Vector4(n, dot(n, v[0]));
-		}
-
-		Vector3 v[3];
-	};
-
-
-	// Tomas Akenine-Möller box-triangle test.
-	NVMATH_API bool triBoxOverlap(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & triangle);
-	NVMATH_API bool triBoxOverlapNoBounds(Vector3::Arg boxcenter, Vector3::Arg boxhalfsize, const Triangle & triangle);
-
-
-	// Moller ray triangle test.
-	NVMATH_API bool rayTest_Moller(const Triangle & t, Vector3::Arg orig, Vector3::Arg dir, float * out_t, float * out_u, float * out_v);
-
-	inline bool rayTest(const Triangle & t, Vector3::Arg orig, Vector3::Arg dir, float * out_t, float * out_u, float * out_v)
-	{
-		return rayTest_Moller(t, orig, dir, out_t, out_u, out_v);
-	}
-	
-	inline bool overlap(const Triangle & t, const Box & b)
-	{
-		Vector3 center = b.center();
-		Vector3 extents = b.extents();
-		return triBoxOverlap(center, extents, t);
-	}
-
-	inline bool overlap(const Box & b, const Triangle & t)
-	{
-		return overlap(t, b);
-	}
-
-	inline bool overlapNoBounds(const Triangle & t, const Box & b)
-	{
-		Vector3 center = b.center();
-		Vector3 extents = b.extents();
-		return triBoxOverlapNoBounds(center, extents, t);
-	}
-
-} // nv namespace
-
-#endif	// NV_MATH_TRIANGLE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Triangle.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Triangle.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Triangle.cpp
@@ -1,168 +0,0 @@
-// This code is in the public domain -- Ignacio Castaño <castanyo@yahoo.es>
-
-#include <nvmath/Triangle.h>
-
-using namespace nv;
-
-
-/// Tomas Möller, barycentric ray-triangle test.
-bool rayTest_Moller(const Triangle & t, Vector3::Arg orig, Vector3::Arg dir, float * out_t, float * out_u, float * out_v)
-{
-	// find vectors for two edges sharing vert0 
-	Vector3 e1 = t.v[1] - t.v[0];
-	Vector3 e2 = t.v[2] - t.v[0];
-
-	// begin calculating determinant - also used to calculate U parameter
-	Vector3 pvec = cross(dir, e2);
-	
-	// if determinant is near zero, ray lies in plane of triangle
-	float det = dot(e1, pvec);
-	if (det < -NV_EPSILON) {
-		return false;
-	}
-
-	// calculate distance from vert0 to ray origin
-	Vector3 tvec = orig - t.v[0];
-
-	// calculate U parameter and test bounds
-	float u = dot(tvec, pvec);
-	if( u < 0.0f || u > det ) {
-		return false;
-	}
-
-	// prepare to test V parameter
-	Vector3 qvec = cross(tvec, e1);
-
-	// calculate V parameter and test bounds
-	float v = dot(dir, qvec);
-	if (v < 0.0f || u + v > det) {
-		return false;
-	}
-
-	// calculate t, scale parameters, ray intersects triangle
-	float inv_det = 1.0f / det;
-	*out_t = dot(e2, qvec) * inv_det;
-	*out_u = u * inv_det;	// v
-	*out_v = v * inv_det;	// 1-(u+v)
-
-	return true;
-}
-
-
-
-
-
-#if 0
-
-
-// IC: This code is adapted from my Pi.MathLib code, based on Moller-Trumbore triangle test.
-FXVector3 edge1, edge2, pvec, tvec, qvec;
-
-edge1 = tri.V1 - tri.V0;
-edge2 = tri.V2 - tri.V0;
-
-pvec.Cross(ray.Direction, edge2);
-
-float det = FXVector3.Dot(edge1, pvec);
-
-// calculate distance from vert0 to ray origin.
-FXVector3 tvec = ray.Origin - vert0;
-
-if( det < 0 ) 
-{
-	// calculate U parameter and test bounds.
-	float u = FXVector3.Dot(tvec, pvec);
-	if (u > 0.0 || u < det)
-	{
-		return false;
-	}
-
-	// prepare to test V parameter.
-	qvec.Cross(tvec, edge1);
-
-	// calculate V parameter and test bounds.
-	float v = FXVector3.Dot(dir, qvec);
-
-	return v <= 0.0 && u + v >= det;
-}
-else
-{
-	// calculate U parameter and test bounds.
-	float u = FXVector3.Dot(tvec, pvec);
-	if (u < 0.0 || u > det)
-	{
-		return false;
-	}
-
-	// prepare to test V parameter.
-	qvec.Cross(tvec, edge1);
-
-	// calculate V parameter and test bounds.
-	float v = FXVector3.Dot(dir, qvec);
-
-	return v >= 0.0 && u + v <= det;
-}
-
-
-
-/** 
- * Dan Sunday, parametric ray-triangle test.
- */
-//    Output: *I = intersection point (when it exists)
-//    Return: -1 = triangle is degenerate (a segment or point)
-//             0 = disjoint (no intersect)
-//             1 = intersect in unique point I1
-//             2 = are in the same plane
-bool RayTriangleTest( const Vec3 &p0, const Vec3 &p1, 
-					  const Vec3 &v0, const Vec3 &v1, const Vec3 &v2, const Vec3 &n,
-					  Vec3 &I ) {
-    Vec3 u, v;					// triangle vectors
-    Vec3 dir, w0, w;			// ray vectors
-    float r, a, b;				// params to calc ray-plane intersect
-
-    // get triangle edge vectors and plane normal
-    u.Sub( v1, v0 );
-    v.Sub( v2, v0 );
-
-    dir.Sub( p1, p0 );			// ray direction vector
-	w0.Sub( p0, v0 );
-    a = Vec3DotProduct( n, w0 );
-    b = Vec3DotProduct( n, dir );
-
-    if( fabs(b) < TI_EPSILON ) 	// ray is parallel to triangle plane
-		return false;
-
-
-    // get intersect point of ray with triangle plane
-    r = -a / b;
-    if( r < 0.0f )				// ray goes away from triangle
-        return false;			// => no intersect
-    
-	// for a segment, also test if (r > 1.0) => no intersect
-
-	I.Mad( p0, dir, r );		// intersect point of ray and plane
-
-    // is I inside T?
-    float    uu, uv, vv, wu, wv, D;
-    uu = Vec3DotProduct( u, u );
-    uv = Vec3DotProduct( u, v );
-    vv = Vec3DotProduct( v, v );
-    w = I - v0;
-    wu = Vec3DotProduct( w, u );
-    wv = Vec3DotProduct( w, v );
-    D = uv * uv - uu * vv;
-
-    // get and test parametric coords
-    float s, t;
-    s = (uv * wv - vv * wu) / D;
-    if( s<0.0 || s > 1.0)        // I is outside T
-        return false;
-    t = (uv * wu - uu * wv) / D;
-    if( t<0.0 || (s + t) > 1.0)  // I is outside T
-        return false;
-
-    return true;                      // I is in T
-}
-
-
-#endif // 0
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.h
@@ -1,805 +1,149 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_MATH_VECTOR_H
 #define NV_MATH_VECTOR_H
 
-#include <nvmath/nvmath.h>
-#include <nvcore/Containers.h> // min, max
+#include "nvmath.h"
 
 namespace nv
 {
-
-enum zero_t { zero };
-enum identity_t { identity };
-
-// I should probably use templates.
-typedef float scalar;
-
-class NVMATH_CLASS Vector2
-{
-public:
-	typedef Vector2 const & Arg;
-	
-	Vector2();
-	explicit Vector2(zero_t);
-	explicit Vector2(scalar f);
-	Vector2(scalar x, scalar y);
-	Vector2(Vector2::Arg v);
-	
-	const Vector2 & operator=(Vector2::Arg v);
-	
-	scalar x() const;
-	scalar y() const;
-
-	scalar component(uint idx) const;
-
-	const scalar * ptr() const;
-
-	void set(scalar x, scalar y);
-	
-	Vector2 operator-() const;
-	void operator+=(Vector2::Arg v);
-	void operator-=(Vector2::Arg v);
-	void operator*=(scalar s);
-	void operator*=(Vector2::Arg v);
-
-	friend bool operator==(Vector2::Arg a, Vector2::Arg b);
-	friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
-
-private:
-	scalar m_x, m_y;
-};
-
-
-class NVMATH_CLASS Vector3
-{
-public:
-	typedef Vector3 const & Arg;
-	
-	Vector3();
-	explicit Vector3(zero_t);
-	Vector3(scalar x, scalar y, scalar z);
-	Vector3(Vector2::Arg v, scalar z);
-	Vector3(Vector3::Arg v);
-	
-	const Vector3 & operator=(Vector3::Arg v);
-	
-	scalar x() const;
-	scalar y() const;
-	scalar z() const;
-
-	Vector2 xy() const;
-
-	scalar component(uint idx) const;
-
-	const scalar * ptr() const;
-
-	void set(scalar x, scalar y, scalar z);
-	
-	Vector3 operator-() const;
-	void operator+=(Vector3::Arg v);
-	void operator-=(Vector3::Arg v);
-	void operator*=(scalar s);
-	void operator/=(scalar s);
-	void operator*=(Vector3::Arg v);
-
-	friend bool operator==(Vector3::Arg a, Vector3::Arg b);
-	friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
-	
-private:
-	scalar m_x, m_y, m_z;
-};
-
-
-class NVMATH_CLASS Vector4
-{
-public:
-	typedef Vector4 const & Arg;
-	
-	Vector4();
-	explicit Vector4(zero_t);
-	Vector4(scalar x, scalar y, scalar z, scalar w);
-	Vector4(Vector2::Arg v, scalar z, scalar w);
-	Vector4(Vector3::Arg v, scalar w);
-	Vector4(Vector4::Arg v);
-//	Vector4(const Quaternion & v);
-	
-	const Vector4 & operator=(Vector4::Arg v);
-	
-	scalar x() const;
-	scalar y() const;
-	scalar z() const;
-	scalar w() const;
-	
-	Vector2 xy() const;
-	Vector3 xyz() const;
-
-	scalar component(uint idx) const;
-
-	const scalar * ptr() const;
-
-	void set(scalar x, scalar y, scalar z, scalar w);
-	
-	Vector4 operator-() const;
-	void operator+=(Vector4::Arg v);
-	void operator-=(Vector4::Arg v);
-	void operator*=(scalar s);
-	void operator*=(Vector4::Arg v);
-	
-	friend bool operator==(Vector4::Arg a, Vector4::Arg b);
-	friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
-	
-private:
-	scalar m_x, m_y, m_z, m_w;
-};
-
-
-// Vector2
-
-inline Vector2::Vector2() {}
-inline Vector2::Vector2(zero_t) : m_x(0.0f), m_y(0.0f) {}
-inline Vector2::Vector2(scalar f) : m_x(f), m_y(f) {}
-inline Vector2::Vector2(scalar x, scalar y) : m_x(x), m_y(y) {}
-inline Vector2::Vector2(Vector2::Arg v) : m_x(v.x()), m_y(v.y()) {}
-
-inline const Vector2 & Vector2::operator=(Vector2::Arg v)
-{
-	m_x = v.x();
-	m_y = v.y();
-	return *this;
-}
-
-inline scalar Vector2::x() const { return m_x; }
-inline scalar Vector2::y() const { return m_y; }
-
-inline scalar Vector2::component(uint idx) const
-{
-	nvDebugCheck(idx < 2);
-	if (idx == 0) return x();
-	if (idx == 1) return y();
-	nvAssume(false);
-	return 0.0f;
-}
-
-inline const scalar * Vector2::ptr() const
-{
-	return &m_x;
-}
-
-inline void Vector2::set(scalar x, scalar y)
-{
-	m_x = x;
-	m_y = y;
-}
-
-inline Vector2 Vector2::operator-() const
-{
-	return Vector2(-m_x, -m_y);
-}
-
-inline void Vector2::operator+=(Vector2::Arg v)
-{
-	m_x += v.m_x;
-	m_y += v.m_y;
-}
-
-inline void Vector2::operator-=(Vector2::Arg v)
-{
-	m_x -= v.m_x;
-	m_y -= v.m_y;
-}
-
-inline void Vector2::operator*=(scalar s)
-{
-	m_x *= s;
-	m_y *= s;
-}
-
-inline void Vector2::operator*=(Vector2::Arg v)
-{
-	m_x *= v.m_x;
-	m_y *= v.m_y;
-}
-
-inline bool operator==(Vector2::Arg a, Vector2::Arg b)
-{
-	return a.m_x == b.m_x && a.m_y == b.m_y; 
-}
-inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
-{
-	return a.m_x != b.m_x || a.m_y != b.m_y; 
-}
-
-
-// Vector3
-
-inline Vector3::Vector3() {}
-inline Vector3::Vector3(zero_t) : m_x(0.0f), m_y(0.0f), m_z(0.0f) {}
-inline Vector3::Vector3(scalar x, scalar y, scalar z) : m_x(x), m_y(y), m_z(z) {}
-inline Vector3::Vector3(Vector2::Arg v, scalar z) : m_x(v.x()), m_y(v.y()), m_z(z) {}
-inline Vector3::Vector3(Vector3::Arg v) : m_x(v.x()), m_y(v.y()), m_z(v.z()) {}
-
-inline const Vector3 & Vector3::operator=(Vector3::Arg v)
-{
-	m_x = v.m_x;
-	m_y = v.m_y;
-	m_z = v.m_z;
-	return *this;
-}
-	
-inline scalar Vector3::x() const { return m_x; } 
-inline scalar Vector3::y() const { return m_y; }
-inline scalar Vector3::z() const { return m_z; }
-	
-inline Vector2 Vector3::xy() const
-{
-	return Vector2(m_x, m_y);
-}
-
-inline scalar Vector3::component(uint idx) const
-{
-	nvDebugCheck(idx < 3);
-	if (idx == 0) return x();
-	if (idx == 1) return y();
-	if (idx == 2) return z();
-	nvAssume(false);
-	return 0.0f;
-}
-
-inline const scalar * Vector3::ptr() const
-{
-	return &m_x;
-}
-	
-inline void Vector3::set(scalar x, scalar y, scalar z)
-{
-	m_x = x;
-	m_y = y;
-	m_z = z;
-}
-
-inline Vector3 Vector3::operator-() const
-{
-	return Vector3(-m_x, -m_y, -m_z);
-}
-
-inline void Vector3::operator+=(Vector3::Arg v)
-{
-	m_x += v.m_x;
-	m_y += v.m_y;
-	m_z += v.m_z;
-}
-
-inline void Vector3::operator-=(Vector3::Arg v)
-{
-	m_x -= v.m_x;
-	m_y -= v.m_y;
-	m_z -= v.m_z;
-}
-
-inline void Vector3::operator*=(scalar s)
-{
-	m_x *= s;
-	m_y *= s;
-	m_z *= s;
-}
-
-inline void Vector3::operator/=(scalar s)
-{
-	float is = 1.0f / s;
-	m_x *= is;
-	m_y *= is;
-	m_z *= is;
-}
-
-inline void Vector3::operator*=(Vector3::Arg v)
-{
-	m_x *= v.m_x;
-	m_y *= v.m_y;
-	m_z *= v.m_z;
-}
-
-inline bool operator==(Vector3::Arg a, Vector3::Arg b)
-{
-	return a.m_x == b.m_x && a.m_y == b.m_y && a.m_z == b.m_z; 
-}
-inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
-{
-	return a.m_x != b.m_x || a.m_y != b.m_y || a.m_z != b.m_z; 
-}
-
-
-// Vector4
-
-inline Vector4::Vector4() {}
-inline Vector4::Vector4(zero_t) : m_x(0.0f), m_y(0.0f), m_z(0.0f), m_w(0.0f) {}
-inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : m_x(x), m_y(y), m_z(z), m_w(w) {}
-inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : m_x(v.x()), m_y(v.y()), m_z(z), m_w(w) {}
-inline Vector4::Vector4(Vector3::Arg v, scalar w) : m_x(v.x()), m_y(v.y()), m_z(v.z()), m_w(w) {}
-inline Vector4::Vector4(Vector4::Arg v) : m_x(v.x()), m_y(v.y()), m_z(v.z()), m_w(v.w()) {}
-
-inline const Vector4 & Vector4::operator=(const Vector4 & v)
-{
-	m_x = v.m_x;
-	m_y = v.m_y;
-	m_z = v.m_z;
-	m_w = v.m_w;
-	return *this;
-}
-
-inline scalar Vector4::x() const { return m_x; }
-inline scalar Vector4::y() const { return m_y; }
-inline scalar Vector4::z() const { return m_z; }
-inline scalar Vector4::w() const { return m_w; }
-
-inline Vector2 Vector4::xy() const
-{
-	return Vector2(m_x, m_y);
-}
-
-inline Vector3 Vector4::xyz() const
-{
-	return Vector3(m_x, m_y, m_z);
-}
-
-inline scalar Vector4::component(uint idx) const
-{
-	nvDebugCheck(idx < 4);
-	if (idx == 0) return x();
-	if (idx == 1) return y();
-	if (idx == 2) return z();
-	if (idx == 3) return w();
-	nvAssume(false);
-	return 0.0f;
-}
-
-inline const scalar * Vector4::ptr() const
-{
-	return &m_x;
-}
-
-inline void Vector4::set(scalar x, scalar y, scalar z, scalar w)
-{
-	m_x = x;
-	m_y = y;
-	m_z = z;
-	m_w = w;
-}
-
-inline Vector4 Vector4::operator-() const
-{
-	return Vector4(-m_x, -m_y, -m_z, -m_w);
-}
-
-inline void Vector4::operator+=(Vector4::Arg v)
-{
-	m_x += v.m_x;
-	m_y += v.m_y;
-	m_z += v.m_z;
-	m_w += v.m_w;
-}
-
-inline void Vector4::operator-=(Vector4::Arg v)
-{
-	m_x -= v.m_x;
-	m_y -= v.m_y;
-	m_z -= v.m_z;
-	m_w -= v.m_w;
-}
-
-inline void Vector4::operator*=(scalar s)
-{
-	m_x *= s;
-	m_y *= s;
-	m_z *= s;
-	m_w *= s;
-}
-
-inline void Vector4::operator*=(Vector4::Arg v)
-{
-	m_x *= v.m_x;
-	m_y *= v.m_y;
-	m_z *= v.m_z;
-	m_w *= v.m_w;
-}
-
-inline bool operator==(Vector4::Arg a, Vector4::Arg b)
-{
-	return a.m_x == b.m_x && a.m_y == b.m_y && a.m_z == b.m_z && a.m_w == b.m_w; 
-}
-inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
-{
-	return a.m_x != b.m_x || a.m_y != b.m_y || a.m_z != b.m_z || a.m_w != b.m_w; 
-}
-
-
-
-// Functions
-
-
-// Vector2
-
-inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
-{
-	return Vector2(a.x() + b.x(), a.y() + b.y());
-}
-inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
-{
-	return add(a, b);
-}
-
-inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
-{
-	return Vector2(a.x() - b.x(), a.y() - b.y());
-}
-inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
-{
-	return sub(a, b);
-}
-
-inline Vector2 scale(Vector2::Arg v, scalar s)
-{
-	return Vector2(v.x() * s, v.y() * s);
-}
-
-inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
-{
-	return Vector2(v.x() * s.x(), v.y() * s.y());
-}
-
-inline Vector2 operator*(Vector2::Arg v, scalar s)
-{
-	return scale(v, s);
-}
-
-inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
-{
-	return Vector2(v1.x()*v2.x(), v1.y()*v2.y());
-}
-
-inline Vector2 operator*(scalar s, Vector2::Arg v)
-{
-	return scale(v, s);
-}
-
-inline scalar dot(Vector2::Arg a, Vector2::Arg b)
-{
-	return a.x() * b.x() + a.y() * b.y();
-}
-
-inline scalar length_squared(Vector2::Arg v)
-{
-	return v.x() * v.x() + v.y() * v.y();
-}
-
-inline scalar length(Vector2::Arg v)
-{
-	return sqrtf(length_squared(v));
-}
-
-inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
-{
-	return equal(v1.x(), v2.x(), epsilon) && equal(v1.y(), v2.y(), epsilon);
-}
-
-inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
-{
-	return Vector2(min(a.x(), b.x()), min(a.y(), b.y()));
-}
-
-inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
-{
-	return Vector2(max(a.x(), b.x()), max(a.y(), b.y()));
-}
-
-inline bool isValid(Vector2::Arg v)
-{
-	return isFinite(v.x()) && isFinite(v.y());
-}
-
-
-// Vector3
-
-inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
-{
-	return Vector3(a.x() + b.x(), a.y() + b.y(), a.z() + b.z());
-}
-inline Vector3 add(Vector3::Arg a, float b)
-{
-	return Vector3(a.x() + b, a.y() + b, a.z() + b);
-}
-inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
-{
-	return add(a, b);
-}
-inline Vector3 operator+(Vector3::Arg a, float b)
-{
-	return add(a, b);
-}
-
-inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
-{
-	return Vector3(a.x() - b.x(), a.y() - b.y(), a.z() - b.z());
-}
-inline Vector3 sub(Vector3::Arg a, float b)
-{
-	return Vector3(a.x() - b, a.y() - b, a.z() - b);
-}
-inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
-{
-	return sub(a, b);
-}
-inline Vector3 operator-(Vector3::Arg a, float b)
-{
-	return sub(a, b);
-}
-
-inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
-{
-	return Vector3(a.y() * b.z() - a.z() * b.y(), a.z() * b.x() - a.x() * b.z(), a.x() * b.y() - a.y() * b.x());
-}
-
-inline Vector3 scale(Vector3::Arg v, scalar s)
-{
-	return Vector3(v.x() * s, v.y() * s, v.z() * s);
-}
-
-inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
-{
-	return Vector3(v.x() * s.x(), v.y() * s.y(), v.z() * s.z());
-}
-
-inline Vector3 operator*(Vector3::Arg v, scalar s)
-{
-	return scale(v, s);
-}
-
-inline Vector3 operator*(scalar s, Vector3::Arg v)
-{
-	return scale(v, s);
-}
-
-inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
-{
-	return scale(v, s);
-}
-
-inline Vector3 operator/(Vector3::Arg v, scalar s)
-{
-	return scale(v, 1.0f/s);
-}
-
-inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
-{
-	return Vector3(a.x() + b.x() * s, a.y() + b.y() * s, a.z() + b.z() * s);
-}
-
-inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t)
-{
-	const scalar s = 1.0f - t;
-	return Vector3(v1.x() * s + t * v2.x(), v1.y() * s + t * v2.y(), v1.z() * s + t * v2.z());
-}
-
-inline scalar dot(Vector3::Arg a, Vector3::Arg b)
-{
-	return a.x() * b.x() + a.y() * b.y() + a.z() * b.z();
-}
-
-inline scalar length_squared(Vector3::Arg v)
-{
-	return v.x() * v.x() + v.y() * v.y() + v.z() * v.z();
-}
-
-inline scalar length(Vector3::Arg v)
-{
-	return sqrtf(length_squared(v));
-}
-
-inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
-{
-	return equal(length(v), 1, epsilon);
-}
-
-inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
-{
-	float l = length(v);
-	nvDebugCheck(!isZero(l, epsilon));
-	Vector3 n = scale(v, 1.0f / l);
-	nvDebugCheck(isNormalized(n));
-	return n;
-}
-
-inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
-{
-	float l = length(v);
-	if (isZero(l, epsilon)) {
-		return fallback;
-	}
-	return scale(v, 1.0f / l);
-}
-
-inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
-{
-	return equal(v1.x(), v2.x(), epsilon) && equal(v1.y(), v2.y(), epsilon) && equal(v1.z(), v2.z(), epsilon);
-}
-
-inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
-{
-	return Vector3(min(a.x(), b.x()), min(a.y(), b.y()), min(a.z(), b.z()));
-}
-
-inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
-{
-	return Vector3(max(a.x(), b.x()), max(a.y(), b.y()), max(a.z(), b.z()));
-}
-
-inline Vector3 clamp(Vector3::Arg v, float min, float max)
-{
-	return Vector3(clamp(v.x(), min, max), clamp(v.y(), min, max), clamp(v.z(), min, max));
-}
-
-inline bool isValid(Vector3::Arg v)
-{
-	return isFinite(v.x()) && isFinite(v.y()) && isFinite(v.z());
-}
-
-/*
-Vector3 transform(Quaternion, vector3);
-Vector3 transform_point(matrix34, vector3);
-Vector3 transform_vector(matrix34, vector3);
-Vector3 transform_point(matrix44, vector3);
-Vector3 transform_vector(matrix44, vector3);
-*/
-
-// Vector4
-
-inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
-{
-	return Vector4(a.x() + b.x(), a.y() + b.y(), a.z() + b.z(), a.w() + b.w());
-}
-inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
-{
-	return add(a, b);
-}
-
-inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
-{
-	return Vector4(a.x() - b.x(), a.y() - b.y(), a.z() - b.z(), a.w() - b.w());
-}
-inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
-{
-	return sub(a, b);
-}
-
-inline Vector4 scale(Vector4::Arg v, scalar s)
-{
-	return Vector4(v.x() * s, v.y() * s, v.z() * s, v.w() * s);
-}
-
-inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
-{
-	return Vector4(v.x() * s.x(), v.y() * s.y(), v.z() * s.z(), v.w() * s.w());
-}
-
-inline Vector4 operator*(Vector4::Arg v, scalar s)
-{
-	return scale(v, s);
-}
-
-inline Vector4 operator*(scalar s, Vector4::Arg v)
-{
-	return scale(v, s);
-}
-
-inline Vector4 operator/(Vector4::Arg v, scalar s)
-{
-	return scale(v, 1.0f/s);
-}
-
-inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, scalar s)
-{
-	return Vector4(a.x() + b.x() * s, a.y() + b.y() * s, a.z() + b.z() * s, a.w() + b.w() * s);
-}
-
-inline scalar dot(Vector4::Arg a, Vector4::Arg b)
-{
-	return a.x() * b.x() + a.y() * b.y() + a.z() * b.z() + a.w() * b.w();
-}
-
-inline scalar length_squared(Vector4::Arg v)
-{
-	return v.x() * v.x() + v.y() * v.y() + v.z() * v.z() + v.w() * v.w();
-}
-
-inline scalar length(Vector4::Arg v)
-{
-	return sqrtf(length_squared(v));
-}
-
-inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
-{
-	return equal(length(v), 1, epsilon);
-}
-
-inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
-{
-	float l = length(v);
-	nvDebugCheck(!isZero(l, epsilon));
-	Vector4 n = scale(v, 1.0f / l);
-	nvDebugCheck(isNormalized(n));
-	return n;
-}
-
-inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
-{
-	float l = length(v);
-	if (isZero(l, epsilon)) {
-		return fallback;
-	}
-	return scale(v, 1.0f / l);
-}
-
-inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
-{
-	return equal(v1.x(), v2.x(), epsilon) && equal(v1.y(), v2.y(), epsilon) && equal(v1.z(), v2.z(), epsilon) && equal(v1.w(), v2.w(), epsilon);
-}
-
-inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
-{
-	return Vector4(min(a.x(), b.x()), min(a.y(), b.y()), min(a.z(), b.z()), min(a.w(), b.w()));
-}
-
-inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
-{
-	return Vector4(max(a.x(), b.x()), max(a.y(), b.y()), max(a.z(), b.z()), max(a.w(), b.w()));
-}
-
-inline bool isValid(Vector4::Arg v)
-{
-	return isFinite(v.x()) && isFinite(v.y()) && isFinite(v.z()) && isFinite(v.w());
-}
-
-
-
-/*
-vector4 transform(matrix34, vector4);
-vector4 transform(matrix44, vector4);
-*/
-
-/*
-Quaternion mul(Quaternion, Quaternion);   // rotational composition
-Quaternion conjugate(Quaternion);
-Quaternion inverse(Quaternion);
-Quaternion axis_angle(const Vector3 & v, scalar s);
-*/
-
-/*
-matrix34 add(matrix34, matrix34);            // note: implicit '1' stays as '1'
-matrix34 operator+(matrix34, matrix34);
-matrix34 sub(matrix34, matrix34);            // note: implicit '1' stays as '1'
-matrix34 operator-(matrix34, matrix34);
-matrix34 mul(matrix34, matrix34);
-matrix34 operator*(matrix34, matrix34);
-matrix34 mul(matrix34, quaternion4);         //  rotation multiplication
-matrix34 operator*(matrix34, quaternion4);   //  rotation multiplication
-matrix34 translation(vector3);
-matrix34 rotation(quaternion4);
-matrix34 rotation(vector3, scalar);          //  axis/angle
-
-matrix44 add(matrix44, matrix44);
-matrix44 operator+(matrix44, matrix44);
-matrix44 sub(matrix44, matrix44);
-matrix44 operator-(matrix44, matrix44);
-matrix44 mul(matrix44, matrix44);
-matrix44 operator*(matrix44, matrix44);
-matrix44 mul(matrix44, quaternion4);         //  rotation multiplication
-matrix44 operator*(matrix44, quaternion4);   //  rotation multiplication
-matrix44 invert(matrix34);
-matrix44 invert(matrix44);
-matrix44 transpose(matrix34);
-matrix44 transpose(matrix44);
-*/
+    class NVMATH_CLASS Vector2
+    {
+    public:
+        typedef Vector2 const & Arg;
+
+        Vector2();
+        explicit Vector2(float f);
+        Vector2(float x, float y);
+        Vector2(Vector2::Arg v);
+
+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
+        //template <typename T> operator T() const { return T(x, y); }
+
+        const Vector2 & operator=(Vector2::Arg v);
+
+        const float * ptr() const;
+
+        void set(float x, float y);
+
+        Vector2 operator-() const;
+        void operator+=(Vector2::Arg v);
+        void operator-=(Vector2::Arg v);
+        void operator*=(float s);
+        void operator*=(Vector2::Arg v);
+
+        friend bool operator==(Vector2::Arg a, Vector2::Arg b);
+        friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
+
+        union {
+            struct {
+                float x, y;
+            };
+            float component[2];
+        };
+    };
+
+    class NVMATH_CLASS Vector3
+    {
+    public:
+        typedef Vector3 const & Arg;
+
+        Vector3();
+        explicit Vector3(float x);
+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
+        Vector3(float x, float y, float z);
+        Vector3(Vector2::Arg v, float z);
+        Vector3(Vector3::Arg v);
+
+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
+        //template <typename T> operator T() const { return T(x, y, z); }
+
+        const Vector3 & operator=(Vector3::Arg v);
+
+        Vector2 xy() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z);
+
+        Vector3 operator-() const;
+        void operator+=(Vector3::Arg v);
+        void operator-=(Vector3::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector3::Arg v);
+        void operator/=(Vector3::Arg v);
+
+        friend bool operator==(Vector3::Arg a, Vector3::Arg b);
+        friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
+
+        union {
+            struct {
+                float x, y, z;
+            };
+            float component[3];
+        };
+    };
+
+    class NVMATH_CLASS Vector4
+    {
+    public:
+        typedef Vector4 const & Arg;
+
+        Vector4();
+        explicit Vector4(float x);
+        Vector4(float x, float y, float z, float w);
+        Vector4(Vector2::Arg v, float z, float w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
+        Vector4(Vector3::Arg v, float w);
+        Vector4(Vector4::Arg v);
+        //	Vector4(const Quaternion & v);
+
+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+        //template <typename T> operator T() const { return T(x, y, z, w); }
+
+        const Vector4 & operator=(Vector4::Arg v);
+
+        Vector2 xy() const;
+        Vector2 zw() const;
+        Vector3 xyz() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z, float w);
+
+        Vector4 operator-() const;
+        void operator+=(Vector4::Arg v);
+        void operator-=(Vector4::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector4::Arg v);
+        void operator/=(Vector4::Arg v);
+
+        friend bool operator==(Vector4::Arg a, Vector4::Arg b);
+        friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
+
+        union {
+            struct {
+                float x, y, z, w;
+            };
+            float component[4];
+        };
+    };
 
 } // nv namespace
 
+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
+
+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
+
+// Instead we simply have explicit casts:
+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
+
 #endif // NV_MATH_VECTOR_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.cpp
@@ -0,0 +1,4 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "Vector.h"
+#include "Vector.inl"
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/Vector.inl
@@ -0,0 +1,919 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_VECTOR_INL
+#define NV_MATH_VECTOR_INL
+
+#include "Vector.h"
+#include "nvcore/Utils.h" // min, max
+#include "nvcore/Hash.h" // hash
+
+namespace nv
+{
+
+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
+
+
+    // Vector2
+    inline Vector2::Vector2() {}
+    inline Vector2::Vector2(float f) : x(f), y(f) {}
+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
+
+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        return *this;
+    }
+
+    inline const float * Vector2::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector2::set(float x, float y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    inline Vector2 Vector2::operator-() const
+    {
+        return Vector2(-x, -y);
+    }
+
+    inline void Vector2::operator+=(Vector2::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+    }
+
+    inline void Vector2::operator-=(Vector2::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+    }
+
+    inline void Vector2::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+    }
+
+    inline void Vector2::operator*=(Vector2::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+    }
+
+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x == b.x && a.y == b.y; 
+    }
+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x != b.x || a.y != b.y; 
+    }
+
+
+    // Vector3
+    inline Vector3::Vector3() {}
+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
+
+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        return *this;
+    }
+
+
+    inline Vector2 Vector3::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline const float * Vector3::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector3::set(float x, float y, float z)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+
+    inline Vector3 Vector3::operator-() const
+    {
+        return Vector3(-x, -y, -z);
+    }
+
+    inline void Vector3::operator+=(Vector3::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+    }
+
+    inline void Vector3::operator-=(Vector3::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+    }
+
+    inline void Vector3::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+    }
+
+    inline void Vector3::operator/=(float s)
+    {
+        float is = 1.0f / s;
+        x *= is;
+        y *= is;
+        z *= is;
+    }
+
+    inline void Vector3::operator*=(Vector3::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+    }
+
+    inline void Vector3::operator/=(Vector3::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+    }
+
+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z; 
+    }
+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z; 
+    }
+
+
+    // Vector4
+    inline Vector4::Vector4() {}
+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector2 Vector4::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
+    inline Vector3 Vector4::xyz() const
+    {
+        return Vector3(x, y, z);
+    }
+
+    inline const float * Vector4::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector4::set(float x, float y, float z, float w)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+
+    inline Vector4 Vector4::operator-() const
+    {
+        return Vector4(-x, -y, -z, -w);
+    }
+
+    inline void Vector4::operator+=(Vector4::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+        w += v.w;
+    }
+
+    inline void Vector4::operator-=(Vector4::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+        w -= v.w;
+    }
+
+    inline void Vector4::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+        w *= s;
+    }
+
+    inline void Vector4::operator/=(float s)
+    {
+        x /= s;
+        y /= s;
+        z /= s;
+        w /= s;
+    }
+
+    inline void Vector4::operator*=(Vector4::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+        w *= v.w;
+    }
+
+    inline void Vector4::operator/=(Vector4::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+        w /= v.w;
+    }
+
+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
+    }
+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
+    }
+
+
+
+    // Functions
+
+
+    // Vector2
+
+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x + b.x, a.y + b.y);
+    }
+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x - b.x, a.y - b.y);
+    }
+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, float s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
+    {
+        return Vector2(v.x * s.x, v.y * s.y);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
+    {
+        return Vector2(v1.x*v2.x, v1.y*v2.y);
+    }
+
+    inline Vector2 operator*(float s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
+    inline float dot(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x * b.x + a.y * b.y;
+    }
+
+    inline float lengthSquared(Vector2::Arg v)
+    {
+        return v.x * v.x + v.y * v.y;
+    }
+
+    inline float length(Vector2::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float inverseLength(Vector2::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector2 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
+    }
+
+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(min(a.x, b.x), min(a.y, b.y));
+    }
+
+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(max(a.x, b.x), max(a.y, b.y));
+    }
+
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
+    inline bool isFinite(Vector2::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y);
+    }
+
+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector2 vf = v;
+        nv::floatCleanup(vf.component, 2);
+        return vf;
+    }
+
+    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+        // IC: While it may be appealing to use the following expression:
+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+
+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
+        // numbers and the results becomes very unstable and dependent on the order of the factors.
+
+        // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result
+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
+        // the triangle.
+
+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
+        return triangleArea(a-c, b-c);
+    }
+
+
+    template <>
+    inline uint hash(const Vector2 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 2, h);
+    }
+
+
+
+    // Vector3
+
+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    inline Vector3 add(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x + b, a.y + b, a.z + b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
+    {
+        return add(a, b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, float b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+    }
+    inline Vector3 sub(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x - b, a.y - b, a.z - b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
+    {
+        return sub(a, b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, float b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, float s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
+    {
+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(float s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
+    {
+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
+    }*/
+
+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
+    }
+
+    inline float dot(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+
+    inline float lengthSquared(Vector3::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z;
+    }
+
+    inline float length(Vector3::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
+    inline float inverseLength(Vector3::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector3 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
+    }
+
+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+    }
+
+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+    }
+
+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
+    {
+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
+    }
+
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
+
+    inline bool isFinite(Vector3::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
+    }
+
+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector3 vf = v;
+        nv::floatCleanup(vf.component, 3);
+        return vf;
+    }
+
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
+    template <>
+    inline uint hash(const Vector3 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 3, h);
+    }
+
+
+    // Vector4
+
+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+    }
+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, float s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
+    {
+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(float s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    {
+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
+    }
+
+    inline float dot(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+
+    inline float lengthSquared(Vector4::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+    }
+
+    inline float length(Vector4::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float inverseLength(Vector4::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector4 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
+    }
+
+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+    }
+
+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+    }
+
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
+    inline bool isFinite(Vector4::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
+    }
+
+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector4 vf = v;
+        nv::floatCleanup(vf.component, 4);
+        return vf;
+    }
+
+    template <>
+    inline uint hash(const Vector4 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 4, h);
+    }
+
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
+} // nv namespace
+
+#endif // NV_MATH_VECTOR_INL
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/ftoi.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/ftoi.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/ftoi.h
@@ -0,0 +1,256 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MATH_FTOI_H
+#define NV_MATH_FTOI_H
+
+#include "nvmath/nvmath.h"
+
+#include <math.h>
+
+namespace nv
+{
+    // Optimized float to int conversions. See:
+    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
+    // http://www.stereopsis.com/sree/fpu2006.html
+    // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
+    // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
+
+
+    union DoubleAnd64 {
+        uint64    i;
+        double    d;
+    };
+
+    static const double floatutil_xs_doublemagic = (6755399441055744.0);                            // 2^52 * 1.5
+    static const double floatutil_xs_doublemagicdelta = (1.5e-8);                                   // almost .5f = .5f + 1e^(number of exp bit)
+    static const double floatutil_xs_doublemagicroundeps = (0.5f - floatutil_xs_doublemagicdelta);  // almost .5f = .5f - 1e^(number of exp bit)
+
+    NV_FORCEINLINE int ftoi_round_xs(double val, double magic) {
+#if 1
+        DoubleAnd64 dunion;
+        dunion.d = val + magic;
+        return (int32) dunion.i; // just cast to grab the bottom bits
+#else
+        val += magic;
+        return ((int*)&val)[0]; // @@ Assumes little endian.
+#endif
+    }
+
+    NV_FORCEINLINE int ftoi_round_xs(float val) {
+        return ftoi_round_xs(val, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_floor_xs(float val) {
+        return ftoi_round_xs(val - floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_ceil_xs(float val) {
+        return ftoi_round_xs(val + floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc_xs(float val) {
+        return (val<0) ? ftoi_ceil_xs(val) : ftoi_floor_xs(val);
+    }
+
+#if NV_CPU_X86 || NV_CPU_X86_64
+
+    NV_FORCEINLINE int ftoi_round_sse(float f) {
+        return _mm_cvt_ss2si(_mm_set_ss(f));
+    }
+
+    NV_FORCEINLINE int ftoi_trunc_sse(float f) {
+      return _mm_cvtt_ss2si(_mm_set_ss(f));
+    }
+
+#endif
+
+
+
+#if NV_USE_SSE
+
+    NV_FORCEINLINE int ftoi_round(float val) {
+        return ftoi_round_sse(val);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc(float f) {
+      return ftoi_trunc_sse(f);
+    }
+
+    // We can probably do better than this. See for example:
+    // http://dss.stephanierct.com/DevBlog/?p=8
+    NV_FORCEINLINE int ftoi_floor(float val) {
+        return ftoi_round(floorf(val));
+    }
+
+    NV_FORCEINLINE int ftoi_ceil(float val) {
+        return ftoi_round(ceilf(val));
+    }
+
+#else
+
+    // In theory this should work with any double floating point math implementation, but it appears that MSVC produces incorrect code
+    // when SSE2 is targeted and fast math is enabled (/arch:SSE2 & /fp:fast). These problems go away with /fp:precise, which is the default mode.
+
+    NV_FORCEINLINE int ftoi_round(float val) {
+        return ftoi_round_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_floor(float val) {
+        return ftoi_floor_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_ceil(float val) {
+        return ftoi_ceil_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc(float f) {
+      return ftoi_trunc_xs(f);
+    }
+
+#endif
+
+
+    inline void test_ftoi() {
+
+        // Round to nearest integer.
+        nvCheck(ftoi_round(0.1f) == 0);
+        nvCheck(ftoi_round(0.6f) == 1);
+        nvCheck(ftoi_round(-0.2f) == 0);
+        nvCheck(ftoi_round(-0.7f) == -1);
+        nvCheck(ftoi_round(10.1f) == 10);
+        nvCheck(ftoi_round(10.6f) == 11);
+        nvCheck(ftoi_round(-90.1f) == -90);
+        nvCheck(ftoi_round(-90.6f) == -91);
+
+        nvCheck(ftoi_round(0) == 0);
+        nvCheck(ftoi_round(1) == 1);
+        nvCheck(ftoi_round(-1) == -1);
+        
+        nvCheck(ftoi_round(0.5f) == 0);  // How are midpoints rounded? Bankers rounding.
+        nvCheck(ftoi_round(1.5f) == 2);
+        nvCheck(ftoi_round(2.5f) == 2);
+        nvCheck(ftoi_round(3.5f) == 4);
+        nvCheck(ftoi_round(4.5f) == 4);
+        nvCheck(ftoi_round(-0.5f) == 0);
+        nvCheck(ftoi_round(-1.5f) == -2);
+                
+
+        // Truncation (round down if > 0, round up if < 0).
+        nvCheck(ftoi_trunc(0.1f) == 0);
+        nvCheck(ftoi_trunc(0.6f) == 0);
+        nvCheck(ftoi_trunc(-0.2f) == 0);
+        nvCheck(ftoi_trunc(-0.7f) == 0);    // @@ When using /arch:SSE2 in Win32, msvc produce wrong code for this one. It is skipping the addition.
+        nvCheck(ftoi_trunc(1.99f) == 1);
+        nvCheck(ftoi_trunc(-1.2f) == -1);
+
+        // Floor (round down).
+        nvCheck(ftoi_floor(0.1f) == 0);
+        nvCheck(ftoi_floor(0.6f) == 0);
+        nvCheck(ftoi_floor(-0.2f) == -1);
+        nvCheck(ftoi_floor(-0.7f) == -1);
+        nvCheck(ftoi_floor(1.99f) == 1);
+        nvCheck(ftoi_floor(-1.2f) == -2);
+
+        nvCheck(ftoi_floor(0) == 0);
+        nvCheck(ftoi_floor(1) == 1);
+        nvCheck(ftoi_floor(-1) == -1);
+        nvCheck(ftoi_floor(2) == 2);
+        nvCheck(ftoi_floor(-2) == -2);
+
+        // Ceil (round up).
+        nvCheck(ftoi_ceil(0.1f) == 1);
+        nvCheck(ftoi_ceil(0.6f) == 1);
+        nvCheck(ftoi_ceil(-0.2f) == 0);
+        nvCheck(ftoi_ceil(-0.7f) == 0);
+        nvCheck(ftoi_ceil(1.99f) == 2);
+        nvCheck(ftoi_ceil(-1.2f) == -1);
+
+        nvCheck(ftoi_ceil(0) == 0);
+        nvCheck(ftoi_ceil(1) == 1);
+        nvCheck(ftoi_ceil(-1) == -1);
+        nvCheck(ftoi_ceil(2) == 2);
+        nvCheck(ftoi_ceil(-2) == -2);
+    }
+
+
+
+
+
+    // Safe versions using standard casts.
+
+    inline int iround(float f)
+    {
+        return int(floorf(f + 0.5f));
+    }
+
+    inline int iround(double f)
+    {
+        return int(::floor(f + 0.5));
+    }
+
+    inline int ifloor(float f)
+    {
+        return int(floorf(f));
+    }
+
+    inline int iceil(float f)
+    {
+        return int(ceilf(f));
+    }
+
+
+
+    // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
+    // Some notes from cbloom: http://cbloomrants.blogspot.com/2011/07/07-26-11-pixel-int-to-float-options.html
+
+    // Quantize a float in the [0,1] range, using exact end points or uniform bins.
+    inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
+        nvDebugCheck(bits <= 16);
+
+        float range = float(1 << bits);
+        if (exactEndPoints) {
+            return floorf(x * (range-1) + 0.5f) / (range-1);
+        }
+        else {
+            return (floorf(x * range) + 0.5f) / range;
+        }
+    }
+
+
+    // This is the most common rounding mode:
+    // 
+    //   0     1       2     3
+    // |___|_______|_______|___|
+    // 0                       1
+    //
+    // You get that if you take the unit floating point number multiply by 'N-1' and round to nearest. That is, `i = round(f * (N-1))`.
+    // You reconstruct the original float dividing by 'N-1': `f = i / (N-1)`
+
+
+    //    0     1     2     3
+    // |_____|_____|_____|_____|
+    // 0                       1
+
+    /*enum BinningMode {
+        RoundMode_ExactEndPoints,       
+        RoundMode_UniformBins,
+    };*/
+
+    template <int N>
+    inline uint unitFloatToFixed(float f) {
+        return ftoi_round(f * ((1<<N)-1));
+    }
+
+    inline uint8 unitFloatToFixed8(float f) {
+        return (uint8)unitFloatToFixed<8>(f);
+    }
+
+    inline uint16 unitFloatToFixed16(float f) {
+        return (uint16)unitFloatToFixed<16>(f);
+    }
+
+
+} // nv
+
+#endif // NV_MATH_FTOI_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvmath/nvmath.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvmath/nvmath.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvmath/nvmath.h
@@ -1,13 +1,26 @@
 // This code is in the public domain -- castanyo@yahoo.es
 
+#pragma once
 #ifndef NV_MATH_H
 #define NV_MATH_H
 
-#include <nvcore/nvcore.h>
-#include <nvcore/Debug.h>
+#include "nvcore/nvcore.h"
+#include "nvcore/Debug.h"   // nvDebugCheck
+#include "nvcore/Utils.h"   // max, clamp
 
 #include <math.h>
 
+#if NV_OS_WIN32 || NV_OS_XBOX
+#include <float.h>  // finite, isnan
+#endif
+
+#if NV_CPU_X86 || NV_CPU_X86_64
+    //#include <intrin.h>
+    #include <xmmintrin.h>
+#endif
+
+
+
 // Function linkage
 #if NVMATH_SHARED
 #ifdef NVMATH_EXPORTS
@@ -22,142 +35,295 @@
 #define NVMATH_CLASS
 #endif // NVMATH_SHARED
 
+// Set some reasonable defaults.
+#ifndef NV_USE_ALTIVEC
+#   define NV_USE_ALTIVEC NV_CPU_PPC
+//#   define NV_USE_ALTIVEC defined(__VEC__)
+#endif
+
+#ifndef NV_USE_SSE
+#   if NV_CPU_X86_64
+        // x64 always supports at least SSE2
+#       define NV_USE_SSE 2
+#   elif NV_CC_MSVC && defined(_M_IX86_FP)
+        // Also on x86 with the /arch:SSE flag in MSVC.
+#       define NV_USE_SSE _M_IX86_FP       // 1=SSE, 2=SS2
+#   elif defined(__SSE__)
+#       define NV_USE_SSE 1
+#   elif defined(__SSE2__)
+#       define NV_USE_SSE 2
+#   else
+        // Otherwise we assume no SSE.
+#       define NV_USE_SSE 0
+#   endif
+#endif
+
+
+// Internally set NV_USE_SIMD when either altivec or sse is available.
+#if NV_USE_ALTIVEC && NV_USE_SSE
+#	error "Cannot enable both altivec and sse!"
+#endif
+
+
+
 #ifndef PI
-#define PI      			float(3.1415926535897932384626433833)
+#define PI                  float(3.1415926535897932384626433833)
 #endif
 
-#define NV_EPSILON			(0.0001f)
-#define NV_NORMAL_EPSILON	(0.001f)
+#define NV_EPSILON          (0.0001f)
+#define NV_NORMAL_EPSILON   (0.001f)
 
 /*
-#define SQ(r)				((r)*(r))
+#define SQ(r)               ((r)*(r))
 
-#define	SIGN_BITMASK		0x80000000
+#define SIGN_BITMASK        0x80000000
 
 /// Integer representation of a floating-point value.
-#define IR(x)					((uint32 &)(x))
+#define IR(x)               ((uint32 &)(x))
 
 /// Absolute integer representation of a floating-point value
-#define AIR(x)					(IR(x) & 0x7fffffff)
+#define AIR(x)              (IR(x) & 0x7fffffff)
 
 /// Floating-point representation of an integer value.
-#define FR(x)					((float&)(x))
+#define FR(x)               ((float&)(x))
 
 /// Integer-based comparison of a floating point value.
 /// Don't use it blindly, it can be faster or slower than the FPU comparison, depends on the context.
-#define IS_NEGATIVE_FLOAT(x)	(IR(x)&SIGN_BITMASK)
+#define IS_NEGATIVE_FLOAT(x) (IR(x)&SIGN_BITMASK)
 */
 
-inline double sqrt_assert(const double f)
+extern "C" inline double sqrt_assert(const double f)
 {
-	nvDebugCheck(f >= 0.0f);
-	return sqrt(f);
+    nvDebugCheck(f >= 0.0f);
+    return sqrt(f);
 }
 
 inline float sqrtf_assert(const float f)
 {
-	nvDebugCheck(f >= 0.0f);
-	return sqrtf(f);
+    nvDebugCheck(f >= 0.0f);
+    return sqrtf(f);
 }
 
-inline double acos_assert(const double f)
+extern "C" inline double acos_assert(const double f) 
 {
-	nvDebugCheck(f >= -1.0f && f <= 1.0f);
-	return acos(f);
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return acos(f);
 }
 
 inline float acosf_assert(const float f)
 {
-	nvDebugCheck(f >= -1.0f && f <= 1.0f);
-	return acosf(f);
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return acosf(f);
 }
 
-inline double asin_assert(const double f)
+extern "C" inline double asin_assert(const double f)
 {
-	nvDebugCheck(f >= -1.0f && f <= 1.0f);
-	return asin(f);
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return asin(f);
 }
 
 inline float asinf_assert(const float f)
 {
-	nvDebugCheck(f >= -1.0f && f <= 1.0f);
-	return asinf(f);
+    nvDebugCheck(f >= -1.0f && f <= 1.0f);
+    return asinf(f);
 }
 
 // Replace default functions with asserting ones.
+#if !NV_CC_MSVC || (NV_CC_MSVC && (_MSC_VER < 1700))    // IC: Apparently this was causing problems in Visual Studio 2012. See Issue 194: https://code.google.com/p/nvidia-texture-tools/issues/detail?id=194
 #define sqrt sqrt_assert
 #define sqrtf sqrtf_assert
 #define acos acos_assert
 #define acosf acosf_assert
 #define asin asin_assert
 #define asinf asinf_assert
-
-#if NV_OS_WIN32
-#include <float.h>
 #endif
 
-namespace nv
+#if NV_CC_MSVC
+NV_FORCEINLINE float log2f(float x)
 {
-inline float toRadian(float degree) { return degree * (PI / 180.0f); }
-inline float toDegree(float radian) { return radian * (180.0f / PI); }
-	
-inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
-{
-	return fabs(f0-f1) <= epsilon;
+    nvCheck(x >= 0);
+    return logf(x) / logf(2.0f);
 }
-
-inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+NV_FORCEINLINE float exp2f(float x)
 {
-	return fabs(f) <= epsilon;
+    return powf(2.0f, x);
 }
+#endif
 
-inline bool isFinite(const float f)
+namespace nv
 {
-#if NV_OS_WIN32
-	return _finite(f) != 0;
-#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
-	return isfinite(f);
+    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
+    inline float toDegree(float radian) { return radian * (180.0f / PI); }
+
+    // Robust floating point comparisons:
+    // http://realtimecollisiondetection.net/blog/?p=89
+    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
+    {
+        //return fabs(f0-f1) <= epsilon;
+        return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
+    }
+
+    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+    {
+        return fabs(f) <= epsilon;
+    }
+
+    inline bool isFinite(const float f)
+    {
+#if NV_OS_WIN32 || NV_OS_XBOX
+        return _finite(f) != 0;
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_NETBSD || NV_OS_OPENBSD || NV_OS_ORBIS
+        return isfinite(f);
 #elif NV_OS_LINUX
-	return finitef(f);
+        return finitef(f);
 #else
-#	error "isFinite not supported"
+#   error "isFinite not supported"
 #endif
-//return std::isfinite (f);
-//return finite (f);
-}
-
-inline bool isNan(const float f)
-{
-#if NV_OS_WIN32
-	return _isnan(f) != 0;
-#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
-	return isnan(f);
-#elif NV_OS_LINUX
-	return isnanf(f);
+        //return std::isfinite (f);
+        //return finite (f);
+    }
+
+    inline bool isNan(const float f)
+    {
+#if NV_OS_WIN32 || NV_OS_XBOX
+        return _isnan(f) != 0;
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_NETBSD || NV_OS_OPENBSD || NV_OS_ORBIS || NV_OS_LINUX
+        return isnan(f);
 #else
-#	error "isNan not supported"
+#   error "isNan not supported"
 #endif
-}
-
-inline uint log2(uint i)
-{
-	uint value = 0;
-	while( i >>= 1 ) {
-		value++;
-	}
-	return value;
-}
+    }
 
-inline float lerp(float f0, float f1, float t)
-{
-	const float s = 1.0f - t;
-	return f0 * s + f1 * t;
-}
+    inline uint log2(uint i)
+    {
+        uint value = 0;
+        while( i >>= 1 ) {
+            value++;
+        }
+        return value;
+    }
+
+    inline float lerp(float f0, float f1, float t)
+    {
+        const float s = 1.0f - t;
+        return f0 * s + f1 * t;
+    }
+
+    inline float square(float f) { return f * f; }
+    inline int square(int i) { return i * i; }
+
+    inline float cube(float f) { return f * f * f; }
+    inline int cube(int i) { return i * i * i; }
+
+    inline float frac(float f)
+    {
+        return f - floor(f);
+    }
+
+    inline float floatRound(float f)
+    {
+        return floorf(f + 0.5f);
+    }
+
+    // Eliminates negative zeros from a float array.
+    inline void floatCleanup(float * fp, int n)
+    {
+        for (int i = 0; i < n; i++) {
+            //nvDebugCheck(isFinite(fp[i]));
+            union { float f; uint32 i; } x = { fp[i] };
+            if (x.i == 0x80000000) fp[i] = 0.0f;
+        }
+    }
+
+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+
+    inline float linearstep(float edge0, float edge1, float x) {
+        // Scale, bias and saturate x to 0..1 range
+        return saturate((x - edge0) / (edge1 - edge0));
+    }
+
+    inline float smoothstep(float edge0, float edge1, float x) {
+        x = linearstep(edge0, edge1, x); 
+
+        // Evaluate polynomial
+        return x*x*(3 - 2*x);
+    }
+
+    inline int sign(float a)
+    {
+        return (a > 0) - (a < 0);
+        //if (a > 0.0f) return 1;
+        //if (a < 0.0f) return -1;
+        //return 0;
+    }
+
+    union Float754 {
+        unsigned int raw;
+        float value;
+        struct {
+        #if NV_BIG_ENDIAN
+            unsigned int negative:1;
+            unsigned int biasedexponent:8;
+            unsigned int mantissa:23;
+        #else
+            unsigned int mantissa:23;
+            unsigned int biasedexponent:8;
+            unsigned int negative:1;
+        #endif
+        } field;
+    };
+
+    // Return the exponent of x ~ Floor(Log2(x))
+    inline int floatExponent(float x)
+    {
+        Float754 f;
+        f.value = x;
+        return (f.field.biasedexponent - 127);
+    }
+
+
+    // FloatRGB9E5
+    union Float3SE {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 e : 5;
+            uint32 zm : 9;
+            uint32 ym : 9;
+            uint32 xm : 9;
+        #else
+            uint32 xm : 9;
+            uint32 ym : 9;
+            uint32 zm : 9;
+            uint32 e : 5;
+        #endif
+        };
+    };
+
+    // FloatR11G11B10
+    union Float3PK {
+        uint32 v;
+        struct {
+        #if NV_BIG_ENDIAN
+            uint32 ze : 5;
+            uint32 zm : 5;
+            uint32 ye : 5;
+            uint32 ym : 6;
+            uint32 xe : 5;
+            uint32 xm : 6;
+        #else
+            uint32 xm : 6;
+            uint32 xe : 5;
+            uint32 ym : 6;
+            uint32 ye : 5;
+            uint32 zm : 5;
+            uint32 ze : 5;
+        #endif
+        };
+    };
 
-inline float square(float f)
-{
-	return f * f;
-}
 
 } // nv
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Atomic.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Atomic.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Atomic.h
@@ -0,0 +1,408 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_THREAD_ATOMIC_H
+#define NV_THREAD_ATOMIC_H
+
+#include "nvthread.h"
+
+#include "nvcore/Debug.h"
+
+
+#if NV_CC_MSVC
+
+#include <intrin.h> // Already included by nvthread.h
+
+#pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
+#pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
+//#pragma intrinsic(_InterlockedExchangeAdd64)
+
+/*
+extern "C"
+{
+    #pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
+    LONG  __cdecl _InterlockedIncrement(long volatile *Addend);
+    LONG  __cdecl _InterlockedDecrement(long volatile *Addend);
+
+    #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
+    LONG  __cdecl _InterlockedCompareExchange(long volatile * Destination, long Exchange, long Compared);
+    LONG  __cdecl _InterlockedExchange(long volatile * Target, LONG Value);
+}
+*/
+
+#endif // NV_CC_MSVC
+
+#if NV_CC_CLANG && POSH_CPU_STRONGARM
+// LLVM/Clang do not yet have functioning atomics as of 2.1
+// #include <atomic>
+#endif
+
+//ACS: need this if we want to use Apple's atomics.
+/*
+#if NV_OS_IOS || NV_OS_DARWIN
+// for iOS & OSX we use apple's atomics
+#include "libkern/OSAtomic.h"
+#endif
+*/
+
+namespace nv {
+
+    // Load and stores.
+    inline uint32 loadRelaxed(const uint32 * ptr) { return *ptr; }
+    inline void storeRelaxed(uint32 * ptr, uint32 value) { *ptr = value; }
+
+    inline uint32 loadAcquire(const volatile uint32 * ptr)
+    {
+        nvDebugCheck((intptr_t(ptr) & 3) == 0);
+
+#if POSH_CPU_X86 || POSH_CPU_X86_64
+        uint32 ret = *ptr;  // on x86, loads are Acquire
+        nvCompilerReadBarrier();
+        return ret;
+#elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64
+        // need more specific cpu type for armv7?
+        // also utilizes a full barrier
+        // currently treating laod like x86 - this could be wrong
+        
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+		uint32 ret = *ptr; // replace with ldrex?
+        nvCompilerReadWriteBarrier();
+        return ret;
+#elif POSH_CPU_PPC64
+        // need more specific cpu type for ppc64?
+        // also utilizes a full barrier
+        // currently treating load like x86 - this could be wrong
+
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+		uint32 ret = *ptr; // replace with ldrex?
+        nvCompilerReadWriteBarrier();
+        return ret;
+#else
+#error "Not implemented"
+#endif
+    }
+
+    inline void storeRelease(volatile uint32 * ptr, uint32 value)
+    {
+        nvDebugCheck((intptr_t(ptr) & 3) == 0);
+        nvDebugCheck((intptr_t(&value) & 3) == 0);
+
+#if POSH_CPU_X86 || POSH_CPU_X86_64
+        nvCompilerWriteBarrier();
+        *ptr = value;   // on x86, stores are Release
+        //nvCompilerWriteBarrier(); // @@ IC: Where does this barrier go? In nvtt it was after, in Witness before. Not sure which one is right.
+#elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+		*ptr = value; //strex?
+		nvCompilerReadWriteBarrier();
+#elif POSH_CPU_PPC64
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+		*ptr = value; //strex?
+		nvCompilerReadWriteBarrier();
+#else
+#error "Atomics not implemented."
+#endif
+    }
+
+
+    template <typename T>
+    inline void storeReleasePointer(volatile T * pTo, T from)
+    {
+        NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t));
+        nvDebugCheck((((intptr_t)pTo) % sizeof(intptr_t)) == 0);
+        nvDebugCheck((((intptr_t)&from) % sizeof(intptr_t)) == 0);
+        nvCompilerWriteBarrier();
+        *pTo = from;    // on x86, stores are Release
+    }
+
+    template <typename T>
+    inline T loadAcquirePointer(volatile T * ptr)
+    {
+        NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t));
+        nvDebugCheck((((intptr_t)ptr) % sizeof(intptr_t)) == 0);
+        T ret = *ptr;   // on x86, loads are Acquire
+        nvCompilerReadBarrier();
+        return ret;
+    } 
+
+
+    // Atomics. @@ Assuming sequential memory order?
+
+#if NV_CC_MSVC
+    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
+
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedIncrement((long *)value));
+    }
+
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedDecrement((long *)value));
+    }
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add;
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add));
+    }
+
+
+
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        long result = _InterlockedCompareExchange((long *)value, (long)desired, (long)expected);
+        return result == (long)expected;
+    }
+
+
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)_InterlockedExchange((long *)value, (long)desired);
+    }
+
+
+
+#elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
+
+    //ACS: Use Apple's atomics instead? I don't know if these are better in any way; there are non-barrier versions too. There's no OSAtomicSwap32 tho'
+    /*
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)OSAtomicIncrement32Barrier((int32_t *)value);
+    }
+    
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)OSAtomicDecrement32Barrier((int32_t *)value);
+    }
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return OSAtomicCompareAndSwap32Barrier((int32_t)expected, (int32_t)desired, (int32_t *)value);
+    }
+    */
+
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, 1);
+    }
+    
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_sub_and_fetch(value, 1);
+    }
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_fetch_and_add(value, value_to_add);
+    }
+
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_bool_compare_and_swap(value, expected, desired);
+    }
+    
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        // this is confusingly named, it doesn't actually do a test but always sets
+        return __sync_lock_test_and_set(value, desired);
+    }
+
+
+
+
+#elif NV_CC_CLANG && POSH_CPU_STRONGARM
+    
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        
+        // this should work in LLVM eventually, but not as of 2.1
+        // return (uint32)AtomicIncrement((long *)value);
+        
+        // in the mean time,
+        register uint32 result;
+        asm volatile (
+                      "1:   ldrexb  %0,  [%1]	\n\t"
+                      "add     %0,   %0, #1     \n\t"
+                      "strexb  r1,   %0, [%1]	\n\t"
+                      "cmp     r1,   #0			\n\t"
+                      "bne     1b"
+                      : "=&r" (result)
+                      : "r"(value)
+                      : "r1"
+                      );
+        return result;
+
+    }
+    
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        
+        // this should work in LLVM eventually, but not as of 2.1:
+        // return (uint32)sys::AtomicDecrement((long *)value);
+
+        // in the mean time,
+        
+        register uint32 result;
+        asm volatile (
+                      "1:   ldrexb  %0,  [%1]	\n\t"
+                      "sub     %0,   %0, #1     \n\t"
+                      "strexb  r1,   %0, [%1]	\n\t"
+                      "cmp     r1,   #0			\n\t"
+                      "bne     1b"
+                      : "=&r" (result)
+                      : "r"(value)
+                      : "r1"
+                      );
+        return result;
+         
+    }
+
+#elif NV_CC_GNUC
+    // Many alternative implementations at:
+    // http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html
+
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, 1);
+    }
+
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_sub_and_fetch(value, 1);
+    }
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_fetch_and_add(value, value_to_add);
+    }
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_bool_compare_and_swap(value, expected, desired);
+    }
+    
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        // this is confusingly named, it doesn't actually do a test but always sets
+        return __sync_lock_test_and_set(value, desired);
+    }
+    
+#else
+#error "Atomics not implemented."
+
+#endif
+
+
+
+
+    // It would be nice to have C++0x-style atomic types, but I'm not in the mood right now. Only uint32 supported so far.
+#if 0
+    template <typename T>
+    void increment(T * value);
+
+    template <typename T>
+    void decrement(T * value);
+
+    template <>
+    void increment(uint32 * value) {
+    }
+
+    template <>
+    void increment(uint64 * value) {
+    }
+
+
+
+    template <typename T>
+    class Atomic
+    {
+    public:
+        explicit Atomic()  : m_value() { }
+        explicit Atomic( T val ) : m_value(val) { }
+        ~Atomic() { }
+
+        T loadRelaxed()  const { return m_value; }
+        void storeRelaxed(T val) { m_value = val; }
+
+        //T loadAcquire() const volatile { return nv::loadAcquire(&m_value); }
+        //void storeRelease(T val) volatile { nv::storeRelease(&m_value, val); }
+
+        void increment() /*volatile*/ { nv::atomicIncrement(m_value); }
+        void decrement() /*volatile*/ { nv::atomicDecrement(m_value); }
+
+        void compareAndStore(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
+        T compareAndExchange(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
+        T exchange(T newVal) { nv::atomicExchange(&m_value, newVal); }
+
+    private:
+        // don't provide operator = or == ; make the client write Store( Load() )
+        NV_FORBID_COPY(Atomic);
+
+        NV_COMPILER_CHECK(sizeof(T) == sizeof(uint32) || sizeof(T) == sizeof(uint64));
+
+        T m_value;
+    };
+#endif
+
+} // nv namespace 
+
+
+#endif // NV_THREADS_ATOMICS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/CMakeLists.txt
@@ -0,0 +1,28 @@
+PROJECT(nvthread)
+
+SET(THREAD_SRCS
+	nvthread.h nvthread.cpp
+	Atomic.h
+	Event.h Event.cpp
+	Mutex.h Mutex.cpp
+	ParallelFor.h ParallelFor.cpp
+	Thread.h Thread.cpp
+	ThreadPool.h ThreadPool.cpp)
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+# targets
+ADD_DEFINITIONS(-DNVTHREAD_EXPORTS)
+
+IF(NVTHREAD_SHARED)
+	ADD_LIBRARY(nvthread SHARED ${THREAD_SRCS})
+ELSE(NVTHREAD_SHARED)
+	ADD_LIBRARY(nvthread ${THREAD_SRCS})
+ENDIF(NVTHREAD_SHARED)
+
+TARGET_LINK_LIBRARIES(nvthread ${LIBS} nvcore)
+
+INSTALL(TARGETS nvthread
+	RUNTIME DESTINATION ${BINDIR}
+	LIBRARY DESTINATION ${LIBDIR}
+	ARCHIVE DESTINATION ${LIBDIR})
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Event.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Event.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Event.h
@@ -0,0 +1,34 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_EVENT_H
+#define NV_THREAD_EVENT_H
+
+#include "nvthread.h"
+
+#include "nvcore/Ptr.h"
+
+namespace nv
+{
+    // This is intended to be used by a single waiter thread.
+    class NVTHREAD_CLASS Event
+    {
+        NV_FORBID_COPY(Event);
+    public:
+        Event();
+        ~Event();
+
+        void post();
+        void wait();    // Wait resets the event.
+
+        static void post(Event * events, uint count);
+        static void wait(Event * events, uint count);
+
+    private:
+        struct Private;
+        AutoPtr<Private> m;
+    };
+
+} // nv namespace
+
+#endif // NV_THREAD_EVENT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Event.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Event.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Event.cpp
@@ -0,0 +1,97 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Event.h"
+
+#if NV_OS_WIN32
+#include "Win32.h"
+#elif NV_OS_USE_PTHREAD
+#include <pthread.h>
+#endif
+
+using namespace nv;
+
+#if NV_OS_WIN32
+
+struct Event::Private {
+    HANDLE handle;
+};
+
+Event::Event() : m(new Private) {
+    m->handle = CreateEvent(NULL, FALSE, FALSE, NULL);
+}
+
+Event::~Event() {
+    CloseHandle(m->handle);
+}
+
+void Event::post() {
+    SetEvent(m->handle);
+}
+
+void Event::wait() {
+    WaitForSingleObject(m->handle, INFINITE);
+}
+
+#elif NV_OS_USE_PTHREAD
+
+struct Event::Private {
+    pthread_cond_t pt_cond;
+    pthread_mutex_t pt_mutex;
+    int count;
+    int wait_count;
+};
+
+Event::Event() : m(new Private) {
+    m->count=0;
+    m->wait_count=0;
+    pthread_mutex_init(&m->pt_mutex, NULL);
+    pthread_cond_init(&m->pt_cond, NULL);
+}
+
+Event::~Event() {
+    pthread_cond_destroy(&m->pt_cond);
+    pthread_mutex_destroy(&m->pt_mutex);
+}
+
+void Event::post() {
+    pthread_mutex_lock(&m->pt_mutex);
+
+    m->count++;
+    
+    //ACS: move this after the unlock?
+    if(m->wait_count>0) {
+        pthread_cond_signal(&m->pt_cond);
+    }
+    
+    pthread_mutex_unlock(&m->pt_mutex);
+}
+
+void Event::wait() {
+    pthread_mutex_lock(&m->pt_mutex);
+    
+    while(m->count==0) {
+        m->wait_count++;
+        pthread_cond_wait(&m->pt_cond, &m->pt_mutex);
+        m->wait_count--;
+    }
+    m->count--;
+    
+    pthread_mutex_unlock(&m->pt_mutex);
+}
+
+#endif // NV_OS_UNIX
+
+
+/*static*/ void Event::post(Event * events, uint count) {
+    for (uint i = 0; i < count; i++) {
+        events[i].post();
+    }
+}
+
+/*static*/ void Event::wait(Event * events, uint count) {
+    // @@ Use wait for multiple objects in win32?
+
+    for (uint i = 0; i < count; i++) {
+        events[i].wait();
+    }
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Mutex.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Mutex.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Mutex.h
@@ -0,0 +1,47 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_MUTEX_H
+#define NV_THREAD_MUTEX_H
+
+#include "nvthread.h"
+
+#include "nvcore/Ptr.h"
+
+namespace nv
+{
+
+    class NVTHREAD_CLASS Mutex
+    {
+        NV_FORBID_COPY(Mutex);
+    public:
+        Mutex (const char * name);
+        ~Mutex ();
+
+        void lock();
+        bool tryLock();
+        void unlock();
+
+    private:
+        struct Private;
+        AutoPtr<Private> m;
+    };
+
+
+    // Templated lock that can be used with any mutex.
+    template <class M>
+    class Lock
+    {
+        NV_FORBID_COPY(Lock);
+    public:
+
+        Lock (M & m) : m_mutex (m) { m_mutex.lock(); }
+        ~Lock () { m_mutex.unlock(); }
+
+    private:
+        M & m_mutex;
+    };
+
+} // nv namespace
+
+#endif // NV_THREAD_MUTEX_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Mutex.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Mutex.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Mutex.cpp
@@ -0,0 +1,129 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Mutex.h"
+
+#if NV_OS_WIN32
+
+#include "Win32.h"
+
+#elif NV_OS_USE_PTHREAD
+
+#include <pthread.h>
+#include <errno.h> // EBUSY
+
+#endif // NV_OS
+
+#if NV_USE_TELEMETRY
+#include <telemetry.h>
+extern HTELEMETRY tmContext;
+#endif
+
+using namespace nv;
+
+
+#if NV_OS_WIN32
+
+struct Mutex::Private {
+    CRITICAL_SECTION mutex;
+    const char * name;
+};
+
+
+Mutex::Mutex (const char * name) : m(new Private)
+{
+    InitializeCriticalSection(&m->mutex);
+    m->name = name;
+#if NV_USE_TELEMETRY
+    tmLockName(tmContext, this, name);
+#endif
+}
+
+Mutex::~Mutex ()
+{
+    DeleteCriticalSection(&m->mutex);
+}
+
+void Mutex::lock()
+{
+#if NV_USE_TELEMETRY
+    TmU64 matcher;
+    tmTryLockEx(tmContext, &matcher, 100/*0.1 ms*/, __FILE__, __LINE__, this, "blocked");
+#endif
+    
+    EnterCriticalSection(&m->mutex);
+
+#if NV_USE_TELEMETRY
+    tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_SUCCESS);
+    tmSetLockState(tmContext, this, TMLS_LOCKED, "acquired");
+#endif
+}
+
+bool Mutex::tryLock()
+{
+#if NV_USE_TELEMETRY
+    TmU64 matcher;
+    tmTryLockEx(tmContext, &matcher, 100/*0.1 ms*/, __FILE__, __LINE__, this, "blocked");
+    if (TryEnterCriticalSection(&m->mutex) != 0) {
+        tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_SUCCESS);
+        tmSetLockState(tmContext, this, TMLS_LOCKED, "acquired");
+        return true;
+    }
+    else {
+        tmEndTryLockEx(tmContext, matcher, __FILE__, __LINE__, this, TMLR_FAILED);
+        return false;
+    }
+#else
+    return TryEnterCriticalSection(&m->mutex) != 0;
+#endif
+}
+
+void Mutex::unlock()
+{
+#if NV_USE_TELEMETRY
+    tmSetLockState(tmContext, this, TMLS_RELEASED, "released");
+#endif
+
+    LeaveCriticalSection(&m->mutex);
+}
+
+#elif NV_OS_USE_PTHREAD
+
+struct Mutex::Private {
+    pthread_mutex_t mutex;
+    const char * name;
+};
+
+
+Mutex::Mutex (const char * name) : m(new Private)
+{
+    int result = pthread_mutex_init(&m->mutex, NULL);
+    m->name = name;
+    nvDebugCheck(result == 0);
+}
+
+Mutex::~Mutex ()
+{
+    int result = pthread_mutex_destroy(&m->mutex);
+    nvDebugCheck(result == 0);
+}
+
+void Mutex::lock()
+{
+    int result = pthread_mutex_lock(&m->mutex);
+    nvDebugCheck(result == 0);
+}
+
+bool Mutex::tryLock()
+{
+    int result = pthread_mutex_trylock(&m->mutex);
+    nvDebugCheck(result == 0 || result == EBUSY);
+    return result == 0;
+}
+
+void Mutex::unlock()
+{
+    int result = pthread_mutex_unlock(&m->mutex);
+    nvDebugCheck(result == 0);
+}
+
+#endif // NV_OS_UNIX
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/ParallelFor.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/ParallelFor.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/ParallelFor.h
@@ -0,0 +1,181 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#pragma once
+#ifndef NV_THREAD_PARALLELFOR_H
+#define NV_THREAD_PARALLELFOR_H
+
+#include "nvthread.h"
+//#include "Atomic.h" // atomic<uint>
+
+namespace nv
+{
+    class Thread;
+    class ThreadPool;
+
+    typedef void ForTask(void * context, /*int tid,*/ int idx); // @@ It would be nice to have the thread index as an argument here.
+
+    struct ParallelFor {
+        ParallelFor(ForTask * task, void * context);
+        ~ParallelFor();
+
+        void run(uint count, uint step = 1);
+
+        // Invariant:
+        ForTask * task;
+        void * context;
+        ThreadPool * pool;
+
+        // State:
+        uint count;
+        uint step;
+        /*atomic<uint>*/ uint idx;
+    };
+
+
+#if NV_CC_CPP11
+
+    template <typename F>
+    void sequential_for(uint count, F f) {
+        for (uint i = 0; i < count; i++) {
+            f(i);
+        }
+    }
+
+
+    template <typename F>
+    void parallel_for(uint count, uint step, F f) {
+        // Transform lambda into function pointer.
+        auto lambda = [](void* context, /*int tid, */int idx) {
+            F & f = *reinterpret_cast<F *>(context);
+            f(/*tid, */idx);
+        };
+
+        ParallelFor pf(lambda, &f);
+        pf.run(count, step);
+    }
+
+
+    template <typename F>
+    void parallel_for(uint count, F f) {
+        parallel_for(count, /*step=*/1, f);
+    }
+
+
+    template <typename F>
+    void parallel_for_if(uint count, uint step, bool condition, F f) {
+        if (condition) {
+            parallel_for(count, step, f);
+        }
+        else {
+            sequential_for(count, f);
+        }
+    }
+
+
+#if 0
+    template <typename F, typename T>
+    void parallel_for_each(Array<T> & array, uint step, F f) {
+        // Transform lambda into function pointer.
+        auto lambda = [](void* context, int idx) {
+            F & f = *reinterpret_cast<F *>(context);
+            f(array[idx]);
+        };
+
+        ParallelFor pf(lambda, &f);
+        pf.run(count, step);
+    }
+#endif
+
+
+#endif // NV_CC_CPP11
+
+
+/*
+
+#include "nvthread/Mutex.h"
+#include "nvcore/Array.inl"
+
+    template <typename T>
+    struct ParallelOutputStream {
+#if 0
+        // In its most basic implementation the parallel stream is simply a single array protected by a mutex.
+        Parallel_Output_Stream(uint producer_count) {}
+
+        void reset() { final_array.clear(); }
+        void append(uint producer_id, const T & t) { Lock(mutex); final_array.append(t); }
+        nv::Array<T> & finalize() { return final_array; }
+        
+        nv::Mutex mutex;
+        nv::Array<T> final_array;
+
+#elif 0
+        // Another simple implementation is to have N arrays that are merged at the end.
+        ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
+            partial_array = new Array<T>[producer_count];
+        }
+
+        void reset() {
+            for (int i = 0; i < producer_count; i++) {
+                partial_array[i].clear();
+            }
+        }
+
+        void append(uint producer_id, const T & t) { 
+            nvCheck(producer_id < producer_count);
+            partial_array[producer_id].append(t);
+        }
+
+        nv::Array<T> & finalize() {
+            for (int i = 1; i < producer_count; i++) {
+                partial_array->append(partial_array[i]);
+                partial_array[i].clear();
+            }
+            return *partial_array;
+        }
+
+        uint producer_count;
+        nv::Array<T> * partial_array;
+#else
+        ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
+            partial_array = new PartialArray[producer_count];
+        }
+
+        // But a more sophisticated implementation keeps N short arrays that are merged as they get full. This preserves partial order.
+        struct PartialArray {          // Make sure this is aligned to cache lines. We want producers to access their respective arrays without conflicts.
+            uint count;
+            T data[32];                 // Pick size to minimize wasted space considering cache line alignment?
+        };
+
+        const uint producer_count;
+        PartialArray * partial_array;
+
+        // @@ Make sure mutex and partial_array are not in the same cache line!
+
+        nv::Mutex mutex;
+        nv::Array<T> final_array;
+
+        void append(uint producer_id, const T & t) {
+            if (partial_array[producer_id].count == 32) {
+                partial_array[producer_id].count = 0;
+                Lock(mutex);
+                final_array.append(partial_array[producer_id].data, 32);
+            }
+
+            partial_array[producer_id].data[partial_array[producer_id].count++] = t;
+        }
+        nv::Array<T> & finalize() {
+            for (int i = 0; i < producer_count; i++) {
+                final_array.append(partial_array[producer_id].data, partial_array[producer_id].count);
+            }
+            return final_array;
+        }
+#endif
+    };
+
+*/
+
+
+} // nv namespace
+
+
+#endif // NV_THREAD_PARALLELFOR_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/ParallelFor.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/ParallelFor.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/ParallelFor.cpp
@@ -0,0 +1,61 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "ParallelFor.h"
+#include "Thread.h"
+#include "Atomic.h"
+#include "ThreadPool.h"
+
+#include "nvcore/Utils.h" // toI32
+
+using namespace nv;
+
+#define ENABLE_PARALLEL_FOR 1
+
+static void worker(void * arg, int tid) {
+    ParallelFor * owner = (ParallelFor *)arg;
+
+    while(true) {
+        uint new_idx = atomicFetchAndAdd(&owner->idx, owner->step);
+        if (new_idx >= owner->count) {
+            break;
+        }
+
+        const uint count = min(owner->count, new_idx + owner->step);
+        for (uint i = new_idx; i < count; i++) {
+            owner->task(owner->context, /*tid, */i);
+        }
+    }
+}
+
+
+ParallelFor::ParallelFor(ForTask * task, void * context) : task(task), context(context) {
+#if ENABLE_PARALLEL_FOR
+    pool = ThreadPool::acquire();
+#endif
+}
+
+ParallelFor::~ParallelFor() {
+#if ENABLE_PARALLEL_FOR
+    ThreadPool::release(pool);
+#endif
+}
+
+void ParallelFor::run(uint count, uint step/*= 1*/) {
+#if ENABLE_PARALLEL_FOR
+    storeRelease(&this->count, count);
+    storeRelease(&this->step, step);
+
+    // Init atomic counter to zero.
+    storeRelease(&idx, 0);
+
+    // Start threads.
+    pool->run(worker, this);
+
+    nvDebugCheck(idx >= count);
+#else
+    for (int i = 0; i < toI32(count); i++) {
+        task(context, i);
+    }
+#endif
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Thread.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Thread.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Thread.h
@@ -0,0 +1,42 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_THREAD_H
+#define NV_THREAD_THREAD_H
+
+#include "nvthread.h"
+
+#include "nvcore/Ptr.h" // AutoPtr
+
+namespace nv
+{
+    typedef void ThreadFunc(void * arg);
+
+    class NVTHREAD_CLASS Thread
+    {
+        NV_FORBID_COPY(Thread);
+    public:
+        Thread();
+        Thread(const char * name);
+        ~Thread();
+
+        void setName(const char * name);
+
+        void start(ThreadFunc * func, void * arg);
+        void wait();
+
+        bool isRunning() const;
+
+        static void spinWait(uint count);
+        static void yield();
+        static void sleep(uint ms);
+
+        static void wait(Thread * threads, uint count);
+
+        struct Private;
+        AutoPtr<Private> p;
+    };
+
+} // nv namespace
+
+#endif // NV_THREAD_THREAD_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Thread.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Thread.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Thread.cpp
@@ -0,0 +1,210 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "Thread.h"
+
+#if NV_OS_WIN32
+    #include "Win32.h"
+#elif NV_OS_USE_PTHREAD
+    #include <pthread.h>
+    #include <unistd.h> // usleep
+#endif
+
+#if NV_USE_TELEMETRY
+#include <telemetry.h>
+extern HTELEMETRY tmContext;
+#endif
+
+
+using namespace nv;
+
+struct Thread::Private
+{
+#if NV_OS_WIN32
+    HANDLE thread;
+#elif NV_OS_USE_PTHREAD
+    pthread_t thread;
+#endif
+
+    ThreadFunc * func;
+    void * arg;
+    const char * name;
+};
+
+
+#if NV_OS_WIN32
+
+unsigned long __stdcall threadFunc(void * arg) {
+    Thread::Private * thread = (Thread::Private *)arg;
+    thread->func(thread->arg);
+    return 0;
+}
+
+// SetThreadName implementation from msdn:
+// http://msdn.microsoft.com/en-us/library/xcb2z8hs.aspx
+
+const DWORD MS_VC_EXCEPTION=0x406D1388;
+
+#pragma pack(push,8)
+typedef struct tagTHREADNAME_INFO
+{
+    DWORD dwType; // Must be 0x1000.
+    LPCSTR szName; // Pointer to name (in user addr space).
+    DWORD dwThreadID; // Thread ID (-1=caller thread).
+    DWORD dwFlags; // Reserved for future use, must be zero.
+} THREADNAME_INFO;
+#pragma pack(pop)
+
+static void setThreadName(DWORD dwThreadID, const char* threadName)
+{
+    THREADNAME_INFO info;
+    info.dwType = 0x1000;
+    info.szName = threadName;
+    info.dwThreadID = dwThreadID;
+    info.dwFlags = 0;
+
+    __try
+    {
+        RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+    }
+    __except(EXCEPTION_EXECUTE_HANDLER)
+    {
+    }
+}
+
+
+#elif NV_OS_USE_PTHREAD
+
+extern "C" void * threadFunc(void * arg) {
+    Thread::Private * thread = (Thread::Private *)arg;
+    thread->func(thread->arg);
+    pthread_exit(0);
+}
+
+#endif
+
+
+Thread::Thread() : p(new Private)
+{
+    p->thread = 0;
+    p->name = NULL;
+}
+
+Thread::Thread(const char * name) : p(new Private)
+{
+    p->thread = 0;
+    p->name = name;
+}
+
+Thread::~Thread()
+{
+    nvDebugCheck(p->thread == 0);
+}
+
+void Thread::setName(const char * name)
+{
+    nvCheck(p->name == NULL);
+    p->name = name;
+}
+
+void Thread::start(ThreadFunc * func, void * arg)
+{
+    p->func = func;
+    p->arg = arg;
+
+#if NV_OS_WIN32
+    DWORD threadId;
+    p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, &threadId);
+    //p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, p.ptr(), 0, NULL);     // @@ So that we can call CRT functions...
+    nvDebugCheck(p->thread != NULL);
+    if (p->name != NULL) {
+        setThreadName(threadId, p->name);
+    #if NV_USE_TELEMETRY
+        tmThreadName(tmContext, threadId, p->name);
+    #endif
+    }
+#elif NV_OS_ORBIS
+    int ret = scePthreadCreate(&p->thread, NULL, threadFunc, p.ptr(), p->name ? p->name : "nv::Thread");
+    nvDebugCheck(ret == 0);
+	// use any non-system core
+	scePthreadSetaffinity(p->thread, 0x3F);
+    scePthreadSetprio(p->thread, (SCE_KERNEL_PRIO_FIFO_DEFAULT + SCE_KERNEL_PRIO_FIFO_HIGHEST) / 2);
+#elif NV_OS_USE_PTHREAD
+    int result = pthread_create(&p->thread, NULL, threadFunc, p.ptr());
+    nvDebugCheck(result == 0);
+#endif
+}
+
+void Thread::wait()
+{
+#if NV_OS_WIN32
+    DWORD status = WaitForSingleObject (p->thread, INFINITE);
+    nvCheck (status ==  WAIT_OBJECT_0);
+    BOOL ok = CloseHandle (p->thread);
+    p->thread = NULL;
+    nvCheck (ok);
+#elif NV_OS_USE_PTHREAD
+    int result = pthread_join(p->thread, NULL);
+    p->thread = 0;
+    nvDebugCheck(result == 0);
+#endif
+}
+
+bool Thread::isRunning () const
+{
+#if NV_OS_WIN32
+    return p->thread != NULL;
+#elif NV_OS_USE_PTHREAD
+    return p->thread != 0;
+#endif
+}
+
+/*static*/ void Thread::spinWait(uint count)
+{
+    for (uint i = 0; i < count; i++) {}
+}
+
+/*static*/ void Thread::yield()
+{
+#if NV_OS_WIN32
+    SwitchToThread();
+#elif NV_OS_USE_PTHREAD
+    int result = sched_yield();
+    nvDebugCheck(result == 0);
+#endif
+}
+
+/*static*/ void Thread::sleep(uint ms)
+{
+#if NV_OS_WIN32
+    Sleep(ms);
+#elif NV_OS_USE_PTHREAD
+    usleep(1000 * ms);
+#endif
+}
+
+/*static*/ void Thread::wait(Thread * threads, uint count)
+{
+/*#if NV_OS_WIN32
+    // @@ Is there any advantage in doing this?
+    nvDebugCheck(count < MAXIMUM_WAIT_OBJECTS);
+
+    HANDLE * handles = new HANDLE[count];
+    for (uint i = 0; i < count; i++) {
+        handles[i] = threads->p->thread;
+    }
+
+    DWORD result = WaitForMultipleObjects(count, handles, TRUE, INFINITE);
+
+    for (uint i = 0; i < count; i++) {
+        CloseHandle (threads->p->thread);
+        threads->p->thread = 0;
+    }
+
+    delete [] handles;
+#else*/
+    for (uint i = 0; i < count; i++) {
+        threads[i].wait();
+    }
+//#endif
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/ThreadPool.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/ThreadPool.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/ThreadPool.h
@@ -0,0 +1,86 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_THREAD_THREADPOOL_H
+#define NV_THREAD_THREADPOOL_H
+
+#include "nvthread.h"
+
+#include "Event.h"
+#include "Thread.h"
+
+// The thread pool creates one worker thread for each physical core. 
+// The threads are idle waiting for their start events so that they do not consume any resources while inactive. 
+// The thread pool runs the same function in all worker threads, the idea is to use this as the foundation of a custom task scheduler.
+// When the thread pool starts, the main thread continues running, but the common use case is to inmmediately wait for the termination events of the worker threads.
+// @@ The start and wait methods could probably be merged.
+// It may be running the thread function on the invoking thread to avoid thread switches.
+
+namespace nv {
+
+    class Thread;
+    class Event;
+
+    typedef void ThreadTask(void * context, int id);
+
+    class ThreadPool {
+        NV_FORBID_COPY(ThreadPool);
+    public:
+
+        static void setup(uint workerCount, bool useThreadAffinity, bool useCallingThread);
+
+        static ThreadPool * acquire();
+        static void release(ThreadPool *);
+
+        ThreadPool(uint workerCount = processorCount(), bool useThreadAffinity = true, bool useCallingThread = false);
+        ~ThreadPool();
+
+        void run(ThreadTask * func, void * arg);
+
+        void start(ThreadTask * func, void * arg);
+        void wait();
+
+        //NV_THREAD_LOCAL static uint threadId;
+
+    private:
+
+        static void workerFunc(void * arg);
+
+        bool useThreadAffinity;
+        bool useCallingThread;
+        uint workerCount;
+
+        Thread * workers;
+        Event * startEvents;
+        Event * finishEvents;
+
+        uint allIdle;
+
+        // Current function:
+        ThreadTask * func;
+        void * arg;
+    };
+
+
+#if NV_CC_CPP11
+
+    template <typename F>
+    void thread_pool_run(F f) {
+        // Transform lambda into function pointer.
+        auto lambda = [](void* context, int id) {
+            F & f = *reinterpret_cast<F *>(context);
+            f(id);
+        };
+
+        ThreadPool * pool = ThreadPool::acquire();
+        pool->run(lambda, &f);
+        ThreadPool::release(pool);
+    }
+
+#endif // NV_CC_CPP11
+
+
+} // namespace nv
+
+
+#endif // NV_THREAD_THREADPOOL_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/ThreadPool.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/ThreadPool.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/ThreadPool.cpp
@@ -0,0 +1,180 @@
+// This code is in the public domain -- castano@gmail.com
+
+#include "ThreadPool.h"
+#include "Mutex.h"
+#include "Thread.h"
+#include "Atomic.h"
+
+#include "nvcore/Utils.h"
+#include "nvcore/StrLib.h"
+
+#if NV_USE_TELEMETRY
+#include <telemetry.h>
+extern HTELEMETRY tmContext;
+#endif
+
+
+// Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
+#define PROTECT_THREAD_POOL 1
+
+
+using namespace nv;
+
+#if PROTECT_THREAD_POOL 
+Mutex s_pool_mutex("thread pool");
+#endif
+
+AutoPtr<ThreadPool> s_pool;
+
+
+/*static*/ void ThreadPool::setup(uint workerCount, bool useThreadAffinity, bool useCallingThread) {
+#if PROTECT_THREAD_POOL 
+    Lock<Mutex> lock(s_pool_mutex);
+#endif
+
+    s_pool = new ThreadPool(workerCount, useThreadAffinity, useCallingThread);
+}
+
+/*static*/ ThreadPool * ThreadPool::acquire()
+{
+#if PROTECT_THREAD_POOL 
+    s_pool_mutex.lock();    // @@ If same thread tries to lock twice, this should assert.
+#endif
+
+    if (s_pool == NULL) {
+        ThreadPool * p = new ThreadPool;
+        nvDebugCheck(s_pool == p);
+    }
+
+    return s_pool.ptr();
+}
+
+/*static*/ void ThreadPool::release(ThreadPool * pool)
+{
+    nvDebugCheck(pool == s_pool);
+
+    // Make sure the threads of the pool are idle.
+    s_pool->wait();
+
+#if PROTECT_THREAD_POOL 
+    s_pool_mutex.unlock();
+#endif
+}
+
+
+
+
+/*static*/ void ThreadPool::workerFunc(void * arg) {
+    uint i = U32((uintptr_t)arg); // This is OK, because workerCount should always be much smaller than 2^32
+
+    //ThreadPool::threadId = i;
+
+    if (s_pool->useThreadAffinity) {
+        lockThreadToProcessor(s_pool->useCallingThread + i);
+    }
+
+    while(true) 
+    {
+        s_pool->startEvents[i].wait();
+
+        ThreadTask * func = loadAcquirePointer(&s_pool->func);
+
+        if (func == NULL) {
+            return;
+        }
+        
+        {
+#if NV_USE_TELEMETRY
+            tmZoneFiltered(tmContext, 20, TMZF_NONE, "worker");
+#endif
+            func(s_pool->arg, s_pool->useCallingThread + i);
+        }
+
+        s_pool->finishEvents[i].post();
+    }
+}
+
+
+ThreadPool::ThreadPool(uint workerCount/*=processorCount()*/, bool useThreadAffinity/*=true*/, bool useCallingThread/*=false*/)
+{
+    s_pool = this;  // Worker threads need this to be initialized before they start.
+
+    this->useThreadAffinity = useThreadAffinity;
+    this->workerCount = workerCount;
+    this->useCallingThread = useCallingThread;
+
+    uint threadCount = workerCount - useCallingThread;
+
+    workers = new Thread[threadCount];
+
+    startEvents = new Event[threadCount];
+    finishEvents = new Event[threadCount];
+
+    nvCompilerWriteBarrier(); // @@ Use a memory fence?
+
+    if (useCallingThread && useThreadAffinity) {
+        lockThreadToProcessor(0);   // Calling thread always locked to processor 0.
+    }
+
+    for (uint i = 0; i < threadCount; i++) {
+        StringBuilder name;
+        name.format("worker %d", i);
+        workers[i].setName(name.release());     // @Leak
+        workers[i].start(workerFunc, (void *)i);
+    }
+
+    allIdle = true;
+}
+
+ThreadPool::~ThreadPool()
+{
+    // Set threads to terminate.
+    start(NULL, NULL);
+
+    // Wait until threads actually exit.
+    Thread::wait(workers, workerCount - useCallingThread);
+
+    delete [] workers;
+    delete [] startEvents;
+    delete [] finishEvents;
+}
+
+void ThreadPool::run(ThreadTask * func, void * arg)
+{
+    // Wait until threads are idle.
+    wait();
+
+    start(func, arg);
+
+    if (useCallingThread) {
+        func(arg, 0);
+    }
+
+    wait();
+}
+
+void ThreadPool::start(ThreadTask * func, void * arg)
+{
+    // Wait until threads are idle.
+    wait();
+
+    // Set our desired function.
+    storeReleasePointer(&this->func, func);
+    storeReleasePointer(&this->arg, arg);
+
+    allIdle = false;
+
+    // Resume threads.
+    Event::post(startEvents, workerCount - useCallingThread);
+}
+
+void ThreadPool::wait()
+{
+    if (!allIdle)
+    {
+        // Wait for threads to complete.
+        Event::wait(finishEvents, workerCount - useCallingThread);
+
+        allIdle = true;
+    }
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/Win32.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/Win32.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/Win32.h
@@ -0,0 +1,9 @@
+// This code is in the public domain -- castano@gmail.com
+
+// Never include this from a header file.
+
+#define WIN32_LEAN_AND_MEAN
+#define VC_EXTRALEAN
+#define _WIN32_WINNT 0x0400 // for SwitchToThread, TryEnterCriticalSection
+#include <windows.h>
+//#include <process.h> // for _beginthreadex
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/nvthread.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/nvthread.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/nvthread.h
@@ -0,0 +1,105 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_THREAD_H
+#define NV_THREAD_H
+
+#include "nvcore/nvcore.h"
+
+// Function linkage
+#if NVTHREAD_SHARED
+#ifdef NVTHREAD_EXPORTS
+#define NVTHREAD_API DLL_EXPORT
+#define NVTHREAD_CLASS DLL_EXPORT_CLASS
+#else
+#define NVTHREAD_API DLL_IMPORT
+#define NVTHREAD_CLASS DLL_IMPORT
+#endif
+#else // NVMATH_SHARED
+#define NVTHREAD_API
+#define NVTHREAD_CLASS
+#endif // NVMATH_SHARED
+
+
+// Compiler barriers.
+// See: http://en.wikipedia.org/wiki/Memory_ordering
+#if NV_CC_MSVC
+
+#include <intrin.h>
+
+#pragma intrinsic(_WriteBarrier)
+#define nvCompilerWriteBarrier      _WriteBarrier
+
+#pragma intrinsic(_ReadWriteBarrier)
+#define nvCompilerReadWriteBarrier  _ReadWriteBarrier
+
+#if _MSC_VER >= 1400            // ReadBarrier is VC2005
+#pragma intrinsic(_ReadBarrier)
+#define nvCompilerReadBarrier       _ReadBarrier	
+#else
+#define nvCompilerReadBarrier       _ReadWriteBarrier
+#endif
+
+#elif NV_CC_GNUC
+
+#define nvCompilerReadWriteBarrier()    asm volatile("" ::: "memory");
+#define nvCompilerWriteBarrier          nvCompilerReadWriteBarrier
+#define nvCompilerReadBarrier           nvCompilerReadWriteBarrier
+
+#elif NV_CC_CLANG && NV_CPU_ARM
+// thanks to Autor Artur Bac for 
+inline void sync_synchronize() { asm volatile( "dmb;"); }
+
+/* this is not yet supported by LLVM 2.1 but it is planned
+#define nvCompilerReadWriteBarrier()    MemoryFence()
+ */
+
+
+// JBeilin: from what i read this should do the trick for ARM 
+// however this might also be wrong and dumb.
+//#define nvCompilerReadWriteBarrier()    asm volatile( "dmb;");
+#define nvCompilerReadWriteBarrier()    nvCompilerReadWriteBarrier()
+#define nvCompilerWriteBarrier          nvCompilerReadWriteBarrier
+#define nvCompilerReadBarrier           nvCompilerReadWriteBarrier
+
+
+#endif // NV_CC_MSVC
+
+
+// @@ Memory barriers / fences.
+
+// @@ Atomics.
+
+
+/* Wrap this up:
+#define YieldProcessor() __asm { rep nop }
+#define YieldProcessor _mm_pause
+#define YieldProcessor __yield
+
+BOOL WINAPI SwitchToThread(void);
+*/
+
+
+namespace nv
+{
+    //void initThreadingSystemInfo();
+
+    // Reentrant.
+    uint processorCount();
+    uint logicalProcessorCount();
+    uint physicalProcessorCount();
+
+    // Locks the current thread to the given logical processor index.
+    void lockThreadToProcessor(int idx);
+    void unlockThreadToProcessor();
+
+    uint threadId();
+
+} // nv namespace
+
+
+
+
+
+
+#endif // NV_THREAD_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvthread/nvthread.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvthread/nvthread.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvthread/nvthread.cpp
@@ -0,0 +1,334 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "nvthread.h"
+
+#include "Thread.h"
+
+#if NV_OS_WIN32
+#include "Win32.h"
+#elif NV_OS_UNIX
+#include <sys/types.h>
+#if !NV_OS_LINUX
+#include <sys/sysctl.h>
+#endif
+#include <unistd.h>
+#elif NV_OS_DARWIN
+#import <stdio.h>
+#import <string.h>
+#import <mach/mach_host.h>
+#import <sys/sysctl.h>
+
+//#include <CoreFoundation/CoreFoundation.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+#endif
+
+using namespace nv;
+
+#if NV_OS_WIN32
+
+typedef BOOL(WINAPI *LPFN_GSI)(LPSYSTEM_INFO);
+typedef BOOL(WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+
+static bool isWow64() {
+    LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+
+    BOOL wow64 = FALSE;
+
+    if (NULL != fnIsWow64Process) {
+        if (!fnIsWow64Process(GetCurrentProcess(), &wow64)) {
+            // If error, assume false.
+        }
+    }
+
+    return wow64 != 0;
+}
+
+static void getSystemInfo(SYSTEM_INFO * sysinfo) {
+    BOOL success = FALSE;
+
+    if (isWow64()) {
+        LPFN_GSI fnGetNativeSystemInfo = (LPFN_GSI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetNativeSystemInfo");
+
+        if (fnGetNativeSystemInfo != NULL) {
+            success = fnGetNativeSystemInfo(sysinfo);
+        }
+    }
+
+    if (!success) {
+        GetSystemInfo(sysinfo);
+    }
+}
+
+#endif // NV_OS_WIN32
+
+// Find the number of logical processors in the system.
+// Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
+uint nv::processorCount() {
+#if NV_OS_WIN32
+    SYSTEM_INFO sysinfo;
+    getSystemInfo(&sysinfo);
+    //return sysinfo.dwNumberOfProcessors;
+
+    // Respect process affinity mask?
+    DWORD_PTR pam, sam;
+    GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+
+    // Count number of bits set in the processor affinity mask.
+    uint count = 0;
+    for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
+        if (pam & (DWORD_PTR(1) << i)) count++;
+    }
+    nvDebugCheck(count <= sysinfo.dwNumberOfProcessors);
+
+    return count;
+#elif NV_OS_ORBIS
+    return 6;
+#elif NV_OS_XBOX
+    return 3; // or 6?
+#elif NV_OS_LINUX || NV_OS_NETBSD // Linux, Solaris, & AIX
+    return sysconf(_SC_NPROCESSORS_ONLN);
+#elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+    int numCPU;
+    int mib[4];
+    size_t len = sizeof(numCPU);
+
+    // set the mib for hw.ncpu
+    mib[0] = CTL_HW;
+
+#if NV_OS_OPENBSD || NV_OS_FREEBSD
+    mib[1] = HW_NCPU;
+#else
+    mib[1] = HW_AVAILCPU;
+#endif
+
+    // get the number of CPUs from the system
+    sysctl(mib, 2, &numCPU, &len, NULL, 0);
+
+    if (numCPU < 1) {
+        mib[1] = HW_NCPU;
+        sysctl( mib, 2, &numCPU, &len, NULL, 0 );
+
+        if (numCPU < 1) {
+            return 1; // Assume single core.
+        }
+    }
+
+    return numCPU;
+#else
+    return 1; // Assume single core.
+#endif
+}
+
+
+uint nv::threadId() {
+#if NV_OS_WIN32
+    return GetCurrentThreadId();
+#else
+    return 0;   // @@ 
+#endif
+}
+
+
+// @@ If we are using less worker threads than processors and hyperthreading is available, we probably want to enumerate the logical processors 
+// so that the first cores of each processor goes first. This way, if say, we leave 2 hardware threads free, then we still have one worker
+// thread on each physical processor.
+
+// I believe that currently logical processors are enumerated in physical order, that is:
+//   0 = thread a in physical core 0
+//   1 = thread b in physical core 0
+//   2 = thread a in physical core 1
+//   ... and so on ...
+// I'm not sure we can actually rely on that. And in any case we should start detecting the number of physical processors, which appears to be a pain 
+// to do in a way that's compatible with newer i7 processors.
+
+void nv::lockThreadToProcessor(int idx) {
+#if NV_OS_WIN32
+    //nvDebugCheck(idx < hardwareThreadCount());
+#if 0
+    DWORD_PTR tam = 1 << idx;
+#else
+    DWORD_PTR pam, sam;
+    BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+
+    // Find the idx's bit set.
+    uint pidx = 0;
+    DWORD_PTR tam = 0;
+    for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
+        DWORD_PTR mask = DWORD_PTR(1) << i;
+        if (pam & mask) {
+            if (pidx == idx) {
+                tam = mask;
+                break;
+            }
+            pidx++;
+        }
+    }
+
+    nvDebugCheck(tam != 0);
+#endif
+
+    SetThreadAffinityMask(GetCurrentThread(), tam);
+#else
+    // @@ NOP
+#endif
+}
+
+
+void nv::unlockThreadToProcessor() {
+#if NV_OS_WIN32
+    DWORD_PTR pam, sam;
+    BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+    SetThreadAffinityMask(GetCurrentThread(), pam);
+#else
+    // @@ NOP
+#endif
+}
+
+uint nv::logicalProcessorCount() {
+    return processorCount();
+}
+
+
+#if NV_OS_WIN32
+
+struct LOGICALPROCESSORDATA
+{
+    unsigned int nLargestStandardFunctionNumber;
+    unsigned int nLargestExtendedFunctionNumber;
+    int nLogicalProcessorCount;
+    int nLocalApicId;
+    int nCPUcore;
+    int nProcessorId;
+    int nApicIdCoreIdSize;
+    int nNC;
+    int nMNC;
+    int nCPUCoresperProcessor;
+    int nThreadsperCPUCore;
+    int nProcId;
+    int nCoreId;
+    bool CmpLegacy;
+    bool HTT;
+};
+
+#define MAX_NUMBER_OF_LOGICAL_PROCESSORS 96
+#define MAX_NUMBER_OF_PHYSICAL_PROCESSORS 8
+#define MAX_NUMBER_OF_IOAPICS 16
+static LOGICALPROCESSORDATA LogicalProcessorMap[MAX_NUMBER_OF_LOGICAL_PROCESSORS];
+static int PhysProcIds[MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS];
+
+static void gatherProcessorData(LOGICALPROCESSORDATA * p) {
+
+    int CPUInfo[4] = { 0, 0, 0, 0 };
+    __cpuid(CPUInfo, 0);
+
+    p->nLargestStandardFunctionNumber = CPUInfo[0];
+
+    // Get the information associated with each valid Id
+    for (uint i = 0; i <= p->nLargestStandardFunctionNumber; ++i) {
+        __cpuid(CPUInfo, i);
+
+        // Interpret CPU feature information.
+        if (i == 1) {
+            // Some of the bits of LocalApicId represent the CPU core 
+            // within a processor and other bits represent the processor ID. 
+            p->nLocalApicId = (CPUInfo[1] >> 24) & 0xff;
+            p->HTT = (CPUInfo[3] >> 28) & 0x1;
+            // recalculate later after 0x80000008
+            p->nLogicalProcessorCount = (CPUInfo[1] >> 16) & 0x0FF;
+        }
+    }
+
+    // Calling __cpuid with 0x80000000 as the InfoType argument
+    // gets the number of valid extended IDs.
+    __cpuid(CPUInfo, 0x80000000);
+    p->nLargestExtendedFunctionNumber = CPUInfo[0];
+
+    // Get the information associated with each extended ID.
+    for (uint i = 0x80000000; i <= p->nLargestExtendedFunctionNumber; ++i) {
+        __cpuid(CPUInfo, i);
+        if (i == 0x80000008) {
+            p->nApicIdCoreIdSize = (CPUInfo[2] >> 12) & 0xF;
+            p->nNC = (CPUInfo[2]) & 0x0FF;
+        }
+    }
+
+    // MNC
+    // A value of zero for ApicIdCoreIdSize indicates that MNC is derived by this
+    // legacy formula: MNC = NC + 1
+    // A non-zero value of ApicIdCoreIdSize means that MNC is 2^ApicIdCoreIdSize  
+    if (p->nApicIdCoreIdSize) {
+        p->nMNC = 2;
+        for (uint j = p->nApicIdCoreIdSize - 1; j > 0; j--) {
+            p->nMNC = p->nMNC * 2;
+        }
+    }
+    else {
+        p->nMNC = p->nNC + 1;
+    }
+
+    // If HTT==0, then LogicalProcessorCount is reserved, and the CPU contains 
+    // one CPU core and the CPU core is single-threaded.
+    // If HTT==1 and CmpLegacy==1, LogicalProcessorCount represents the number of
+    // CPU cores per processor, where each CPU core is single-threaded.  If HTT==1
+    // and CmpLegacy==0, then LogicalProcessorCount is the number of threads per
+    // processor, which is the number of cores times the number of threads per core.
+    // The number of cores is NC+1.
+    p->nCPUCoresperProcessor = p->nNC + 1;
+    p->nThreadsperCPUCore = (p->HTT == 0 ? 1 : (p->CmpLegacy == 1 ? 1 : p->nLogicalProcessorCount / p->nCPUCoresperProcessor ));
+
+    // Calculate a mask for the core IDs
+    uint mask = 1;
+    uint numbits = 1;
+    if (p->nApicIdCoreIdSize) {
+        numbits = p->nApicIdCoreIdSize;
+        for (uint j = p->nApicIdCoreIdSize; j > 1; j--) {
+            mask = (mask << 1) + 1;
+        }
+    }
+    p->nProcId = (p->nLocalApicId & ~mask) >> numbits;
+    p->nCoreId = p->nLocalApicId & mask;
+}
+
+
+uint nv::physicalProcessorCount() {
+
+    uint lpc = logicalProcessorCount();
+
+    // Get info about each logical processor.
+    for (uint i = 0; i < lpc; i++) {
+        // Make sure thread doesn't change processor while we gather it's data.
+        lockThreadToProcessor(i);
+
+        gatherProcessorData(&LogicalProcessorMap[i]);
+    }
+
+    unlockThreadToProcessor();
+
+    memset(PhysProcIds, 0, sizeof(PhysProcIds));
+    for (uint i = 0; i < lpc; i++) {
+        PhysProcIds[LogicalProcessorMap[i].nProcId]++;
+    }
+
+    uint pc = 0;
+    for (uint i = 0; i < (MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS); i++) {
+        if (PhysProcIds[i] != 0) {
+            pc++;
+        }
+    }
+
+    return pc;
+}
+
+#else
+
+uint nv::physicalProcessorCount() {
+    // @@ Assume the same.
+    return processorCount();
+}
+
+#endif
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/BlockCompressor.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/BlockCompressor.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/BlockCompressor.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_BLOCKCOMPRESSOR_H
+#define NVTT_BLOCKCOMPRESSOR_H
+
+#include "Compressor.h"
+
+
+namespace nv
+{
+    struct ColorBlock;
+    class Vector4;
+
+    struct ColorBlockCompressor : public CompressorInterface
+    {
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * rgba, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const = 0;
+    };
+
+    struct FloatColorCompressor : public CompressorInterface
+    {
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * rgba, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const = 0;
+    };
+
+} // nv namespace
+
+
+#endif // NVTT_BLOCKCOMPRESSOR_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/BlockCompressor.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/BlockCompressor.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/BlockCompressor.cpp
@@ -0,0 +1,335 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "BlockCompressor.h"
+#include "OutputOptions.h"
+#include "TaskDispatcher.h"
+
+#include "nvimage/Image.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Memory.h"
+
+#include <new> // placement new
+
+
+using namespace nv;
+using namespace nvtt;
+
+/*
+// OpenMP
+#if defined(HAVE_OPENMP)
+#include <omp.h>
+#endif
+
+void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, const float * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    const uint bs = blockSize();
+    const uint bw = (w + 3) / 4;
+    const uint bh = (h + 3) / 4;
+
+#if defined(HAVE_OPENMP)
+    bool singleThreaded = false;
+#else
+    bool singleThreaded = true;
+#endif
+
+    // Use a single thread to compress small textures.
+    if (bw * bh < 16) singleThreaded = true;
+
+    if (singleThreaded)
+    {
+        nvDebugCheck(bs <= 16);
+        uint8 mem[16]; // @@ Output one row at a time!
+
+        for (int y = 0; y < int(h); y += 4) {
+            for (uint x = 0; x < w; x += 4) {
+
+                ColorBlock rgba;
+                rgba.init(w, h, data, x, y);
+
+                compressBlock(rgba, alphaMode, compressionOptions, mem);
+
+                if (outputOptions.outputHandler != NULL) {
+                    outputOptions.outputHandler->writeData(mem, bs);
+                }
+            }
+        }
+    }
+#if defined(HAVE_OPENMP)
+    else
+    {
+        const uint size = bs * bw * bh;
+        uint8 * mem = new uint8[size];
+
+        #pragma omp parallel
+        {
+            #pragma omp for
+            for (int i = 0; i < int(bw*bh); i++)
+            {
+                const uint x = i % bw;
+                const uint y = i / bw;
+
+		ColorBlock rgba;
+		rgba.init(w, h, data, 4*x, 4*y);
+
+		uint8 * ptr = mem + (y * bw + x) * bs;
+		compressBlock(rgba, alphaMode, compressionOptions, ptr);
+	    } // omp for
+	} // omp parallel
+
+	if (outputOptions.outputHandler != NULL) {
+	    outputOptions.outputHandler->writeData(mem, size);
+	}
+
+        delete [] mem;
+    }
+#endif
+}
+*/
+
+
+struct CompressorContext
+{
+    nvtt::AlphaMode alphaMode;
+    uint w, h, d;
+    const float * data;
+    const nvtt::CompressionOptions::Private * compressionOptions;
+
+    uint bw, bh, bs;
+    uint8 * mem;
+    CompressorInterface * compressor;
+};
+
+
+// Each task compresses one block.
+void ColorBlockCompressorTask(void * data, int i)
+{
+    CompressorContext * d = (CompressorContext *) data;
+
+    uint x = i % d->bw;
+    uint y = i / d->bw;
+
+    //for (uint x = 0; x < d->bw; x++)
+    {
+        ColorBlock rgba;
+        rgba.init(d->w, d->h, d->data, 4*x, 4*y);
+
+        uint8 * ptr = d->mem + (y * d->bw + x) * d->bs;
+        ((ColorBlockCompressor *) d->compressor)->compressBlock(rgba, d->alphaMode, *d->compressionOptions, ptr);
+    }
+}
+
+void ColorBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    CompressorContext context;
+    context.alphaMode = alphaMode;
+    context.w = w;
+    context.h = h;
+    context.d = d;
+    context.data = data;
+    context.compressionOptions = &compressionOptions;
+
+    context.bs = blockSize();
+    context.bw = (w + 3) / 4;
+    context.bh = (h + 3) / 4;
+
+    context.compressor = this;
+
+    SequentialTaskDispatcher sequential;
+
+    // Use a single thread to compress small textures.
+    if (context.bh < 4) dispatcher = &sequential;
+
+#if _DEBUG
+    dispatcher = &sequential;
+#endif
+
+    const uint count = context.bw * context.bh;
+    const uint size = context.bs * count;
+    context.mem = new uint8[size];
+
+    dispatcher->dispatch(ColorBlockCompressorTask, &context, count);
+
+    outputOptions.writeData(context.mem, size);
+
+    delete [] context.mem;
+}
+
+
+#if 0
+// Each task compresses one block.
+void ColorSetCompressorTask(void * data, int i)
+{
+    CompressorContext * d = (CompressorContext *) data;
+
+    uint x = i % d->bw;
+    uint y = i / d->bw;
+
+    //for (uint x = 0; x < d->bw; x++)
+    {
+        ColorSet set;
+        set.setColors(d->data, d->w, d->h, x * 4, y * 4);
+
+        uint8 * ptr = d->mem + (y * d->bw + x) * d->bs;
+        ((ColorSetCompressor *)d->compressor)->compressBlock(set, d->alphaMode, *d->compressionOptions, ptr);
+    }
+}
+
+
+void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    CompressorContext context;
+    context.alphaMode = alphaMode;
+    context.w = w;
+    context.h = h;
+    context.data = data;
+    context.compressionOptions = &compressionOptions;
+
+    context.bs = blockSize();
+    context.bw = (w + 3) / 4;
+    context.bh = (h + 3) / 4;
+
+    context.compressor = this;
+
+    SequentialTaskDispatcher sequential;
+
+    // Use a single thread to compress small textures.
+    if (context.bh < 4) dispatcher = &sequential;
+
+#if _DEBUG
+    dispatcher = &sequential;
+#endif
+
+    const uint count = context.bw * context.bh;
+    const uint size = context.bs * count;
+    context.mem = new uint8[size];
+
+    dispatcher->dispatch(ColorSetCompressorTask, &context, count);
+
+    outputOptions.writeData(context.mem, size);
+
+    delete [] context.mem;
+}
+#endif // 0
+
+
+// Each task compresses one block.
+void FloatColorCompressorTask(void * data, int i)
+{
+    CompressorContext * d = (CompressorContext *) data;
+
+    // Copy image to block.
+    const uint block_x = (i % d->bw);
+    const uint block_y = (i / d->bw);
+
+    const uint src_x_offset = block_x * 4;
+    const uint src_y_offset = block_y * 4;
+
+    const float * r = (const float *)d->data + d->w * d->h * d->d * 0;
+    const float * g = (const float *)d->data + d->w * d->h * d->d * 1;
+    const float * b = (const float *)d->data + d->w * d->h * d->d * 2;
+    const float * a = (const float *)d->data + d->w * d->h * d->d * 3;
+
+    Vector4 colors[16];
+    float weights[16];
+
+    const uint block_w = min(d->w - block_x * 4U, 4U);
+    const uint block_h = min(d->h - block_y * 4U, 4U);
+
+    uint x, y;
+    for (y = 0; y < block_h; y++) {
+        for (x = 0; x < block_w; x++) {
+            uint dst_idx = 4 * y + x;
+            uint src_idx = (y + src_y_offset) * d->w + (x + src_x_offset);
+            colors[dst_idx].x = r[src_idx];
+            colors[dst_idx].y = g[src_idx];
+            colors[dst_idx].z = b[src_idx];
+            colors[dst_idx].w = a[src_idx];
+            weights[dst_idx] = (d->alphaMode == nvtt::AlphaMode_Transparency) ? a[src_idx] : 1.0f;
+        }
+        for (; x < 4; x++) {
+            uint dst_idx = 4 * y + x;
+            colors[dst_idx] = Vector4(0);
+            weights[dst_idx] = 0.0f;
+        }
+    }
+    for (; y < 4; y++) {
+        for (x = 0; x < 4; x++) {
+            uint dst_idx = 4 * y + x;
+            colors[dst_idx] = Vector4(0);
+            weights[dst_idx] = 0.0f;
+        }
+    }
+
+    // Compress block.
+    uint8 * output = d->mem + (block_y * d->bw + block_x) * d->bs;
+    ((FloatColorCompressor *)d->compressor)->compressBlock(colors, weights, *d->compressionOptions, output);
+}
+
+
+void FloatColorCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);   // @@ Add support for compressed 3D textures.
+
+    CompressorContext context;
+    context.alphaMode = alphaMode;
+    context.w = w;
+    context.h = h;
+    context.d = d;
+    context.data = data;
+    context.compressionOptions = &compressionOptions;
+
+    context.bs = blockSize();
+    context.bw = (w + 3) / 4;
+    context.bh = (h + 3) / 4;
+
+    context.compressor = this;
+
+    SequentialTaskDispatcher sequential;
+
+    // Use a single thread to compress small textures.
+    if (context.bh < 4) dispatcher = &sequential;
+
+#if _DEBUG
+    dispatcher = &sequential;
+#endif
+
+    const uint count = context.bw * context.bh;
+    const uint size = context.bs * count;
+    context.mem = new uint8[size];
+
+    dispatcher->dispatch(FloatColorCompressorTask, &context, count);
+
+    outputOptions.writeData(context.mem, size);
+
+    delete [] context.mem;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CMakeLists.txt
@@ -3,115 +3,64 @@
 ADD_SUBDIRECTORY(squish)
 
 SET(NVTT_SRCS
-	nvtt.h 
-	nvtt.cpp
-	Compressor.h
-	Compressor.cpp
-	nvtt_wrapper.h
-	nvtt_wrapper.cpp
-	CompressDXT.h
-	CompressDXT.cpp
-	CompressRGB.h
-	CompressRGB.cpp
-	QuickCompressDXT.h
-	QuickCompressDXT.cpp
-	OptimalCompressDXT.h
-	OptimalCompressDXT.cpp
-	SingleColorLookup.h
-	CompressionOptions.h
-	CompressionOptions.cpp
-	InputOptions.h
-	InputOptions.cpp
-	OutputOptions.h
-	OutputOptions.cpp
-	cuda/CudaUtils.h
-	cuda/CudaUtils.cpp
-	cuda/CudaMath.h
-	cuda/Bitmaps.h
-	cuda/CudaCompressDXT.h
-	cuda/CudaCompressDXT.cpp)
-
-IF(CUDA_FOUND)
-	ADD_DEFINITIONS(-DHAVE_CUDA)
-	WRAP_CUDA(CUDA_SRCS cuda/CompressKernel.cu)
-	SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
-	SET(LIBS ${LIBS} ${CUDA_LIBRARIES})
-	INCLUDE_DIRECTORIES(${CUDA_INCLUDE_PATH})
-ENDIF(CUDA_FOUND)
+    nvtt.h nvtt.cpp
+    nvtt_wrapper.h nvtt_wrapper.cpp
+    ClusterFit.h ClusterFit.cpp
+    Compressor.h
+    BlockCompressor.h BlockCompressor.cpp
+    CompressorDX9.h CompressorDX9.cpp
+    CompressorDX10.h CompressorDX10.cpp
+    CompressorDX11.h CompressorDX11.cpp
+    CompressorDXT1.h CompressorDXT1.cpp
+    CompressorDXT5_RGBM.h CompressorDXT5_RGBM.cpp
+    CompressorRGB.h CompressorRGB.cpp
+    Context.h Context.cpp
+    QuickCompressDXT.h QuickCompressDXT.cpp
+    OptimalCompressDXT.h OptimalCompressDXT.cpp
+    SingleColorLookup.h SingleColorLookup.cpp
+    CompressionOptions.h CompressionOptions.cpp
+    InputOptions.h InputOptions.cpp
+    OutputOptions.h OutputOptions.cpp
+    TaskDispatcher.h #TaskDispatcher.cpp
+    Surface.h Surface.cpp
+    CubeSurface.h CubeSurface.cpp
+    cuda/CudaUtils.h cuda/CudaUtils.cpp
+    cuda/CudaMath.h
+    cuda/BitmapTable.h
+    cuda/CudaCompressorDXT.h cuda/CudaCompressorDXT.cpp)
+
+IF (CUDA_FOUND)
+    ADD_DEFINITIONS(-DHAVE_CUDA)
+    CUDA_COMPILE(CUDA_SRCS cuda/CompressKernel.cu)
+    SET(NVTT_SRCS ${NVTT_SRCS} ${CUDA_SRCS})
+    SET(LIBS ${LIBS} ${CUDA_LIBRARIES})
+    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+ENDIF (CUDA_FOUND)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
 ADD_DEFINITIONS(-DNVTT_EXPORTS)
 
 IF(NVTT_SHARED)
-	IF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-		SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
-	ENDIF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    IF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+        SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,origin")
+    ENDIF(CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
 
-	ADD_DEFINITIONS(-DNVTT_SHARED=1)
-	ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
+    ADD_DEFINITIONS(-DNVTT_SHARED=1)
+    ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
 ELSE(NVTT_SHARED)
-	ADD_LIBRARY(nvtt ${NVTT_SRCS})
+    ADD_LIBRARY(nvtt ${NVTT_SRCS})
 ENDIF(NVTT_SHARED)
 
-TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvmath nvimage squish)
+TARGET_LINK_LIBRARIES(nvtt ${LIBS} nvcore nvimage nvthread squish bc6h bc7 nvmath)
 
 INSTALL(TARGETS nvtt 
-	RUNTIME DESTINATION ${BINDIR}
-	LIBRARY DESTINATION ${LIBDIR}
-	ARCHIVE DESTINATION ${LIBDIR})
+    RUNTIME DESTINATION ${BINDIR}
+    LIBRARY DESTINATION ${LIBDIR}
+    ARCHIVE DESTINATION ${LIBDIR})
 
 INSTALL(FILES nvtt.h DESTINATION include/nvtt)
 
 
-
-# test executables
-ADD_EXECUTABLE(nvcompress tools/compress.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(nvcompress nvcore nvmath nvimage nvtt)
-
-ADD_EXECUTABLE(nvdecompress tools/decompress.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(nvdecompress nvcore nvmath nvimage)
-
-ADD_EXECUTABLE(nvddsinfo tools/ddsinfo.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(nvddsinfo nvcore nvmath nvimage)
-
-ADD_EXECUTABLE(nvimgdiff tools/imgdiff.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(nvimgdiff nvcore nvmath nvimage)
-
-ADD_EXECUTABLE(nvassemble tools/assemble.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(nvassemble nvcore nvmath nvimage)
-
-ADD_EXECUTABLE(filtertest tests/filtertest.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(filtertest nvcore nvmath nvimage)
-
-ADD_EXECUTABLE(nvzoom tools/resize.cpp tools/cmdline.h)
-TARGET_LINK_LIBRARIES(nvzoom nvcore nvmath nvimage)
-
-INSTALL(TARGETS nvcompress nvdecompress nvddsinfo nvimgdiff nvassemble nvzoom DESTINATION bin)
-
-# UI tools
-IF(QT4_FOUND AND NOT MSVC)
-	SET(QT_USE_QTOPENGL TRUE)
-	INCLUDE_DIRECTORIES(${QT_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
-	
-	SET(SRCS
-		tools/main.cpp
-		tools/configdialog.h
-		tools/configdialog.cpp)
-
-	SET(LIBS
-		nvtt
-		${QT_QTCORE_LIBRARY}
-		${QT_QTGUI_LIBRARY}
-		${QT_QTOPENGL_LIBRARY})
-
-	QT4_WRAP_UI(UICS tools/configdialog.ui)
-	QT4_WRAP_CPP(MOCS tools/configdialog.h)
-	#QT4_ADD_RESOURCES(RCCS tools/configdialog.rc)
-
-	ADD_EXECUTABLE(nvcompressui MACOSX_BUNDLE ${SRCS} ${UICS} ${MOCS})
-	TARGET_LINK_LIBRARIES(nvcompressui ${LIBS})
-
-ENDIF(QT4_FOUND AND NOT MSVC)
-
-
+#ADD_SUBDIRECTORY(tools)
+#ADD_SUBDIRECTORY(tests)
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/ClusterFit.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/ClusterFit.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/ClusterFit.h
@@ -0,0 +1,83 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to	deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef NVTT_CLUSTERFIT_H
+#define NVTT_CLUSTERFIT_H
+
+#include "nvmath/SimdVector.h"
+#include "nvmath/Vector.h"
+#include "nvcore/Memory.h"
+
+// Use SIMD version if altivec or SSE are available.
+#define NVTT_USE_SIMD (NV_USE_ALTIVEC || NV_USE_SSE)
+//#define NVTT_USE_SIMD 0
+
+namespace nv {
+
+    struct ColorSet;
+
+    class ClusterFit
+    {
+    public:
+        ClusterFit();
+
+        //void setColorSet(const ColorSet * set);
+        void setColorSet(const Vector3 * colors, const float * weights, int count);
+
+        void setColorWeights(const Vector4 & w);
+        float bestError() const;
+
+        bool compress3(Vector3 * start, Vector3 * end);
+        bool compress4(Vector3 * start, Vector3 * end);
+
+    private:
+
+        uint m_count;
+
+        // IC: Color and weight arrays are larger than necessary to avoid compiler warning.
+
+    #if NVTT_USE_SIMD
+        NV_ALIGN_16 SimdVector m_weighted[17];  // color | weight
+        SimdVector m_metric;                    // vec3
+        SimdVector m_metricSqr;                 // vec3
+        SimdVector m_xxsum;                     // color | weight
+        SimdVector m_xsum;                      // color | weight (wsum)
+        SimdVector m_besterror;                 // scalar
+    #else
+        Vector3 m_weighted[17];
+        float m_weights[17];
+        Vector3 m_metric;
+        Vector3 m_metricSqr;
+        Vector3 m_xxsum;
+        Vector3 m_xsum;
+        float m_wsum;
+        float m_besterror;
+    #endif
+    };
+
+} // nv namespace
+
+#endif // NVTT_CLUSTERFIT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/ClusterFit.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/ClusterFit.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/ClusterFit.cpp
@@ -0,0 +1,660 @@
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to	deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "ClusterFit.h"
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
+#include "nvimage/ColorBlock.h"
+
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+
+ClusterFit::ClusterFit()
+{
+}
+
+#if 0 // @@ Deprecate. Do not use color set directly.
+void ClusterFit::setColorSet(const ColorSet * set) 
+{
+    // initialise the best error
+#if NVTT_USE_SIMD
+    m_besterror = SimdVector( FLT_MAX );
+    Vector3 metric = m_metric.toVector3();
+#else
+    m_besterror = FLT_MAX;
+    Vector3 metric = m_metric;
+#endif
+
+    // cache some values
+    m_count = set->colorCount;
+
+    Vector3 values[16];
+    for (uint i = 0; i < m_count; i++)
+    {
+        values[i] = set->colors[i].xyz();
+    }
+
+    Vector3 principal = Fit::computePrincipalComponent_PowerMethod(m_count, values, set->weights, metric);
+    //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(m_count, values, set->weights, metric);
+
+    // build the list of values
+    int order[16];
+    float dps[16];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        dps[i] = dot(values[i], principal);
+        order[i] = i;
+    }
+
+    // stable sort
+    for (uint i = 0; i < m_count; ++i)
+    {
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
+        {
+            swap(dps[j], dps[j - 1]);
+            swap(order[j], order[j - 1]);
+        }
+    }
+
+    // weight all the points
+#if NVTT_USE_SIMD
+    m_xxsum = SimdVector( 0.0f );
+    m_xsum = SimdVector( 0.0f );
+#else
+    m_xxsum = Vector3(0.0f);
+    m_xsum = Vector3(0.0f);
+    m_wsum = 0.0f;
+#endif
+	
+    for (uint i = 0; i < m_count; ++i)
+    {
+        int p = order[i];
+#if NVTT_USE_SIMD
+        NV_ALIGN_16 Vector4 tmp(values[p], 1);
+        m_weighted[i] = SimdVector(tmp.component) * SimdVector(set->weights[p]);
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+#else
+        m_weighted[i] = values[p] * set->weights[p];
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+        m_weights[i] = set->weights[p];
+        m_wsum += m_weights[i];
+#endif
+    }
+}
+#endif // 0
+
+
+void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count)
+{
+    // initialise the best error
+#if NVTT_USE_SIMD
+    m_besterror = SimdVector( FLT_MAX );
+    Vector3 metric = m_metric.toVector3();
+#else
+    m_besterror = FLT_MAX;
+    Vector3 metric = m_metric;
+#endif
+
+    m_count = count;
+
+    Vector3 principal = Fit::computePrincipalComponent_PowerMethod(count, colors, weights, metric);
+    //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(count, colors, weights, metric);
+
+    // build the list of values
+    int order[16];
+    float dps[16];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        dps[i] = dot(colors[i], principal);
+        order[i] = i;
+    }
+
+    // stable sort
+    for (uint i = 0; i < m_count; ++i)
+    {
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
+        {
+            swap(dps[j], dps[j - 1]);
+            swap(order[j], order[j - 1]);
+        }
+    }
+
+    // weight all the points
+#if NVTT_USE_SIMD
+    m_xxsum = SimdVector( 0.0f );
+    m_xsum = SimdVector( 0.0f );
+#else
+    m_xxsum = Vector3(0.0f);
+    m_xsum = Vector3(0.0f);
+    m_wsum = 0.0f;
+#endif
+	
+    for (uint i = 0; i < m_count; ++i)
+    {
+        int p = order[i];
+#if NVTT_USE_SIMD
+        NV_ALIGN_16 Vector4 tmp(colors[p], 1);
+        m_weighted[i] = SimdVector(tmp.component) * SimdVector(weights[p]);
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+#else
+        m_weighted[i] = colors[p] * weights[p];
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+        m_weights[i] = weights[p];
+        m_wsum += m_weights[i];
+#endif
+    }
+}
+
+
+
+void ClusterFit::setColorWeights(Vector4::Arg w)
+{
+#if NVTT_USE_SIMD
+    NV_ALIGN_16 Vector4 tmp(w.xyz(), 1);
+    m_metric = SimdVector(tmp.component);
+#else
+    m_metric = w.xyz();
+#endif
+    m_metricSqr = m_metric * m_metric;
+}
+
+float ClusterFit::bestError() const
+{
+#if NVTT_USE_SIMD
+    SimdVector x = m_xxsum * m_metricSqr;
+    SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
+    return error.toFloat();
+#else
+    return m_besterror + dot(m_xxsum, m_metricSqr);
+#endif
+
+}
+
+#if NVTT_USE_SIMD
+
+bool ClusterFit::compress3( Vector3 * start, Vector3 * end )
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+    // declare variables
+    SimdVector beststart = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
+    SimdVector besterror = SimdVector( FLT_MAX );
+
+    SimdVector x0 = zero;
+
+    int b0 = 0, b1 = 0;
+
+    // check all possible clusters for this total order
+    for( int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for( int c1 = 0; c1 <= count-c0; c1++)
+        {
+            const SimdVector x2 = m_xsum - x1 - x0;
+
+            //Vector3 alphax_sum = x0 + x1 * 0.5f;
+            //float alpha2_sum = w0 + w1 * 0.25f;
+            const SimdVector alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+            const SimdVector alpha2_sum = alphax_sum.splatW();
+
+            //const Vector3 betax_sum = x2 + x1 * 0.5f;
+            //const float beta2_sum = w2 + w1 * 0.25f;
+            const SimdVector betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+            const SimdVector beta2_sum = betax_sum.splatW();
+
+            //const float alphabeta_sum = w1 * 0.25f;
+            const SimdVector alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
+
+            // const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+            const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+            // clamp to the grid
+            a = min( one, max( zero, a ) );
+            b = min( one, max( zero, b ) );
+            a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+            b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+
+            // compute the error (we skip the constant xxsum)
+            SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+            SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+            SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+            SimdVector e4 = multiplyAdd( two, e3, e1 );
+
+            // apply the metric to the error term
+            SimdVector e5 = e4 * m_metricSqr;
+            SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+            // keep the solution if it wins
+            if( compareAnyLessThan( error, besterror ) )
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+                b0 = c0;
+                b1 = c1;
+            }
+
+            x1 += m_weighted[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if( compareAnyLessThan( besterror, m_besterror ) )
+    {
+
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half = SimdVector(0.5f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+    const SimdVector twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+    const SimdVector twonineths = SimdVector( 2.0f/9.0f );
+    const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+    // declare variables
+    SimdVector beststart = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
+    SimdVector besterror = SimdVector( FLT_MAX );
+
+    SimdVector x0 = zero;
+    int b0 = 0, b1 = 0, b2 = 0;
+
+    // check all possible clusters for this total order
+    for( int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for( int c1 = 0; c1 <= count-c0; c1++)
+        {
+            SimdVector x2 = zero;
+
+            for( int c2 = 0; c2 <= count-c0-c1; c2++)
+            {
+                const SimdVector x3 = m_xsum - x2 - x1 - x0;
+
+                //const Vector3 alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                //const float alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
+                const SimdVector alpha2_sum = alphax_sum.splatW();
+
+                //const Vector3 betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+                //const float beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                const SimdVector betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
+                const SimdVector beta2_sum = betax_sum.splatW();
+
+                //const float alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                const SimdVector alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum
+
+                //const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+                const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+                // clamp to the grid
+                a = min( one, max( zero, a ) );
+                b = min( one, max( zero, b ) );
+                a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+                b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+
+                // compute the error (we skip the constant xxsum)
+                SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+                SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+                SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+                SimdVector e4 = multiplyAdd( two, e3, e1 );
+
+#if 1
+                // apply the metric to the error term
+                SimdVector e5 = e4 * m_metricSqr;
+                SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+#else
+                // @@ Is there a horizontal max SIMD instruction?
+                SimdVector error = e4.splatX() + e4.splatY() + e4.splatZ();
+                error *= two;
+                error += max(max(e4.splatX(), e4.splatY()), e4.splatZ());
+                error -= min(min(e4.splatX(), e4.splatY()), e4.splatZ());
+
+#endif
+
+                // keep the solution if it wins
+                if (compareAnyLessThan(error, besterror))
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                    b0 = c0;
+                    b1 = c1;
+                    b2 = c2;
+                }
+
+                x2 += m_weighted[c0+c1+c2];
+            }
+
+            x1 += m_weighted[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if (compareAnyLessThan(besterror, m_besterror))
+    {
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+#else
+
+inline Vector3 round565(const Vector3 & v) {
+	uint r = ftoi_trunc(v.x * 31.0f);
+    float r0 = float(((r+0) << 3) | ((r+0) >> 2));
+    float r1 = float(((r+1) << 3) | ((r+1) >> 2));
+    if (fabs(v.x - r1) < fabs(v.x - r0)) r = min(r+1, 31U);
+	r = (r << 3) | (r >> 2);
+
+	uint g = ftoi_trunc(v.y * 63.0f);
+    float g0 = float(((g+0) << 2) | ((g+0) >> 4));
+    float g1 = float(((g+1) << 2) | ((g+1) >> 4));
+    if (fabs(v.y - g1) < fabs(v.y - g0)) g = min(g+1, 63U);
+    g = (g << 2) | (g >> 4);
+
+    uint b = ftoi_trunc(v.z * 31.0f);
+    float b0 = float(((b+0) << 3) | ((b+0) >> 2));
+    float b1 = float(((b+1) << 3) | ((b+1) >> 2));
+    if (fabs(v.z - b1) < fabs(v.z - b0)) b = min(b+1, 31U);
+	
+	b = (b << 3) | (b >> 2);
+
+    return Vector3(float(r)/255, float(g)/255, float(b)/255);
+}
+
+bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid( 31.0f, 63.0f, 31.0f );
+    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+    // declare variables
+    Vector3 beststart( 0.0f );
+    Vector3 bestend( 0.0f );
+    float besterror = FLT_MAX;
+
+    Vector3 x0(0.0f);
+    float w0 = 0.0f;
+
+    int b0 = 0, b1 = 0;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count-c0; c1++)
+        {
+            float w2 = m_wsum - w0 - w1;
+
+            // These factors could be entirely precomputed.
+            float const alpha2_sum = w0 + w1 * 0.25f;
+            float const beta2_sum = w2 + w1 * 0.25f;
+            float const alphabeta_sum = w1 * 0.25f;
+            float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+            Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            Vector3 const betax_sum = m_xsum - alphax_sum;
+
+            Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+            Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+
+            // clamp to the grid
+            a = clamp(a, 0, 1);
+            b = clamp(b, 0, 1);
+#if 1
+            a = floor(grid * a + 0.5f) * gridrcp;
+            b = floor(grid * b + 0.5f) * gridrcp;
+#else
+
+            //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f;
+            //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f;
+            //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f;
+            //int br = ftoi_round(31 * b.x); br = (br << 3) | (br >> 2); b.x = float(br) / 255.0f;
+            //int bg = ftoi_round(63 * b.y); br = (bg << 2) | (bg >> 4); b.y = float(bg) / 255.0f;
+            //int bb = ftoi_round(31 * b.z); br = (bb << 3) | (bb >> 2); b.z = float(bb) / 255.0f;
+
+            /*a = floor(a * grid + 0.5f);
+            a.x = (a.x * 8 + floorf(a.x / 4)) / 255.0f;
+            a.y = (a.y * 4 + floorf(a.y / 16)) / 255.0f;
+            a.z = (a.z * 8 + floorf(a.z / 4)) / 255.0f;
+
+            b = floor(b * grid + 0.5f);
+            b.x = (b.x * 8 + floorf(b.x / 4)) / 255.0f;
+            b.y = (b.y * 4 + floorf(b.y / 16)) / 255.0f;
+            b.z = (b.z * 8 + floorf(b.z / 4)) / 255.0f;*/
+
+            a = round565(a);
+            b = round565(b);
+#endif
+
+            // compute the error
+            Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+            // apply the metric to the error term
+            float error = dot(e1, m_metricSqr);
+
+            // keep the solution if it wins
+            if (error < besterror)
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+                b0 = c0;
+                b1 = c1;
+            }
+
+            x1 += m_weighted[c0+c1];
+            w1 += m_weights[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if( besterror < m_besterror )
+    {
+
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid( 31.0f, 63.0f, 31.0f );
+    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+    // declare variables
+    Vector3 beststart( 0.0f );
+    Vector3 bestend( 0.0f );
+    float besterror = FLT_MAX;
+
+    Vector3 x0(0.0f);
+    float w0 = 0.0f;
+    int b0 = 0, b1 = 0, b2 = 0;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count-c0; c1++)
+        {
+            Vector3 x2(0.0f);
+            float w2 = 0.0f;
+
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++)
+            {
+                float w3 = m_wsum - w0 - w1 - w2;
+
+                float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                Vector3 const betax_sum = m_xsum - alphax_sum;
+
+                Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+                Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+
+                // clamp to the grid
+                a = clamp(a, 0, 1);
+                b = clamp(b, 0, 1);
+#if 0
+                a = floor(a * grid + 0.5f) * gridrcp;
+                b = floor(b * grid + 0.5f) * gridrcp;
+#else
+                //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f;
+                //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f;
+                //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f;
+                //int br = ftoi_round(31 * b.x); br = (br << 3) | (br >> 2); b.x = float(br) / 255.0f;
+                //int bg = ftoi_round(63 * b.y); br = (bg << 2) | (bg >> 4); b.y = float(bg) / 255.0f;
+                //int bb = ftoi_round(31 * b.z); br = (bb << 3) | (bb >> 2); b.z = float(bb) / 255.0f;
+
+                /*
+                a = floor(a * grid + 0.5f);
+                a.x = (a.x * 8 + floorf(a.x / 4)) / 255.0f;
+                a.y = (a.y * 4 + floorf(a.y / 16)) / 255.0f;
+                a.z = (a.z * 8 + floorf(a.z / 4)) / 255.0f;
+
+                b = floor(b * grid + 0.5f);
+                b.x = (b.x * 8 + floorf(b.x / 4)) / 255.0f;
+                b.y = (b.y * 4 + floorf(b.y / 16)) / 255.0f;
+                b.z = (b.z * 8 + floorf(b.z / 4)) / 255.0f;
+                */
+
+                a = round565(a);
+                b = round565(b);
+#endif
+                // @@ It would be much more accurate to evaluate the error exactly. 
+
+                // compute the error
+                Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+                // apply the metric to the error term
+                float error = dot( e1, m_metricSqr );
+
+                // keep the solution if it wins
+                if (error < besterror)
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                    b0 = c0;
+                    b1 = c1;
+                    b2 = c2;
+                }
+
+                x2 += m_weighted[c0+c1+c2];
+                w2 += m_weights[c0+c1+c2];
+            }
+
+            x1 += m_weighted[c0+c1];
+            w1 += m_weights[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if (besterror < m_besterror)
+    {
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+#endif // NVTT_USE_SIMD
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressDXT.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressDXT.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressDXT.h
@@ -1,87 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_COMPRESSDXT_H
-#define NV_TT_COMPRESSDXT_H
-
-#include <nvimage/nvimage.h>
-#include "nvtt.h"
-
-namespace nv
-{
-	class Image;
-	class FloatImage;
-
-	class FastCompressor
-	{
-	public:
-		FastCompressor();
-		~FastCompressor();
-
-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
-
-		void compressDXT1(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1a(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5n(const nvtt::OutputOptions::Private & outputOptions);
-
-	private:
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
-	};
-
-	class SlowCompressor
-	{
-	public:
-		SlowCompressor();
-		~SlowCompressor();
-
-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
-
-		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT1a(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressBC4(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressBC5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-
-	private:
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
-	};
-
-	// External compressors.
-#if defined(HAVE_S3QUANT)
-	void s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
-#endif
-	
-#if defined(HAVE_ATITC)
-	void atiCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions);
-#endif
-
-} // nv namespace
-
-
-#endif // NV_TT_COMPRESSDXT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressDXT.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressDXT.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressDXT.cpp
@@ -1,597 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Memory.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-
-#include "nvtt.h"
-#include "CompressDXT.h"
-#include "QuickCompressDXT.h"
-#include "OptimalCompressDXT.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-// squish
-#include "squish/colourset.h"
-//#include "squish/clusterfit.h"
-#include "squish/fastclusterfit.h"
-#include "squish/weightedclusterfit.h"
-
-
-// s3_quant
-#if defined(HAVE_S3QUANT)
-#include "s3tc/s3_quant.h"
-#endif
-
-// ati tc
-#if defined(HAVE_ATITC)
-#include "atitc/ATI_Compress.h"
-#endif
-
-//#include <time.h>
-
-using namespace nv;
-using namespace nvtt;
-
-
-nv::FastCompressor::FastCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None)
-{
-}
-
-nv::FastCompressor::~FastCompressor()
-{
-}
-
-void nv::FastCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
-{
-	m_image = image;
-	m_alphaMode = alphaMode;
-}
-
-void nv::FastCompressor::compressDXT1(const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			QuickCompress::compressDXT1(rgba, &block);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT1a(const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			QuickCompress::compressDXT1a(rgba, &block);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT3(const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT3 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-
-			QuickCompress::compressDXT3(rgba, &block);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT5(const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			QuickCompress::compressDXT5(rgba, &block, 0);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::FastCompressor::compressDXT5n(const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			rgba.init(m_image, x, y);
-			
-			rgba.swizzleDXT5n();
-
-			QuickCompress::compressDXT5(rgba, &block, 0);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-nv::SlowCompressor::SlowCompressor() : m_image(NULL), m_alphaMode(AlphaMode_None)
-{
-}
-
-nv::SlowCompressor::~SlowCompressor()
-{
-}
-
-void nv::SlowCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
-{
-	m_image = image;
-	m_alphaMode = alphaMode;
-}
-
-void nv::SlowCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	squish::WeightedClusterFit fit;
-	//squish::ClusterFit fit;
-	//squish::FastClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1(rgba.color(0), &block);
-			}
-			else
-			{
-				squish::ColourSet colours((uint8 *)rgba.colors(), 0, true);
-				fit.SetColourSet(&colours, squish::kDxt1);
-				fit.Compress(&block);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressDXT1a(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT1 block;
-
-	squish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			bool anyAlpha = false;
-			bool allAlpha = true;
-			
-			for (uint i = 0; i < 16; i++)
-			{
-				if (rgba.color(i).a < 128) anyAlpha = true;
-				else allAlpha = false;
-			}
-			
-			if ((!anyAlpha && rgba.isSingleColor() || allAlpha))
-			{
-				OptimalCompress::compressDXT1a(rgba.color(0), &block);
-			}
-			else
-			{
-				squish::ColourSet colours((uint8 *)rgba.colors(), squish::kDxt1|squish::kWeightColourByAlpha);
-				fit.SetColourSet(&colours, squish::kDxt1);
-				fit.Compress(&block);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT3 block;
-	
-	squish::WeightedClusterFit fit;
-	//squish::FastClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			// Compress explicit alpha.
-			OptimalCompress::compressDXT3A(rgba, &block.alpha);
-
-			// Compress color.
-			if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1(rgba.color(0), &block.color);
-			}
-			else
-			{
-				squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
-				fit.SetColourSet(&colours, 0);
-				fit.Compress(&block.color);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-void nv::SlowCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-
-	squish::WeightedClusterFit fit;
-	fit.SetMetric(compressionOptions.colorWeight.x(), compressionOptions.colorWeight.y(), compressionOptions.colorWeight.z());
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-
-			// Compress alpha.
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(rgba, &block.alpha);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(rgba, &block.alpha);
-			}
-		
-			// Compress color.
-			if (rgba.isSingleColor())
-			{
-				OptimalCompress::compressDXT1(rgba.color(0), &block.color);
-			}
-			else
-			{
-				squish::ColourSet colours((uint8 *)rgba.colors(), squish::kWeightColourByAlpha);
-				fit.SetColourSet(&colours, 0);
-				fit.Compress(&block.color);
-			}
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressDXT5n(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	BlockDXT5 block;
-	
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-			
-			rgba.swizzleDXT5n();			
-			
-			// Compress X.
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(rgba, &block.alpha);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(rgba, &block.alpha);
-			}
-			
-			// Compress Y.
-			OptimalCompress::compressDXT1G(rgba, &block.color);
-			
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressBC4(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-	
-	ColorBlock rgba;
-	AlphaBlockDXT5 block;
-	
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			rgba.init(m_image, x, y);
-
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(rgba, &block);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(rgba, &block);
-			}
-
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-void nv::SlowCompressor::compressBC5(const CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = m_image->width();
-	const uint h = m_image->height();
-
-	ColorBlock xcolor;
-	ColorBlock ycolor;
-
-	BlockATI2 block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			
-			xcolor.init(m_image, x, y);
-			xcolor.splatX();
-			
-			ycolor.init(m_image, x, y);
-			ycolor.splatY();
-
-			if (compressionOptions.quality == Quality_Highest)
-			{
-				OptimalCompress::compressDXT5A(xcolor, &block.x);
-				OptimalCompress::compressDXT5A(ycolor, &block.y);
-			}
-			else
-			{
-				QuickCompress::compressDXT5A(xcolor, &block.x);
-				QuickCompress::compressDXT5A(ycolor, &block.y);
-			}
-
-			if (outputOptions.outputHandler != NULL) {
-				outputOptions.outputHandler->writeData(&block, sizeof(block));
-			}
-		}
-	}
-}
-
-
-#if defined(HAVE_S3QUANT)
-
-void nv::s3CompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions)
-{
-	const uint w = image->width();
-	const uint h = image->height();
-	
-	float error = 0.0f;
-
-	BlockDXT1 dxtBlock3;
-	BlockDXT1 dxtBlock4;
-	ColorBlock block;
-
-	for (uint y = 0; y < h; y += 4) {
-		for (uint x = 0; x < w; x += 4) {
-			block.init(image, x, y);
-
-			// Init rgb block.
-			RGBBlock rgbBlock;
-			rgbBlock.n = 16;
-			for (uint i = 0; i < 16; i++) {
-				rgbBlock.colorChannel[i][0] = clamp(float(block.color(i).r) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][1] = clamp(float(block.color(i).g) / 255.0f, 0.0f, 1.0f);
-				rgbBlock.colorChannel[i][2] = clamp(float(block.color(i).b) / 255.0f, 0.0f, 1.0f);
-			}
-			rgbBlock.weight[0] = 1.0f;
-			rgbBlock.weight[1] = 1.0f;
-			rgbBlock.weight[2] = 1.0f;
-
-			rgbBlock.inLevel = 4;
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock4.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock4.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock4.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock4.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock4.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock4.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock4.setIndices(rgbBlock.index);
-
-			if (dxtBlock4.col0.u < dxtBlock4.col1.u) {
-				swap(dxtBlock4.col0.u, dxtBlock4.col1.u);
-				dxtBlock4.indices ^= 0x55555555;
-			}
-
-			uint error4 = blockError(block, dxtBlock4);
-
-			rgbBlock.inLevel = 3;
-
-			CodeRGBBlock(&rgbBlock);
-
-			// Copy results to DXT block.
-			dxtBlock3.col0.r = rgbBlock.endPoint[0][0];
-			dxtBlock3.col0.g = rgbBlock.endPoint[0][1];
-			dxtBlock3.col0.b = rgbBlock.endPoint[0][2];
-
-			dxtBlock3.col1.r = rgbBlock.endPoint[1][0];
-			dxtBlock3.col1.g = rgbBlock.endPoint[1][1];
-			dxtBlock3.col1.b = rgbBlock.endPoint[1][2];
-
-			dxtBlock3.setIndices(rgbBlock.index);
-
-			if (dxtBlock3.col0.u > dxtBlock3.col1.u) {
-				swap(dxtBlock3.col0.u, dxtBlock3.col1.u);
-				dxtBlock3.indices ^= (~dxtBlock3.indices  >> 1) & 0x55555555;
-			}
-
-			uint error3 = blockError(block, dxtBlock3);
-
-			if (error3 < error4) {
-				error += error3;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock3, sizeof(dxtBlock3));
-				}
-			}
-			else {
-				error += error4;
-
-				if (outputOptions.outputHandler != NULL) {
-					outputOptions.outputHandler->writeData(&dxtBlock4, sizeof(dxtBlock4));
-				}
-			}
-		}
-	}
-
-	printf("error = %f\n", error/((w+3)/4 * (h+3)/4));
-}
-
-#endif // defined(HAVE_S3QUANT)
-
-
-#if defined(HAVE_ATITC)
-
-void nv::atiCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions)
-{
-	// Init source texture
-	ATI_TC_Texture srcTexture;
-	srcTexture.dwSize = sizeof(srcTexture);
-	srcTexture.dwWidth = image->width();
-	srcTexture.dwHeight = image->height();
-	srcTexture.dwPitch = image->width() * 4;
-	srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
-	srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
-	srcTexture.pData = (ATI_TC_BYTE*) image->pixels();
-
-	// Init dest texture
-	ATI_TC_Texture destTexture;
-	destTexture.dwSize = sizeof(destTexture);
-	destTexture.dwWidth = image->width();
-	destTexture.dwHeight = image->height();
-	destTexture.dwPitch = 0;
-	destTexture.format = ATI_TC_FORMAT_DXT1;
-	destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
-	destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
-
-	// Compress
-	ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
-
-	if (outputOptions.outputHandler != NULL) {
-		outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
-	}
-}
-
-#endif // defined(HAVE_ATITC)
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressRGB.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressRGB.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressRGB.h
@@ -1,39 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_COMPRESSRGB_H
-#define NV_TT_COMPRESSRGB_H
-
-#include "nvtt.h"
-
-namespace nv
-{
-	class Image;
-
-	// Pixel format converter.
-	void compressRGB(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
-	
-} // nv namespace
-
-
-#endif // NV_TT_COMPRESSDXT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressRGB.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressRGB.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressRGB.cpp
@@ -1,140 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Debug.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/PixelFormat.h>
-#include <nvmath/Color.h>
-
-#include "CompressRGB.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-using namespace nv;
-using namespace nvtt;
-
-namespace 
-{
-
-	inline uint computePitch(uint w, uint bitsize)
-	{
-		uint p = w * ((bitsize + 7) / 8);
-
-		// Align to 32 bits.
-		return ((p + 3) / 4) * 4;
-	}
-
-	inline void convert_to_a8r8g8b8(const void * src, void * dst, uint w)
-	{
-		memcpy(dst, src, 4 * w);
-	}
-
-	inline void convert_to_x8r8g8b8(const void * src, void * dst, uint w)
-	{
-		memcpy(dst, src, 4 * w);
-	}
-
-} // namespace
-
-
-// Pixel format converter.
-void nv::compressRGB(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
-{
-	nvCheck(image != NULL);
-
-	const uint w = image->width();
-	const uint h = image->height();
-
-	const uint bitCount = compressionOptions.bitcount;
-	nvCheck(bitCount == 8 || bitCount == 16 || bitCount == 24 || bitCount == 32);
-
-	const uint byteCount = bitCount / 8;
-
-	const uint rmask = compressionOptions.rmask;
-	uint rshift, rsize;
-	PixelFormat::maskShiftAndSize(rmask, &rshift, &rsize);
-	
-	const uint gmask = compressionOptions.gmask;
-	uint gshift, gsize;
-	PixelFormat::maskShiftAndSize(gmask, &gshift, &gsize);
-	
-	const uint bmask = compressionOptions.bmask;
-	uint bshift, bsize;
-	PixelFormat::maskShiftAndSize(bmask, &bshift, &bsize);
-	
-	const uint amask = compressionOptions.amask;
-	uint ashift, asize;
-	PixelFormat::maskShiftAndSize(amask, &ashift, &asize);
-
-	// Determine pitch.
-	uint pitch = computePitch(w, compressionOptions.bitcount);
-
-	uint8 * dst = (uint8 *)::malloc(pitch + 4);
-
-	for (uint y = 0; y < h; y++)
-	{
-		const Color32 * src = image->scanline(y);
-
-		if (bitCount == 32 && rmask == 0xFF0000 && gmask == 0xFF00 && bmask == 0xFF && amask == 0xFF000000)
-		{
-			convert_to_a8r8g8b8(src, dst, w);
-		}
-		else if (bitCount == 32 && rmask == 0xFF0000 && gmask == 0xFF00 && bmask == 0xFF && amask == 0)
-		{
-			convert_to_x8r8g8b8(src, dst, w);
-		}
-		else
-		{
-			// Generic pixel format conversion.
-			for (uint x = 0; x < w; x++)
-			{
-				uint c = 0;
-				c |= PixelFormat::convert(src[x].r, 8, rsize) << rshift;
-				c |= PixelFormat::convert(src[x].g, 8, gsize) << gshift;
-				c |= PixelFormat::convert(src[x].b, 8, bsize) << bshift;
-				c |= PixelFormat::convert(src[x].a, 8, asize) << ashift;
-				
-				// Output one byte at a time.
-				for (uint i = 0; i < byteCount; i++)
-				{
-					*(dst + x * byteCount + i) = (c >> (i * 8)) & 0xFF;
-				}
-			}
-			
-			// Zero padding.
-			for (uint x = w * byteCount; x < pitch; x++)
-			{
-				*(dst + x) = 0;
-			}
-		}
-
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(dst, pitch);
-		}
-	}
-
-	::free(dst);
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressionOptions.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressionOptions.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressionOptions.h
@@ -1,61 +1,80 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_COMPRESSIONOPTIONS_H
-#define NV_TT_COMPRESSIONOPTIONS_H
-
-#include <nvcore/StrLib.h>
-#include <nvmath/Vector.h>
-#include "nvtt.h"
-
-namespace nvtt
-{
-
-	struct CompressionOptions::Private
-	{
-		Format format;
-		
-		Quality quality;
-		
-		nv::Vector4 colorWeight;
-		
-		// Pixel format description.
-		uint bitcount;
-		uint rmask;
-		uint gmask;
-		uint bmask;
-		uint amask;
-		
-		nv::String externalCompressor;
-
-		// Quantization.
-		bool enableColorDithering;
-		bool enableAlphaDithering;
-		bool binaryAlpha;
-		int alphaThreshold;			// reference value used for binary alpha quantization.
-	};
-
-} // nvtt namespace
-
-
-#endif // NV_TT_COMPRESSIONOPTIONS_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_COMPRESSIONOPTIONS_H
+#define NV_TT_COMPRESSIONOPTIONS_H
+
+#include "nvtt.h"
+#include "nvmath/Vector.h"
+#include "nvcore/StrLib.h"
+
+namespace nvtt
+{
+
+    struct CompressionOptions::Private
+    {
+        Format format;
+
+        Quality quality;
+
+        nv::Vector4 colorWeight;
+
+        // Pixel format description.
+        uint bitcount;
+        uint rmask;
+        uint gmask;
+        uint bmask;
+        uint amask;
+        uint8 rsize;
+        uint8 gsize;
+        uint8 bsize;
+        uint8 asize;
+
+        PixelType pixelType;
+        uint pitchAlignment;
+
+        nv::String externalCompressor;
+
+        // Quantization.
+        bool enableColorDithering;
+        bool enableAlphaDithering;
+        bool binaryAlpha;
+        int alphaThreshold;			// reference value used for binary alpha quantization.
+
+        Decoder decoder;
+
+        uint getBitCount() const
+        {
+            if (format == Format_RGBA) {
+                if (bitcount != 0) return bitcount;
+                else return rsize + gsize + bsize + asize;
+            }
+            return 0;
+        }
+    };
+
+} // nvtt namespace
+
+
+#endif // NV_TT_COMPRESSIONOPTIONS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressionOptions.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressionOptions.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressionOptions.cpp
@@ -1,143 +1,273 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "nvtt.h"
-#include "CompressionOptions.h"
-
-using namespace nv;
-using namespace nvtt;
-
-
-/// Constructor. Sets compression options to the default values.
-CompressionOptions::CompressionOptions() : m(*new CompressionOptions::Private())
-{
-	reset();
-}
-
-
-/// Destructor.
-CompressionOptions::~CompressionOptions()
-{
-	delete &m;
-}
-
-
-/// Set default compression options.
-void CompressionOptions::reset()
-{
-	m.format = Format_DXT1;
-	m.quality = Quality_Normal;
-	m.colorWeight.set(1.0f, 1.0f, 1.0f, 1.0f);
-
-	m.bitcount = 32;
-	m.bmask = 0x000000FF;
-	m.gmask = 0x0000FF00;
-	m.rmask = 0x00FF0000;
-	m.amask = 0xFF000000;
-
-	m.enableColorDithering = false;
-	m.enableAlphaDithering = false;
-	m.binaryAlpha = false;
-	m.alphaThreshold = 127;
-}
-
-
-/// Set desired compression format.
-void CompressionOptions::setFormat(Format format)
-{
-	m.format = format;
-}
-
-
-/// Set compression quality settings.
-void CompressionOptions::setQuality(Quality quality)
-{
-	m.quality = quality;
-}
-
-
-/// Set the weights of each color channel. 
-/// The choice for these values is subjective. In many case uniform color weights 
-/// (1.0, 1.0, 1.0) work very well. A popular choice is to use the NTSC luma encoding 
-/// weights (0.2126, 0.7152, 0.0722), but I think that blue contributes to our 
-/// perception more than a 7%. A better choice in my opinion is (3, 4, 2).
-void CompressionOptions::setColorWeights(float red, float green, float blue, float alpha/*=1.0f*/)
-{
-//	float total = red + green + blue;
-//	float x = red / total;
-//	float y = green / total;
-//	m.colorWeight.set(x, y, 1.0f - x - y);
-	m.colorWeight.set(red, green, blue, alpha);
-}
-
-
-/// Set color mask to describe the RGB/RGBA format.
-void CompressionOptions::setPixelFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask)
-{
-	// Validate arguments.
-	nvCheck(bitcount == 8 || bitcount == 16 || bitcount == 24 || bitcount == 32);
-	nvCheck((rmask & gmask) == 0);
-	nvCheck((rmask & bmask) == 0);
-	nvCheck((rmask & amask) == 0);
-	nvCheck((gmask & bmask) == 0);
-	nvCheck((gmask & amask) == 0);
-	nvCheck((bmask & amask) == 0);
-
-	if (bitcount != 32)
-	{
-		uint maxMask = (1 << bitcount);
-		nvCheck(maxMask > rmask);
-		nvCheck(maxMask > gmask);
-		nvCheck(maxMask > bmask);
-		nvCheck(maxMask > amask);
-	}
-
-	m.bitcount = bitcount;
-	m.rmask = rmask;
-	m.gmask = gmask;
-	m.bmask = bmask;
-	m.amask = amask;
-}
-
-/// Use external compressor.
-void CompressionOptions::setExternalCompressor(const char * name)
-{
-	m.externalCompressor = name;
-}
-
-/// Set quantization options.
-/// @warning Do not enable dithering unless you know what you are doing. Quantization 
-/// introduces errors. It's better to let the compressor quantize the result to 
-/// minimize the error, instead of quantizing the data before handling it to
-/// the compressor.
-void CompressionOptions::setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold/*= 127*/)
-{
-	nvCheck(alphaThreshold >= 0 && alphaThreshold < 256);
-	m.enableColorDithering = colorDithering;
-	m.enableAlphaDithering = alphaDithering;
-	m.binaryAlpha = binaryAlpha;
-	m.alphaThreshold = alphaThreshold;
-}
-
-
-
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressionOptions.h"
+#include "nvimage/DirectDrawSurface.h"
+#include "nvmath/Vector.inl"
+
+using namespace nv;
+using namespace nvtt;
+
+
+/// Constructor. Sets compression options to the default values.
+CompressionOptions::CompressionOptions() : m(*new CompressionOptions::Private())
+{
+    reset();
+}
+
+
+/// Destructor.
+CompressionOptions::~CompressionOptions()
+{
+    delete &m;
+}
+
+
+/// Set default compression options.
+void CompressionOptions::reset()
+{
+    m.format = Format_DXT1;
+    m.quality = Quality_Normal;
+    m.colorWeight.set(1.0f, 1.0f, 1.0f, 1.0f);
+
+    m.bitcount = 32;
+    m.bmask = 0x000000FF;
+    m.gmask = 0x0000FF00;
+    m.rmask = 0x00FF0000;
+    m.amask = 0xFF000000;
+
+    m.rsize = 8;
+    m.gsize = 8;
+    m.bsize = 8;
+    m.asize = 8;
+	
+    m.pixelType = PixelType_UnsignedNorm;
+    m.pitchAlignment = 1;
+
+    m.enableColorDithering = false;
+    m.enableAlphaDithering = false;
+    m.binaryAlpha = false;
+    m.alphaThreshold = 127;
+
+    m.decoder = Decoder_D3D10;
+}
+
+
+/// Set desired compression format.
+void CompressionOptions::setFormat(Format format)
+{
+    m.format = format;
+}
+
+
+/// Set compression quality settings.
+void CompressionOptions::setQuality(Quality quality)
+{
+    m.quality = quality;
+}
+
+
+/// Set the weights of each color channel. 
+/// The choice for these values is subjective. In most cases uniform color weights
+/// (1.0, 1.0, 1.0) work very well. A popular choice is to use the NTSC luma encoding 
+/// weights (0.2126, 0.7152, 0.0722), but I think that blue contributes to our 
+/// perception more than a 7%. A better choice in my opinion is (3, 4, 2).
+void CompressionOptions::setColorWeights(float red, float green, float blue, float alpha/*=1.0f*/)
+{
+//    float total = red + green + blue;
+//    float x = red / total;
+//    float y = green / total;
+//    m.colorWeight.set(x, y, 1.0f - x - y);
+    m.colorWeight.set(red, green, blue, alpha);
+}
+
+
+/// Set color mask to describe the RGB/RGBA format.
+void CompressionOptions::setPixelFormat(uint bitCount, uint rmask, uint gmask, uint bmask, uint amask)
+{
+    // Validate arguments.
+    nvCheck(bitCount <= 32);
+    nvCheck((rmask & gmask) == 0);
+    nvCheck((rmask & bmask) == 0);
+    nvCheck((rmask & amask) == 0);
+    nvCheck((gmask & bmask) == 0);
+    nvCheck((gmask & amask) == 0);
+    nvCheck((bmask & amask) == 0);
+
+    if (bitCount != 32)
+    {
+        uint maxMask = (1 << bitCount);
+        nvCheck(maxMask > rmask);
+        nvCheck(maxMask > gmask);
+        nvCheck(maxMask > bmask);
+        nvCheck(maxMask > amask);
+    }
+
+    m.bitcount = bitCount;
+    m.rmask = rmask;
+    m.gmask = gmask;
+    m.bmask = bmask;
+    m.amask = amask;
+
+    m.rsize = 0;
+    m.gsize = 0;
+    m.bsize = 0;
+    m.asize = 0;
+}
+
+void CompressionOptions::setPixelFormat(uint8 rsize, uint8 gsize, uint8 bsize, uint8 asize)
+{
+    nvCheck(rsize <= 32 && gsize <= 32 && bsize <= 32 && asize <= 32);
+
+    m.bitcount = 0;
+    m.rmask = 0;
+    m.gmask = 0;
+    m.bmask = 0;
+    m.amask = 0;
+
+    m.rsize = rsize;
+    m.gsize = gsize;
+    m.bsize = bsize;
+    m.asize = asize;
+}
+
+/// Set pixel type.
+void CompressionOptions::setPixelType(PixelType pixelType)
+{
+    m.pixelType = pixelType;
+}
+
+
+/// Set pitch alignment in bytes.
+void CompressionOptions::setPitchAlignment(int pitchAlignment)
+{
+    nvDebugCheck(pitchAlignment > 0 && isPowerOfTwo(pitchAlignment));
+    m.pitchAlignment = pitchAlignment;
+}
+
+
+/// Use external compressor.
+void CompressionOptions::setExternalCompressor(const char * name)
+{
+    m.externalCompressor = name;
+}
+
+/// Set quantization options.
+/// @warning Do not enable dithering unless you know what you are doing. Quantization 
+/// introduces errors. It's better to let the compressor quantize the result to 
+/// minimize the error, instead of quantizing the data before handling it to
+/// the compressor.
+void CompressionOptions::setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold/*= 127*/)
+{
+    nvCheck(alphaThreshold >= 0 && alphaThreshold < 256);
+    m.enableColorDithering = colorDithering;
+    m.enableAlphaDithering = alphaDithering;
+    m.binaryAlpha = binaryAlpha;
+    m.alphaThreshold = alphaThreshold;
+}
+
+/// Set target decoder to optimize for.
+void CompressionOptions::setTargetDecoder(Decoder decoder)
+{
+    m.decoder = decoder;
+}
+
+
+
+// Translate to and from D3D formats.
+unsigned int CompressionOptions::d3d9Format() const
+{
+    if (m.format == Format_RGB) {
+        if (m.pixelType == PixelType_UnsignedNorm) {
+            
+            uint bitcount = m.bitcount;
+            uint rmask = m.rmask;
+            uint gmask = m.gmask;
+            uint bmask = m.bmask;
+            uint amask = m.amask;
+
+            if (bitcount == 0) {
+                bitcount = m.rsize + m.gsize + m.bsize + m.asize;
+                rmask = ((1 << m.rsize) - 1) << (m.asize + m.bsize + m.gsize);
+                gmask = ((1 << m.gsize) - 1) << (m.asize + m.bsize);
+                bmask = ((1 << m.bsize) - 1) << m.asize;
+                amask = ((1 << m.asize) - 1) << 0;
+            }
+
+            if (bitcount <= 32) {
+                return nv::findD3D9Format(bitcount, rmask, gmask, bmask, amask);
+            }
+            else {
+                //if (m.rsize == 16 && m.gsize == 16 && m.bsize == 0 && m.asize == 0) return D3DFMT_G16R16;
+                if (m.rsize == 16 && m.gsize == 16 && m.bsize == 16 && m.asize == 16) return D3DFMT_A16B16G16R16;
+            }
+        }
+        else if (m.pixelType == PixelType_Float) {
+            if (m.rsize == 16 && m.gsize == 0 && m.bsize == 0 && m.asize == 0) return D3DFMT_R16F;
+            if (m.rsize == 32 && m.gsize == 0 && m.bsize == 0 && m.asize == 0) return D3DFMT_R32F;
+            if (m.rsize == 16 && m.gsize == 16 && m.bsize == 0 && m.asize == 0) return D3DFMT_G16R16F;
+            if (m.rsize == 32 && m.gsize == 32 && m.bsize == 0 && m.asize == 0) return D3DFMT_G32R32F;
+            if (m.rsize == 16 && m.gsize == 16 && m.bsize == 16 && m.asize == 16) return D3DFMT_A16B16G16R16F;
+            if (m.rsize == 32 && m.gsize == 32 && m.bsize == 32 && m.asize == 32) return D3DFMT_A32B32G32R32F;
+        }
+
+        return 0;
+    }
+    else {
+        uint d3d9_formats[] = {
+            0,              // Format_RGB,
+            FOURCC_DXT1,    // Format_DXT1
+            FOURCC_DXT1,    // Format_DXT1a
+            FOURCC_DXT3,    // Format_DXT3
+            FOURCC_DXT5,    // Format_DXT5
+            FOURCC_DXT5,    // Format_DXT5n
+            FOURCC_ATI1,    // Format_BC4
+            FOURCC_ATI2,    // Format_BC5
+            FOURCC_DXT1,    // Format_DXT1n
+		    0,              // Format_CTX1
+            MAKEFOURCC('B', 'C', '6', 'H'),     // Format_BC6
+            MAKEFOURCC('B', 'C', '7', 'L'),     // Format_BC7
+            //FOURCC_ATI2,    // Format_BC5_Luma
+            FOURCC_DXT5,    // Format_BC3_RGBM
+        };
+
+        NV_COMPILER_CHECK(NV_ARRAY_SIZE(d3d9_formats) == Format_Count);
+
+        return d3d9_formats[m.format];
+    }
+}
+
+/*
+bool CompressionOptions::setDirect3D9Format(unsigned int format)
+{
+}
+
+unsigned int CompressionOptions::dxgiFormat() const
+{
+}
+
+bool CompressionOptions::setDXGIFormat(unsigned int format)
+{
+}
+*/
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/Compressor.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/Compressor.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/Compressor.h
@@ -1,80 +1,41 @@
-// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_COMPRESSOR_H
-#define NV_TT_COMPRESSOR_H
-
-#include <nvcore/Ptr.h>
-
-#include <nvtt/cuda/CudaCompressDXT.h>
-
-#include "nvtt.h"
-
-namespace nv
-{
-	class Image;
-}
-
-namespace nvtt
-{
-	struct Mipmap;
-
-	struct Compressor::Private
-	{
-		Private() {}
-
-		bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-		int estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const;
-
-	private:
-
-		bool outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-		bool compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-
-		bool initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const;
-
-		int findExactMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const;
-		int findClosestMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const;
-
-		void downsampleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
-		void scaleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d) const;
-		void processInputImage(Mipmap & mipmap, const InputOptions::Private & inputOptions) const;
-		void quantizeMipmap(Mipmap & mipmap, const CompressionOptions::Private & compressionOptions) const;
-		bool compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
-
-
-
-	public:
-
-		bool cudaSupported;
-		bool cudaEnabled;
-		int cudaDevice;
-
-		nv::AutoPtr<nv::CudaCompressor> cuda;
-
-	};
-
-} // nvtt namespace
-
-
-#endif // NV_TT_COMPRESSOR_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_COMPRESSOR_H
+#define NVTT_COMPRESSOR_H
+
+#include "nvtt.h"
+#include "nvcore/nvcore.h" // uint
+
+namespace nv
+{
+    struct CompressorInterface
+    {
+        virtual ~CompressorInterface() {}
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * rgba, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions) = 0;
+    };
+
+} // nv namespace
+
+#endif // NVTT_COMPRESSOR_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/Compressor.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/Compressor.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/Compressor.cpp
@@ -1,853 +0,0 @@
-// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvtt/nvtt.h>
-
-#include <nvcore/Memory.h>
-#include <nvcore/Ptr.h>
-
-#include <nvimage/DirectDrawSurface.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-#include <nvimage/Image.h>
-#include <nvimage/FloatImage.h>
-#include <nvimage/Filter.h>
-#include <nvimage/Quantize.h>
-#include <nvimage/NormalMap.h>
-#include <nvimage/PixelFormat.h>
-
-#include "Compressor.h"
-#include "InputOptions.h"
-#include "CompressionOptions.h"
-#include "OutputOptions.h"
-
-#include "CompressDXT.h"
-#include "CompressRGB.h"
-#include "cuda/CudaUtils.h"
-#include "cuda/CudaCompressDXT.h"
-
-
-using namespace nv;
-using namespace nvtt;
-
-
-namespace
-{
-
-	static int blockSize(Format format)
-	{
-		if (format == Format_DXT1 || format == Format_DXT1a) {
-			return 8;
-		}
-		else if (format == Format_DXT3) {
-			return 16;
-		}
-		else if (format == Format_DXT5 || format == Format_DXT5n) {
-			return 16;
-		}
-		else if (format == Format_BC4) {
-			return 8;
-		}
-		else if (format == Format_BC5) {
-			return 16;
-		}
-		return 0;
-	}
-
-	inline uint computePitch(uint w, uint bitsize)
-	{
-		uint p = w * ((bitsize + 7) / 8);
-
-		// Align to 32 bits.
-		return ((p + 3) / 4) * 4;
-	}
-
-	static int computeImageSize(uint w, uint h, uint d, uint bitCount, Format format)
-	{
-		if (format == Format_RGBA) {
-			return d * h * computePitch(w, bitCount);
-		}
-		else {
-			// @@ Handle 3D textures. DXT and VTC have different behaviors.
-			return ((w + 3) / 4) * ((h + 3) / 4) * blockSize(format);
-		}
-	}
-
-} // namespace
-
-namespace nvtt
-{
-	// Mipmap could be:
-	// - a pointer to an input image.
-	// - a fixed point image.
-	// - a floating point image.
-	struct Mipmap
-	{
-		Mipmap() : m_inputImage(NULL) {}
-		~Mipmap() {}
-
-		// Reference input image.
-		void setFromInput(const InputOptions::Private & inputOptions, uint idx)
-		{
-			m_inputImage = inputOptions.image(idx);
-			m_fixedImage = NULL;
-			m_floatImage = NULL;
-		}
-
-		// Assign and take ownership of given image.
-		void setImage(FloatImage * image)
-		{
-			m_inputImage = NULL;
-			m_fixedImage = NULL;
-			m_floatImage = image;
-		}
-
-
-		// Convert linear float image to fixed image ready for compression.
-		void toFixedImage(const InputOptions::Private & inputOptions)
-		{
-			if (m_floatImage != NULL) // apfaffe - We should check that we have a float image, if so convert it!
-			{
-				if (inputOptions.isNormalMap || inputOptions.outputGamma == 1.0f)
-				{
-					m_fixedImage = m_floatImage->createImage();
-				}
-				else
-				{
-					m_fixedImage = m_floatImage->createImageGammaCorrect(inputOptions.outputGamma);
-				}
-			}
-		}
-
-		// Convert input image to linear float image.
-		void toFloatImage(const InputOptions::Private & inputOptions)
-		{
-			if (m_floatImage == NULL)
-			{
-				nvDebugCheck(this->asFixedImage() != NULL);
-
-				m_floatImage = new FloatImage(this->asFixedImage());
-
-				if (inputOptions.isNormalMap)
-				{
-					// Expand normals to [-1, 1] range.
-					//	floatImage->expandNormals(0);
-				}
-				else if (inputOptions.inputGamma != 1.0f)
-				{
-					// Convert to linear space.
-					m_floatImage->toLinear(0, 3, inputOptions.inputGamma);
-				}
-			}
-		}
-
-		const FloatImage * asFloatImage() const
-		{
-			return m_floatImage.ptr();
-		}
-
-		FloatImage * asFloatImage()
-		{
-			return m_floatImage.ptr();
-		}
-
-		const Image * asFixedImage() const
-		{
-			// - apfaffe - switched logic to return the 'processed image' rather than the input!
-			if (m_fixedImage != NULL && m_fixedImage.ptr() != NULL)
-			{
-				return m_fixedImage.ptr();
-			}
-			return m_inputImage;
-		}
-
-		Image * asMutableFixedImage()
-		{
-			if (m_inputImage != NULL)
-			{
-				// Do not modify input image, create a copy.
-				m_fixedImage = new Image(*m_inputImage);
-				m_inputImage = NULL;
-			}
-			return m_fixedImage.ptr();
-		}
-
-
-	private:
-		const Image * m_inputImage;
-		AutoPtr<Image> m_fixedImage;
-		AutoPtr<FloatImage> m_floatImage;
-	};
-
-} // nvtt namespace
-
-
-Compressor::Compressor() : m(*new Compressor::Private())
-{
-	// CUDA initialization.
-	m.cudaSupported = cuda::isHardwarePresent();
-	m.cudaEnabled = false;
-	m.cudaDevice = -1;
-
-	enableCudaAcceleration(m.cudaSupported);
-}
-
-Compressor::~Compressor()
-{
-	enableCudaAcceleration(false);
-	delete &m;
-}
-
-
-/// Enable CUDA acceleration.
-void Compressor::enableCudaAcceleration(bool enable)
-{
-	if (m.cudaSupported)
-	{
-		if (m.cudaEnabled && !enable)
-		{
-			m.cudaEnabled = false;
-			m.cuda = NULL;
-
-			if (m.cudaDevice != -1)
-			{
-				// Exit device.
-				cuda::exitDevice();
-			}
-		}
-		else if (!m.cudaEnabled && enable)
-		{
-			// Init the CUDA device. This may return -1 if CUDA was already initialized by the app.
-			m.cudaEnabled = cuda::initDevice(&m.cudaDevice);
-
-			if (m.cudaEnabled)
-			{
-				// Create compressor if initialization succeeds.
-				m.cuda = new CudaCompressor();
-
-				// But cleanup if failed.
-				if (!m.cuda->isValid())
-				{
-					enableCudaAcceleration(false);
-				}
-			}
-		}
-	}
-}
-
-/// Check if CUDA acceleration is enabled.
-bool Compressor::isCudaAccelerationEnabled() const
-{
-	return m.cudaEnabled;
-}
-
-
-/// Compress the input texture with the given compression options.
-bool Compressor::process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
-{
-	return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m);
-}
-
-
-/// Estimate the size of compressing the input with the given options.
-int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const
-{
-	return m.estimateSize(inputOptions.m, compressionOptions.m);
-}
-
-
-
-
-bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
-{
-	// Make sure enums match.
-	nvStaticCheck(FloatImage::WrapMode_Clamp == (FloatImage::WrapMode)WrapMode_Clamp);
-	nvStaticCheck(FloatImage::WrapMode_Mirror == (FloatImage::WrapMode)WrapMode_Mirror);
-	nvStaticCheck(FloatImage::WrapMode_Repeat == (FloatImage::WrapMode)WrapMode_Repeat);
-
-	// Get output handler.
-	if (!outputOptions.openFile())
-	{
-		if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen);
-		return false;
-	}
-
-	inputOptions.computeTargetExtents();
-
-	// Output DDS header.
-	if (!outputHeader(inputOptions, compressionOptions, outputOptions))
-	{
-		return false;
-	}
-
-	for (uint f = 0; f < inputOptions.faceCount; f++)
-	{
-		if (!compressMipmaps(f, inputOptions, compressionOptions, outputOptions))
-		{
-			return false;
-		}
-	}
-
-	outputOptions.closeFile();
-
-	return true;
-}
-
-
-// Output DDS header.
-bool Compressor::Private::outputHeader(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
-{
-	// Output DDS header.
-	if (outputOptions.outputHandler == NULL || !outputOptions.outputHeader)
-	{
-		return true;
-	}
-
-	DDSHeader header;
-
-	header.setWidth(inputOptions.targetWidth);
-	header.setHeight(inputOptions.targetHeight);
-
-	int mipmapCount = inputOptions.realMipmapCount();
-	nvDebugCheck(mipmapCount > 0);
-
-	header.setMipmapCount(mipmapCount);
-
-	if (inputOptions.textureType == TextureType_2D) {
-		header.setTexture2D();
-	}
-	else if (inputOptions.textureType == TextureType_Cube) {
-		header.setTextureCube();
-	}		
-	/*else if (inputOptions.textureType == TextureType_3D) {
-	header.setTexture3D();
-	header.setDepth(inputOptions.targetDepth);
-	}*/
-
-	if (compressionOptions.format == Format_RGBA)
-	{
-		header.setPitch(computePitch(inputOptions.targetWidth, compressionOptions.bitcount));
-		header.setPixelFormat(compressionOptions.bitcount, compressionOptions.rmask, compressionOptions.gmask, compressionOptions.bmask, compressionOptions.amask);
-	}
-	else
-	{
-		header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
-
-		if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
-			header.setFourCC('D', 'X', 'T', '1');
-			if (inputOptions.isNormalMap) header.setNormalFlag(true);
-		}
-		else if (compressionOptions.format == Format_DXT3) {
-			header.setFourCC('D', 'X', 'T', '3');
-		}
-		else if (compressionOptions.format == Format_DXT5) {
-			header.setFourCC('D', 'X', 'T', '5');
-		}
-		else if (compressionOptions.format == Format_DXT5n) {
-			header.setFourCC('D', 'X', 'T', '5');
-			if (inputOptions.isNormalMap) header.setNormalFlag(true);
-		}
-		else if (compressionOptions.format == Format_BC4) {
-			header.setFourCC('A', 'T', 'I', '1');
-		}
-		else if (compressionOptions.format == Format_BC5) {
-			header.setFourCC('A', 'T', 'I', '2');
-			if (inputOptions.isNormalMap) header.setNormalFlag(true);
-		}
-	}
-
-	// Swap bytes if necessary.
-	header.swapBytes();
-
-	uint headerSize = 128;
-	if (header.hasDX10Header())
-	{
-		nvStaticCheck(sizeof(DDSHeader) == 128 + 20);
-		headerSize = 128 + 20;
-	}
-
-	bool writeSucceed = outputOptions.outputHandler->writeData(&header, headerSize);
-	if (!writeSucceed && outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_FileWrite);
-	}
-
-	return writeSucceed;
-}
-
-
-bool Compressor::Private::compressMipmaps(uint f, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
-{
-	uint w = inputOptions.targetWidth;
-	uint h = inputOptions.targetHeight;
-	uint d = inputOptions.targetDepth;
-
-	Mipmap mipmap;
-
-	const uint mipmapCount = inputOptions.realMipmapCount();
-	nvDebugCheck(mipmapCount > 0);
-
-	for (uint m = 0; m < mipmapCount; m++)
-	{
-		if (outputOptions.outputHandler)
-		{
-			int size = computeImageSize(w, h, d, compressionOptions.bitcount, compressionOptions.format);
-			outputOptions.outputHandler->beginImage(size, w, h, d, f, m);
-		}
-
-		// @@ Where to do the color transform?
-		// - Color transform may not be linear, so we cannot do before computing mipmaps.
-		// - Should be done in linear space, that is, after gamma correction.
-
-		if (!initMipmap(mipmap, inputOptions, w, h, d, f, m))
-		{
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_InvalidInput);
-				return false;
-			}
-		}
-
-		quantizeMipmap(mipmap, compressionOptions);
-
-		compressMipmap(mipmap, inputOptions, compressionOptions, outputOptions);
-
-		// Compute extents of next mipmap:
-		w = max(1U, w / 2);
-		h = max(1U, h / 2);
-		d = max(1U, d / 2);
-	}
-
-	return true;
-}
-
-bool Compressor::Private::initMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f, uint m) const
-{
-	// Find image from input.
-	int inputIdx = findExactMipmap(inputOptions, w, h, d, f);
-
-	if ((inputIdx == -1 || inputOptions.convertToNormalMap) && m != 0)
-	{
-		// Generate from last, when mipmap not found, or normal map conversion enabled.
-		downsampleMipmap(mipmap, inputOptions);
-	}
-	else
-	{
-		if (inputIdx != -1)
-		{
-			// If input mipmap found, then get from input.
-			mipmap.setFromInput(inputOptions, inputIdx);
-		}
-		else
-		{
-			// If not found, resize closest mipmap.
-			inputIdx = findClosestMipmap(inputOptions, w, h, d, f);
-
-			if (inputIdx == -1)
-			{
-				return false;
-			}
-
-			mipmap.setFromInput(inputOptions, inputIdx);
-
-			scaleMipmap(mipmap, inputOptions, w, h, d);
-		}
-
-		processInputImage(mipmap, inputOptions);
-	}
-
-	// Convert linear float image to fixed image ready for compression.
-	mipmap.toFixedImage(inputOptions);
-
-	return true;
-}
-
-int Compressor::Private::findExactMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const
-{
-	for (int m = 0; m < int(inputOptions.mipmapCount); m++)
-	{
-		int idx = f * inputOptions.mipmapCount + m;
-		const InputOptions::Private::InputImage & inputImage = inputOptions.images[idx];
-
-		if (inputImage.width == int(w) && inputImage.height == int(h) && inputImage.depth == int(d))
-		{
-			if (inputImage.data != NULL)
-			{
-				return idx;
-			}
-			return -1;
-		}
-		else if (inputImage.width < int(w) || inputImage.height < int(h) || inputImage.depth < int(d))
-		{
-			return -1;
-		}
-	}
-
-	return -1;
-}
-
-int Compressor::Private::findClosestMipmap(const InputOptions::Private & inputOptions, uint w, uint h, uint d, uint f) const
-{
-	int bestIdx = -1;
-
-	for (int m = 0; m < int(inputOptions.mipmapCount); m++)
-	{
-		int idx = f * inputOptions.mipmapCount + m;
-		const InputOptions::Private::InputImage & inputImage = inputOptions.images[idx];
-
-		if (inputImage.data != NULL)
-		{
-			int difference = (inputImage.width - w) + (inputImage.height - h) + (inputImage.depth - d);
-
-			if (difference < 0)
-			{
-				if (bestIdx == -1)
-				{
-					bestIdx = idx;
-				}
-
-				return bestIdx;
-			}
-
-			bestIdx = idx;
-		}
-	}
-
-	return bestIdx;
-}
-
-// Create mipmap from the given image.
-void Compressor::Private::downsampleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions) const
-{
-	// Make sure that floating point linear representation is available.
-	mipmap.toFloatImage(inputOptions);
-
-	const FloatImage * floatImage = mipmap.asFloatImage();
-
-	if (inputOptions.mipmapFilter == MipmapFilter_Box)
-	{
-		// Use fast downsample.
-		mipmap.setImage(floatImage->fastDownSample());
-	}
-	else if (inputOptions.mipmapFilter == MipmapFilter_Triangle)
-	{
-		TriangleFilter filter;
-		mipmap.setImage(floatImage->downSample(filter, (FloatImage::WrapMode)inputOptions.wrapMode));
-	}
-	else /*if (inputOptions.mipmapFilter == MipmapFilter_Kaiser)*/
-	{
-		nvDebugCheck(inputOptions.mipmapFilter == MipmapFilter_Kaiser);
-		KaiserFilter filter(inputOptions.kaiserWidth);
-		filter.setParameters(inputOptions.kaiserAlpha, inputOptions.kaiserStretch);
-		mipmap.setImage(floatImage->downSample(filter, (FloatImage::WrapMode)inputOptions.wrapMode));
-	}
-
-	// Normalize mipmap.
-	if ((inputOptions.isNormalMap || inputOptions.convertToNormalMap) && inputOptions.normalizeMipmaps)
-	{
-		normalizeNormalMap(mipmap.asFloatImage());
-	}
-}
-
-
-void Compressor::Private::scaleMipmap(Mipmap & mipmap, const InputOptions::Private & inputOptions, uint w, uint h, uint d) const
-{
-	mipmap.toFloatImage(inputOptions);
-
-	// @@ Add more filters.
-	// @@ Select different filters for downscaling and reconstruction.
-
-	// Resize image. 
-	BoxFilter boxFilter;
-	mipmap.setImage(mipmap.asFloatImage()->resize(boxFilter, w, h, (FloatImage::WrapMode)inputOptions.wrapMode));
-}
-
-
-// Process an input image: Convert to normal map, normalize, or convert to linear space.
-void Compressor::Private::processInputImage(Mipmap & mipmap, const InputOptions::Private & inputOptions) const
-{
-	if (inputOptions.convertToNormalMap)
-	{
-		mipmap.toFixedImage(inputOptions);
-
-		Vector4 heightScale = inputOptions.heightFactors;
-		mipmap.setImage(createNormalMap(mipmap.asFixedImage(), (FloatImage::WrapMode)inputOptions.wrapMode, heightScale, inputOptions.bumpFrequencyScale));
-	}
-	else if (inputOptions.isNormalMap)
-	{
-		if (inputOptions.normalizeMipmaps)
-		{
-			// If floating point image available, normalize in place.
-			if (mipmap.asFloatImage() == NULL)
-			{
-				FloatImage * floatImage = new FloatImage(mipmap.asFixedImage());
-				normalizeNormalMap(floatImage);
-				mipmap.setImage(floatImage);
-			}
-			else
-			{
-				normalizeNormalMap(mipmap.asFloatImage());
-				mipmap.setImage(mipmap.asFloatImage());
-			}
-		}
-	}
-	else
-	{
-		if (inputOptions.inputGamma != inputOptions.outputGamma)
-		{
-			mipmap.toFloatImage(inputOptions);
-		}
-	}
-}
-
-
-// Quantize the given mipmap according to the compression options.
-void Compressor::Private::quantizeMipmap(Mipmap & mipmap, const CompressionOptions::Private & compressionOptions) const
-{
-	nvDebugCheck(mipmap.asFixedImage() != NULL);
-
-	if (compressionOptions.binaryAlpha)
-	{
-		if (compressionOptions.enableAlphaDithering)
-		{
-			Quantize::FloydSteinberg_BinaryAlpha(mipmap.asMutableFixedImage(), compressionOptions.alphaThreshold);
-		}
-		else
-		{
-			Quantize::BinaryAlpha(mipmap.asMutableFixedImage(), compressionOptions.alphaThreshold);
-		}
-	}
-
-	if (compressionOptions.enableColorDithering || compressionOptions.enableAlphaDithering)
-	{
-		uint rsize = 8;
-		uint gsize = 8;
-		uint bsize = 8;
-		uint asize = 8;
-
-		if (compressionOptions.enableColorDithering)
-		{
-			if (compressionOptions.format >= Format_DXT1 && compressionOptions.format <= Format_DXT5)
-			{
-				rsize = 5;
-				gsize = 6;
-				bsize = 5;
-			}
-			else if (compressionOptions.format == Format_RGB)
-			{
-				uint rshift, gshift, bshift;
-				PixelFormat::maskShiftAndSize(compressionOptions.rmask, &rshift, &rsize);
-				PixelFormat::maskShiftAndSize(compressionOptions.gmask, &gshift, &gsize);
-				PixelFormat::maskShiftAndSize(compressionOptions.bmask, &bshift, &bsize);
-			}
-		}
-
-		if (compressionOptions.enableAlphaDithering)
-		{
-			if (compressionOptions.format == Format_DXT3)
-			{
-				asize = 4;
-			}
-			else if (compressionOptions.format == Format_RGB)
-			{
-				uint ashift;
-				PixelFormat::maskShiftAndSize(compressionOptions.amask, &ashift, &asize);
-			}
-		}
-
-		if (compressionOptions.binaryAlpha)
-		{
-			asize = 8; // Already quantized.
-		}
-
-		Quantize::FloydSteinberg(mipmap.asMutableFixedImage(), rsize, gsize, bsize, asize);
-	}
-}
-
-
-// Compress the given mipmap.
-bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
-{
-	const Image * image = mipmap.asFixedImage();
-	nvDebugCheck(image != NULL);
-
-	FastCompressor fast;
-	fast.setImage(image, inputOptions.alphaMode);
-
-	SlowCompressor slow;
-	slow.setImage(image, inputOptions.alphaMode);
-
-	const bool useCuda = cudaEnabled && image->width() * image->height() >= 512;
-
-	if (compressionOptions.format == Format_RGBA || compressionOptions.format == Format_RGB)
-	{
-		compressRGB(image, outputOptions, compressionOptions);
-	}
-	else if (compressionOptions.format == Format_DXT1)
-	{
-#if defined(HAVE_S3QUANT)
-		if (compressionOptions.externalCompressor == "s3")
-		{
-			s3CompressDXT1(image, outputOptions);
-		}
-		else
-#endif
-
-#if defined(HAVE_ATITC)
-			if (compressionOptions.externalCompressor == "ati")
-			{
-				atiCompressDXT1(image, outputOptions);
-			}
-			else
-#endif
-				if (compressionOptions.quality == Quality_Fastest)
-				{
-					fast.compressDXT1(outputOptions);
-				}
-				else
-				{
-					if (useCuda)
-					{
-						nvDebugCheck(cudaSupported);
-						cuda->setImage(image, inputOptions.alphaMode);
-						cuda->compressDXT1(compressionOptions, outputOptions);
-					}
-					else
-					{
-						slow.compressDXT1(compressionOptions, outputOptions);
-					}
-				}
-	}
-	else if (compressionOptions.format == Format_DXT1a)
-	{
-		if (compressionOptions.quality == Quality_Fastest)
-		{
-			fast.compressDXT1a(outputOptions);
-		}
-		else
-		{
-			if (useCuda)
-			{
-				nvDebugCheck(cudaSupported);
-				/*cuda*/slow.compressDXT1a(compressionOptions, outputOptions);
-			}
-			else
-			{
-				slow.compressDXT1a(compressionOptions, outputOptions);
-			}
-		}
-	}
-	else if (compressionOptions.format == Format_DXT3)
-	{
-		if (compressionOptions.quality == Quality_Fastest)
-		{
-			fast.compressDXT3(outputOptions);
-		}
-		else
-		{
-			if (useCuda)
-			{
-				nvDebugCheck(cudaSupported);
-				cuda->setImage(image, inputOptions.alphaMode);
-				cuda->compressDXT3(compressionOptions, outputOptions);
-			}
-			else
-			{
-				slow.compressDXT3(compressionOptions, outputOptions);
-			}
-		}
-	}
-	else if (compressionOptions.format == Format_DXT5)
-	{
-		if (compressionOptions.quality == Quality_Fastest)
-		{
-			fast.compressDXT5(outputOptions);
-		}
-		else
-		{
-			if (useCuda)
-			{
-				nvDebugCheck(cudaSupported);
-				cuda->setImage(image, inputOptions.alphaMode);
-				cuda->compressDXT5(compressionOptions, outputOptions);
-			}
-			else
-			{
-				slow.compressDXT5(compressionOptions, outputOptions);
-			}
-		}
-	}
-	else if (compressionOptions.format == Format_DXT5n)
-	{
-		if (compressionOptions.quality == Quality_Fastest)
-		{
-			fast.compressDXT5n(outputOptions);
-		}
-		else
-		{
-			slow.compressDXT5n(compressionOptions, outputOptions);
-		}
-	}
-	else if (compressionOptions.format == Format_BC4)
-	{
-		slow.compressBC4(compressionOptions, outputOptions);
-	}
-	else if (compressionOptions.format == Format_BC5)
-	{
-		slow.compressBC5(compressionOptions, outputOptions);
-	}
-
-	return true;
-}
-
-
-int Compressor::Private::estimateSize(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions) const
-{
-	const Format format = compressionOptions.format;
-	const uint bitCount = compressionOptions.bitcount;
-
-	inputOptions.computeTargetExtents();
-
-	uint mipmapCount = inputOptions.realMipmapCount();
-
-	int size = 0;
-
-	for (uint f = 0; f < inputOptions.faceCount; f++)
-	{
-		uint w = inputOptions.targetWidth;
-		uint h = inputOptions.targetHeight;
-		uint d = inputOptions.targetDepth;
-
-		for (uint m = 0; m < mipmapCount; m++)
-		{
-			size += computeImageSize(w, h, d, bitCount, format);
-
-			// Compute extents of next mipmap:
-			w = max(1U, w / 2);
-			h = max(1U, h / 2);
-			d = max(1U, d / 2);
-		}
-	}
-
-	return size;
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX10.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX10.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX10.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_COMPRESSORDX10_H
+#define NVTT_COMPRESSORDX10_H
+
+#include "BlockCompressor.h"
+
+namespace nv
+{
+	struct ColorBlock;
+
+	// Fast CPU compressors.
+	struct FastCompressorBC4 : public ColorBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
+
+	struct FastCompressorBC5 : public ColorBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
+
+
+	// Production CPU compressors.
+	struct ProductionCompressorBC4 : public ColorBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 8; }
+	};
+
+	struct ProductionCompressorBC5 : public ColorBlockCompressor
+	{
+		virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};
+
+    /*struct ProductionCompressorBC5_Luma : public ColorSetCompressor
+	{
+		virtual void compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+		virtual uint blockSize() const { return 16; }
+	};*/
+
+
+} // nv namespace
+
+
+#endif // NVTT_COMPRESSORDX10_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX10.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX10.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX10.cpp
@@ -0,0 +1,122 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressorDX10.h"
+#include "QuickCompressDXT.h"
+#include "OptimalCompressDXT.h"
+
+#include "nvtt.h"
+
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/ftoi.h"
+
+#include <new> // placement new
+
+using namespace nv;
+using namespace nvtt;
+
+
+void FastCompressorBC4::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI1 * block = new(output) BlockATI1;
+	
+    AlphaBlock4x4 tmp;
+    tmp.init(src, 0);  // Copy red to alpha
+	QuickCompress::compressDXT5A(tmp, &block->alpha);
+}
+
+void FastCompressorBC5::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI2 * block = new(output) BlockATI2;
+	
+    AlphaBlock4x4 tmp;
+
+    tmp.init(src, 0);  // Copy red to alpha
+	QuickCompress::compressDXT5A(tmp, &block->x);
+	
+    tmp.init(src, 1);  // Copy green to alpha
+	QuickCompress::compressDXT5A(tmp, &block->y);
+}
+
+
+void ProductionCompressorBC4::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI1 * block = new(output) BlockATI1;
+
+    AlphaBlock4x4 tmp;
+    tmp.init(src, 0);  // Copy red to alpha
+	OptimalCompress::compressDXT5A(tmp, &block->alpha);
+}
+
+void ProductionCompressorBC5::compressBlock(ColorBlock & src, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+	BlockATI2 * block = new(output) BlockATI2;
+
+    AlphaBlock4x4 tmp;
+
+    tmp.init(src, 0);  // Copy red to alpha
+	OptimalCompress::compressDXT5A(tmp, &block->x);
+	
+    tmp.init(src, 1);  // Copy green to alpha
+	OptimalCompress::compressDXT5A(tmp, &block->y);
+}
+
+
+#if 0
+void ProductionCompressorBC5_Luma::compressBlock(ColorSet & set, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockATI2 * block = new(output) BlockATI2;
+
+    AlphaBlock4x4 tmp;
+    tmp.init(set, /*channel=*/0);
+    OptimalCompress::compressDXT5A(tmp, &block->x);
+
+    // Decode block->x
+    AlphaBlock4x4 decoded;
+    block->x.decodeBlock(&decoded);
+
+    const float R = 1.0f / 256.0f; // Maximum residual that we can represent. @@ Tweak this.
+
+    // Compute residual block.
+    for (int i = 0; i < 16; i++) {
+        float in = set.color(i).x;                      // [0,1]
+        float out = float(decoded.alpha[i]) / 255.0f;   // [0,1]
+
+        float residual = (out - in);                    // [-1,1], but usually [-R,R]
+
+        // Normalize residual to [-1,1] range.
+        residual /= R;
+
+        // Pack in [0,1] range.
+        residual = residual * 0.5f + 0.5f;
+
+        tmp.alpha[i] = nv::ftoi_round(nv::saturate(residual) * 255.0f);
+    }
+
+    OptimalCompress::compressDXT5A(tmp, &block->y);
+   
+}
+#endif // 0
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX11.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX11.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX11.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_COMPRESSORDX11_H
+#define NVTT_COMPRESSORDX11_H
+
+#include "BlockCompressor.h"
+
+namespace nv
+{
+    struct CompressorBC6 : public FloatColorCompressor
+    {
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+    struct CompressorBC7 : public FloatColorCompressor
+    {
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+	
+} // nv namespace
+
+
+#endif // NVTT_COMPRESSORDX11_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX11.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX11.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX11.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressorDX11.h"
+
+#include "nvtt.h"
+#include "CompressionOptions.h"
+#include "nvimage/ColorBlock.h"
+#include "nvmath/Half.h"
+#include "nvmath/Vector.inl"
+
+#include "bc6h/zoh.h"
+#include "bc7/avpcl.h"
+
+#include <string.h> // memset
+
+using namespace nv;
+using namespace nvtt;
+
+
+void CompressorBC6::compressBlock(const Vector4 colors[16], const float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // !!!UNDONE: support channel weights
+    // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...)
+
+    if (compressionOptions.pixelType == PixelType_UnsignedFloat ||
+        compressionOptions.pixelType == PixelType_UnsignedNorm ||
+        compressionOptions.pixelType == PixelType_UnsignedInt)
+    {
+        ZOH::Utils::FORMAT = ZOH::UNSIGNED_F16;
+    }
+    else
+    {
+        ZOH::Utils::FORMAT = ZOH::SIGNED_F16;
+    }
+
+    // Convert NVTT's tile struct to ZOH's, and convert float to half.
+    ZOH::Tile zohTile(4, 4);
+    memset(zohTile.data, 0, sizeof(zohTile.data));
+    memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map));
+    for (uint y = 0; y < 4; ++y)
+    {
+        for (uint x = 0; x < 4; ++x)
+        {
+            Vector4 color = colors[4*y+x];
+            uint16 rHalf = to_half(color.x);
+            uint16 gHalf = to_half(color.y);
+            uint16 bHalf = to_half(color.z);
+            zohTile.data[y][x].x = ZOH::Tile::half2float(rHalf);
+            zohTile.data[y][x].y = ZOH::Tile::half2float(gHalf);
+            zohTile.data[y][x].z = ZOH::Tile::half2float(bHalf);
+            zohTile.importance_map[y][x] = weights[4*y+x];
+        }
+    }
+
+    ZOH::compress(zohTile, (char *)output);
+}
+
+void CompressorBC7::compressBlock(const Vector4 colors[16], const float weights[16], const CompressionOptions::Private & compressionOptions, void * output)
+{
+    // !!!UNDONE: support channel weights
+    // !!!UNDONE: set flags once, not per block (this is especially sketchy since block compression is multithreaded...)
+
+    AVPCL::mode_rgb = false;
+    AVPCL::flag_premult = false; //(alphaMode == AlphaMode_Premultiplied);
+    AVPCL::flag_nonuniform = false;
+    AVPCL::flag_nonuniform_ati = false;
+    
+    // Convert NVTT's tile struct to AVPCL's.
+    AVPCL::Tile avpclTile(4, 4);
+    memset(avpclTile.data, 0, sizeof(avpclTile.data));
+    for (uint y = 0; y < 4; ++y) {
+        for (uint x = 0; x < 4; ++x) {
+            Vector4 color = colors[4*y+x];
+            avpclTile.data[y][x] = color * 255.0f;
+            avpclTile.importance_map[y][x] = 1.0f; //weights[4*y+x];
+        }
+    }
+
+    AVPCL::compress(avpclTile, (char *)output);
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX9.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX9.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX9.h
@@ -0,0 +1,156 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_COMPRESSORDX9_H
+#define NVTT_COMPRESSORDX9_H
+
+#include "BlockCompressor.h"
+
+namespace nv
+{
+    struct ColorBlock;
+
+    // Fast CPU compressors.
+    struct FastCompressorDXT1 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+
+    struct FastCompressorDXT1a : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+
+    struct FastCompressorDXT3 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+    struct FastCompressorDXT5 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+    struct FastCompressorDXT5n : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+
+    // Normal CPU compressors.
+#if 1
+    struct CompressorDXT1 : public FloatColorCompressor
+    {
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+#else
+    struct CompressorDXT1 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+#endif
+
+    struct CompressorDXT1a : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+
+    struct CompressorDXT1_Luma : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+
+    struct CompressorDXT3 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+    struct CompressorDXT5 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+    struct CompressorDXT5n : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+    struct CompressorBC3_RGBM : public FloatColorCompressor
+    {
+        virtual void compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; }
+    };
+
+
+    // External compressors.
+#if defined(HAVE_ATITC)
+    struct AtiCompressorDXT1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+
+    struct AtiCompressorDXT5 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_SQUISH)
+    struct SquishCompressorDXT1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_D3DX)
+    struct D3DXCompressorDXT1 : public CompressorInterface
+    {
+        virtual void compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+#endif
+
+#if defined(HAVE_STB)
+    struct StbCompressorDXT1 : public ColorBlockCompressor
+    {
+        virtual void compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; }
+    };
+#endif
+
+} // nv namespace
+
+
+#endif // NVTT_COMPRESSORDX9_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX9.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX9.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDX9.cpp
@@ -0,0 +1,499 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressorDX9.h"
+#include "QuickCompressDXT.h"
+#include "OptimalCompressDXT.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+#include "ClusterFit.h"
+#include "CompressorDXT1.h"
+#include "CompressorDXT5_RGBM.h"
+
+// squish
+#include "squish/colourset.h"
+#include "squish/weightedclusterfit.h"
+
+#include "nvtt.h"
+
+#include "nvimage/Image.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Color.inl"
+
+#include "nvcore/Memory.h"
+
+#include <new> // placement new
+
+// s3_quant
+#if defined(HAVE_S3QUANT)
+#include "s3tc/s3_quant.h"
+#endif
+
+// ati tc
+#if defined(HAVE_ATITC)
+typedef int BOOL;
+typedef _W64 unsigned long ULONG_PTR;
+typedef ULONG_PTR DWORD_PTR;
+#include "atitc/ATI_Compress.h"
+#endif
+
+// squish
+#if defined(HAVE_SQUISH)
+//#include "squish/squish.h"
+#include "squish-1.10/squish.h"
+#endif
+
+// d3dx
+#if defined(HAVE_D3DX)
+#include <d3dx9.h>
+#endif
+
+// stb
+#if defined(HAVE_STB)
+#define STB_DEFINE
+#include "stb/stb_dxt.h"
+#endif
+
+using namespace nv;
+using namespace nvtt;
+
+
+void FastCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT1 * block = new(output) BlockDXT1;
+    QuickCompress::compressDXT1(rgba, block);
+}
+
+void FastCompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT1 * block = new(output) BlockDXT1;
+    QuickCompress::compressDXT1a(rgba, block);
+}
+
+void FastCompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT3 * block = new(output) BlockDXT3;
+    QuickCompress::compressDXT3(rgba, block);
+}
+
+void FastCompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT5 * block = new(output) BlockDXT5;
+    QuickCompress::compressDXT5(rgba, block);
+}
+
+void FastCompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    rgba.swizzle(4, 1, 5, 0); // 0xFF, G, 0, R
+
+    BlockDXT5 * block = new(output) BlockDXT5;
+    QuickCompress::compressDXT5(rgba, block);
+}
+
+
+#if 1
+
+void CompressorDXT1::compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    compress_dxt1(colors, weights, compressionOptions.colorWeight.xyz(), /*three_color_mode*/true, (BlockDXT1 *)output);
+}
+
+#else
+void CompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    nvsquish::WeightedClusterFit fit;
+    fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+    if (rgba.isSingleColor())
+    {
+        BlockDXT1 * block = new(output) BlockDXT1;
+        OptimalCompress::compressDXT1(rgba.color(0), block);
+    }
+    else
+    {
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), 0);
+        fit.SetColourSet(&colours, nvsquish::kDxt1);
+        fit.Compress(output);
+    }
+}
+#endif
+
+void CompressorDXT1a::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    uint alphaMask = 0;
+    for (uint i = 0; i < 16; i++)
+    {
+        if (rgba.color(i).a == 0) alphaMask |= (3 << (i * 2)); // Set two bits for each color.
+    }
+
+    const bool isSingleColor = rgba.isSingleColor();
+
+    if (isSingleColor)
+    {
+        BlockDXT1 * block = new(output) BlockDXT1;
+        OptimalCompress::compressDXT1a(rgba.color(0), alphaMask, block);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = nvsquish::kDxt1;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, nvsquish::kDxt1);
+
+        fit.Compress(output);
+    }
+}
+
+void CompressorDXT1_Luma::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT1 * block = new(output) BlockDXT1;
+    OptimalCompress::compressDXT1_Luma(rgba, block);
+}
+
+void CompressorDXT3::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT3 * block = new(output) BlockDXT3;
+
+    // Compress explicit alpha.
+    OptimalCompress::compressDXT3A(rgba, &block->alpha);
+
+    // Compress color.
+    if (rgba.isSingleColor())
+    {
+        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
+    }
+}
+
+void CompressorDXT5::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT5 * block = new(output) BlockDXT5;
+
+    // Compress alpha.
+    if (compressionOptions.quality == Quality_Highest)
+    {
+        OptimalCompress::compressDXT5A(rgba, &block->alpha);
+    }
+    else
+    {
+        QuickCompress::compressDXT5A(rgba, &block->alpha);
+    }
+
+    // Compress color.
+    if (rgba.isSingleColor())
+    {
+        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
+    }
+}
+
+
+void CompressorDXT5n::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    BlockDXT5 * block = new(output) BlockDXT5;
+
+    // Compress Y.
+    if (compressionOptions.quality == Quality_Highest)
+    {
+        OptimalCompress::compressDXT1G(rgba, &block->color);
+    }
+    else
+    {
+        if (rgba.isSingleColor(Color32(0, 0xFF, 0, 0))) // Mask all but green channel.
+        {
+                OptimalCompress::compressDXT1G(rgba.color(0).g, &block->color);
+        }
+        else
+        {
+            ColorBlock tile = rgba;
+            tile.swizzle(4, 1, 5, 3); // leave alpha in alpha channel.
+
+            nvsquish::WeightedClusterFit fit;
+            fit.SetMetric(0, 1, 0);
+
+            int flags = 0;
+            if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+            nvsquish::ColourSet colours((uint8 *)tile.colors(), flags);
+            fit.SetColourSet(&colours, 0);
+            fit.Compress(&block->color);
+        }
+    }
+
+    rgba.swizzle(4, 1, 5, 0); // 1, G, 0, R
+
+    // Compress X.
+    if (compressionOptions.quality == Quality_Highest)
+    {
+        OptimalCompress::compressDXT5A(rgba, &block->alpha);
+    }
+    else
+    {
+        QuickCompress::compressDXT5A(rgba, &block->alpha);
+    }
+}
+
+
+void CompressorBC3_RGBM::compressBlock(const Vector4 colors[16], const float weights[16], const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    float min_m = 0.25f; // @@ Get from compression options.
+    compress_dxt5_rgbm(colors, weights, min_m, (BlockDXT5 *)output);
+}
+
+
+#if defined(HAVE_ATITC)
+
+void AtiCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    // Init source texture
+    ATI_TC_Texture srcTexture;
+    srcTexture.dwSize = sizeof(srcTexture);
+    srcTexture.dwWidth = w;
+    srcTexture.dwHeight = h;
+    if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+    {
+        srcTexture.dwPitch = w * 4;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+    }
+    else
+    {
+        // @@ Floating point input is not swizzled.
+        srcTexture.dwPitch = w * 16;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+    }
+    srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+    srcTexture.pData = (ATI_TC_BYTE*) data;
+
+    // Init dest texture
+    ATI_TC_Texture destTexture;
+    destTexture.dwSize = sizeof(destTexture);
+    destTexture.dwWidth = w;
+    destTexture.dwHeight = h;
+    destTexture.dwPitch = 0;
+    destTexture.format = ATI_TC_FORMAT_DXT1;
+    destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+    destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+    ATI_TC_CompressOptions options;
+    options.dwSize = sizeof(options);
+    options.bUseChannelWeighting = false;
+    options.bUseAdaptiveWeighting = false;
+    options.bDXT1UseAlpha = false;
+    options.nCompressionSpeed = ATI_TC_Speed_Normal;
+    options.bDisableMultiThreading = false;
+    //options.bDisableMultiThreading = true;
+
+    // Compress
+    ATI_TC_ConvertTexture(&srcTexture, &destTexture, &options, NULL, NULL, NULL);
+
+    if (outputOptions.outputHandler != NULL) {
+            outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+    }
+
+    mem::free(destTexture.pData);
+}
+
+void AtiCompressorDXT5::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    // Init source texture
+    ATI_TC_Texture srcTexture;
+    srcTexture.dwSize = sizeof(srcTexture);
+    srcTexture.dwWidth = w;
+    srcTexture.dwHeight = h;
+    if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+    {
+        srcTexture.dwPitch = w * 4;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_8888;
+    }
+    else
+    {
+        srcTexture.dwPitch = w * 16;
+        srcTexture.format = ATI_TC_FORMAT_ARGB_32F;
+    }
+    srcTexture.dwDataSize = ATI_TC_CalculateBufferSize(&srcTexture);
+    srcTexture.pData = (ATI_TC_BYTE*) data;
+
+    // Init dest texture
+    ATI_TC_Texture destTexture;
+    destTexture.dwSize = sizeof(destTexture);
+    destTexture.dwWidth = w;
+    destTexture.dwHeight = h;
+    destTexture.dwPitch = 0;
+    destTexture.format = ATI_TC_FORMAT_DXT5;
+    destTexture.dwDataSize = ATI_TC_CalculateBufferSize(&destTexture);
+    destTexture.pData = (ATI_TC_BYTE*) mem::malloc(destTexture.dwDataSize);
+
+    // Compress
+    ATI_TC_ConvertTexture(&srcTexture, &destTexture, NULL, NULL, NULL, NULL);
+
+    if (outputOptions.outputHandler != NULL) {
+        outputOptions.outputHandler->writeData(destTexture.pData, destTexture.dwDataSize);
+    }
+
+    mem::free(destTexture.pData);
+}
+
+#endif // defined(HAVE_ATITC)
+
+#if defined(HAVE_SQUISH)
+
+void SquishCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+    nvDebugCheck(false);
+
+#pragma message(NV_FILE_LINE "TODO: Convert input to fixed point ABGR format instead of ARGB")
+    /*
+    Image img(*image);
+    int count = img.width() * img.height();
+    for (int i = 0; i < count; i++)
+    {
+            Color32 c = img.pixel(i);
+            img.pixel(i) = Color32(c.b, c.g, c.r, c.a);
+    }
+
+    int size = squish::GetStorageRequirements(img.width(), img.height(), squish::kDxt1);
+    void * blocks = mem::malloc(size);
+
+    squish::CompressImage((const squish::u8 *)img.pixels(), img.width(), img.height(), blocks, squish::kDxt1 | squish::kColourClusterFit);
+
+    if (outputOptions.outputHandler != NULL) {
+            outputOptions.outputHandler->writeData(blocks, size);
+    }
+
+    mem::free(blocks);
+    */
+}
+
+#endif // defined(HAVE_SQUISH)
+
+
+#if defined(HAVE_D3DX)
+
+void D3DXCompressorDXT1::compress(nvtt::InputFormat inputFormat, nvtt::AlphaMode alphaMode, uint w, uint h, uint d, void * data, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+
+    IDirect3D9 * d3d = Direct3DCreate9(D3D_SDK_VERSION);
+
+    D3DPRESENT_PARAMETERS presentParams;
+    ZeroMemory(&presentParams, sizeof(presentParams));
+    presentParams.Windowed = TRUE;
+    presentParams.SwapEffect = D3DSWAPEFFECT_COPY;
+    presentParams.BackBufferWidth = 8;
+    presentParams.BackBufferHeight = 8;
+    presentParams.BackBufferFormat = D3DFMT_UNKNOWN;
+
+    HRESULT err;
+
+    IDirect3DDevice9 * device = NULL;
+    err = d3d->CreateDevice(D3DADAPTER_DEFAULT, D3DDEVTYPE_REF, GetDesktopWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING, &presentParams, &device);
+
+    IDirect3DTexture9 * texture = NULL;
+    err = D3DXCreateTexture(device, w, h, 1, 0, D3DFMT_DXT1, D3DPOOL_SYSTEMMEM, &texture);
+
+    IDirect3DSurface9 * surface = NULL;
+    err = texture->GetSurfaceLevel(0, &surface);
+
+    RECT rect;
+    rect.left = 0;
+    rect.top = 0;
+    rect.bottom = h;
+    rect.right = w;
+
+    if (inputFormat == nvtt::InputFormat_BGRA_8UB)
+    {
+        err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A8R8G8B8, w * 4, NULL, &rect, D3DX_DEFAULT, 0);
+    }
+    else
+    {
+        err = D3DXLoadSurfaceFromMemory(surface, NULL, NULL, data, D3DFMT_A32B32G32R32F, w * 16, NULL, &rect, D3DX_DEFAULT, 0);
+    }
+
+    if (err != D3DERR_INVALIDCALL && err != D3DXERR_INVALIDDATA)
+    {
+        D3DLOCKED_RECT rect;
+        ZeroMemory(&rect, sizeof(rect));
+
+        err = surface->LockRect(&rect, NULL, D3DLOCK_READONLY);
+
+	    if (outputOptions.outputHandler != NULL) {
+	        int size = rect.Pitch * ((h + 3) / 4);
+	        outputOptions.outputHandler->writeData(rect.pBits, size);
+	    }
+
+        err = surface->UnlockRect();
+    }
+
+    surface->Release();
+    device->Release();
+    d3d->Release();
+}
+
+#endif // defined(HAVE_D3DX)
+
+
+#if defined(HAVE_STB)
+
+void StbCompressorDXT1::compressBlock(ColorBlock & rgba, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    rgba.swizzle(2, 1, 0, 3); // Swap R and B
+    stb_compress_dxt_block((unsigned char *)output, (unsigned char *)rgba.colors(), 0, 0);
+}
+
+
+#endif // defined(HAVE_STB)
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT1.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT1.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT1.h
@@ -0,0 +1,23 @@
+
+namespace nv {
+
+    class Color32;
+    struct ColorBlock;
+    struct BlockDXT1;
+    class Vector3;
+    class Vector4;
+
+    // All these functions return MSE.
+
+    float compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output);
+    float compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output);
+
+    float compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1_least_squares_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output);
+    float compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int search_limit, BlockDXT1 * output);
+    void compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
+
+
+    float compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output);
+
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT1.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT1.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT1.cpp
@@ -0,0 +1,799 @@
+
+#include "CompressorDXT1.h"
+#include "SingleColorLookup.h"
+#include "ClusterFit.h"
+
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/Color.inl"
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+#include "nvmath/ftoi.h"
+
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memset
+#include <float.h> // FLT_MAX
+
+
+using namespace nv;
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Color conversion functions.
+
+static const float midpoints5[32] = {
+    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
+    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
+};
+
+static const float midpoints6[64] = {
+    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f, 
+    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f, 
+    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f, 
+    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
+};
+
+/*void init_tables() {
+    for (int i = 0; i < 31; i++) {
+        float f0 = float(((i+0) << 3) | ((i+0) >> 2)) / 255.0f;
+        float f1 = float(((i+1) << 3) | ((i+1) >> 2)) / 255.0f;
+        midpoints5[i] = (f0 + f1) * 0.5;
+    }
+    midpoints5[31] = 1.0f;
+
+    for (int i = 0; i < 63; i++) {
+        float f0 = float(((i+0) << 2) | ((i+0) >> 4)) / 255.0f;
+        float f1 = float(((i+1) << 2) | ((i+1) >> 4)) / 255.0f;
+        midpoints6[i] = (f0 + f1) * 0.5;
+    }
+    midpoints6[63] = 1.0f;
+}*/
+
+static Color16 vector3_to_color16(const Vector3 & v) {
+    // Truncate.
+    uint r = ftoi_trunc(clamp(v.x * 31.0f, 0.0f, 31.0f));
+	uint g = ftoi_trunc(clamp(v.y * 63.0f, 0.0f, 63.0f));
+	uint b = ftoi_trunc(clamp(v.z * 31.0f, 0.0f, 31.0f));
+
+    // Round exactly according to 565 bit-expansion.
+    r += (v.x > midpoints5[r]);
+    g += (v.y > midpoints6[g]);
+    b += (v.z > midpoints5[b]);
+
+    return Color16((r << 11) | (g << 5) | b);
+}
+
+
+static Color32 bitexpand_color16_to_color32(Color16 c16) {
+    Color32 c32;
+    //c32.b = (c16.b << 3) | (c16.b >> 2);
+    //c32.g = (c16.g << 2) | (c16.g >> 4);
+    //c32.r = (c16.r << 3) | (c16.r >> 2);
+    //c32.a = 0xFF;
+
+    c32.u = ((c16.u << 3) & 0xf8) | ((c16.u << 5) & 0xfc00) | ((c16.u << 8) & 0xf80000);
+    c32.u |= (c32.u >> 5) & 0x070007;
+    c32.u |= (c32.u >> 6) & 0x000300;
+
+    return c32;
+}
+
+/*static Color32 bitexpand_color16_to_color32(int r, int g, int b) {
+    Color32 c32;
+    c32.b = (b << 3) | (b >> 2);
+    c32.g = (g << 2) | (g >> 4);
+    c32.r = (r << 3) | (r >> 2);
+    c32.a = 0xFF;
+    return c32;
+}*/
+
+static Color16 truncate_color32_to_color16(Color32 c32) {
+    Color16 c16;
+    c16.b = (c32.b >> 3);
+    c16.g = (c32.g >> 2);
+    c16.r = (c32.r >> 3);
+    return c16;
+}
+
+/*inline Vector3 r5g6b5_to_vector3(int r, int g, int b)
+{
+    Vector3 c;
+    c.x = float((r << 3) | (r >> 2));
+    c.y = float((g << 2) | (g >> 4));
+    c.z = float((b << 3) | (b >> 2));
+    return c;
+}*/
+
+inline Vector3 color_to_vector3(Color32 c)
+{
+    const float scale = 1.0f / 255.0f;
+    return Vector3(c.r * scale, c.g * scale, c.b * scale);
+}
+
+inline Color32 vector3_to_color(Vector3 v)
+{
+    Color32 color;
+    color.r = U8(ftoi_round(saturate(v.x) * 255));
+    color.g = U8(ftoi_round(saturate(v.y) * 255));
+    color.b = U8(ftoi_round(saturate(v.z) * 255));
+    color.a = 255;
+    return color;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Input block processing.
+
+inline static void color_block_to_vector_block(const ColorBlock & rgba, Vector3 block[16])
+{
+	for (int i = 0; i < 16; i++)
+	{
+		const Color32 c = rgba.color(i);
+		block[i] = Vector3(c.r, c.g, c.b);
+	}
+}
+
+// Find first valid color.
+static bool find_valid_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 * valid_color)
+{
+    for (int i = 0; i < count; i++) {
+        if (weights[i] > 0.0f) {
+            *valid_color = colors[i];
+            return true;
+        }
+    }
+
+    // No valid colors.
+    return false;
+}
+
+static bool is_single_color_rgb(const Vector3 * colors, const float * weights, int count, Vector3 color)
+{
+    for (int i = 0; i < count; i++) {
+        if (weights[i] > 0.0f) {
+            if (colors[i] != color) return false;
+        }
+    }
+
+    return true;
+}
+
+// Find similar colors and combine them together.
+static int reduce_colors(const Vector4 * input_colors, const float * input_weights, Vector3 * colors, float * weights)
+{
+    int n = 0;
+    for (int i = 0; i < 16; i++)
+    {
+        Vector3 ci = input_colors[i].xyz();
+        float wi = input_weights[i];
+
+        if (wi > 0) {
+            // Find matching color.
+            int j;
+            for (j = 0; j < n; j++) {
+                if (equal(colors[j].x, ci.x) && equal(colors[j].y, ci.y) && equal(colors[j].z, ci.z)) {
+                    weights[j] += wi;
+                    break;
+                }
+            }
+
+            // No match found. Add new color.
+            if (j == n) {
+                colors[n] = ci;
+                weights[n] = wi;
+                n++;
+            }
+        }
+    }
+
+    nvDebugCheck(n <= 16);
+
+    return n;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Error evaluation.
+
+// Different ways of estimating the error.
+/*static float evaluate_mse(const Vector3 & p, const Vector3 & c) {
+    //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z);
+    Vector3 d = (p - c);
+    return dot(d, d);
+}*/
+
+static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
+    //return (square(p.x-c.x) * w2.x + square(p.y-c.y) * w2.y + square(p.z-c.z) * w2.z);
+    Vector3 d = (p - c) * w;
+    return dot(d, d);
+}
+
+/*static float evaluate_mse(const Vector3 & p, const Vector3 & c, const Vector3 & w) {
+    return ww.x * square(p.x-c.x) + ww.y * square(p.y-c.y) + ww.z * square(p.z-c.z);
+}*/
+
+static int evaluate_mse(const Color32 & p, const Color32 & c) {
+    return (square(int(p.r)-c.r) + square(int(p.g)-c.g) + square(int(p.b)-c.b));
+}
+
+static float evaluate_mse(const Vector3 palette[4], const Vector3 & c, const Vector3 & w) {
+    float e0 = evaluate_mse(palette[0], c, w);
+    float e1 = evaluate_mse(palette[1], c, w);
+    float e2 = evaluate_mse(palette[2], c, w);
+    float e3 = evaluate_mse(palette[3], c, w);
+    return min(min(e0, e1), min(e2, e3));
+}
+
+static int evaluate_mse(const Color32 palette[4], const Color32 & c) {
+    int e0 = evaluate_mse(palette[0], c);
+    int e1 = evaluate_mse(palette[1], c);
+    int e2 = evaluate_mse(palette[2], c);
+    int e3 = evaluate_mse(palette[3], c);
+    return min(min(e0, e1), min(e2, e3));
+}
+
+// Returns MSE error in [0-255] range.
+static int evaluate_mse(const BlockDXT1 * output, Color32 color, int index) {
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
+
+    return evaluate_mse(palette[index], color);
+}
+
+// Returns weighted MSE error in [0-255] range.
+static float evaluate_palette_error(Color32 palette[4], const Color32 * colors, const float * weights, int count) {
+    
+	float total = 0.0f;
+	for (int i = 0; i < count; i++) {
+        total += weights[i] * evaluate_mse(palette, colors[i]);
+	}
+
+	return total;
+}
+
+#if 0
+static float evaluate_mse(const BlockDXT1 * output, const Vector3 colors[16]) {
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
+
+    // convert palette to float.
+    Vector3 vector_palette[4];
+    for (int i = 0; i < 4; i++) {
+        vector_palette[i] = color_to_vector3(palette[i]);
+    }
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (output->indices >> (2*i)) & 3; // @@ Is this the right order?
+        error += evaluate_mse(vector_palette[index], colors[i]);
+    }
+
+    return error;
+}
+#endif
+
+static float evaluate_mse(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, const BlockDXT1 * output) {
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
+
+    // convert palette to float.
+    Vector3 vector_palette[4];
+    for (int i = 0; i < 4; i++) {
+        vector_palette[i] = color_to_vector3(palette[i]);
+    }
+
+    // evaluate error for each index.
+    float error = 0.0f;
+    for (int i = 0; i < 16; i++) {
+        int index = (output->indices >> (2 * i)) & 3;
+        error += input_weights[i] * evaluate_mse(vector_palette[index], input_colors[i].xyz(), color_weights);
+    }
+    return error;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Palette evaluation.
+
+static void evaluate_palette4(Color32 palette[4]) {
+    palette[2].r = (2 * palette[0].r + palette[1].r) / 3;
+    palette[2].g = (2 * palette[0].g + palette[1].g) / 3;
+    palette[2].b = (2 * palette[0].b + palette[1].b) / 3;
+    palette[3].r = (2 * palette[1].r + palette[0].r) / 3;
+    palette[3].g = (2 * palette[1].g + palette[0].g) / 3;
+    palette[3].b = (2 * palette[1].b + palette[0].b) / 3;
+}
+
+static void evaluate_palette3(Color32 palette[4]) {
+    palette[2].r = (palette[0].r + palette[1].r) / 2;
+    palette[2].g = (palette[0].g + palette[1].g) / 2;
+    palette[2].b = (palette[0].b + palette[1].b) / 2;
+    palette[3].r = 0;
+    palette[3].g = 0;
+    palette[3].b = 0;
+}
+
+static void evaluate_palette(Color16 c0, Color16 c1, Color32 palette[4]) {
+    palette[0] = bitexpand_color16_to_color32(c0);
+    palette[1] = bitexpand_color16_to_color32(c1);
+    if (c0.u > c1.u) {
+        evaluate_palette4(palette);
+    }
+    else {
+        evaluate_palette3(palette);
+    }
+}
+
+static void evaluate_palette(Color16 c0, Color16 c1, Vector3 palette[4]) {
+    Color32 palette32[4];
+    evaluate_palette(c0, c1, palette32);
+
+    for (int i = 0; i < 4; i++) {
+        palette[i] = color_to_vector3(palette32[i]);
+    }
+}
+
+static void evaluate_palette3(Color16 c0, Color16 c1, Vector3 palette[4]) {
+    nvDebugCheck(c0.u > c1.u);
+
+    Color32 palette32[4];
+    evaluate_palette(c0, c1, palette32);
+
+    for (int i = 0; i < 4; i++) {
+        palette[i] = color_to_vector3(palette32[i]);
+    }
+}
+
+
+
+
+
+static uint compute_indices4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+    
+    uint indices = 0;
+	for (int i = 0; i < 16; i++) {
+		float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
+		float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
+		float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
+		float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
+		
+		uint b0 = d0 > d3;
+		uint b1 = d1 > d2;
+		uint b2 = d0 > d2;
+		uint b3 = d1 > d3;
+		uint b4 = d2 > d3;
+		
+		uint x0 = b1 & b2;
+		uint x1 = b0 & b3;
+		uint x2 = b0 & b4;
+		
+		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+	}
+
+	return indices;
+}
+
+
+static uint compute_indices(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 palette[4]) {
+    
+    uint indices = 0;
+	for (int i = 0; i < 16; i++) {
+		float d0 = evaluate_mse(palette[0], input_colors[i].xyz(), color_weights);
+		float d1 = evaluate_mse(palette[1], input_colors[i].xyz(), color_weights);
+		float d2 = evaluate_mse(palette[2], input_colors[i].xyz(), color_weights);
+		float d3 = evaluate_mse(palette[3], input_colors[i].xyz(), color_weights);
+		
+        uint index;
+        if (d0 < d1 && d0 < d2 && d0 < d3) index = 0;
+        else if (d1 < d2 && d1 < d3) index = 1;
+        else if (d2 < d3) index = 2;
+        else index = 3;
+
+		indices |= index << (2 * i);
+	}
+
+	return indices;
+}
+
+
+static void output_block3(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u > color1.u) {
+        swap(color0, color1);
+    }
+
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices(input_colors, color_weights, palette);
+}
+
+static void output_block4(const Vector4 input_colors[16], const Vector3 & color_weights, const Vector3 & v0, const Vector3 & v1, BlockDXT1 * block)
+{
+    Color16 color0 = vector3_to_color16(v0);
+    Color16 color1 = vector3_to_color16(v1);
+
+    if (color0.u < color1.u) {
+        swap(color0, color1);
+    }
+
+    Vector3 palette[4];
+    evaluate_palette(color0, color1, palette);
+
+    block->col0 = color0;
+    block->col1 = color1;
+    block->indices = compute_indices4(input_colors, color_weights, palette);
+}
+
+
+
+
+
+// Single color compressor, based on:
+// https://mollyrocket.com/forums/viewtopic.php?t=392
+static void compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
+{
+    output->col0.r = OMatch5[c.r][0];
+    output->col0.g = OMatch6[c.g][0];
+    output->col0.b = OMatch5[c.b][0];
+    output->col1.r = OMatch5[c.r][1];
+    output->col1.g = OMatch6[c.g][1];
+    output->col1.b = OMatch5[c.b][1];
+    output->indices = 0xaaaaaaaa;
+    
+    if (output->col0.u < output->col1.u)
+    {
+        swap(output->col0.u, output->col1.u);
+        output->indices ^= 0x55555555;
+    }
+}
+
+
+float nv::compress_dxt1_single_color_optimal(Color32 c, BlockDXT1 * output)
+{
+    ::compress_dxt1_single_color_optimal(c, output);
+
+    // Multiply by 16^2, the weight associated to a single color.
+    // Divide by 255*255 to covert error to [0-1] range.
+    return (256.0f / (255*255)) * evaluate_mse(output, c, output->indices & 3);
+}
+
+
+float nv::compress_dxt1_single_color_optimal(const Vector3 & color, BlockDXT1 * output)
+{
+    return compress_dxt1_single_color_optimal(vector3_to_color(color), output);
+}
+
+
+// Compress block using the average color.
+float nv::compress_dxt1_single_color(const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, BlockDXT1 * output)
+{
+    // Compute block average.
+    Vector3 color_sum(0);
+    float weight_sum = 0;
+
+    for (int i = 0; i < count; i++) {
+        color_sum += colors[i] * weights[i];
+        weight_sum += weights[i];
+    }
+
+    // Compress optimally.
+    ::compress_dxt1_single_color_optimal(vector3_to_color(color_sum / weight_sum), output);
+
+    // Decompress block color.
+    Color32 palette[4];
+    output->evaluatePalette(palette, /*d3d9=*/false);
+
+    Vector3 block_color = color_to_vector3(palette[output->indices & 0x3]);
+
+    // Evaluate error.
+    float error = 0;
+    for (int i = 0; i < count; i++) {
+        error += weights[i] * evaluate_mse(block_color, colors[i], color_weights);
+    }
+    return error;
+}
+
+
+/* @@ Not implemented yet.
+// Low quality baseline compressor.
+float nv::compress_dxt1_least_squares_fit(const Vector3 * input_colors, const Vector3 * colors, const float * weights, int count, BlockDXT1 * output)
+{
+    // @@ Iterative best end point fit.
+
+    return FLT_MAX;
+}*/
+
+
+float nv::compress_dxt1_bounding_box_exhaustive(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, int max_volume, BlockDXT1 * output)
+{
+    // Compute bounding box.
+    Vector3 min_color(1.0f);
+    Vector3 max_color(0.0f);
+
+    for (int i = 0; i < count; i++) {
+        min_color = min(min_color, colors[i]);
+        max_color = max(max_color, colors[i]);
+    }
+
+    // Convert to 5:6:5
+    int min_r = ftoi_floor(31 * min_color.x);
+    int min_g = ftoi_floor(63 * min_color.y);
+    int min_b = ftoi_floor(31 * min_color.z);
+    int max_r = ftoi_ceil(31 * max_color.x);
+    int max_g = ftoi_ceil(63 * max_color.y);
+    int max_b = ftoi_ceil(31 * max_color.z);
+
+    // Expand the box.
+    int range_r = max_r - min_r;
+    int range_g = max_g - min_g;
+    int range_b = max_b - min_b;
+
+    min_r = max(0, min_r - range_r / 2 - 2);
+    min_g = max(0, min_g - range_g / 2 - 2);
+    min_b = max(0, min_b - range_b / 2 - 2);
+
+    max_r = min(31, max_r + range_r / 2 + 2);
+    max_g = min(63, max_g + range_g / 2 + 2);
+    max_b = min(31, max_b + range_b / 2 + 2);
+
+    // Estimate size of search space.
+    int volume = (max_r-min_r+1) * (max_g-min_g+1) * (max_b-min_b+1);
+
+    // if size under search_limit, then proceed. Note that search_volume is sqrt of number of evaluations.
+    if (volume > max_volume) {
+        return FLT_MAX;
+    }
+
+    // @@ Convert to fixed point before building box?
+    Color32 colors32[16];
+    for (int i = 0; i < count; i++) {
+        colors32[i] = toColor32(Vector4(colors[i], 1));
+    }
+
+    float best_error = FLT_MAX;
+    Color16 best0, best1;           // @@ Record endpoints as Color16?
+
+    Color16 c0, c1;
+    Color32 palette[4];
+
+    for(int r0 = min_r; r0 <= max_r; r0++)
+    for(int g0 = min_g; g0 <= max_g; g0++)
+    for(int b0 = min_b; b0 <= max_b; b0++)
+    {
+        c0.r = r0; c0.g = g0; c0.b = b0;
+        palette[0] = bitexpand_color16_to_color32(c0);
+
+        for(int r1 = min_r; r1 <= max_r; r1++)
+        for(int g1 = min_g; g1 <= max_g; g1++)
+        for(int b1 = min_b; b1 <= max_b; b1++)
+        {
+            c1.r = r1; c1.g = g1; c1.b = b1;
+            palette[1] = bitexpand_color16_to_color32(c1);
+
+            if (c0.u > c1.u) {
+                // Evaluate error in 4 color mode.
+                evaluate_palette4(palette);
+            }
+            else {
+                if (three_color_mode) {
+                    // Evaluate error in 3 color mode.
+                    evaluate_palette3(palette);
+                }
+                else {
+                    // Skip 3 color mode.
+                    continue;
+                }
+            }
+
+            float error = evaluate_palette_error(palette, colors32, weights, count);
+
+            if (error < best_error) {
+                best_error = error;
+                best0 = c0;
+                best1 = c1;
+            }
+        }
+    }
+
+    output->col0 = best0;
+    output->col1 = best1;
+
+    Vector3 vector_palette[4];
+    evaluate_palette(output->col0, output->col1, vector_palette);
+
+    output->indices = compute_indices(input_colors, color_weights, vector_palette);
+
+    return best_error / (255 * 255);
+}
+
+
+void nv::compress_dxt1_cluster_fit(const Vector4 input_colors[16], const Vector3 * colors, const float * weights, int count, const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
+{
+    ClusterFit fit;
+    fit.setColorWeights(Vector4(color_weights, 1));
+    fit.setColorSet(colors, weights, count);
+
+    // start & end are in [0, 1] range.
+    Vector3 start, end;
+    fit.compress4(&start, &end);
+
+    if (three_color_mode && fit.compress3(&start, &end)) {
+        output_block3(input_colors, color_weights, start, end, output);
+    }
+    else {
+        output_block4(input_colors, color_weights, start, end, output);
+    }
+}
+
+
+
+
+float nv::compress_dxt1(const Vector4 input_colors[16], const float input_weights[16], const Vector3 & color_weights, bool three_color_mode, BlockDXT1 * output)
+{
+    Vector3 colors[16];
+    float weights[16];
+    int count = reduce_colors(input_colors, input_weights, colors, weights);
+
+    if (count == 0) {
+        // Output trivial block.
+        output->col0.u = 0;
+        output->col1.u = 0;
+        output->indices = 0;
+        return 0;
+    }
+
+
+    float error = FLT_MAX;
+
+    // Sometimes the single color compressor produces better results than the exhaustive. This introduces discontinuities between blocks that
+    // use different compressors. For this reason, this is not enabled by default.
+    if (1) {
+        error = compress_dxt1_single_color(colors, weights, count, color_weights, output);
+
+        if (error == 0.0f || count == 1) {
+            // Early out.
+            return error;
+        }
+    }
+
+    // This is too expensive, even with a low threshold.
+    // If high quality:
+    if (0) {
+        BlockDXT1 exhaustive_output;
+        float exhaustive_error = compress_dxt1_bounding_box_exhaustive(input_colors, colors, weights, count, color_weights, three_color_mode, 1400, &exhaustive_output);
+
+        if (exhaustive_error != FLT_MAX) {
+            float exhaustive_error2 = evaluate_mse(input_colors, input_weights, color_weights, &exhaustive_output);
+
+            // The exhaustive compressor does not use color_weights, so the results may be different.
+            //nvCheck(equal(exhaustive_error, exhaustive_error2));
+
+            if (exhaustive_error2 < error) {
+                *output = exhaustive_output;
+                error = exhaustive_error;
+            }
+        }
+    }
+
+    // @@ TODO.
+    // This is pretty fast and in some cases can produces better quality than cluster fit.
+    //error = compress_dxt1_least_squares_fit(colors, weigths, error, output);
+
+    // Cluster fit cannot handle single color blocks, so encode them optimally if we haven't encoded them already.
+    if (error == FLT_MAX && count == 1) {
+        error = compress_dxt1_single_color_optimal(colors[0], output);
+    }
+
+    if (count > 1) {
+        BlockDXT1 cluster_fit_output;
+        compress_dxt1_cluster_fit(input_colors, colors, weights, count, color_weights, three_color_mode, &cluster_fit_output);
+
+        float cluster_fit_error = evaluate_mse(input_colors, input_weights, color_weights, &cluster_fit_output);
+        
+        if (cluster_fit_error < error) {
+            *output = cluster_fit_output;
+            error = cluster_fit_error;
+        }
+    }
+
+    return error;
+}
+
+
+// Once we have an index assignment we have colors grouped in 1-4 clusters.
+// If 1 clusters -> Use optimal compressor.
+// If 2 clusters -> Try: (0, 1), (1, 2), (0, 2), (0, 3) - [0, 1]
+// If 3 clusters -> Try: (0, 1, 2), (0, 1, 3), (0, 2, 3) - [0, 1, 2]
+// If 4 clusters -> Try: (0, 1, 2, 3)
+
+// @@ How do we do the initial index/cluster assignment? Use standard cluster fit.
+
+
+// Least squares fitting of color end points for the given indices. @@ Take weights into account.
+static bool optimize_end_points4(uint indices, const Vector3 * colors, const Vector3 * weights, int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum(0.0f);
+    Vector3 betax_sum(0.0f);
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+
+// Least squares fitting of color end points for the given indices. @@ This does not support black/transparent index. @@ Take weights into account.
+static bool optimize_end_points3(uint indices, const Vector3 * colors, const Vector3 * weights, int count, Vector3 * a, Vector3 * b)
+{
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    Vector3 alphax_sum(0.0f);
+    Vector3 betax_sum(0.0f);
+
+    for (int i = 0; i < count; i++)
+    {
+        const uint bits = indices >> (2 * i);
+
+        float beta = float(bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+    if (equal(denom, 0.0f)) return false;
+
+    float factor = 1.0f / denom;
+
+    *a = saturate((alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor);
+    *b = saturate((betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor);
+
+    return true;
+}
+
+// @@ After optimization we need to round end points. Round in all possible directions, and pick best.
+
+
+
+
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT5_RGBM.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT5_RGBM.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT5_RGBM.h
@@ -0,0 +1,9 @@
+
+namespace nv {
+
+    struct BlockDXT5;
+    class Vector4;
+
+    float compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output);
+
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT5_RGBM.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT5_RGBM.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorDXT5_RGBM.cpp
@@ -0,0 +1,428 @@
+#include "CompressorDXT5_RGBM.h"
+#include "CompressorDXT1.h"
+
+#include "OptimalCompressDXT.h"
+#include "QuickCompressDXT.h"
+
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/Color.inl"
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+#include "nvmath/ftoi.h"
+
+#include "nvthread/Atomic.h"
+#include <stdio.h>
+
+using namespace nv;
+
+//static uint atomic_counter = 0;
+
+
+float nv::compress_dxt5_rgbm(const Vector4 input_colors[16], const float input_weights[16], float min_m, BlockDXT5 * output) {
+
+    // Convert to RGBM.
+    Vector4 input_colors_rgbm[16]; // @@ Write over input_colors?
+    float rgb_weights[16];
+
+    float weight_sum = 0;
+
+    for (uint i = 0; i < 16; i++) {
+        const Vector4 & c = input_colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float M = max(max(R, G), max(B, min_m));
+        float r = R / M;
+        float g = G / M;
+        float b = B / M;
+        float a = (M - min_m) / (1 - min_m);
+
+        input_colors_rgbm[i] = Vector4(r, g, b, a);
+        rgb_weights[i] = input_weights[i] * M;
+        weight_sum += input_weights[i];
+    }
+
+    if (weight_sum == 0) {
+        for (uint i = 0; i < 16; i++) rgb_weights[i] = 1;
+    }
+
+    // Compress RGB.
+    compress_dxt1(input_colors_rgbm, rgb_weights, Vector3(1), /*three_color_mode=*/false, &output->color);
+
+    // Decompress RGB/M block.
+    nv::ColorBlock RGB;
+    output->color.decodeBlock(&RGB);
+
+    // Compute M values to compensate for RGB's error.
+    AlphaBlock4x4 M;
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = input_colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float rm = RGB.color(i).r / 255.0f;
+        float gm = RGB.color(i).g / 255.0f;
+        float bm = RGB.color(i).b / 255.0f;
+
+        // compute m such that m * (r/M, g/M, b/M) == RGB
+    
+        // Three equations, one unknown:
+        //  m * r/M == R
+        //  m * g/M == G
+        //  m * b/M == B
+        
+        // Solve in the least squares sense!
+
+        // m (rm gm bm) (rm gm bm)^T == (rm gm bm) (R G B)^T
+
+        // m == dot(rgb, RGB) / dot(rgb, rgb)
+
+        float m = dot(Vector3(rm, gm, bm), Vector3(R, G, B)) / dot(Vector3(rm, gm, bm), Vector3(rm, gm, bm));
+
+        m = (m - min_m) / (1 - min_m);
+
+#if 0
+        // IC: This does indeed happen. What does that mean? The best choice of m is above the available range. If this happened too often it would make sense to scale m in
+        // the pixel shader to allow for more accurate reconstruction. However, that scaling would reduce the precision over the [0-1] range. I haven't measured how much
+        // error is introduced by the clamping vs. how much the error would change with the increased range.
+        if (m > 1.0f) {
+            uint counter = atomicIncrement(&atomic_counter);
+            printf("It happens %u times!", counter);
+        }
+#endif
+
+        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
+        M.weights[i] = input_weights[i];
+    }
+
+    // Compress M.
+    //if (compressionOptions.quality == Quality_Fastest) {
+    //    QuickCompress::compressDXT5A(M, &output->alpha);
+    /*}
+    else {*/
+        OptimalCompress::compressDXT5A(M, &output->alpha);
+    //}
+
+
+#if 0   // Multiple iterations do not seem to help.
+    // Decompress M.
+    output->alpha.decodeBlock(&M);
+
+    // Feed it back to the input RGB block.
+    for (uint i = 0; i < 16; i++) {
+        const Vector4 & c = input_colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float m = float(M.alpha[i]) / 255.0f * (1 - min_m) + min_m;
+
+        float r = R / m;
+        float g = G / m;
+        float b = B / m;
+        float a = float(M.alpha[i]) / 255.0f;
+
+        input_colors_rgbm[i] = Vector4(r, g, b, a);
+        rgb_weights[i] = input_weights[i] * m;
+    }
+#endif
+
+    return 0; // @@ 
+}
+
+
+
+
+#if 0
+
+    BlockDXT5 * block = new(output)BlockDXT5;
+
+    // Decompress the color block and find the M values that reproduce the input most closely. This should compensate for some of the DXT errors.
+
+    // Compress the resulting M values optimally.
+
+    // Repeat this several times until compression error does not improve?
+
+    //Vector3 rgb_block[16];
+    //float m_block[16];
+
+
+    // Init RGB/M block.
+#if 0
+    nvsquish::WeightedClusterFit fit;
+
+    ColorBlock rgba;
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = src.color(i);
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float M = max(max(R, G), max(B, min_m));
+        float r = R / M;
+        float g = G / M;
+        float b = B / M;
+        float a = c.w;
+
+        rgba.color(i) = toColor32(Vector4(r, g, b, a));
+    }
+
+    if (rgba.isSingleColor())
+    {
+        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
+    }
+#endif
+#if 1
+    ColorSet rgb;
+    rgb.allocate(4, 4);
+
+    for (uint i = 0; i < 16; i++) {
+        const Vector4 & c = colors[i];
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float M = max(max(R, G), max(B, min_m));
+        float r = R / M;
+        float g = G / M;
+        float b = B / M;
+        float a = c.w;
+
+        rgb.colors[i] = Vector4(r, g, b, a);
+        rgb.indices[i] = i;
+        rgb.weights[i] = max(weights[i], 0.001f);// weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
+    }
+
+    rgb.createMinimalSet(/*ignoreTransparent=*/true);
+
+    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
+        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
+    }
+    else {
+        ClusterFit fit;
+        fit.setColorWeights(compressionOptions.colorWeight);
+        fit.setColorSet(&rgb);
+
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        QuickCompress::outputBlock4(rgb, start, end, &block->color);
+    }
+#endif
+
+    // Decompress RGB/M block.
+    nv::ColorBlock RGB;
+    block->color.decodeBlock(&RGB);
+    
+#if 1
+    AlphaBlock4x4 M;
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = colors[i];
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float r = RGB.color(i).r / 255.0f;
+        float g = RGB.color(i).g / 255.0f;
+        float b = RGB.color(i).b / 255.0f;
+
+        float m = (R / r + G / g + B / b) / 3.0f;
+        //float m = max((R / r + G / g + B / b) / 3.0f, min_m);
+        //float m = max(max(R / r, G / g), max(B / b, min_m));
+        //float m = max(max(R, G), max(B, min_m));
+
+        m = (m - min_m) / (1 - min_m);
+
+        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
+        M.weights[i] = weights[i];
+    }
+
+    // Compress M.
+    if (compressionOptions.quality == Quality_Fastest) {
+        QuickCompress::compressDXT5A(M, &block->alpha);
+    }
+    else {
+        OptimalCompress::compressDXT5A(M, &block->alpha);
+    }
+#else
+    OptimalCompress::compressDXT5A_RGBM(src, RGB, &block->alpha);
+#endif
+
+#if 0
+    // Decompress M.
+    block->alpha.decodeBlock(&M);
+
+    rgb.allocate(src.w, src.h);     // @@ Handle smaller blocks.
+
+    for (uint i = 0; i < src.colorCount; i++) {
+        const Vector4 & c = src.color(i);
+
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        //float m = max(max(R, G), max(B, min_m));
+        float m = float(M.alpha[i]) / 255.0f * (1 - min_m) + min_m;
+        float r = R / m;
+        float g = G / m;
+        float b = B / m;
+        float a = c.w;
+
+        rgb.colors[i] = Vector4(r, g, b, a);
+        rgb.indices[i] = i;
+        rgb.weights[i] = max(c.w, 0.001f);// src.weights[i];   // IC: For some reason 0 weights are causing problems, even if we eliminate the corresponding colors from the set.
+    }
+
+    rgb.createMinimalSet(/*ignoreTransparent=*/true);
+
+    if (rgb.isSingleColor(/*ignoreAlpha=*/true)) {
+        OptimalCompress::compressDXT1(toColor32(rgb.color(0)), &block->color);
+    }
+    else {
+        ClusterFit fit;
+        fit.setMetric(compressionOptions.colorWeight);
+        fit.setColourSet(&rgb);
+
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        QuickCompress::outputBlock4(rgb, start, end, &block->color);
+    }
+#endif
+
+#if 0
+    block->color.decodeBlock(&RGB);
+
+    //AlphaBlock4x4 M;
+    //M.initWeights(src);
+    
+    for (int i = 0; i < 16; i++) {
+        const Vector4 & c = src.color(i);
+        float R = saturate(c.x);
+        float G = saturate(c.y);
+        float B = saturate(c.z);
+
+        float r = RGB.color(i).r / 255.0f;
+        float g = RGB.color(i).g / 255.0f;
+        float b = RGB.color(i).b / 255.0f;
+
+        float m = (R / r + G / g + B / b) / 3.0f;
+        //float m = max((R / r + G / g + B / b) / 3.0f, min_m);
+        //float m = max(max(R / r, G / g), max(B / b, min_m));
+        //float m = max(max(R, G), max(B, min_m));
+
+        m = (m - min_m) / (1 - min_m);
+
+        M.alpha[i] = U8(ftoi_round(saturate(m) * 255.0f));
+        M.weights[i] = src.weights[i];
+    }
+
+    // Compress M.
+    if (compressionOptions.quality == Quality_Fastest) {
+        QuickCompress::compressDXT5A(M, &block->alpha);
+    }
+    else {
+        OptimalCompress::compressDXT5A(M, &block->alpha);
+    }
+#endif
+
+
+
+#if 0
+    src.fromRGBM(M, min_m);
+
+    src.createMinimalSet(/*ignoreTransparent=*/true);
+
+    if (src.isSingleColor(/*ignoreAlpha=*/true)) {
+        OptimalCompress::compressDXT1(src.color(0), &block->color);
+    }
+    else {
+        // @@ Use our improved compressor.
+        ClusterFit fit;
+        fit.setMetric(compressionOptions.colorWeight);
+        fit.setColourSet(&src);
+
+        Vector3 start, end;
+        fit.compress4(&start, &end);
+
+        if (fit.compress3(&start, &end)) {
+            QuickCompress::outputBlock3(src, start, end, block->color);
+        }
+        else {
+            QuickCompress::outputBlock4(src, start, end, block->color);
+        }
+    }
+#endif // 0
+
+    // @@ Decompress color and compute M that best approximates src with these colors? Then compress M again?
+
+
+
+    // RGBM encoding.
+    // Maximize precision.
+    // - Number of possible grey levels:
+    //   - Naive:  2^3 = 8
+    //   - Better: 2^3 + 2^2 = 12
+    //   - How to choose min_m? 
+    //     - Ideal = Adaptive per block, don't know where to store.
+    //     - Adaptive per lightmap. How to compute optimal?
+    //     - Fixed: 0.25 in our case. Lightmaps scaled to a fixed [0, 1] range.
+
+    // - Optimal compressor: Interpolation artifacts.
+
+    // - Color transform. 
+    //    - Measure error in post-tone-mapping color space. 
+    //    - Assume a simple tone mapping operator. We know minimum and maximum exposure, but don't know exact exposure in game.
+    //    - Guess based on average lighmap color? Use fixed exposure, in scaled lightmap space.
+
+    // - Enhanced DXT compressor.
+    //    - Typical RGBM encoding as follows:
+    //      rgb -> M = max(rgb), RGB=rgb/M -> RGBM
+    //    - If we add a compression step (M' = M) and M' < M, then rgb may be greater than 1.
+    //      - We could ensure that M' >= M during compression.
+    //      - We could clamp RGB anyway.
+    //      - We could add a fixed scale value to take into account compression errors and avoid clamping.
+
+
+    
+
+
+    // Compress color.
+    /*if (rgba.isSingleColor())
+    {
+        OptimalCompress::compressDXT1(rgba.color(0), &block->color);
+    }
+    else
+    {
+        nvsquish::WeightedClusterFit fit;
+        fit.SetMetric(compressionOptions.colorWeight.x, compressionOptions.colorWeight.y, compressionOptions.colorWeight.z);
+
+        int flags = 0;
+        if (alphaMode == nvtt::AlphaMode_Transparency) flags |= nvsquish::kWeightColourByAlpha;
+
+        nvsquish::ColourSet colours((uint8 *)rgba.colors(), flags);
+        fit.SetColourSet(&colours, 0);
+        fit.Compress(&block->color);
+    }*/
+
+#endif // 0
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorRGB.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorRGB.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorRGB.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_COMPRESSORRGB_H
+#define NVTT_COMPRESSORRGB_H
+
+#include "Compressor.h"
+
+namespace nv
+{
+    struct PixelFormatConverter : public CompressorInterface
+    {
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+    };
+
+} // nv namespace
+
+
+#endif // NVTT_COMPRESSORRGB_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorRGB.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorRGB.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CompressorRGB.cpp
@@ -0,0 +1,568 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CompressorRGB.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+
+#include "nvimage/Image.h"
+#include "nvimage/FloatImage.h"
+#include "nvimage/PixelFormat.h"
+
+#include "nvmath/Color.h"
+#include "nvmath/Half.h"
+#include "nvmath/ftoi.h"
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Debug.h"
+
+using namespace nv;
+using namespace nvtt;
+
+namespace 
+{
+    /* 11 and 10 bit floating point numbers according to the OpenGL packed float extension:
+       http://www.opengl.org/registry/specs/EXT/packed_float.txt
+
+       2.1.A  Unsigned 11-Bit Floating-Point Numbers
+
+        An unsigned 11-bit floating-point number has no sign bit, a 5-bit
+        exponent (E), and a 6-bit mantissa (M).  The value of an unsigned
+        11-bit floating-point number (represented as an 11-bit unsigned
+        integer N) is determined by the following:
+
+            0.0,                      if E == 0 and M == 0,
+            2^-14 * (M / 64),         if E == 0 and M != 0,
+            2^(E-15) * (1 + M/64),    if 0 < E < 31,
+            INF,                      if E == 31 and M == 0, or
+            NaN,                      if E == 31 and M != 0,
+
+        where
+
+            E = floor(N / 64), and
+            M = N mod 64.
+
+        Implementations are also allowed to use any of the following
+        alternative encodings:
+
+            0.0,                      if E == 0 and M != 0
+            2^(E-15) * (1 + M/64)     if E == 31 and M == 0
+            2^(E-15) * (1 + M/64)     if E == 31 and M != 0
+
+        When a floating-point value is converted to an unsigned 11-bit
+        floating-point representation, finite values are rounded to the closet
+        representable finite value.  While less accurate, implementations
+        are allowed to always round in the direction of zero.  This means
+        negative values are converted to zero.  Likewise, finite positive
+        values greater than 65024 (the maximum finite representable unsigned
+        11-bit floating-point value) are converted to 65024.  Additionally:
+        negative infinity is converted to zero; positive infinity is converted
+        to positive infinity; and both positive and negative NaN are converted
+        to positive NaN.
+
+        Any representable unsigned 11-bit floating-point value is legal
+        as input to a GL command that accepts 11-bit floating-point data.
+        The result of providing a value that is not a floating-point number
+        (such as infinity or NaN) to such a command is unspecified, but must
+        not lead to GL interruption or termination.  Providing a denormalized
+        number or negative zero to GL must yield predictable results.
+
+        2.1.B  Unsigned 10-Bit Floating-Point Numbers
+
+        An unsigned 10-bit floating-point number has no sign bit, a 5-bit
+        exponent (E), and a 5-bit mantissa (M).  The value of an unsigned
+        10-bit floating-point number (represented as an 10-bit unsigned
+        integer N) is determined by the following:
+
+            0.0,                      if E == 0 and M == 0,
+            2^-14 * (M / 32),         if E == 0 and M != 0,
+            2^(E-15) * (1 + M/32),    if 0 < E < 31,
+            INF,                      if E == 31 and M == 0, or
+            NaN,                      if E == 31 and M != 0,
+
+        where
+
+            E = floor(N / 32), and
+            M = N mod 32.
+
+        When a floating-point value is converted to an unsigned 10-bit
+        floating-point representation, finite values are rounded to the closet
+        representable finite value.  While less accurate, implementations
+        are allowed to always round in the direction of zero.  This means
+        negative values are converted to zero.  Likewise, finite positive
+        values greater than 64512 (the maximum finite representable unsigned
+        10-bit floating-point value) are converted to 64512.  Additionally:
+        negative infinity is converted to zero; positive infinity is converted
+        to positive infinity; and both positive and negative NaN are converted
+        to positive NaN.
+
+        Any representable unsigned 10-bit floating-point value is legal
+        as input to a GL command that accepts 10-bit floating-point data.
+        The result of providing a value that is not a floating-point number
+        (such as infinity or NaN) to such a command is unspecified, but must
+        not lead to GL interruption or termination.  Providing a denormalized
+        number or negative zero to GL must yield predictable results.
+    */
+
+    // @@ Is this correct? Not tested!
+    // 6 bits of mantissa, 5 bits of exponent.
+    static uint toFloat11(float f) {
+        if (f < 0) f = 0;           // Flush to 0 or to epsilon?
+        if (f > 65024) f = 65024;   // Flush to infinity or max?
+
+        Float754 F;
+        F.value = f;
+
+        uint E = F.field.biasedexponent - 127 + 15;
+        nvDebugCheck(E < 32);
+
+        uint M = F.field.mantissa >> (23 - 6);
+
+        return (E << 6) | M;
+    }
+
+    // @@ Is this correct? Not tested!
+    // 5 bits of mantissa, 5 bits of exponent.
+    static uint toFloat10(float f) {
+        if (f < 0) f = 0;           // Flush to 0 or to epsilon?
+        if (f > 64512) f = 64512;   // Flush to infinity or max?
+
+        Float754 F;
+        F.value = f;
+
+        uint E = F.field.biasedexponent - 127 + 15;
+        nvDebugCheck(E < 32);
+
+        uint M = F.field.mantissa >> (23 - 5);
+
+        return (E << 5) | M;
+    }
+
+
+    // IC: Inf/NaN and denormal handling based on DirectXMath.
+    static float fromFloat11(uint u) {
+        // 5 bit exponent
+        // 6 bit mantissa
+        
+        uint E = (u >> 6) & 0x1F;
+        uint M = u & 0x3F;
+
+        Float754 F;
+        F.field.negative = 0;
+
+        if (E == 0x1f) { // INF or NAN.
+            E = 0xFF;
+        }
+        else {
+            if (E != 0) {
+                F.field.biasedexponent = E + 127 - 15;
+                F.field.mantissa = M << (23 - 6);
+            }
+            else if (M != 0) {
+                E = 1;
+                do {
+                    E--;
+                    M <<= 1;
+                } while((M & 0x40) == 0);
+
+                M &= 0x3F;
+            }
+        }
+
+        F.field.biasedexponent = 0xFF;
+        F.field.mantissa = M << (23 - 6);
+        
+		return F.value;
+#if 0
+        // X Channel (6-bit mantissa)
+        Mantissa = pSource->xm;
+
+        if ( pSource->xe == 0x1f ) // INF or NAN
+        {
+            Result[0] = 0x7f800000 | (pSource->xm << 17);
+        }
+        else
+        {
+            if ( pSource->xe != 0 ) // The value is normalized
+            {
+                Exponent = pSource->xe;
+            }
+            else if (Mantissa != 0) // The value is denormalized
+            {
+                // Normalize the value in the resulting float
+                Exponent = 1;
+        
+                do
+                {
+                    Exponent--;
+                    Mantissa <<= 1;
+                } while ((Mantissa & 0x40) == 0);
+        
+                Mantissa &= 0x3F;
+            }
+            else // The value is zero
+            {
+                Exponent = (uint32_t)-112;
+            }
+    
+            Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
+        }
+#endif
+    }
+
+    // https://www.opengl.org/registry/specs/EXT/texture_shared_exponent.txt
+    Float3SE toFloat3SE(float r, float g, float b)
+    {
+        const int N = 9;                    // Mantissa bits.
+        const int E = 5;                    // Exponent bits.
+        const int Emax = (1 << E) - 1;      // 31
+        const int B = (1 << (E-1)) - 1;     // 15
+        const float sharedexp_max = float((1 << N) - 1) / (1 << N) * (1 << (Emax-B));   // 65408
+
+        // Clamp color components.
+        r = max(0.0f, min(sharedexp_max, r));
+        g = max(0.0f, min(sharedexp_max, g));
+        b = max(0.0f, min(sharedexp_max, b));
+
+        // Get max component.
+        float max_c = max3(r, g, b);
+
+        // Compute shared exponent.
+        int exp_shared_p = max(-B-1, ftoi_floor(log2f(max_c))) + 1 + B;
+
+        int max_s = ftoi_round(max_c / (1 << (exp_shared_p - B - N)));
+
+        int exp_shared = exp_shared_p;
+        if (max_s == (1 << N)) exp_shared++;
+
+        Float3SE v;
+        v.e = exp_shared;
+
+        // Compute mantissas.
+        v.xm = ftoi_round(r / (1 << (exp_shared - B - N)));
+        v.ym = ftoi_round(g / (1 << (exp_shared - B - N)));
+        v.zm = ftoi_round(b / (1 << (exp_shared - B - N)));
+
+        return v;
+    }
+
+    Vector3 fromFloat3SE(Float3SE v) {
+        Float754 f;
+        f.raw = 0x33800000 + (v.e << 23);
+        float scale = f.value;
+        return scale * Vector3(float(v.xm), float(v.ym), float(v.zm));
+    }
+
+    // These are based on: http://www.graphics.cornell.edu/~bjw/rgbe/rgbe.c
+    uint toRGBE(float r, float g, float b)
+    {
+        float v = max3(r, g, b);
+
+        uint rgbe;
+
+        if (v < 1e-32) {
+            rgbe = 0;
+        }
+        else {
+            int e;
+            float scale = frexpf(v, &e) * 256.0f / v;
+            //Float754 f;
+            //f.value = v;
+            //float scale = f.field.biasedexponent * 256.0f / v;
+            //e = f.field.biasedexponent - 127
+
+            rgbe |= U8(ftoi_round(r * scale)) << 0;
+            rgbe |= U8(ftoi_round(g * scale)) << 8;
+            rgbe |= U8(ftoi_round(b * scale)) << 16;
+            rgbe |= U8(e + 128) << 24;
+        }
+
+        return rgbe;
+    }
+
+    Vector3 fromRGBE(uint rgbe) {
+        uint r = (rgbe >> 0) & 0xFF;
+        uint g = (rgbe >> 8) & 0xFF;
+        uint b = (rgbe >> 16) & 0xFF;
+        uint e = (rgbe >> 24);
+
+        if (e != 0) {
+            float scale = ldexpf(1.0f, e-(int)(128+8));             // +8 to divide by 256. @@ Shouldn't we divide by 255 instead?
+            return scale * Vector3(float(r), float(g), float(b));
+        }
+        
+        return Vector3(0);
+    }
+
+
+    struct BitStream
+    {
+        BitStream(uint8 * ptr) : ptr(ptr), buffer(0), bits(0) {
+        }
+
+        void putBits(uint p, int bitCount)
+        {
+            nvDebugCheck(bits < 8);
+            nvDebugCheck(bitCount <= 32);
+
+            uint64 buffer = (this->buffer << bitCount) | p;
+            uint bits = this->bits + bitCount;
+
+            while (bits >= 8)
+            {
+                *ptr++ = (buffer & 0xFF);
+                
+                buffer >>= 8;
+                bits -= 8;
+            }
+
+            this->buffer = (uint8)buffer;
+            this->bits = bits;
+        }
+
+        void putFloat(float f)
+        {
+            nvDebugCheck(bits == 0); // @@ Do not require alignment.
+            *((float *)ptr) = f;
+            ptr += 4;
+        }
+
+        void putHalf(float f)
+        {
+            nvDebugCheck(bits == 0); // @@ Do not require alignment.
+            *((uint16 *)ptr) = to_half(f);
+            ptr += 2;
+        }
+
+        void putFloat11(float f)
+        {
+            putBits(toFloat11(f), 11);
+        }
+
+        void putFloat10(float f)
+        {
+            putBits(toFloat10(f), 10);
+        }
+
+        void flush()
+        {
+            nvDebugCheck(bits < 8);
+            if (bits) {
+                *ptr++ = buffer;
+                buffer = 0;
+                bits = 0;
+            }
+        }
+
+        void align(int alignment)
+        {
+            nvDebugCheck(alignment >= 1);
+            flush();
+            int remainder = (int)((uintptr_t)ptr % alignment);
+            if (remainder != 0) {
+                putBits(0, (alignment - remainder) * 8);
+            }
+        }
+
+        uint8 * ptr;
+        uint8 buffer;
+        uint8 bits;
+    };
+
+} // namespace
+
+
+
+void PixelFormatConverter::compress(nvtt::AlphaMode /*alphaMode*/, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck (compressionOptions.format == nvtt::Format_RGBA);
+
+    uint bitCount;
+    uint rmask, rshift, rsize;
+    uint gmask, gshift, gsize;
+    uint bmask, bshift, bsize;
+    uint amask, ashift, asize;
+
+    if (compressionOptions.pixelType == nvtt::PixelType_Float)
+    {
+        rsize = compressionOptions.rsize;
+        gsize = compressionOptions.gsize;
+        bsize = compressionOptions.bsize;
+        asize = compressionOptions.asize;
+
+        // Other float sizes are not supported and will be zero-padded.
+        nvDebugCheck(rsize == 0 || rsize == 10 || rsize == 11 || rsize == 16 || rsize == 32);
+        nvDebugCheck(gsize == 0 || gsize == 10 || gsize == 11 || gsize == 16 || gsize == 32);
+        nvDebugCheck(bsize == 0 || bsize == 10 || bsize == 11 || bsize == 16 || bsize == 32);
+        nvDebugCheck(asize == 0 || asize == 10 || asize == 11 || asize == 16 || asize == 32);
+
+        bitCount = rsize + gsize + bsize + asize;
+    }
+    else
+    {
+        if (compressionOptions.bitcount != 0)
+        {
+            bitCount = compressionOptions.bitcount;
+            nvCheck(bitCount <= 32);
+
+            rmask = compressionOptions.rmask;
+            gmask = compressionOptions.gmask;
+            bmask = compressionOptions.bmask;
+            amask = compressionOptions.amask;
+
+            PixelFormat::maskShiftAndSize(rmask, &rshift, &rsize);
+            PixelFormat::maskShiftAndSize(gmask, &gshift, &gsize);
+            PixelFormat::maskShiftAndSize(bmask, &bshift, &bsize);
+            PixelFormat::maskShiftAndSize(amask, &ashift, &asize);
+        }
+        else
+        {
+            rsize = compressionOptions.rsize;
+            gsize = compressionOptions.gsize;
+            bsize = compressionOptions.bsize;
+            asize = compressionOptions.asize;
+
+            bitCount = rsize + gsize + bsize + asize;
+            nvCheck(bitCount <= 32);
+
+            ashift = 0;
+            bshift = ashift + asize;
+            gshift = bshift + bsize;
+            rshift = gshift + gsize;
+
+            rmask = ((1 << rsize) - 1) << rshift;
+            gmask = ((1 << gsize) - 1) << gshift;
+            bmask = ((1 << bsize) - 1) << bshift;
+            amask = ((1 << asize) - 1) << ashift;
+        }
+    }
+
+    const uint pitch = computeBytePitch(w, bitCount, compressionOptions.pitchAlignment);
+    const uint whd = w * h * d;
+
+    // Allocate output scanline.
+    uint8 * const dst = malloc<uint8>(pitch);
+
+    for (uint z = 0; z < d; z++)
+    {
+        for (uint y = 0; y < h; y++)
+        {
+            const float * src = (const float *)data + (z * h + y) * w;
+
+            BitStream stream(dst);
+
+            for (uint x = 0; x < w; x++)
+            {
+                float r = src[x + 0 * whd];
+                float g = src[x + 1 * whd];
+                float b = src[x + 2 * whd];
+                float a = src[x + 3 * whd];
+
+                if (compressionOptions.pixelType == nvtt::PixelType_Float)
+                {
+                    if (rsize == 32) stream.putFloat(r);
+                    else if (rsize == 16) stream.putHalf(r);
+                    else if (rsize == 11) stream.putFloat11(r);
+                    else if (rsize == 10) stream.putFloat10(r);
+                    else stream.putBits(0, rsize);
+
+                    if (gsize == 32) stream.putFloat(g);
+                    else if (gsize == 16) stream.putHalf(g);
+                    else if (gsize == 11) stream.putFloat11(g);
+                    else if (gsize == 10) stream.putFloat10(g);
+                    else stream.putBits(0, gsize);
+
+                    if (bsize == 32) stream.putFloat(b);
+                    else if (bsize == 16) stream.putHalf(b);
+                    else if (bsize == 11) stream.putFloat11(b);
+                    else if (bsize == 10) stream.putFloat10(b);
+                    else stream.putBits(0, bsize);
+
+                    if (asize == 32) stream.putFloat(a);
+                    else if (asize == 16) stream.putHalf(a);
+                    else if (asize == 11) stream.putFloat11(a);
+                    else if (asize == 10) stream.putFloat10(a);
+                    else stream.putBits(0, asize);
+                }
+                else if (compressionOptions.pixelType == nvtt::PixelType_SharedExp)
+                {
+                    if (rsize == 9 && gsize == 9 && bsize == 9 && asize == 5) {
+                        Float3SE v = toFloat3SE(r, g, b);
+                        stream.putBits(v.v, 32);
+                    }
+                    else if (rsize == 8 && gsize == 8 && bsize == 8 && asize == 8) {
+                        // @@ 
+                    }
+                    else {
+                        // @@ Not supported. Filling with zeros.
+                        stream.putBits(0, bitCount);
+                    }
+                }
+                else
+                {
+                    // We first convert to 16 bits, then to the target size. @@ If greater than 16 bits, this will truncate and bitexpand.
+                    
+                    // @@ Add support for nvtt::PixelType_SignedInt, nvtt::PixelType_SignedNorm, nvtt::PixelType_UnsignedInt
+
+                    int ir, ig, ib, ia;
+                    if (compressionOptions.pixelType == nvtt::PixelType_UnsignedNorm) {
+                        ir = iround(clamp(r * 65535.0f, 0.0f, 65535.0f));
+                        ig = iround(clamp(g * 65535.0f, 0.0f, 65535.0f));
+                        ib = iround(clamp(b * 65535.0f, 0.0f, 65535.0f));
+                        ia = iround(clamp(a * 65535.0f, 0.0f, 65535.0f));
+                    }
+                    else if (compressionOptions.pixelType == nvtt::PixelType_SignedNorm) {
+                        // @@
+                    }
+                    else if (compressionOptions.pixelType == nvtt::PixelType_UnsignedInt) {
+                        ir = iround(clamp(r, 0.0f, 65535.0f));
+                        ig = iround(clamp(g, 0.0f, 65535.0f));
+                        ib = iround(clamp(b, 0.0f, 65535.0f));
+                        ia = iround(clamp(a, 0.0f, 65535.0f));
+                    }
+                    else if (compressionOptions.pixelType == nvtt::PixelType_SignedInt) {
+                        // @@
+                    }
+                    
+                    uint p = 0;
+                    p |= PixelFormat::convert(ir, 16, rsize) << rshift;
+                    p |= PixelFormat::convert(ig, 16, gsize) << gshift;
+                    p |= PixelFormat::convert(ib, 16, bsize) << bshift;
+                    p |= PixelFormat::convert(ia, 16, asize) << ashift;
+
+                    stream.putBits(p, bitCount);
+                }
+            }
+
+            // Zero padding.
+            stream.align(compressionOptions.pitchAlignment);
+            nvDebugCheck(stream.ptr == dst + pitch);
+
+            // Scanlines are always byte-aligned.
+            outputOptions.writeData(dst, pitch);
+        }
+    }
+
+    free(dst);
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/Context.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/Context.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/Context.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_CONTEXT_H
+#define NV_TT_CONTEXT_H
+
+#include "nvcore/Ptr.h"
+
+#include "nvtt/Compressor.h"
+#include "nvtt/cuda/CudaCompressorDXT.h"
+#include "nvtt.h"
+#include "TaskDispatcher.h"
+
+namespace nv
+{
+    class Image;
+}
+
+namespace nvtt
+{
+    struct Mipmap;
+
+    struct Compressor::Private
+    {
+        Private() {}
+
+        bool compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+        bool compress(const Surface & tex, int face, int mipmap, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+        bool compress(AlphaMode alphaMode, int w, int h, int d, int face, int mipmap, const float * data, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+
+        void quantize(Surface & tex, const CompressionOptions::Private & compressionOptions) const;
+
+        bool outputHeader(nvtt::TextureType textureType, int w, int h, int d, int faceCount, int mipmapCount, bool isNormalMap, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
+
+        nv::CompressorInterface * chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const;
+        nv::CompressorInterface * chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const;
+
+
+        bool cudaSupported;
+        bool cudaEnabled;
+
+        nv::AutoPtr<nv::CudaContext> cuda;
+
+        TaskDispatcher * dispatcher;
+        //SequentialTaskDispatcher defaultDispatcher;
+        ConcurrentTaskDispatcher defaultDispatcher;
+    };
+
+} // nvtt namespace
+
+
+#endif // NV_TT_CONTEXT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/Context.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/Context.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/Context.cpp
@@ -0,0 +1,862 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2008-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Context.h"
+
+#include "nvtt.h"
+
+#include "InputOptions.h"
+#include "CompressionOptions.h"
+#include "OutputOptions.h"
+#include "Surface.h"
+
+#include "CompressorDX9.h"
+#include "CompressorDX10.h"
+#include "CompressorDX11.h"
+#include "CompressorRGB.h"
+#include "cuda/CudaUtils.h"
+#include "cuda/CudaCompressorDXT.h"
+
+#include "nvimage/DirectDrawSurface.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+#include "nvimage/Image.h"
+#include "nvimage/FloatImage.h"
+#include "nvimage/Filter.h"
+#include "nvimage/Quantize.h"
+#include "nvimage/NormalMap.h"
+#include "nvimage/PixelFormat.h"
+#include "nvimage/ColorSpace.h"
+
+#include "nvcore/Memory.h"
+#include "nvcore/Ptr.h"
+
+using namespace nv;
+using namespace nvtt;
+
+Compressor::Compressor() : m(*new Compressor::Private())
+{
+    // CUDA initialization.
+    m.cudaSupported = cuda::isHardwarePresent();
+    m.cudaEnabled = false;
+    m.cuda = NULL;
+
+    enableCudaAcceleration(m.cudaSupported);
+
+    m.dispatcher = &m.defaultDispatcher;
+}
+
+Compressor::~Compressor()
+{
+    delete &m;
+}
+
+
+void Compressor::enableCudaAcceleration(bool enable)
+{
+    if (m.cudaSupported)
+    {
+        m.cudaEnabled = enable;
+    }
+
+    if (m.cudaEnabled && m.cuda == NULL)
+    {
+        m.cuda = new CudaContext();
+
+        if (!m.cuda->isValid())
+        {
+            m.cudaEnabled = false;
+            m.cuda = NULL;
+        }
+    }
+}
+
+bool Compressor::isCudaAccelerationEnabled() const
+{
+    return m.cudaEnabled;
+}
+
+void Compressor::setTaskDispatcher(TaskDispatcher * disp)
+{
+    if (disp == NULL) {
+        m.dispatcher = &m.defaultDispatcher;
+    }
+    else {
+        m.dispatcher = disp;
+    }
+}
+
+
+// Input Options API.
+bool Compressor::process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    return m.compress(inputOptions.m, compressionOptions.m, outputOptions.m);
+}
+
+int Compressor::estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const
+{
+    int w = inputOptions.m.width;
+    int h = inputOptions.m.height;
+    int d = inputOptions.m.depth;
+    
+    getTargetExtent(&w, &h, &d, inputOptions.m.maxExtent, inputOptions.m.roundMode, inputOptions.m.textureType);
+
+    int mipmapCount = 1;
+    if (inputOptions.m.generateMipmaps) {
+        mipmapCount = countMipmaps(w, h, d);
+        if (inputOptions.m.maxLevel > 0) mipmapCount = min(mipmapCount, inputOptions.m.maxLevel);
+    }
+
+    return inputOptions.m.faceCount * estimateSize(w, h, d, mipmapCount, compressionOptions);
+}
+
+
+// Surface API.
+bool Compressor::outputHeader(const Surface & tex, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    return m.outputHeader(tex.type(), tex.width(), tex.height(), tex.depth(), 1, mipmapCount, tex.isNormalMap(), compressionOptions.m, outputOptions.m);
+}
+
+bool Compressor::compress(const Surface & tex, int face, int mipmap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    return m.compress(tex, face, mipmap, compressionOptions.m, outputOptions.m);
+}
+
+int Compressor::estimateSize(const Surface & tex, int mipmapCount, const CompressionOptions & compressionOptions) const
+{
+    const int w = tex.width();
+    const int h = tex.height();
+    const int d = tex.depth();
+
+    return estimateSize(w, h, d, mipmapCount, compressionOptions);
+}
+
+bool Compressor::outputHeader(const CubeSurface & cube, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    return m.outputHeader(TextureType_Cube, cube.edgeLength(), cube.edgeLength(), 1, 1, mipmapCount, false, compressionOptions.m, outputOptions.m);
+}
+
+bool Compressor::compress(const CubeSurface & cube, int mipmap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    for (int i = 0; i < 6; i++) {
+        if(!m.compress(cube.face(i), i, mipmap, compressionOptions.m, outputOptions.m)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+int Compressor::estimateSize(const CubeSurface & cube, int mipmapCount, const CompressionOptions & compressionOptions) const
+{
+    return 6 * estimateSize(cube.edgeLength(), cube.edgeLength(), 1, mipmapCount, compressionOptions);
+}
+
+
+// Raw API.
+bool Compressor::outputHeader(TextureType type, int w, int h, int d, int arraySize, int mipmapCount, bool isNormalMap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    return m.outputHeader(type, w, h, d, arraySize, mipmapCount, isNormalMap, compressionOptions.m, outputOptions.m);
+}
+
+bool Compressor::compress(int w, int h, int d, int face, int mipmap, const float * rgba, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const
+{
+    return m.compress(AlphaMode_None, w, h, d, face, mipmap, rgba, compressionOptions.m, outputOptions.m);
+}
+
+int Compressor::estimateSize(int w, int h, int d, int mipmapCount, const CompressionOptions & compressionOptions) const
+{
+    const Format format = compressionOptions.m.format;
+
+    const uint bitCount = compressionOptions.m.getBitCount();
+    const uint pitchAlignment = compressionOptions.m.pitchAlignment;
+
+    int size = 0;
+    for (int m = 0; m < mipmapCount; m++)
+    {
+        size += computeImageSize(w, h, d, bitCount, pitchAlignment, format);
+
+        // Compute extents of next mipmap:
+        w = max(1, w / 2);
+        h = max(1, h / 2);
+        d = max(1, d / 2);
+    }
+
+    return size;
+}
+
+
+
+
+
+bool Compressor::Private::compress(const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+    // Make sure enums match.
+    nvStaticCheck(FloatImage::WrapMode_Clamp == (FloatImage::WrapMode)WrapMode_Clamp);
+    nvStaticCheck(FloatImage::WrapMode_Mirror == (FloatImage::WrapMode)WrapMode_Mirror);
+    nvStaticCheck(FloatImage::WrapMode_Repeat == (FloatImage::WrapMode)WrapMode_Repeat);
+
+    // Get output handler.
+    if (!outputOptions.hasValidOutputHandler()) {
+        outputOptions.error(Error_FileOpen);
+        return false;
+    }
+
+    nvtt::Surface img;
+    img.setWrapMode(inputOptions.wrapMode);
+    img.setAlphaMode(inputOptions.alphaMode);
+    img.setNormalMap(inputOptions.isNormalMap);
+
+    const int faceCount = inputOptions.faceCount;
+    int width = inputOptions.width;
+    int height = inputOptions.height;
+    int depth = inputOptions.depth;
+    int arraySize = inputOptions.textureType == TextureType_Array ? faceCount : 1;
+
+    nv::getTargetExtent(&width, &height, &depth, inputOptions.maxExtent, inputOptions.roundMode, inputOptions.textureType);
+
+    // If the extents have not changed, then we can use source images for all mipmaps.
+    bool canUseSourceImages = (inputOptions.width == width && inputOptions.height == height && inputOptions.depth == depth);
+
+    int mipmapCount = 1;
+    if (inputOptions.generateMipmaps) {
+        mipmapCount = countMipmaps(width, height, depth);
+        if (inputOptions.maxLevel > 0) mipmapCount = min(mipmapCount, inputOptions.maxLevel);
+    }
+
+    if (!outputHeader(inputOptions.textureType, width, height, depth, arraySize, mipmapCount, img.isNormalMap(), compressionOptions, outputOptions)) {
+        return false;
+    }
+
+
+    // Output images.
+    for (int f = 0; f < faceCount; f++)
+    {
+        int w = width;
+        int h = height;
+        int d = depth;
+        bool canUseSourceImagesForThisFace = canUseSourceImages;
+
+        img.setImage(inputOptions.inputFormat, inputOptions.width, inputOptions.height, inputOptions.depth, inputOptions.images[f]);
+
+        // To normal map.
+        if (inputOptions.convertToNormalMap) {
+            img.toGreyScale(inputOptions.heightFactors.x, inputOptions.heightFactors.y, inputOptions.heightFactors.z, inputOptions.heightFactors.w);
+            img.toNormalMap(inputOptions.bumpFrequencyScale.x, inputOptions.bumpFrequencyScale.y, inputOptions.bumpFrequencyScale.z, inputOptions.bumpFrequencyScale.w);
+            img.packNormals();
+        }
+
+        // To linear space.
+        if (!img.isNormalMap()) {
+            img.toLinear(inputOptions.inputGamma);
+        }
+
+        // Resize input.
+        img.resize(w, h, d, ResizeFilter_Box);
+
+        nvtt::Surface tmp = img;
+        if (!img.isNormalMap()) {
+            tmp.toGamma(inputOptions.outputGamma);
+        }
+
+        quantize(tmp, compressionOptions);
+        compress(tmp, f, 0, compressionOptions, outputOptions);
+
+        for (int m = 1; m < mipmapCount; m++) {
+            w = max(1, w/2);
+            h = max(1, h/2);
+            d = max(1, d/2);
+
+            int idx = m * faceCount + f;
+
+            bool useSourceImages = false;
+            if (canUseSourceImagesForThisFace) {
+                if (inputOptions.images[idx] == NULL) { // One face is missing in this mipmap level.
+                    canUseSourceImagesForThisFace = false; // If one level is missing, ignore the following source images.
+                }
+                else {
+                    useSourceImages = true;
+                }
+            }
+
+            if (useSourceImages) {
+                img.setImage(inputOptions.inputFormat, w, h, d, inputOptions.images[idx]);
+
+                // For already generated mipmaps, we need to convert to linear.
+                if (!img.isNormalMap()) {
+                    img.toLinear(inputOptions.inputGamma);
+                }
+            }
+            else {
+                if (inputOptions.mipmapFilter == MipmapFilter_Kaiser) {
+                    float params[2] = { inputOptions.kaiserAlpha, inputOptions.kaiserStretch };
+                    img.buildNextMipmap(MipmapFilter_Kaiser, inputOptions.kaiserWidth, params);
+                }
+                else {
+                    img.buildNextMipmap(inputOptions.mipmapFilter);
+                }
+            }
+            nvDebugCheck(img.width() == w);
+            nvDebugCheck(img.height() == h);
+            nvDebugCheck(img.depth() == d);
+
+            if (img.isNormalMap()) {
+                if (inputOptions.normalizeMipmaps) {
+                    img.expandNormals();
+                    img.normalizeNormalMap();
+                    img.packNormals();
+                }
+                tmp = img;
+            }
+            else {
+                tmp = img;
+                tmp.toGamma(inputOptions.outputGamma);
+            }
+
+            quantize(tmp, compressionOptions);
+            compress(tmp, f, m, compressionOptions, outputOptions);
+        }
+    }
+
+    return true;
+}
+
+bool Compressor::Private::compress(const Surface & tex, int face, int mipmap, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+    if (!compress(tex.alphaMode(), tex.width(), tex.height(), tex.depth(), face, mipmap, tex.data(), compressionOptions, outputOptions)) {
+        return false;
+    }
+
+    return true;
+}
+
+bool Compressor::Private::compress(AlphaMode alphaMode, int w, int h, int d, int face, int mipmap, const float * rgba, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+    int size = computeImageSize(w, h, d, compressionOptions.getBitCount(), compressionOptions.pitchAlignment, compressionOptions.format);
+    outputOptions.beginImage(size, w, h, d, face, mipmap);
+
+    // Decide what compressor to use.
+    AutoPtr<CompressorInterface> compressor;
+#if defined HAVE_CUDA
+    if (cudaEnabled && w * h >= 512)
+    {
+        compressor = chooseGpuCompressor(compressionOptions);
+    }
+#endif
+    if (compressor == NULL)
+    {
+        compressor = chooseCpuCompressor(compressionOptions);
+    }
+
+    if (compressor == NULL)
+    {
+        outputOptions.error(Error_UnsupportedFeature);
+    }
+    else
+    {
+        compressor->compress(alphaMode, w, h, d, rgba, dispatcher, compressionOptions, outputOptions);
+    }
+
+    outputOptions.endImage();
+
+    return true;
+}
+
+
+void Compressor::Private::quantize(Surface & img, const CompressionOptions::Private & compressionOptions) const
+{
+    if (compressionOptions.enableColorDithering) {
+        if (compressionOptions.format >= Format_BC1 && compressionOptions.format <= Format_BC3) {
+            img.quantize(0, 5, true, true);
+            img.quantize(1, 6, true, true);
+            img.quantize(2, 5, true, true);
+        }
+        else if (compressionOptions.format == Format_RGB) {
+            img.quantize(0, compressionOptions.rsize, true, true);
+            img.quantize(1, compressionOptions.gsize, true, true);
+            img.quantize(2, compressionOptions.bsize, true, true);
+        }
+    }
+    if (compressionOptions.enableAlphaDithering) {
+        if (compressionOptions.format == Format_RGB) {
+            img.quantize(3, compressionOptions.asize, true, true);
+        }
+    }
+    else if (compressionOptions.binaryAlpha) {
+        img.binarize(3, float(compressionOptions.alphaThreshold)/255.0f, compressionOptions.enableAlphaDithering);
+    }
+}
+
+bool Compressor::Private::outputHeader(nvtt::TextureType textureType, int w, int h, int d, int arraySize, int mipmapCount, bool isNormalMap, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const
+{
+    if (w <= 0 || h <= 0 || d <= 0 || arraySize <= 0 || mipmapCount <= 0)
+    {
+        outputOptions.error(Error_InvalidInput);
+        return false;
+    }
+
+    if (!outputOptions.outputHeader)
+    {
+        return true;
+    }
+
+    // Output DDS header.
+    if (outputOptions.container == Container_DDS || outputOptions.container == Container_DDS10)
+    {
+        DDSHeader header;
+
+        header.setUserVersion(outputOptions.version);
+
+        if (textureType == TextureType_2D) {
+            nvCheck(arraySize == 1);
+            header.setTexture2D();
+        }
+        else if (textureType == TextureType_Cube) {
+            nvCheck(arraySize == 1);
+            header.setTextureCube();
+        }
+        else if (textureType == TextureType_3D) {
+            nvCheck(arraySize == 1);
+            header.setTexture3D();
+            header.setDepth(d);
+        }
+        else if (textureType == TextureType_Array) {
+            header.setTextureArray(arraySize);
+        }
+
+        header.setWidth(w);
+        header.setHeight(h);
+        header.setMipmapCount(mipmapCount);
+
+        bool supported = true;
+
+        if (outputOptions.container == Container_DDS10)
+        {
+            if (compressionOptions.format == Format_RGBA)
+            {
+                const uint bitcount = compressionOptions.getBitCount();
+
+                if (compressionOptions.pixelType == PixelType_Float) {
+                    if (compressionOptions.rsize == 16 && compressionOptions.gsize == 16 && compressionOptions.bsize == 16 && compressionOptions.asize == 16) {
+                        header.setDX10Format(DXGI_FORMAT_R16G16B16A16_FLOAT);
+                    }
+                    else if (compressionOptions.rsize == 11 && compressionOptions.gsize == 11 && compressionOptions.bsize == 10 && compressionOptions.asize == 0) {
+                        header.setDX10Format(DXGI_FORMAT_R11G11B10_FLOAT);
+                    }
+                    else {
+                        supported = false;
+                    }
+                }
+                else {
+                    if (bitcount == 16 && compressionOptions.rsize == 16) {
+                        header.setDX10Format(DXGI_FORMAT_R16_UNORM);
+                    }
+                    else {
+                        uint format = findDXGIFormat(compressionOptions.bitcount,
+                                                     compressionOptions.rmask,
+                                                     compressionOptions.gmask,
+                                                     compressionOptions.bmask,
+                                                     compressionOptions.amask);
+
+                        if (format != DXGI_FORMAT_UNKNOWN) {
+                            header.setDX10Format(format);
+                        }
+                        else {
+                            supported = false;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a || compressionOptions.format == Format_DXT1n) {
+                    header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC1_UNORM_SRGB : DXGI_FORMAT_BC1_UNORM);
+                    if (compressionOptions.format == Format_DXT1a) header.setHasAlphaFlag(true);
+                    if (isNormalMap) header.setNormalFlag(true);
+                }
+                else if (compressionOptions.format == Format_DXT3) {
+                    header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC2_UNORM_SRGB : DXGI_FORMAT_BC2_UNORM);
+                }
+                else if (compressionOptions.format == Format_DXT5 || compressionOptions.format == Format_BC3_RGBM) {
+                    header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC3_UNORM_SRGB : DXGI_FORMAT_BC3_UNORM);
+                }
+                else if (compressionOptions.format == Format_DXT5n) {
+                    header.setDX10Format(DXGI_FORMAT_BC3_UNORM);
+                    if (isNormalMap) header.setNormalFlag(true);
+                }
+                else if (compressionOptions.format == Format_BC4) {
+                    header.setDX10Format(DXGI_FORMAT_BC4_UNORM); // DXGI_FORMAT_BC4_SNORM ?
+                }
+                else if (compressionOptions.format == Format_BC5 /*|| compressionOptions.format == Format_BC5_Luma*/) {
+                    header.setDX10Format(DXGI_FORMAT_BC5_UNORM); // DXGI_FORMAT_BC5_SNORM ?
+                    if (isNormalMap) header.setNormalFlag(true);
+                }
+                else if (compressionOptions.format == Format_BC6) {
+                    if (compressionOptions.pixelType == PixelType_Float) header.setDX10Format(DXGI_FORMAT_BC6H_SF16);
+                    /*if (compressionOptions.pixelType == PixelType_UnsignedFloat)*/ header.setDX10Format(DXGI_FORMAT_BC6H_UF16); // By default we assume unsigned.
+                }
+                else if (compressionOptions.format == Format_BC7) {
+                    header.setDX10Format(outputOptions.srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM);
+                    if (isNormalMap) header.setNormalFlag(true);
+                }
+                else if (compressionOptions.format == Format_CTX1) {
+                    supported = false;
+                }
+                else {
+                    supported = false;
+                }
+            }
+        }
+        else
+        {
+            if (compressionOptions.format == Format_RGBA)
+            {
+                // Get output bit count.
+                header.setPitch(computeBytePitch(w, compressionOptions.getBitCount(), compressionOptions.pitchAlignment));
+
+                if (compressionOptions.pixelType == PixelType_Float)
+                {
+                    if (compressionOptions.rsize == 16 && compressionOptions.gsize == 0 && compressionOptions.bsize == 0 && compressionOptions.asize == 0)
+                    {
+                        header.setFormatCode(111); // D3DFMT_R16F
+                    }
+                    else if (compressionOptions.rsize == 16 && compressionOptions.gsize == 16 && compressionOptions.bsize == 0 && compressionOptions.asize == 0)
+                    {
+                        header.setFormatCode(112); // D3DFMT_G16R16F
+                    }
+                    else if (compressionOptions.rsize == 16 && compressionOptions.gsize == 16 && compressionOptions.bsize == 16 && compressionOptions.asize == 16)
+                    {
+                        header.setFormatCode(113); // D3DFMT_A16B16G16R16F
+                    }
+                    else if (compressionOptions.rsize == 32 && compressionOptions.gsize == 0 && compressionOptions.bsize == 0 && compressionOptions.asize == 0)
+                    {
+                        header.setFormatCode(114); // D3DFMT_R32F
+                    }
+                    else if (compressionOptions.rsize == 32 && compressionOptions.gsize == 32 && compressionOptions.bsize == 0 && compressionOptions.asize == 0)
+                    {
+                        header.setFormatCode(115); // D3DFMT_G32R32F
+                    }
+                    else if (compressionOptions.rsize == 32 && compressionOptions.gsize == 32 && compressionOptions.bsize == 32 && compressionOptions.asize == 32)
+                    {
+                        header.setFormatCode(116); // D3DFMT_A32B32G32R32F
+                    }
+                    else
+                    {
+                        supported = false;
+                    }
+                }
+                else // Fixed point
+                {
+                    const uint bitcount = compressionOptions.getBitCount();
+
+                    if (compressionOptions.bitcount != 0)
+                    {
+                        // Masks already computed.
+                        header.setPixelFormat(compressionOptions.bitcount, compressionOptions.rmask, compressionOptions.gmask, compressionOptions.bmask, compressionOptions.amask);
+                    }
+                    else if (bitcount <= 32)
+                    {
+                        // Compute pixel format masks.
+                        const uint ashift = 0;
+                        const uint bshift = ashift + compressionOptions.asize;
+                        const uint gshift = bshift + compressionOptions.bsize;
+                        const uint rshift = gshift + compressionOptions.gsize;
+
+                        const uint rmask = ((1 << compressionOptions.rsize) - 1) << rshift;
+                        const uint gmask = ((1 << compressionOptions.gsize) - 1) << gshift;
+                        const uint bmask = ((1 << compressionOptions.bsize) - 1) << bshift;
+                        const uint amask = ((1 << compressionOptions.asize) - 1) << ashift;
+
+                        header.setPixelFormat(bitcount, rmask, gmask, bmask, amask);
+                    }
+                    else
+                    {
+                        supported = false;
+                    }
+                }
+            }
+            else
+            {
+                header.setLinearSize(computeImageSize(w, h, d, compressionOptions.bitcount, compressionOptions.pitchAlignment, compressionOptions.format));
+
+                if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a || compressionOptions.format == Format_DXT1n) {
+                    header.setFourCC('D', 'X', 'T', '1');
+                    if (isNormalMap) header.setNormalFlag(true);
+                }
+                else if (compressionOptions.format == Format_DXT3) {
+                    header.setFourCC('D', 'X', 'T', '3');
+                }
+                else if (compressionOptions.format == Format_DXT5 || compressionOptions.format == Format_BC3_RGBM) {
+                    header.setFourCC('D', 'X', 'T', '5');
+                }
+                else if (compressionOptions.format == Format_DXT5n) {
+                    header.setFourCC('D', 'X', 'T', '5');
+                    if (isNormalMap) {
+                        header.setNormalFlag(true);
+                        header.setSwizzleCode('A', '2', 'D', '5');
+                        //header.setSwizzleCode('x', 'G', 'x', 'R');
+                    }
+                }
+                else if (compressionOptions.format == Format_BC4) {
+                    header.setFourCC('A', 'T', 'I', '1');
+                }
+                else if (compressionOptions.format == Format_BC5 /*|| compressionOptions.format == Format_BC5_Luma*/) {
+                    header.setFourCC('A', 'T', 'I', '2');
+                    if (isNormalMap) {
+                        header.setNormalFlag(true);
+                        header.setSwizzleCode('A', '2', 'X', 'Y');
+                    }
+                }
+                else if (compressionOptions.format == Format_BC6) {
+                    header.setFourCC('Z', 'O', 'H', ' ');               // This is not supported by D3DX. Always use DX10 header with BC6-7 formats.
+                    supported = false;
+                }
+                else if (compressionOptions.format == Format_BC7) {
+                    header.setFourCC('Z', 'O', 'L', 'A');               // This is not supported by D3DX. Always use DX10 header with BC6-7 formats.
+                    if (isNormalMap) header.setNormalFlag(true);
+                    supported = false;
+                }
+                else if (compressionOptions.format == Format_CTX1) {
+                    header.setFourCC('C', 'T', 'X', '1');
+                    if (isNormalMap) header.setNormalFlag(true);
+                }
+                else {
+                    supported = false;
+                }
+            }
+
+            if (outputOptions.srgb) header.setSrgbFlag(true);
+        }
+
+        if (!supported)
+        {
+            // This container does not support the requested format.
+            outputOptions.error(Error_UnsupportedOutputFormat);
+            return false;
+        }
+
+        uint headerSize = 128;
+        if (header.hasDX10Header())
+        {
+            nvStaticCheck(sizeof(DDSHeader) == 128 + 20);
+            headerSize = 128 + 20;
+        }
+
+        // Swap bytes if necessary.
+        header.swapBytes();
+
+        bool writeSucceed = outputOptions.writeData(&header, headerSize);
+        if (!writeSucceed)
+        {
+            outputOptions.error(Error_FileWrite);
+        }
+
+        return writeSucceed;
+    }
+
+    return true;
+}
+
+
+CompressorInterface * Compressor::Private::chooseCpuCompressor(const CompressionOptions::Private & compressionOptions) const
+{
+    if (compressionOptions.format == Format_RGB)
+    {
+        return new PixelFormatConverter;
+    }
+    else if (compressionOptions.format == Format_DXT1)
+    {
+#if defined(HAVE_ATITC)
+        if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT1;
+        else
+#endif
+
+#if defined(HAVE_SQUISH)
+        if (compressionOptions.externalCompressor == "squish") return new SquishCompressorDXT1;
+        else
+#endif
+
+#if defined(HAVE_D3DX)
+        if (compressionOptions.externalCompressor == "d3dx") return new D3DXCompressorDXT1;
+        else
+#endif
+
+#if defined(HAVE_D3DX)
+        if (compressionOptions.externalCompressor == "stb") return new StbCompressorDXT1;
+        else
+#endif
+
+        if (compressionOptions.quality == Quality_Fastest)
+        {
+            return new FastCompressorDXT1;
+        }
+
+        return new CompressorDXT1;
+    }
+    else if (compressionOptions.format == Format_DXT1a)
+    {
+        if (compressionOptions.quality == Quality_Fastest)
+        {
+            return new FastCompressorDXT1a;
+        }
+
+        return new CompressorDXT1a;
+    }
+    else if (compressionOptions.format == Format_DXT1n)
+    {
+        // Not supported.
+    }
+    else if (compressionOptions.format == Format_DXT3)
+    {
+        if (compressionOptions.quality == Quality_Fastest)
+        {
+            return new FastCompressorDXT3;
+        }
+
+        return new CompressorDXT3;
+    }
+    else if (compressionOptions.format == Format_DXT5)
+    {
+#if defined(HAVE_ATITC)
+        if (compressionOptions.externalCompressor == "ati") return new AtiCompressorDXT5;
+        else
+#endif
+
+        if (compressionOptions.quality == Quality_Fastest)
+        {
+            return new FastCompressorDXT5;
+        }
+
+        return new CompressorDXT5;
+    }
+    else if (compressionOptions.format == Format_DXT5n)
+    {
+        if (compressionOptions.quality == Quality_Fastest)
+        {
+            return new FastCompressorDXT5n;
+        }
+
+        return new CompressorDXT5n;
+    }
+    else if (compressionOptions.format == Format_BC4)
+    {
+        if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal)
+        {
+            return new FastCompressorBC4;
+        }
+
+        return new ProductionCompressorBC4;
+    }
+    else if (compressionOptions.format == Format_BC5)
+    {
+        if (compressionOptions.quality == Quality_Fastest || compressionOptions.quality == Quality_Normal)
+        {
+            return new FastCompressorBC5;
+        }
+
+        return new ProductionCompressorBC5;
+    }
+    else if (compressionOptions.format == Format_CTX1)
+    {
+        // Not supported.
+    }
+    else if (compressionOptions.format == Format_BC6)
+    {
+        return new CompressorBC6;
+    }
+    else if (compressionOptions.format == Format_BC7)
+    {
+        return new CompressorBC7;
+    }
+    /*else if (compressionOptions.format == Format_BC5_Luma)
+    {
+        return new ProductionCompressorBC5_Luma;
+    }*/
+    else if (compressionOptions.format == Format_BC3_RGBM)
+    {
+        return new CompressorBC3_RGBM;
+    }
+
+    return NULL;
+}
+
+
+CompressorInterface * Compressor::Private::chooseGpuCompressor(const CompressionOptions::Private & compressionOptions) const
+{
+    nvDebugCheck(cudaSupported);
+
+    if (compressionOptions.quality == Quality_Fastest)
+    {
+        // Do not use CUDA compressors in fastest quality mode.
+        return NULL;
+    }
+
+#if defined HAVE_CUDA
+    if (compressionOptions.format == Format_DXT1)
+    {
+        return new CudaCompressorDXT1(*cuda);
+    }
+    else if (compressionOptions.format == Format_DXT1a)
+    {
+        //#pragma NV_MESSAGE("TODO: Implement CUDA DXT1a compressor.")
+    }
+    else if (compressionOptions.format == Format_DXT1n)
+    {
+        // Not supported.
+    }
+    else if (compressionOptions.format == Format_DXT3)
+    {
+        //return new CudaCompressorDXT3(*cuda);
+    }
+    else if (compressionOptions.format == Format_DXT5)
+    {
+        //return new CudaCompressorDXT5(*cuda);
+    }
+    else if (compressionOptions.format == Format_DXT5n)
+    {
+        // @@ Return CUDA compressor.
+    }
+    else if (compressionOptions.format == Format_BC4)
+    {
+        // Not supported.
+    }
+    else if (compressionOptions.format == Format_BC5)
+    {
+        // Not supported.
+    }
+    else if (compressionOptions.format == Format_CTX1)
+    {
+        // @@ Return CUDA compressor.
+    }
+    else if (compressionOptions.format == Format_BC6)
+    {
+        // Not supported.
+    }
+    else if (compressionOptions.format == Format_BC7)
+    {
+        // Not supported.
+    }
+#endif // defined HAVE_CUDA
+
+    return NULL;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CubeSurface.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CubeSurface.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CubeSurface.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_CUBEIMAGE_H
+#define NVTT_CUBEIMAGE_H
+
+#include "nvtt.h"
+#include "Surface.h"
+
+#include "nvimage/FloatImage.h"
+
+#include "nvmath/Vector.h"
+
+#include "nvcore/RefCounted.h"
+#include "nvcore/Ptr.h"
+#include "nvcore/Array.h"
+
+
+namespace nvtt
+{
+    struct TexelTable {
+        TexelTable(uint edgeLength);
+
+        float solidAngle(uint f, uint x, uint y) const;
+        const nv::Vector3 & direction(uint f, uint x, uint y) const;
+
+        uint size;
+        nv::Array<float> solidAngleArray;
+        nv::Array<nv::Vector3> directionArray;
+    };
+
+
+    struct CubeSurface::Private : public nv::RefCounted
+    {
+        void operator=(const Private &);
+    public:
+        Private()
+        {
+            nvDebugCheck( refCount() == 0 );
+
+            edgeLength = 0;
+            texelTable = NULL;
+        }
+        Private(const Private & p) : RefCounted() // Copy ctor. inits refcount to 0.
+        {
+            nvDebugCheck( refCount() == 0 );
+
+            edgeLength = p.edgeLength;
+            for (uint i = 0; i < 6; i++) {
+                face[i] = p.face[i];
+            }
+            texelTable = NULL; // @@ Transfer tables. Needs refcounting?
+        }
+        ~Private()
+        {
+            delete texelTable;
+        }
+
+        void allocate(uint edgeLength)
+        {
+            this->edgeLength = edgeLength;
+            for (uint i = 0; i < 6; i++) {
+                face[i].detach();
+                face[i].m->image = new nv::FloatImage;
+                face[i].m->image->allocate(4, edgeLength, edgeLength, 1);
+            }
+        }
+
+        void allocateTexelTable()
+        {
+            if (texelTable == NULL) {
+                texelTable = new TexelTable(edgeLength);
+            }
+        }
+
+        // Filtering helpers:
+        nv::Vector3 applyAngularFilter(const nv::Vector3 & dir, float coneAngle, float * filterTable, int tableSize);
+        nv::Vector3 applyCosinePowerFilter(const nv::Vector3 & dir, float coneAngle, float cosinePower);
+
+        nv::Vector3 sample(const nv::Vector3 & dir);
+
+        uint edgeLength;
+        Surface face[6];
+        TexelTable * texelTable;
+    };
+
+} // nvtt namespace
+
+
+#endif // NVTT_CUBEIMAGE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/CubeSurface.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/CubeSurface.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/CubeSurface.cpp
@@ -0,0 +1,1042 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CubeSurface.h"
+#include "Surface.h"
+
+#include "nvimage/DirectDrawSurface.h"
+
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Array.inl"
+#include "nvcore/StrLib.h"
+
+using namespace nv;
+using namespace nvtt;
+
+
+
+// Solid angle of an axis aligned quad from (0,0,1) to (x,y,1)
+// See: http://www.fizzmoll11.com/thesis/ for a derivation of this formula.
+static float areaElement(float x, float y) {
+    return atan2(x*y, sqrtf(x*x + y*y + 1));
+}
+
+// Solid angle of a hemicube texel.
+static float solidAngleTerm(uint x, uint y, float inverseEdgeLength) {
+    // Transform x,y to [-1, 1] range, offset by 0.5 to point to texel center.
+    float u = (float(x) + 0.5f) * (2 * inverseEdgeLength) - 1.0f;
+    float v = (float(y) + 0.5f) * (2 * inverseEdgeLength) - 1.0f;
+    nvDebugCheck(u >= -1.0f && u <= 1.0f);
+    nvDebugCheck(v >= -1.0f && v <= 1.0f);
+
+#if 1
+    // Exact solid angle:
+    float x0 = u - inverseEdgeLength;
+    float y0 = v - inverseEdgeLength;
+    float x1 = u + inverseEdgeLength;
+    float y1 = v + inverseEdgeLength;
+    float solidAngle = areaElement(x0, y0) - areaElement(x0, y1) - areaElement(x1, y0) + areaElement(x1, y1);
+    nvDebugCheck(solidAngle > 0.0f);
+
+    return solidAngle;
+#else
+    // This formula is equivalent, but not as precise.
+    float pixel_area = nv::square(2.0f * inverseEdgeLength);
+    float dist_square = 1.0f + nv::square(u) + nv::square(v);
+    float cos_theta = 1.0f / sqrt(dist_square);
+    float cos_theta_d2 = cos_theta / dist_square; // Funny this is just 1/dist^3 or cos(tetha)^3
+
+    return pixel_area * cos_theta_d2;
+#endif
+}
+
+
+static Vector3 texelDirection(uint face, uint x, uint y, int edgeLength, EdgeFixup fixupMethod)
+{
+    float u, v;
+    if (fixupMethod == EdgeFixup_Stretch) {
+        // Transform x,y to [-1, 1] range, match up edges exactly.
+        u = float(x) * 2.0f / (edgeLength - 1) - 1.0f;
+        v = float(y) * 2.0f / (edgeLength - 1) - 1.0f;
+    }
+    else {
+        // Transform x,y to [-1, 1] range, offset by 0.5 to point to texel center.
+        u = (float(x) + 0.5f) * (2.0f / edgeLength) - 1.0f;
+        v = (float(y) + 0.5f) * (2.0f / edgeLength) - 1.0f;
+    }
+
+    if (fixupMethod == EdgeFixup_Warp) {
+        // Warp texel centers in the proximity of the edges.
+        float a = powf(float(edgeLength), 2.0f) / powf(float(edgeLength - 1), 3.0f);
+        u = a * powf(u, 3) + u;
+        v = a * powf(v, 3) + v;
+    }
+
+    nvDebugCheck(u >= -1.0f && u <= 1.0f);
+    nvDebugCheck(v >= -1.0f && v <= 1.0f);
+
+    Vector3 n;
+
+    if (face == 0) {
+        n.x = 1;
+        n.y = -v;
+        n.z = -u;
+    }
+    if (face == 1) {
+        n.x = -1;
+        n.y = -v;
+        n.z = u;
+    }
+
+    if (face == 2) {
+        n.x = u;
+        n.y = 1;
+        n.z = v;
+    }
+    if (face == 3) {
+        n.x = u;
+        n.y = -1;
+        n.z = -v;
+    }
+
+    if (face == 4) {
+        n.x = u;
+        n.y = -v;
+        n.z = 1;
+    }
+    if (face == 5) {
+        n.x = -u;
+        n.y = -v;
+        n.z = -1;
+    }
+
+    return normalizeFast(n);
+}
+
+
+TexelTable::TexelTable(uint edgeLength) : size(edgeLength) {
+
+    uint hsize = size/2;
+
+    // Allocate a small solid angle table that takes into account cube map symmetry.
+    solidAngleArray.resize(hsize * hsize);
+
+    for (uint y = 0; y < hsize; y++) {
+        for (uint x = 0; x < hsize; x++) {
+            solidAngleArray[y * hsize + x] = solidAngleTerm(hsize+x, hsize+y, 1.0f/edgeLength);
+        }
+    }
+
+
+    directionArray.resize(size*size*6);
+
+    for (uint f = 0; f < 6; f++) {
+        for (uint y = 0; y < size; y++) {
+            for (uint x = 0; x < size; x++) {
+                directionArray[(f * size + y) * size + x] = texelDirection(f, x, y, edgeLength, EdgeFixup_None);
+            }
+        }
+    }
+}
+
+const Vector3 & TexelTable::direction(uint f, uint x, uint y) const {
+    nvDebugCheck(f < 6 && x < size && y < size);
+    return directionArray[(f * size + y) * size + x];
+}
+
+float TexelTable::solidAngle(uint f, uint x, uint y) const {
+    uint hsize = size/2;
+    if (x >= hsize) x -= hsize;
+    else if (x < hsize) x = hsize - x - 1;
+    if (y >= hsize) y -= hsize;
+    else if (y < hsize) y = hsize - y - 1;
+
+    return solidAngleArray[y * hsize + x];
+}
+
+
+static const Vector3 faceNormals[6] = {
+    Vector3(1, 0, 0),
+    Vector3(-1, 0, 0),
+    Vector3(0, 1, 0),
+    Vector3(0, -1, 0),
+    Vector3(0, 0, 1),
+    Vector3(0, 0, -1),
+};
+
+static const Vector3 faceU[6] = {
+    Vector3(0, 0, -1),
+    Vector3(0, 0, 1),
+    Vector3(1, 0, 0),
+    Vector3(1, 0, 0),
+    Vector3(1, 0, 0),
+    Vector3(-1, 0, 0),
+};
+
+static const Vector3 faceV[6] = {
+    Vector3(0, -1, 0),
+    Vector3(0, -1, 0),
+    Vector3(0, 0, 1),
+    Vector3(0, 0, -1),
+    Vector3(0, -1, 0),
+    Vector3(0, -1, 0),
+};
+
+
+static Vector2 toPolar(Vector3::Arg v) {
+    Vector2 p;
+    p.x = atan2(v.x, v.y);  // theta
+    p.y = acosf(v.z);       // phi
+    return p;
+}
+
+static Vector2 toPlane(float theta, float phi) {
+    float x = sin(phi) * cos(theta);
+    float y = sin(phi) * sin(theta);
+    float z = cos(phi);
+
+    Vector2 p;
+    p.x = x / fabs(z);
+    p.y = y / fabs(z);
+    //p.x = tan(phi) * cos(theta);
+    //p.y = tan(phi) * sin(theta);
+
+    return p;
+}
+
+static Vector2 toPlane(Vector3::Arg v) {
+    Vector2 p;
+    p.x = v.x / fabs(v.z);
+    p.y = v.y / fabs(v.z);
+    return p;
+}
+
+
+
+
+
+CubeSurface::CubeSurface() : m(new CubeSurface::Private())
+{
+    m->addRef();
+}
+
+CubeSurface::CubeSurface(const CubeSurface & cube) : m(cube.m)
+{
+    if (m != NULL) m->addRef();
+}
+
+CubeSurface::~CubeSurface()
+{
+    if (m != NULL) m->release();
+    m = NULL;
+}
+
+void CubeSurface::operator=(const CubeSurface & cube)
+{
+    if (cube.m != NULL) cube.m->addRef();
+    if (m != NULL) m->release();
+    m = cube.m;
+}
+
+void CubeSurface::detach()
+{
+    if (m->refCount() > 1)
+    {
+        m->release();
+        m = new CubeSurface::Private(*m);
+        m->addRef();
+        nvDebugCheck(m->refCount() == 1);
+    }
+}
+
+
+
+bool CubeSurface::isNull() const
+{
+    return m->edgeLength == 0;
+}
+
+int CubeSurface::edgeLength() const
+{
+    return m->edgeLength;
+}
+
+int CubeSurface::countMipmaps() const
+{
+    return nv::countMipmaps(m->edgeLength);
+}
+
+Surface & CubeSurface::face(int f)
+{
+    nvDebugCheck(f >= 0 && f < 6);
+    return m->face[f];
+}
+
+const Surface & CubeSurface::face(int f) const
+{
+    nvDebugCheck(f >= 0 && f < 6);
+    return m->face[f];
+}
+
+
+bool CubeSurface::load(const char * fileName, int mipmap)
+{
+    if (strEqual(Path::extension(fileName), ".dds")) {
+        nv::DirectDrawSurface dds(fileName);
+
+        if (!dds.isValid()/* || !dds.isSupported()*/) {
+            return false;
+        }
+
+        if (!dds.isTextureCube()) {
+            return false;
+        }
+
+        // Make sure it's a valid cube.
+        if (dds.header.width != dds.header.height) return false;
+        //if ((dds.header.caps.caps2 & DDSCAPS2_CUBEMAP_ALL_FACES) != DDSCAPS2_CUBEMAP_ALL_FACES) return false;
+
+        if (mipmap < 0) {
+            mipmap = dds.mipmapCount() - 1 - mipmap;
+        }
+        if (mipmap < 0 || mipmap > I32(dds.mipmapCount())) return false;
+        
+
+        nvtt::InputFormat inputFormat = nvtt::InputFormat_RGBA_16F;
+
+        if (dds.header.hasDX10Header()) {
+            if (dds.header.header10.dxgiFormat == DXGI_FORMAT_R16G16B16A16_FLOAT) inputFormat = nvtt::InputFormat_RGBA_16F;
+            else if (dds.header.header10.dxgiFormat == DXGI_FORMAT_R32G32B32A32_FLOAT) inputFormat = nvtt::InputFormat_RGBA_32F;
+            else if (dds.header.header10.dxgiFormat == DXGI_FORMAT_R32_FLOAT) inputFormat = nvtt::InputFormat_R_32F;
+            else return false;
+        }
+        else {
+            if ((dds.header.pf.flags & DDPF_FOURCC) != 0) {
+                if (dds.header.pf.fourcc == D3DFMT_A16B16G16R16F) inputFormat = nvtt::InputFormat_RGBA_16F;
+                else if (dds.header.pf.fourcc == D3DFMT_A32B32G32R32F) inputFormat = nvtt::InputFormat_RGBA_32F;
+                else if (dds.header.pf.fourcc == D3DFMT_R32F) inputFormat = nvtt::InputFormat_R_32F;
+                else return false;
+            }
+            else {
+                if (dds.header.pf.bitcount == 32 /*&& ...*/) inputFormat = nvtt::InputFormat_BGRA_8UB;
+                else return false;  // @@ Do pixel format conversions!
+            }
+        }
+        
+        uint edgeLength = dds.surfaceWidth(mipmap);
+        uint size = dds.surfaceSize(mipmap);
+
+        void * data = malloc(size);
+
+        for (int f = 0; f < 6; f++) {
+            dds.readSurface(f, mipmap, data, size);
+            m->face[f].setImage(inputFormat, edgeLength, edgeLength, 1, data);
+        }
+
+        m->edgeLength = edgeLength;
+
+        free(data);
+
+        return true;
+    }
+
+    return false;
+}
+
+bool CubeSurface::save(const char * fileName) const
+{
+    // @@ TODO
+    return false;
+}
+
+struct ivec2 {
+    uint x;
+    uint y;
+};
+//                                                   posx    negx    posy    negy    posz    negz
+static const ivec2 foldOffsetVerticalCross[6]   = { {2, 1}, {0, 1}, {1, 0}, {1, 2}, {1, 1}, {1, 3} };
+static const ivec2 foldOffsetHorizontalCross[6] = { {2, 1}, {0, 1}, {1, 0}, {1, 2}, {1, 1}, {3, 1} };
+static const ivec2 foldOffsetColumn[6]          = { {0, 0}, {0, 1}, {0, 2}, {0, 3}, {0, 4}, {0, 5} };
+static const ivec2 foldOffsetRow[6]             = { {0, 0}, {1, 0}, {2, 0}, {3, 0}, {4, 0}, {5, 0} };
+
+void CubeSurface::fold(const Surface & tex, CubeLayout layout)
+{
+    ivec2 const* offsets = 0;
+    uint edgeLength;
+
+    switch(layout) {
+        case CubeLayout_LatitudeLongitude:
+        case CubeLayout_VerticalCross:
+            edgeLength = tex.height() / 4;
+            offsets = foldOffsetVerticalCross;
+            break;
+        case CubeLayout_HorizontalCross:
+            edgeLength = tex.width() / 4;
+            offsets = foldOffsetHorizontalCross;
+            break;
+        case CubeLayout_Column:
+            edgeLength = tex.width();
+            offsets = foldOffsetColumn;
+            break;
+        case CubeLayout_Row:
+            edgeLength = tex.height();
+            offsets = foldOffsetRow;
+            break;
+    }
+
+    m->edgeLength = edgeLength;
+    for(uint f = 0; f < 6; f++) {
+        uint x = offsets[f].x * edgeLength;
+        uint y = offsets[f].y * edgeLength;
+        m->face[f] = tex.createSubImage(x, x + edgeLength - 1, y, y + edgeLength - 1, 0, 0);
+    }
+
+    if(layout == CubeLayout_VerticalCross || layout == CubeLayout_LatitudeLongitude) {
+        // Back face needs to be rotated 180 degrees
+        m->face[5].flipX();
+        m->face[5].flipY();
+    }
+}
+
+Surface CubeSurface::unfold(CubeLayout layout) const
+{
+    ivec2 const* offsets = 0;
+    uint edgeLength = m->edgeLength;
+    uint width;
+    uint height;
+
+    switch(layout) {
+        case CubeLayout_LatitudeLongitude:
+        case CubeLayout_VerticalCross:
+            offsets = foldOffsetVerticalCross;
+            width = 3 * edgeLength;
+            height = 4 * edgeLength;
+            // Back face needs to be rotated 180 degrees
+            m->face[5].flipX();
+            m->face[5].flipY();
+            break;
+        case CubeLayout_HorizontalCross:
+            offsets = foldOffsetHorizontalCross;
+            width = 4 * edgeLength;
+            height = 3 * edgeLength;
+            break;
+        case CubeLayout_Column:
+            offsets = foldOffsetColumn;
+            width = edgeLength;
+            height = 6 * edgeLength;
+            break;
+        case CubeLayout_Row:
+            offsets = foldOffsetRow;
+            width = 6 * edgeLength;
+            height = edgeLength;
+            break;
+    }
+
+    Surface surface;
+    surface.setImage(width, height, 1);
+    for(uint f = 0; f < 6; f++) {
+        uint x = offsets[f].x * edgeLength;
+        uint y = offsets[f].y * edgeLength;
+        surface.copy(m->face[f], 0, 0, 0, edgeLength, edgeLength, 1, x, y, 0);
+    }
+
+    if(layout == CubeLayout_VerticalCross || layout == CubeLayout_LatitudeLongitude) {
+        // Undo back face rotation
+        m->face[5].flipY();
+        m->face[5].flipX();
+    }
+    return surface;
+}
+
+float CubeSurface::average(int channel) const
+{
+    const uint edgeLength = m->edgeLength;
+    m->allocateTexelTable();
+
+    float total = 0.0f;
+    float sum = 0.0f;
+
+    for (int f = 0; f < 6; f++) {
+        float * c = m->face[f].m->image->channel(channel);
+
+         for (uint y = 0; y < edgeLength; y++) {
+             for (uint x = 0; x < edgeLength; x++) {
+                float solidAngle = m->texelTable->solidAngle(f, x, y);
+
+                total += solidAngle;
+                sum += c[y * edgeLength + x] * solidAngle;
+            }
+        }
+    }
+
+    return sum / total;
+}
+
+void CubeSurface::range(int channel, float * minimum_ptr, float * maximum_ptr) const
+{
+    const uint edgeLength = m->edgeLength;
+    m->allocateTexelTable();
+
+    float minimum = NV_FLOAT_MAX;
+    float maximum = 0.0f;
+
+    for (int f = 0; f < 6; f++) {
+        float * c = m->face[f].m->image->channel(channel);
+
+         for (uint y = 0; y < edgeLength; y++) {
+             for (uint x = 0; x < edgeLength; x++) {
+
+                 minimum = nv::min(minimum, c[y * edgeLength + x]);
+                 maximum = nv::max(maximum, c[y * edgeLength + x]);
+            }
+        }
+    }
+
+    *minimum_ptr = minimum;
+    *maximum_ptr = maximum;
+}
+
+void CubeSurface::clamp(int channel, float low/*= 0.0f*/, float high/*= 1.0f*/) {
+    for (int f = 0; f < 6; f++) {
+        m->face[f].clamp(channel, low, high);
+    }
+}
+
+
+
+#include "nvmath/SphericalHarmonic.h"
+
+CubeSurface CubeSurface::irradianceFilter(int size, EdgeFixup fixupMethod) const
+{
+    m->allocateTexelTable();
+
+    // Transform this cube to spherical harmonic basis
+    Sh2 sh;
+
+    // For each texel of the input cube.
+    const uint edgeLength = m->edgeLength;
+    for (uint f = 0; f < 6; f++) {
+        for (uint y = 0; y < edgeLength; y++) {
+            for (uint x = 0; x < edgeLength; x++) {
+
+                Vector3 dir = m->texelTable->direction(f, x, y);
+                float solidAngle = m->texelTable->solidAngle(f, x, y);
+
+                Sh2 shDir;
+                shDir.eval(dir);
+
+                sh.addScaled(sh, solidAngle);
+            }
+        }
+    }
+
+
+    // Evaluate spherical harmonic for each output texel.
+    CubeSurface output;
+    output.m->allocate(size);
+
+
+
+
+    // @@ TODO
+    return CubeSurface();
+}
+
+
+
+
+// Convolve filter against this cube.
+Vector3 CubeSurface::Private::applyAngularFilter(const Vector3 & filterDir, float coneAngle, float * filterTable, int tableSize)
+{
+    const float cosineConeAngle = cos(coneAngle);
+    nvDebugCheck(cosineConeAngle >= 0);
+
+    Vector3 color(0);
+    float sum = 0;
+
+    // Things I have tried to speed this up:
+    // - Compute accurate bounds assuming cone axis aligned to plane, result was too small elsewhere.
+    // - Compute ellipse that results in the cone/plane intersection and compute its bounds. Sometimes intersection is a parabolla, hard to handle that case.
+    // - Compute the 6 axis aligned planes that bound the cone, clip faces against planes. Resulting plane equations are way too complex.
+
+    // What AMD CubeMapGen does:
+    // - Compute conservative bounds on the primary face, wrap around the adjacent faces.
+
+
+    // For each texel of the input cube.
+    for (uint f = 0; f < 6; f++) {
+
+        // Test face cone agains filter cone.
+        float cosineFaceAngle = dot(filterDir, faceNormals[f]);
+        float faceAngle = acosf(cosineFaceAngle);
+
+        if (faceAngle > coneAngle + atanf(sqrtf(2))) {
+            // Skip face.
+            continue;
+        }
+
+        const int L = I32(edgeLength-1);
+        int x0 = 0, x1 = L;
+        int y0 = 0, y1 = L;
+
+#if 0
+        float u0 = -1;
+        float u1 = 1;
+        float v0 = -1;
+        float v1 = 1;
+
+        // @@ Compute uvs.
+
+        // Expand uv coordinates from [-1,1] to [0, edgeLength)
+        u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
+        v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
+        u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
+        v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
+        nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
+        nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
+        nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
+        nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
+
+        x0 = clamp(ifloor(u0), 0, L);
+        y0 = clamp(ifloor(v0), 0, L);
+        x1 = clamp(iceil(u1), 0, L);
+        y1 = clamp(iceil(v1), 0, L);
+#endif
+
+        nvDebugCheck(x1 >= x0);
+        nvDebugCheck(y1 >= y0);
+
+        if (x1 == x0 || y1 == y0) {
+            // Skip this face.
+            continue;
+        }
+
+
+        const Surface & inputFace = face[f];
+        const FloatImage * inputImage = inputFace.m->image;
+
+        for (int y = y0; y <= y1; y++) {
+            bool inside = false;
+            for (int x = x0; x <= x1; x++) {
+
+                Vector3 dir = texelTable->direction(f, x, y);
+                float cosineAngle = dot(dir, filterDir);
+
+                if (cosineAngle > cosineConeAngle) {
+                    float solidAngle = texelTable->solidAngle(f, x, y);
+                    //float scale = powf(saturate(cosineAngle), cosinePower);
+                    
+                    int idx = int(saturate(cosineAngle) * (tableSize - 1));
+                    float scale = filterTable[idx]; // @@ Do bilinear interpolation?
+
+                    float contribution = solidAngle * scale;
+
+                    sum += contribution;
+                    color.x += contribution * inputImage->pixel(0, x, y, 0);
+                    color.y += contribution * inputImage->pixel(1, x, y, 0);
+                    color.z += contribution * inputImage->pixel(2, x, y, 0);
+
+                    inside = true;
+                }
+                else if (inside) {
+                    // Filter scale is monotonic, if we have been inside once and we just exit, then we can skip the rest of the row.
+                    // We could do the same thing for the columns and skip entire rows.
+                    break;
+                }
+            }
+        }
+    }
+
+    color *= (1.0f / sum);
+
+    return color;
+}
+
+// We want to find the alpha such that:
+// cos(alpha)^cosinePower = epsilon
+// That's: acos(epsilon^(1/cosinePower))
+
+// We can cull texels in two different ways:
+// - culling faces that do not touch the cone.
+// - computing one rectangle per face, find intersection between cone and face.
+// -
+
+// Other speedups:
+// - parallelize. Done.
+// - use ISPC?
+
+
+// Convolve filter against this cube.
+Vector3 CubeSurface::Private::applyCosinePowerFilter(const Vector3 & filterDir, float coneAngle, float cosinePower)
+{
+    const float cosineConeAngle = cos(coneAngle);
+    nvDebugCheck(cosineConeAngle >= 0);
+
+    Vector3 color(0);
+    float sum = 0;
+
+    // Things I have tried to speed this up:
+    // - Compute accurate bounds assuming cone axis aligned to plane, result was too small elsewhere.
+    // - Compute ellipse that results in the cone/plane intersection and compute its bounds. Sometimes intersection is a parabolla, hard to handle that case.
+    // - Compute the 6 axis aligned planes that bound the cone, clip faces against planes. Resulting plane equations are way too complex.
+
+    // What AMD CubeMapGen does:
+    // - Compute conservative bounds on the primary face, wrap around the adjacent faces.
+
+
+    // For each texel of the input cube.
+    for (uint f = 0; f < 6; f++) {
+
+        // Test face cone agains filter cone.
+        float cosineFaceAngle = dot(filterDir, faceNormals[f]);
+        float faceAngle = acosf(cosineFaceAngle);
+
+        if (faceAngle > coneAngle + atanf(sqrtf(2))) {
+            // Skip face.
+            continue;
+        }
+
+        const int L = I32(edgeLength-1);
+        int x0 = 0, x1 = L;
+        int y0 = 0, y1 = L;
+
+#if 0
+        float u0 = -1;
+        float u1 = 1;
+        float v0 = -1;
+        float v1 = 1;
+
+        // @@ Compute uvs.
+
+        // Expand uv coordinates from [-1,1] to [0, edgeLength)
+        u0 = (u0 + 1) * edgeLength * 0.5f - 0.5f;
+        v0 = (v0 + 1) * edgeLength * 0.5f - 0.5f;
+        u1 = (u1 + 1) * edgeLength * 0.5f - 0.5f;
+        v1 = (v1 + 1) * edgeLength * 0.5f - 0.5f;
+        nvDebugCheck(u0 >= -0.5f && u0 <= edgeLength - 0.5f);
+        nvDebugCheck(v0 >= -0.5f && v0 <= edgeLength - 0.5f);
+        nvDebugCheck(u1 >= -0.5f && u1 <= edgeLength - 0.5f);
+        nvDebugCheck(v1 >= -0.5f && v1 <= edgeLength - 0.5f);
+
+        x0 = clamp(ifloor(u0), 0, L);
+        y0 = clamp(ifloor(v0), 0, L);
+        x1 = clamp(iceil(u1), 0, L);
+        y1 = clamp(iceil(v1), 0, L);
+#endif
+
+        nvDebugCheck(x1 >= x0);
+        nvDebugCheck(y1 >= y0);
+
+        if (x1 == x0 || y1 == y0) {
+            // Skip this face.
+            continue;
+        }
+
+
+        const Surface & inputFace = face[f];
+        const FloatImage * inputImage = inputFace.m->image;
+
+        for (int y = y0; y <= y1; y++) {
+            bool inside = false;
+            for (int x = x0; x <= x1; x++) {
+
+                Vector3 dir = texelTable->direction(f, x, y);
+                float cosineAngle = dot(dir, filterDir);
+
+                if (cosineAngle > cosineConeAngle) {
+                    float solidAngle = texelTable->solidAngle(f, x, y);
+                    float scale = powf(saturate(cosineAngle), cosinePower);
+                    float contribution = solidAngle * scale;
+
+                    sum += contribution;
+                    color.x += contribution * inputImage->pixel(0, x, y, 0);
+                    color.y += contribution * inputImage->pixel(1, x, y, 0);
+                    color.z += contribution * inputImage->pixel(2, x, y, 0);
+
+                    inside = true;
+                }
+                else if (inside) {
+                    // Filter scale is monotonic, if we have been inside once and we just exit, then we can skip the rest of the row.
+                    // We could do the same thing for the columns and skip entire rows.
+                    break;
+                }
+            }
+        }
+    }
+
+    color *= (1.0f / sum);
+
+    return color;
+}
+
+#include "nvthread/ParallelFor.h"
+
+struct ApplyAngularFilterContext {
+    CubeSurface::Private * inputCube;
+    CubeSurface::Private * filteredCube;
+    float coneAngle;
+    float * filterTable;
+    int tableSize;
+    EdgeFixup fixupMethod;
+};
+
+void ApplyAngularFilterTask(void * context, int id)
+{
+    ApplyAngularFilterContext * ctx = (ApplyAngularFilterContext *)context;
+
+    int size = ctx->filteredCube->edgeLength;
+
+    int f = id / (size * size);
+    int idx = id % (size * size);
+    int y = idx / size;
+    int x = idx % size;
+
+    nvtt::Surface & filteredFace = ctx->filteredCube->face[f];
+    FloatImage * filteredImage = filteredFace.m->image;
+
+    const Vector3 filterDir = texelDirection(f, x, y, size, ctx->fixupMethod);
+
+    // Convolve filter against cube.
+    Vector3 color = ctx->inputCube->applyAngularFilter(filterDir, ctx->coneAngle, ctx->filterTable, ctx->tableSize);
+
+    filteredImage->pixel(0, idx) = color.x;
+    filteredImage->pixel(1, idx) = color.y;
+    filteredImage->pixel(2, idx) = color.z;
+}
+
+
+CubeSurface CubeSurface::cosinePowerFilter(int size, float cosinePower, EdgeFixup fixupMethod) const
+{
+    // Allocate output cube.
+    CubeSurface filteredCube;
+    filteredCube.m->allocate(size);
+
+    // Texel table is stored along with the surface so that it's compute only once.
+    m->allocateTexelTable();
+
+    const float threshold = 0.001f;
+    const float coneAngle = acosf(powf(threshold, 1.0f/cosinePower));
+
+
+    // For each texel of the output cube.
+    /*for (uint f = 0; f < 6; f++) {
+        nvtt::Surface filteredFace = filteredCube.m->face[f];
+        FloatImage * filteredImage = filteredFace.m->image;
+
+        for (uint y = 0; y < uint(size); y++) {
+            for (uint x = 0; x < uint(size); x++) {
+
+                const Vector3 filterDir = texelDirection(f, x, y, size, fixupMethod);
+
+                // Convolve filter against cube.
+                Vector3 color = m->applyCosinePowerFilter(filterDir, coneAngle, cosinePower);
+
+                filteredImage->pixel(0, x, y, 0) = color.x;
+                filteredImage->pixel(1, x, y, 0) = color.y;
+                filteredImage->pixel(2, x, y, 0) = color.z;
+            }
+        }
+    }*/
+
+    ApplyAngularFilterContext context;
+    context.inputCube = m;
+    context.filteredCube = filteredCube.m;
+    context.coneAngle = coneAngle;
+    context.fixupMethod = fixupMethod;
+
+    context.tableSize = 512;
+    context.filterTable = new float[context.tableSize];
+
+    // @@ Instead of looking up table between [0 - 1] we should probably use [cos(coneAngle), 1]
+
+    for (int i = 0; i < context.tableSize; i++) {
+        float f = float(i) / (context.tableSize - 1);
+        context.filterTable[i] = powf(f, cosinePower);
+    }
+    
+
+    nv::ParallelFor parallelFor(ApplyAngularFilterTask, &context);
+    parallelFor.run(6 * size * size);
+
+    // @@ Implement edge averaging.
+    if (fixupMethod == EdgeFixup_Average) {
+        for (uint f = 0; f < 6; f++) {
+            nvtt::Surface filteredFace = filteredCube.m->face[f];
+            FloatImage * filteredImage = filteredFace.m->image;
+
+            // For each component.
+            for (uint c = 0; c < 3; c++) {
+                // @@ For each corner, sample the two adjacent faces.
+                filteredImage->pixel(c, 0, 0, 0);
+                filteredImage->pixel(c, size-1, 0, 0);
+                filteredImage->pixel(c, 0, size-1, 0);
+                filteredImage->pixel(c, size-1, size-1, 0);
+
+                // @@ For each edge, sample the adjacent face.
+
+            }
+        }
+    }
+
+    return filteredCube;
+}
+
+
+// Sample cubemap in the given direction.
+Vector3 CubeSurface::Private::sample(const Vector3 & dir)
+{
+    int f = -1;
+    if (fabs(dir.x) > fabs(dir.y) && fabs(dir.x) > fabs(dir.z)) {
+        if (dir.x > 0) f = 0;
+        else f = 1;
+    }
+    else if (fabs(dir.y) > fabs(dir.z)) {
+        if (dir.y > 0) f = 2;
+        else f = 3;
+    }
+    else {
+        if (dir.z > 0) f = 4;
+        else f = 5;
+    }
+    nvDebugCheck(f != -1);
+
+    // uv coordinates corresponding to filterDir.
+    float u = dot(dir, faceU[f]);
+    float v = dot(dir, faceV[f]);
+
+    FloatImage * img = face[f].m->image;
+
+    Vector3 color;
+    color.x = img->sampleLinearClamp(0, u, v);
+    color.y = img->sampleLinearClamp(1, u, v);
+    color.z = img->sampleLinearClamp(2, u, v);
+
+    return color;
+}
+
+// @@ Not tested!
+CubeSurface CubeSurface::fastResample(int size, EdgeFixup fixupMethod) const
+{
+    // Allocate output cube.
+    CubeSurface resampledCube;
+    resampledCube.m->allocate(size);
+
+    // For each texel of the output cube.
+    for (uint f = 0; f < 6; f++) {
+        nvtt::Surface resampledFace = resampledCube.m->face[f];
+        FloatImage * resampledImage = resampledFace.m->image;
+
+        for (uint y = 0; y < uint(size); y++) {
+            for (uint x = 0; x < uint(size); x++) {
+
+                const Vector3 filterDir = texelDirection(f, x, y, size, fixupMethod);
+
+                Vector3 color = m->sample(filterDir);
+
+                resampledImage->pixel(0, x, y, 0) = color.x;
+                resampledImage->pixel(1, x, y, 0) = color.y;
+                resampledImage->pixel(2, x, y, 0) = color.z;
+            }
+        }
+    }
+
+    // @@ Implement edge averaging. Share this code with cosinePowerFilter
+    if (fixupMethod == EdgeFixup_Average) {
+    }
+
+    return resampledCube;
+}
+
+
+void CubeSurface::toLinear(float gamma)
+{
+    if (isNull()) return;
+
+    detach();
+
+    for (int i = 0; i < 6; i++) {
+        m->face[i].toLinear(gamma);
+    }
+}
+
+void CubeSurface::toGamma(float gamma)
+{
+    if (isNull()) return;
+
+    detach();
+
+    for (int i = 0; i < 6; i++) {
+        m->face[i].toGamma(gamma);
+    }
+}
+
+
+#if 0
+// @@ Provide solar azimuth.
+#include "ArHoseSkyModel.h"
+void CubeSurface::sky(float turbidity, float albedo[3], float solarElevation) {
+
+    ArHosekSkyModelState * skymodel_state[3];
+
+    for (int i = 0; i < num_channels; i++) {
+        skymodel_state[i] = arhosekskymodelstate_alloc_init(turbidity, albedo[i], solarElevation);
+    }
+
+    // 700 nm (red), 546.1 nm (green) and 435.8 nm (blue).
+    float channel_center[3] = {
+        700,    // Red 620740,
+        546.1,  // Green 520570,
+        435.8,  // Blue 450490,
+    };
+
+    // @@ For each pixel:
+    // What's the channel center for the RGB model?
+    double  skydome_result[3];
+    for (unsigned int i = 0; i < num_channels; i++) {
+        skydome_result[i] = arhosekskymodel_radiance(skymodel_state[i], theta, gamma, channel_center[i]);
+    }
+
+    for (int i = 0; i < num_channels; i++) {
+        arhosek_skymodelstate_free(skymodel_state[i]);
+    }
+
+    /*
+    ArHosekXYZSkyModelState * skymodel_state[3];
+
+    for (int i = 0; i < num_channels; i++) {
+        skymodel_state[i] = arhosek_xyz_skymodelstate_alloc_init(turbidity, albedo[i], solarElevation);
+    }
+
+    // @@ For each pixel.
+    double  skydome_result[3];
+    for (unsigned int i = 0; i < num_channels; i++) {
+        skydome_result[i] = arhosek_xyz_skymodel_radiance(skymodel_state[i], theta, gamma, i);
+    }
+
+    for (int i = 0; i < num_channels; i++) {
+        arhosek_xyz_skymodelstate_free(skymodel_state[i]);
+    }
+    */
+}
+#endif
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/InputOptions.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/InputOptions.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/InputOptions.h
@@ -1,4 +1,5 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
 // 
 // Permission is hereby granted, free of charge, to any person
 // obtaining a copy of this software and associated documentation
@@ -21,93 +22,61 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 
-#ifndef NV_TT_INPUTOPTIONS_H
-#define NV_TT_INPUTOPTIONS_H
+#ifndef NVTT_INPUTOPTIONS_H
+#define NVTT_INPUTOPTIONS_H
 
-#include <nvcore/Ptr.h>
-#include <nvmath/Vector.h>
-#include <nvmath/Matrix.h>
-#include <nvimage/Image.h>
 #include "nvtt.h"
 
+#include "nvmath/Vector.h"
+
+
 namespace nvtt
 {
 
-	struct InputOptions::Private
-	{
-		Private() : images(NULL) {}
-		
-		WrapMode wrapMode;
-		TextureType textureType;
-		InputFormat inputFormat;
-		AlphaMode alphaMode;
-		
-		uint faceCount;
-		uint mipmapCount;
-		uint imageCount;
-		
-		struct InputImage;
-		InputImage * images;
-		
-		// Gamma conversion.
-		float inputGamma;
-		float outputGamma;
-		
-		// Color transform.
-		ColorTransform colorTransform;
-		nv::Matrix linearTransform;
-		
-		// Mipmap generation options.
-		bool generateMipmaps;
-		int maxLevel;
-		MipmapFilter mipmapFilter;
-		
-		// Kaiser filter parameters.
-		float kaiserWidth;
-		float kaiserAlpha;
-		float kaiserStretch;
-		
-		// Normal map options.
-		bool isNormalMap;
-		bool normalizeMipmaps;
-		bool convertToNormalMap;
-		nv::Vector4 heightFactors;
-		nv::Vector4 bumpFrequencyScale;
-		
-		// Adjust extents.
-		uint maxExtent;
-		RoundMode roundMode;
-		
-		// @@ These are computed in nvtt::compress, so they should be mutable or stored elsewhere...
-		mutable uint targetWidth;
-		mutable uint targetHeight;
-		mutable uint targetDepth;
-		mutable uint targetMipmapCount;
-		
-		void computeTargetExtents() const;
-		
-		int realMipmapCount() const;
-		
-		const nv::Image * image(uint face, uint mipmap) const;
-		const nv::Image * image(uint idx) const;
-
-	};
-
-	// Internal image structure.
-	struct InputOptions::Private::InputImage
-	{
-		InputImage() {}
-		
-		int mipLevel;
-		int face;
-		
-		int width;
-		int height;
-		int depth;
-		
-		nv::AutoPtr<nv::Image> data;
-	};
+    struct InputOptions::Private
+    {
+        Private() : images(NULL) {}
+
+        WrapMode wrapMode;
+        TextureType textureType;
+        InputFormat inputFormat;
+        AlphaMode alphaMode;
+
+        uint width;
+        uint height;
+        uint depth;
+        uint faceCount;
+        uint mipmapCount;
+        uint imageCount;
+
+        void ** images;
+
+        // Gamma conversion.
+        float inputGamma;
+        float outputGamma;
+
+        // Mipmap generation options.
+        bool generateMipmaps;
+        int maxLevel;
+        MipmapFilter mipmapFilter;
+
+        // Kaiser filter parameters.
+        float kaiserWidth;
+        float kaiserAlpha;
+        float kaiserStretch;
+
+        // Normal map options.
+        bool isNormalMap;
+        bool normalizeMipmaps;
+        bool convertToNormalMap;
+        nv::Vector4 heightFactors;
+        nv::Vector4 bumpFrequencyScale;
+
+        // Adjust extents.
+        uint maxExtent;
+        RoundMode roundMode;
+    };
 
 } // nvtt namespace
 
-#endif // NV_TT_INPUTOPTIONS_H
+#endif // NVTT_INPUTOPTIONS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/InputOptions.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/InputOptions.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/InputOptions.cpp
@@ -1,408 +1,342 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <string.h> // memcpy
-
-#include <nvcore/Memory.h>
-
-#include "nvtt.h"
-#include "InputOptions.h"
-
-using namespace nv;
-using namespace nvtt;
-
-namespace
-{
-
-	static uint countMipmaps(int w, int h, int d)
-	{
-		uint mipmap = 0;
-		
-		while (w != 1 || h != 1 || d != 1) {
-			w = max(1, w / 2);
-			h = max(1, h / 2);
-			d = max(1, d / 2);
-			mipmap++;
-		}
-		
-		return mipmap + 1;
-	}
-
-	// 1 -> 1, 2 -> 2, 3 -> 2, 4 -> 4, 5 -> 4, ...
-	static uint previousPowerOfTwo(const uint v)
-	{
-		return nextPowerOfTwo(v + 1) / 2;
-	}
-	
-	static uint nearestPowerOfTwo(const uint v)
-	{
-		const uint np2 = nextPowerOfTwo(v);
-		const uint pp2 = previousPowerOfTwo(v);
-		
-		if (np2 - v <= v - pp2)
-		{
-			return np2;
-		}
-		else
-		{
-			return pp2;
-		}
-	}
-	
-} // namespace
-
-
-/// Constructor.
-InputOptions::InputOptions() : m(*new InputOptions::Private())
-{ 
-	reset();
-}
-
-// Delete images.
-InputOptions::~InputOptions()
-{
-	resetTextureLayout();
-	
-	delete &m;
-}
-
-
-// Reset input options.
-void InputOptions::reset()
-{
-	m.wrapMode = WrapMode_Mirror;
-	m.textureType = TextureType_2D;
-	m.inputFormat = InputFormat_BGRA_8UB;
-
-	m.alphaMode = AlphaMode_None;
-
-	m.inputGamma = 2.2f;
-	m.outputGamma = 2.2f;
-	
-	m.colorTransform = ColorTransform_None;
-	m.linearTransform = Matrix(identity);
-
-	m.generateMipmaps = true;
-	m.maxLevel = -1;
-	m.mipmapFilter = MipmapFilter_Box;
-
-	m.kaiserWidth = 3;
-	m.kaiserAlpha = 4.0f;
-	m.kaiserStretch = 1.0f;
-
-	m.isNormalMap = false;
-	m.normalizeMipmaps = true;
-	m.convertToNormalMap = false;
-	m.heightFactors.set(0.0f, 0.0f, 0.0f, 1.0f);
-	m.bumpFrequencyScale = Vector4(1.0f, 0.5f, 0.25f, 0.125f) / (1.0f + 0.5f + 0.25f + 0.125f);
-	
-	m.maxExtent = 0;
-	m.roundMode = RoundMode_None;
-}
-
-
-// Setup the input image.
-void InputOptions::setTextureLayout(TextureType type, int width, int height, int depth /*= 1*/)
-{
-	// Validate arguments.
-	nvCheck(width >= 0);
-	nvCheck(height >= 0);
-	nvCheck(depth >= 0);
-
-	// Correct arguments.
-	if (width == 0) width = 1;
-	if (height == 0) height = 1;
-	if (depth == 0) depth = 1;
-
-	// Delete previous images.
-	resetTextureLayout();
-	
-	m.textureType = type;
-	
-	// Allocate images.
-	m.mipmapCount = countMipmaps(width, height, depth);
-	m.faceCount = (type == TextureType_Cube) ? 6 : 1;
-	m.imageCount = m.mipmapCount * m.faceCount;
-	
-	m.images = new Private::InputImage[m.imageCount];
-	
-	for(uint f = 0; f < m.faceCount; f++)
-	{
-		uint w = width;
-		uint h = height;
-		uint d = depth;
-
-		for (uint mipLevel = 0; mipLevel < m.mipmapCount; mipLevel++)
-		{
-			Private::InputImage & img = m.images[f * m.mipmapCount + mipLevel];
-			img.width = w;
-			img.height = h;
-			img.depth = d;
-			img.mipLevel = mipLevel;
-			img.face = f;
-			
-			img.data = NULL;
-			
-			w = max(1U, w / 2);
-			h = max(1U, h / 2);
-			d = max(1U, d / 2);
-		}
-	}
-}
-
-
-void InputOptions::resetTextureLayout()
-{
-	if (m.images != NULL)
-	{
-		// Delete image array.
-		delete [] m.images;
-		m.images = NULL;
-
-		m.faceCount = 0;
-		m.mipmapCount = 0;
-		m.imageCount = 0;
-	}
-}
-
-
-// Copies the data to our internal structures.
-bool InputOptions::setMipmapData(const void * data, int width, int height, int depth /*= 1*/, int face /*= 0*/, int mipLevel /*= 0*/)
-{
-	nvCheck(depth == 1);
-	
-	const int idx = face * m.mipmapCount + mipLevel;
-	
-	if (m.images[idx].width != width || m.images[idx].height != height || m.images[idx].depth != depth || m.images[idx].mipLevel != mipLevel || m.images[idx].face != face)
-	{
-		// Invalid dimension or index.
-		return false;
-	}
-	
-	m.images[idx].data = new nv::Image();
-	m.images[idx].data->allocate(width, height);
-	memcpy(m.images[idx].data->pixels(), data, width * height * 4); 
-	
-	return true;
-}
-
-
-/// Describe the format of the input.
-void InputOptions::setFormat(InputFormat format)
-{
-	m.inputFormat = format;
-}
-
-
-/// Set the way the input alpha channel is interpreted.
-void InputOptions::setAlphaMode(AlphaMode alphaMode)
-{
-	m.alphaMode = alphaMode;
-}
-
-
-/// Set gamma settings.
-void InputOptions::setGamma(float inputGamma, float outputGamma)
-{
-	m.inputGamma = inputGamma;
-	m.outputGamma = outputGamma;
-}
-
-
-/// Set texture wrappign mode.
-void InputOptions::setWrapMode(WrapMode mode)
-{
-	m.wrapMode = mode;
-}
-
-
-/// Set mipmap filter.
-void InputOptions::setMipmapFilter(MipmapFilter filter)
-{
-	m.mipmapFilter = filter;
-}
-
-/// Set mipmap generation.
-void InputOptions::setMipmapGeneration(bool enabled, int maxLevel/*= -1*/)
-{
-	m.generateMipmaps = enabled;
-	m.maxLevel = maxLevel;
-}
-
-/// Set Kaiser filter parameters.
-void InputOptions::setKaiserParameters(float width, float alpha, float stretch)
-{
-	m.kaiserWidth = width;
-	m.kaiserAlpha = alpha;
-	m.kaiserStretch = stretch;
-}
-
-/// Indicate whether input is a normal map or not.
-void InputOptions::setNormalMap(bool b)
-{
-	m.isNormalMap = b;
-}
-
-/// Enable normal map conversion.
-void InputOptions::setConvertToNormalMap(bool convert)
-{
-	m.convertToNormalMap = convert;
-}
-
-/// Set height evaluation factors.
-void InputOptions::setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale)
-{
-	// Do not normalize height factors.
-//	float total = redScale + greenScale + blueScale + alphaScale;
-	m.heightFactors = Vector4(redScale, greenScale, blueScale, alphaScale);
-}
-
-/// Set normal map conversion filter.
-void InputOptions::setNormalFilter(float small, float medium, float big, float large)
-{
-	float total = small + medium + big + large;
-	m.bumpFrequencyScale = Vector4(small, medium, big, large) / total;
-}
-
-/// Enable mipmap normalization.
-void InputOptions::setNormalizeMipmaps(bool normalize)
-{
-	m.normalizeMipmaps = normalize;
-}
-
-/// Set color transform.
-void InputOptions::setColorTransform(ColorTransform t)
-{
-	m.colorTransform = t;
-}
-
-// Set linear transform for the given channel.
-void InputOptions::setLinearTransform(int channel, float w0, float w1, float w2, float w3)
-{
-	nvCheck(channel >= 0 && channel < 4);
-
-	Vector4 w(w0, w1, w2, w3);
-	//m.linearTransform.setRow(channel, w);
-}
-
-void InputOptions::setMaxExtents(int e)
-{
-	nvDebugCheck(e > 0);
-	m.maxExtent = e;
-}
-
-void InputOptions::setRoundMode(RoundMode mode)
-{
-	m.roundMode = mode;
-}
-
-
-void InputOptions::Private::computeTargetExtents() const
-{
-	nvCheck(images != NULL);
-	
-	uint maxExtent = this->maxExtent;
-	if (roundMode != RoundMode_None)
-	{
-		// rounded max extent should never be higher than original max extent.
-		maxExtent = previousPowerOfTwo(maxExtent);
-	}
-
-	uint w = images->width;
-	uint h = images->height;
-	uint d = images->depth;
-	
-	nvDebugCheck(w > 0);
-	nvDebugCheck(h > 0);
-	nvDebugCheck(d > 0);
-	
-	// Scale extents without changing aspect ratio.
-	uint maxwhd = max(max(w, h), d);
-	if (maxExtent != 0 && maxwhd > maxExtent)
-	{
-		w = max((w * maxExtent) / maxwhd, 1U);
-		h = max((h * maxExtent) / maxwhd, 1U);
-		d = max((d * maxExtent) / maxwhd, 1U);
-	}
-	
-	// Round to power of two.
-	if (roundMode == RoundMode_ToNextPowerOfTwo)
-	{
-		w = nextPowerOfTwo(w);
-		h = nextPowerOfTwo(h);
-		d = nextPowerOfTwo(d);
-	}
-	else if (roundMode == RoundMode_ToNearestPowerOfTwo)
-	{
-		w = nearestPowerOfTwo(w);
-		h = nearestPowerOfTwo(h);
-		d = nearestPowerOfTwo(d);
-	}
-	else if (roundMode == RoundMode_ToPreviousPowerOfTwo)
-	{
-		w = previousPowerOfTwo(w);
-		h = previousPowerOfTwo(h);
-		d = previousPowerOfTwo(d);
-	}
-	
-	this->targetWidth = w;
-	this->targetHeight = h;
-	this->targetDepth = d;
-	
-	this->targetMipmapCount = countMipmaps(w, h, d);
-}
-
-
-// Return real number of mipmaps, including first level.
-// computeTargetExtents should have been called before.
-int InputOptions::Private::realMipmapCount() const
-{
-	int mipmapCount = targetMipmapCount;
-	
-	if (!generateMipmaps) mipmapCount = 1;
-	else if (maxLevel != -1 && maxLevel < mipmapCount - 1) mipmapCount = maxLevel + 1;
-
-	return mipmapCount;
-}
-
-
-const Image * InputOptions::Private::image(uint face, uint mipmap) const
-{
-	nvDebugCheck(face < faceCount);
-	nvDebugCheck(mipmap < mipmapCount);
-
-	const InputImage & image = this->images[face * mipmapCount + mipmap];
-	nvDebugCheck(image.face == face);
-	nvDebugCheck(image.mipLevel == mipmap);
-
-	return image.data.ptr();
-}
-
-const Image * InputOptions::Private::image(uint idx) const
-{
-	nvDebugCheck(idx < faceCount * mipmapCount);
-
-	const InputImage & image = this->images[idx];
-
-	return image.data.ptr();
-}
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "InputOptions.h"
+
+#include "nvmath/Vector.inl"
+
+#include "nvcore/Utils.h" // nextPowerOfTwo
+#include "nvcore/Memory.h"
+
+#include <string.h> // memcpy, memset
+
+
+
+using namespace nv;
+using namespace nvtt;
+
+namespace
+{
+
+    static uint countMipmaps(int w, int h, int d)
+    {
+        uint mipmap = 0;
+
+        while (w != 1 || h != 1 || d != 1) {
+            w = max(1, w / 2);
+            h = max(1, h / 2);
+            d = max(1, d / 2);
+            mipmap++;
+        }
+
+        return mipmap + 1;
+    }
+
+    // 1 -> 1, 2 -> 2, 3 -> 2, 4 -> 4, 5 -> 4, ...
+    static uint previousPowerOfTwo(const uint v)
+    {
+        return nextPowerOfTwo(v + 1) / 2;
+    }
+
+    static uint nearestPowerOfTwo(const uint v)
+    {
+        const uint np2 = nextPowerOfTwo(v);
+        const uint pp2 = previousPowerOfTwo(v);
+
+        if (np2 - v <= v - pp2)
+        {
+            return np2;
+        }
+        else
+        {
+            return pp2;
+        }
+    }
+
+} // namespace
+
+
+/// Constructor.
+InputOptions::InputOptions() : m(*new InputOptions::Private())
+{ 
+    reset();
+}
+
+// Delete images.
+InputOptions::~InputOptions()
+{
+    resetTextureLayout();
+
+    delete &m;
+}
+
+
+// Reset input options.
+void InputOptions::reset()
+{
+    m.wrapMode = WrapMode_Mirror;
+    m.textureType = TextureType_2D;
+    m.inputFormat = InputFormat_BGRA_8UB;
+
+    m.alphaMode = AlphaMode_None;
+
+    m.inputGamma = 2.2f;
+    m.outputGamma = 2.2f;
+
+    m.generateMipmaps = true;
+    m.maxLevel = -1;
+    m.mipmapFilter = MipmapFilter_Box;
+
+    m.kaiserWidth = 3;
+    m.kaiserAlpha = 4.0f;
+    m.kaiserStretch = 1.0f;
+
+    m.isNormalMap = false;
+    m.normalizeMipmaps = true;
+    m.convertToNormalMap = false;
+    m.heightFactors.set(0.0f, 0.0f, 0.0f, 1.0f);
+    m.bumpFrequencyScale = Vector4(1.0f, 0.5f, 0.25f, 0.125f) / (1.0f + 0.5f + 0.25f + 0.125f);
+
+    m.maxExtent = 0;
+    m.roundMode = RoundMode_None;
+}
+
+
+// Setup the input image.
+void InputOptions::setTextureLayout(TextureType type, int width, int height, int depth /*= 1*/, int arraySize /*= 1*/)
+{
+    // Validate arguments.
+    nvCheck(width >= 0);
+    nvCheck(height >= 0);
+    nvCheck(depth >= 0);
+    nvCheck(arraySize >= 0);
+
+    // Correct arguments.
+    if (width == 0) width = 1;
+    if (height == 0) height = 1;
+    if (depth == 0) depth = 1;
+    if (arraySize == 0) arraySize = 1;
+
+    // Delete previous images.
+    resetTextureLayout();
+
+    m.textureType = type;
+    m.width = width;
+    m.height = height;
+    m.depth = depth;
+
+    // Allocate images.
+    if (type == TextureType_Cube) {
+        nvCheck(arraySize == 1);
+        m.faceCount = 6;
+    }
+    else if (type == TextureType_Array) {
+        m.faceCount = arraySize;
+    } else {
+        nvCheck(arraySize == 1);
+        m.faceCount = 1;
+    }
+    m.mipmapCount = countMipmaps(width, height, depth);
+    m.imageCount = m.mipmapCount * m.faceCount;
+    m.images = new void *[m.imageCount];
+
+    memset(m.images, 0, sizeof(void *) * m.imageCount);
+}
+
+
+void InputOptions::resetTextureLayout()
+{
+    if (m.images != NULL)
+    {
+        // Delete images.
+        for (uint i = 0; i < m.imageCount; i++) {
+            free(m.images[i]);
+        }
+
+        // Delete image array.
+        delete [] m.images;
+        m.images = NULL;
+
+        m.faceCount = 0;
+        m.mipmapCount = 0;
+        m.imageCount = 0;
+    }
+}
+
+
+// Copies the data to our internal structures.
+bool InputOptions::setMipmapData(const void * data, int width, int height, int depth /*= 1*/, int face /*= 0*/, int mipLevel /*= 0*/)
+{
+    if (uint(face) >= m.faceCount) {
+        return false;
+    }
+    if (uint(mipLevel) >= m.mipmapCount) {
+        return false;
+    }
+
+    const uint idx = mipLevel * m.faceCount + face;
+    if (idx >= m.imageCount) {
+        return false;
+    }
+
+    // Compute expected width, height and depth for this mipLevel. Return false if it doesn't match.
+    int w = m.width;
+    int h = m.height;
+    int d = m.depth;
+    for (int i = 0; i < mipLevel; i++) {
+        w = max(1, w/2);
+        h = max(1, h/2);
+        d = max(1, d/2);
+    }
+    if (w != width || h != height || d != depth) {
+        return false;
+    }
+
+    int imageSize = width * height * depth;
+    if (m.inputFormat == InputFormat_BGRA_8UB)
+    {
+        imageSize *= 4 * sizeof(uint8);
+    }
+    else if (m.inputFormat == InputFormat_RGBA_16F)
+    {
+        imageSize *= 4 * sizeof(uint16);
+    }
+    else if (m.inputFormat == InputFormat_RGBA_32F)
+    {
+        imageSize *= 4 * sizeof(float);
+    }
+    else if (m.inputFormat == InputFormat_R_32F)
+    {
+        imageSize *= 1 * sizeof(float);
+    }
+    else
+    {
+        return false;
+    }
+
+    m.images[idx] = realloc(m.images[idx], imageSize);
+    if (m.images[idx] == NULL) {
+        // Out of memory.
+        return false;
+    }
+
+    memcpy(m.images[idx], data, imageSize);
+
+    return true;
+}
+
+
+/// Describe the format of the input.
+void InputOptions::setFormat(InputFormat format)
+{
+    m.inputFormat = format;
+}
+
+
+/// Set the way the input alpha channel is interpreted.
+void InputOptions::setAlphaMode(AlphaMode alphaMode)
+{
+    m.alphaMode = alphaMode;
+}
+
+
+/// Set gamma settings.
+void InputOptions::setGamma(float inputGamma, float outputGamma)
+{
+    m.inputGamma = inputGamma;
+    m.outputGamma = outputGamma;
+}
+
+
+/// Set texture wrappign mode.
+void InputOptions::setWrapMode(WrapMode mode)
+{
+    m.wrapMode = mode;
+}
+
+
+/// Set mipmap filter.
+void InputOptions::setMipmapFilter(MipmapFilter filter)
+{
+    m.mipmapFilter = filter;
+}
+
+/// Set mipmap generation.
+void InputOptions::setMipmapGeneration(bool enabled, int maxLevel/*= -1*/)
+{
+    m.generateMipmaps = enabled;
+    m.maxLevel = maxLevel;
+}
+
+/// Set Kaiser filter parameters.
+void InputOptions::setKaiserParameters(float width, float alpha, float stretch)
+{
+    m.kaiserWidth = width;
+    m.kaiserAlpha = alpha;
+    m.kaiserStretch = stretch;
+}
+
+/// Indicate whether input is a normal map or not.
+void InputOptions::setNormalMap(bool b)
+{
+    m.isNormalMap = b;
+}
+
+/// Enable normal map conversion.
+void InputOptions::setConvertToNormalMap(bool convert)
+{
+    m.convertToNormalMap = convert;
+}
+
+/// Set height evaluation factors.
+void InputOptions::setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale)
+{
+    // Do not normalize height factors.
+//  float total = redScale + greenScale + blueScale + alphaScale;
+    m.heightFactors = Vector4(redScale, greenScale, blueScale, alphaScale);
+}
+
+/// Set normal map conversion filter.
+void InputOptions::setNormalFilter(float small, float medium, float big, float large)
+{
+    float total = small + medium + big + large;
+    m.bumpFrequencyScale = Vector4(small, medium, big, large) / total;
+}
+
+/// Enable mipmap normalization.
+void InputOptions::setNormalizeMipmaps(bool normalize)
+{
+    m.normalizeMipmaps = normalize;
+}
+
+void InputOptions::setMaxExtents(int e)
+{
+    nvDebugCheck(e > 0);
+    m.maxExtent = e;
+}
+
+void InputOptions::setRoundMode(RoundMode mode)
+{
+    m.roundMode = mode;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/OptimalCompressDXT.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/OptimalCompressDXT.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/OptimalCompressDXT.h
@@ -1,49 +1,63 @@
-// Copyright NVIDIA Corporation 2008 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_OPTIMALCOMPRESSDXT_H
-#define NV_TT_OPTIMALCOMPRESSDXT_H
-
-#include <nvimage/nvimage.h>
-
-namespace nv
-{
-	struct ColorBlock;
-	struct BlockDXT1;
-	struct BlockDXT3;
-	struct BlockDXT5;
-	struct AlphaBlockDXT3;
-	struct AlphaBlockDXT5;
-
-	namespace OptimalCompress
-	{
-		void compressDXT1(Color32 rgba, BlockDXT1 * dxtBlock);
-		void compressDXT1a(Color32 rgba, BlockDXT1 * dxtBlock);
-		
-		void compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block);
-		void compressDXT3A(const ColorBlock & rgba, AlphaBlockDXT3 * dxtBlock);
-		void compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock);
-	}
-} // nv namespace
-
-#endif // NV_TT_OPTIMALCOMPRESSDXT_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_OPTIMALCOMPRESSDXT_H
+#define NV_TT_OPTIMALCOMPRESSDXT_H
+
+//#include "nvimage/nvimage.h"
+
+#include "nvmath/Color.h"
+
+namespace nv
+{
+    struct ColorSet;
+	struct ColorBlock;
+	struct BlockDXT1;
+	struct BlockDXT3;
+	struct BlockDXT5;
+	struct AlphaBlockDXT3;
+	struct AlphaBlockDXT5;
+    struct AlphaBlock4x4;
+
+	namespace OptimalCompress
+	{
+        // Single color compressors:
+		void compressDXT1(Color32 rgba, BlockDXT1 * dxtBlock);
+		void compressDXT1a(Color32 rgba, uint alphaMask, BlockDXT1 * dxtBlock);
+		void compressDXT1G(uint8 g, BlockDXT1 * dxtBlock);
+		
+        void compressDXT3A(const AlphaBlock4x4 & src, AlphaBlockDXT3 * dst);
+        void compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst);
+
+		void compressDXT1G(const ColorBlock & src, BlockDXT1 * dst);
+		void compressDXT3A(const ColorBlock & src, AlphaBlockDXT3 * dst);
+		void compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst);
+        
+        void compressDXT1_Luma(const ColorBlock & src, BlockDXT1 * dst);
+
+        void compressDXT5A_RGBM(const ColorSet & src, const ColorBlock & RGB, AlphaBlockDXT5 * dst);
+	}
+} // nv namespace
+
+#endif // NV_TT_OPTIMALCOMPRESSDXT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/OptimalCompressDXT.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/OptimalCompressDXT.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/OptimalCompressDXT.cpp
@@ -1,368 +1,812 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Containers.h> // swap
-
-#include <nvmath/Color.h>
-
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-
-#include "OptimalCompressDXT.h"
-#include "SingleColorLookup.h"
-
-
-using namespace nv;
-using namespace OptimalCompress;
-
-
-
-namespace
-{
-	static int computeGreenError(const ColorBlock & rgba, const BlockDXT1 * block)
-	{
-		nvDebugCheck(block != NULL);
-
-		int palette[4];
-		palette[0] = (block->col0.g << 2) | (block->col0.g >> 4);
-		palette[1] = (block->col1.g << 2) | (block->col1.g >> 4);
-		palette[2] = (2 * palette[0] + palette[1]) / 3;
-		palette[3] = (2 * palette[1] + palette[0]) / 3;
-
-		int totalError = 0;
-
-		for (int i = 0; i < 16; i++)
-		{
-			const int green = rgba.color(i).g;
-			
-			int error = abs(green - palette[0]);
-			error = min(error, abs(green - palette[1]));
-			error = min(error, abs(green - palette[2]));
-			error = min(error, abs(green - palette[3]));
-			
-			totalError += error;
-		}
-
-		return totalError;
-	}
-
-	static uint computeGreenIndices(const ColorBlock & rgba, const Color32 palette[4])
-	{
-		const int color0 = palette[0].g;
-		const int color1 = palette[1].g;
-		const int color2 = palette[2].g;
-		const int color3 = palette[3].g;
-		
-		uint indices = 0;
-		for (int i = 0; i < 16; i++)
-		{
-			const int color = rgba.color(i).g;
-			
-			uint d0 = abs(color0 - color);
-			uint d1 = abs(color1 - color);
-			uint d2 = abs(color2 - color);
-			uint d3 = abs(color3 - color);
-			
-			uint b0 = d0 > d3;
-			uint b1 = d1 > d2;
-			uint b2 = d0 > d2;
-			uint b3 = d1 > d3;
-			uint b4 = d2 > d3;
-			
-			uint x0 = b1 & b2;
-			uint x1 = b0 & b3;
-			uint x2 = b0 & b4;
-			
-			indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
-		}
-
-		return indices;
-	}
-
-	// Choose quantized color that produces less error. Used by DXT3 compressor.
-	inline static uint quantize4(uint8 a)
-	{
-		int q0 = (a >> 4) - 1;
-		int q1 = (a >> 4);
-		int q2 = (a >> 4) + 1;
-		
-		q0 = (q0 << 4) | q0;
-		q1 = (q1 << 4) | q1;
-		q2 = (q2 << 4) | q2;
-		
-		int d0 = abs(q0 - a);
-		int d1 = abs(q1 - a);
-		int d2 = abs(q2 - a);
-
-		if (d0 < d1 && d0 < d2) return q0 >> 4;
-		if (d1 < d2) return q1 >> 4;
-		return q2 >> 4;
-	}
-	
-	static uint computeAlphaError(const ColorBlock & rgba, const AlphaBlockDXT5 * block)
-	{
-		uint8 alphas[8];
-		block->evaluatePalette(alphas);
-
-		uint totalError = 0;
-
-		for (uint i = 0; i < 16; i++)
-		{
-			uint8 alpha = rgba.color(i).a;
-
-			uint besterror = 256*256;
-			uint best;
-			for (uint p = 0; p < 8; p++)
-			{
-				int d = alphas[p] - alpha;
-				uint error = d * d;
-
-				if (error < besterror)
-				{
-					besterror = error;
-					best = p;
-				}
-			}
-
-			totalError += besterror;
-		}
-
-		return totalError;
-	}
-	
-	static void computeAlphaIndices(const ColorBlock & rgba, AlphaBlockDXT5 * block)
-	{
-		uint8 alphas[8];
-		block->evaluatePalette(alphas);
-
-		for (uint i = 0; i < 16; i++)
-		{
-			uint8 alpha = rgba.color(i).a;
-
-			uint besterror = 256*256;
-			uint best = 8;
-			for(uint p = 0; p < 8; p++)
-			{
-				int d = alphas[p] - alpha;
-				uint error = d * d;
-
-				if (error < besterror)
-				{
-					besterror = error;
-					best = p;
-				}
-			}
-			nvDebugCheck(best < 8);
-
-			block->setIndex(i, best);
-		}
-	}
-
-} // namespace
-
-
-
-
-
-// Single color compressor, based on:
-// https://mollyrocket.com/forums/viewtopic.php?t=392
-void OptimalCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
-{
-	dxtBlock->col0.r = OMatch5[c.r][0];
-	dxtBlock->col0.g = OMatch6[c.g][0];
-	dxtBlock->col0.b = OMatch5[c.b][0];
-	dxtBlock->col1.r = OMatch5[c.r][1];
-	dxtBlock->col1.g = OMatch6[c.g][1];
-	dxtBlock->col1.b = OMatch5[c.b][1];
-	dxtBlock->indices = 0xaaaaaaaa;
-
-	if (dxtBlock->col0.u < dxtBlock->col1.u)
-	{
-		swap(dxtBlock->col0.u, dxtBlock->col1.u);
-		dxtBlock->indices ^= 0x55555555;
-	}
-}
-
-void OptimalCompress::compressDXT1a(Color32 rgba, BlockDXT1 * dxtBlock)
-{
-	if (rgba.a < 128)
-	{
-		dxtBlock->col0.u = 0;
-		dxtBlock->col1.u = 0;
-		dxtBlock->indices = 0xFFFFFFFF;
-	}
-	else
-	{
-		compressDXT1(rgba, dxtBlock);
-	}
-}
-
-
-// Brute force green channel compressor
-void OptimalCompress::compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block)
-{
-	nvDebugCheck(block != NULL);
-	
-	uint8 ming = 63;
-	uint8 maxg = 0;
-	
-	// Get min/max green.
-	for (uint i = 0; i < 16; i++)
-	{
-		uint8 green = rgba.color(i).g >> 2;
-		ming = min(ming, green);
-		maxg = max(maxg, green);
-	}
-
-	block->col0.r = 31;
-	block->col1.r = 31;
-	block->col0.g = maxg;
-	block->col1.g = ming;
-	block->col0.b = 0;
-	block->col1.b = 0;
-
-	if (maxg - ming > 4)
-	{
-		int besterror = computeGreenError(rgba, block);
-		int bestg0 = maxg;
-		int bestg1 = ming;
-		
-		for (int g0 = ming+5; g0 < maxg; g0++)
-		{
-			for (int g1 = ming; g1 < g0-4; g1++)
-			{
-				if ((maxg-g0) + (g1-ming) > besterror)
-					continue;
-				
-				block->col0.g = g0;
-				block->col1.g = g1;
-				int error = computeGreenError(rgba, block);
-				
-				if (error < besterror)
-				{
-					besterror = error;
-					bestg0 = g0;
-					bestg1 = g1;
-				}
-			}
-		}
-		
-		block->col0.g = bestg0;
-		block->col1.g = bestg1;
-	}
-	
-	Color32 palette[4];
-	block->evaluatePalette(palette);
-	block->indices = computeGreenIndices(rgba, palette);
-}
-
-void OptimalCompress::compressDXT3A(const ColorBlock & rgba, AlphaBlockDXT3 * dxtBlock)
-{
-	dxtBlock->alpha0 = quantize4(rgba.color(0).a);
-	dxtBlock->alpha1 = quantize4(rgba.color(1).a);
-	dxtBlock->alpha2 = quantize4(rgba.color(2).a);
-	dxtBlock->alpha3 = quantize4(rgba.color(3).a);
-	dxtBlock->alpha4 = quantize4(rgba.color(4).a);
-	dxtBlock->alpha5 = quantize4(rgba.color(5).a);
-	dxtBlock->alpha6 = quantize4(rgba.color(6).a);
-	dxtBlock->alpha7 = quantize4(rgba.color(7).a);
-	dxtBlock->alpha8 = quantize4(rgba.color(8).a);
-	dxtBlock->alpha9 = quantize4(rgba.color(9).a);
-	dxtBlock->alphaA = quantize4(rgba.color(10).a);
-	dxtBlock->alphaB = quantize4(rgba.color(11).a);
-	dxtBlock->alphaC = quantize4(rgba.color(12).a);
-	dxtBlock->alphaD = quantize4(rgba.color(13).a);
-	dxtBlock->alphaE = quantize4(rgba.color(14).a);
-	dxtBlock->alphaF = quantize4(rgba.color(15).a);
-}
-
-
-void OptimalCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock)
-{
-	uint8 mina = 255;
-	uint8 maxa = 0;
-
-	// Get min/max alpha.
-	for (uint i = 0; i < 16; i++)
-	{
-		uint8 alpha = rgba.color(i).a;
-		mina = min(mina, alpha);
-		maxa = max(maxa, alpha);
-	}
-
-	dxtBlock->alpha0 = maxa;
-	dxtBlock->alpha1 = mina;
-
-	/*int centroidDist = 256;
-	int centroid;
-
-	// Get the closest to the centroid.
-	for (uint i = 0; i < 16; i++)
-	{
-		uint8 alpha = rgba.color(i).a;
-		int dist = abs(alpha - (maxa + mina) / 2);
-		if (dist < centroidDist)
-		{
-			centroidDist = dist;
-			centroid = alpha;
-		}
-	}*/
-
-	if (maxa - mina > 8)
-	{
-		int besterror = computeAlphaError(rgba, dxtBlock);
-		int besta0 = maxa;
-		int besta1 = mina;
-
-		for (int a0 = mina+9; a0 < maxa; a0++)
-		{
-			for (int a1 = mina; a1 < a0-8; a1++)
-			//for (int a1 = mina; a1 < maxa; a1++)
-			{
-				//nvCheck(abs(a1-a0) > 8);
-
-				//if (abs(a0 - a1) < 8) continue;
-				//if ((maxa-a0) + (a1-mina) + min(abs(centroid-a0), abs(centroid-a1)) > besterror)
-				if ((maxa-a0) + (a1-mina) > besterror)
-					continue;
-
-				dxtBlock->alpha0 = a0;
-				dxtBlock->alpha1 = a1;
-				int error = computeAlphaError(rgba, dxtBlock);
-
-				if (error < besterror)
-				{
-					besterror = error;
-					besta0 = a0;
-					besta1 = a1;
-				}
-			}
-		}
-
-		dxtBlock->alpha0 = besta0;
-		dxtBlock->alpha1 = besta1;
-	}
-
-	computeAlphaIndices(rgba, dxtBlock);
-}
-
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "OptimalCompressDXT.h"
+#include "SingleColorLookup.h"
+
+#include <nvimage/ColorBlock.h>
+#include <nvimage/BlockDXT.h>
+
+#include <nvmath/Color.h>
+
+#include <nvcore/Utils.h> // swap
+
+#include <limits.h>     // INT_MAX
+#include <float.h>      // FLT_MAX
+
+using namespace nv;
+using namespace OptimalCompress;
+
+
+
+namespace
+{
+	static int greenDistance(int g0, int g1)
+	{
+		//return abs(g0 - g1);
+		int d = g0 - g1;
+		return d * d;
+	}
+
+	static int alphaDistance(int a0, int a1)
+	{
+		//return abs(a0 - a1);
+		int d = a0 - a1;
+		return d * d;
+	}
+
+	/*static uint nearestGreen4(uint green, uint maxGreen, uint minGreen)
+	{
+		uint bias = maxGreen + (maxGreen - minGreen) / 6;
+
+		uint index = 0;
+		if (maxGreen - minGreen != 0) index = clamp(3 * (bias - green) / (maxGreen - minGreen), 0U, 3U);
+
+		return (index * minGreen + (3 - index) * maxGreen) / 3;
+	}*/
+
+	static int computeGreenError(const ColorBlock & rgba, const BlockDXT1 * block, int bestError = INT_MAX)
+	{
+		nvDebugCheck(block != NULL);
+
+	//	uint g0 = (block->col0.g << 2) | (block->col0.g >> 4);
+	//	uint g1 = (block->col1.g << 2) | (block->col1.g >> 4);
+
+		int palette[4];
+		palette[0] = (block->col0.g << 2) | (block->col0.g >> 4);
+		palette[1] = (block->col1.g << 2) | (block->col1.g >> 4);
+		palette[2] = (2 * palette[0] + palette[1]) / 3;
+		palette[3] = (2 * palette[1] + palette[0]) / 3;
+
+		int totalError = 0;
+		for (int i = 0; i < 16; i++)
+		{
+			const int green = rgba.color(i).g;
+			
+			int error = greenDistance(green, palette[0]);
+			error = min(error, greenDistance(green, palette[1]));
+			error = min(error, greenDistance(green, palette[2]));
+			error = min(error, greenDistance(green, palette[3]));
+
+			totalError += error;
+
+		//	totalError += nearestGreen4(green, g0, g1);
+
+			if (totalError > bestError)
+			{
+				// early out
+				return totalError;
+			}
+		}
+
+		return totalError;
+	}
+
+	static uint computeGreenIndices(const ColorBlock & rgba, const Color32 palette[4])
+	{
+		const int color0 = palette[0].g;
+		const int color1 = palette[1].g;
+		const int color2 = palette[2].g;
+		const int color3 = palette[3].g;
+		
+		uint indices = 0;
+		for (int i = 0; i < 16; i++)
+		{
+			const int color = rgba.color(i).g;
+			
+			uint d0 = greenDistance(color0, color);
+			uint d1 = greenDistance(color1, color);
+			uint d2 = greenDistance(color2, color);
+			uint d3 = greenDistance(color3, color);
+			
+			uint b0 = d0 > d3;
+			uint b1 = d1 > d2;
+			uint b2 = d0 > d2;
+			uint b3 = d1 > d3;
+			uint b4 = d2 > d3;
+			
+			uint x0 = b1 & b2;
+			uint x1 = b0 & b3;
+			uint x2 = b0 & b4;
+			
+			indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+		}
+
+		return indices;
+	}
+
+	// Choose quantized color that produces less error. Used by DXT3 compressor.
+	inline static uint quantize4(uint8 a)
+	{
+		int q0 = max(int(a >> 4) - 1, 0);
+		int q1 = (a >> 4);
+		int q2 = min(int(a >> 4) + 1, 0xF);
+		
+		q0 = (q0 << 4) | q0;
+		q1 = (q1 << 4) | q1;
+		q2 = (q2 << 4) | q2;
+		
+		int d0 = alphaDistance(q0, a);
+		int d1 = alphaDistance(q1, a);
+		int d2 = alphaDistance(q2, a);
+
+		if (d0 < d1 && d0 < d2) return q0 >> 4;
+		if (d1 < d2) return q1 >> 4;
+		return q2 >> 4;
+	}
+	
+	static uint nearestAlpha8(uint alpha, uint maxAlpha, uint minAlpha)
+	{
+		float bias = maxAlpha + float(maxAlpha - minAlpha) / (2.0f * 7.0f);
+		float scale = 7.0f / float(maxAlpha - minAlpha);
+
+		uint index = (uint)clamp((bias - float(alpha)) * scale, 0.0f, 7.0f);
+
+		return (index * minAlpha + (7 - index) * maxAlpha) / 7;
+	}
+
+	/*static uint computeAlphaError8(const ColorBlock & rgba, const AlphaBlockDXT5 * block, int bestError = INT_MAX)
+	{
+		int totalError = 0;
+
+		for (uint i = 0; i < 16; i++)
+		{
+			uint8 alpha = rgba.color(i).a;
+
+			totalError += alphaDistance(alpha, nearestAlpha8(alpha, block->alpha0, block->alpha1));
+
+			if (totalError > bestError)
+			{
+				// early out
+				return totalError;
+			}
+		}
+
+		return totalError;
+	}*/
+
+	static float computeAlphaError(const AlphaBlock4x4 & src, const AlphaBlockDXT5 * dst, float bestError = FLT_MAX)
+	{
+		uint8 alphas[8];
+		dst->evaluatePalette(alphas, false); // @@ Use target decoder.
+
+		float totalError = 0;
+
+		for (uint i = 0; i < 16; i++)
+		{
+			uint8 alpha = src.alpha[i];
+
+			int minDist = INT_MAX;
+			for (uint p = 0; p < 8; p++)
+			{
+				int dist = alphaDistance(alpha, alphas[p]);
+				minDist = min(dist, minDist);
+			}
+
+			totalError += minDist * src.weights[i];
+
+			if (totalError > bestError)
+			{
+				// early out
+				return totalError;
+			}
+		}
+
+		return totalError;
+	}
+	
+	static void computeAlphaIndices(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst)
+	{
+		uint8 alphas[8];
+		dst->evaluatePalette(alphas, /*d3d9=*/false); // @@ Use target decoder.
+
+		for (uint i = 0; i < 16; i++)
+		{
+			uint8 alpha = src.alpha[i];
+
+			int minDist = INT_MAX;
+			int bestIndex = 8;
+			for (uint p = 0; p < 8; p++)
+			{
+				int dist = alphaDistance(alpha, alphas[p]);
+
+				if (dist < minDist)
+				{
+					minDist = dist;
+					bestIndex = p;
+				}
+			}
+			nvDebugCheck(bestIndex < 8);
+
+			dst->setIndex(i, bestIndex);
+		}
+	}
+
+} // namespace
+
+
+
+
+
+// Single color compressor, based on:
+// https://mollyrocket.com/forums/viewtopic.php?t=392
+void OptimalCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
+{
+    dxtBlock->col0.r = OMatch5[c.r][0];
+    dxtBlock->col0.g = OMatch6[c.g][0];
+    dxtBlock->col0.b = OMatch5[c.b][0];
+    dxtBlock->col1.r = OMatch5[c.r][1];
+    dxtBlock->col1.g = OMatch6[c.g][1];
+    dxtBlock->col1.b = OMatch5[c.b][1];
+    dxtBlock->indices = 0xaaaaaaaa;
+    
+    if (dxtBlock->col0.u < dxtBlock->col1.u)
+    {
+        swap(dxtBlock->col0.u, dxtBlock->col1.u);
+        dxtBlock->indices ^= 0x55555555;
+    }
+}
+
+void OptimalCompress::compressDXT1a(Color32 c, uint alphaMask, BlockDXT1 * dxtBlock)
+{
+    if (alphaMask == 0) {
+        compressDXT1(c, dxtBlock);
+    }
+    else {
+        dxtBlock->col0.r = OMatchAlpha5[c.r][0];
+        dxtBlock->col0.g = OMatchAlpha6[c.g][0];
+        dxtBlock->col0.b = OMatchAlpha5[c.b][0];
+        dxtBlock->col1.r = OMatchAlpha5[c.r][1];
+        dxtBlock->col1.g = OMatchAlpha6[c.g][1];
+        dxtBlock->col1.b = OMatchAlpha5[c.b][1];
+        dxtBlock->indices = 0xaaaaaaaa; // 0b1010..1010
+
+        if (dxtBlock->col0.u > dxtBlock->col1.u)
+        {
+	        swap(dxtBlock->col0.u, dxtBlock->col1.u);
+        }
+
+        dxtBlock->indices |= alphaMask;
+    }
+}
+
+void OptimalCompress::compressDXT1G(uint8 g, BlockDXT1 * dxtBlock)
+{
+	dxtBlock->col0.r = 31;
+	dxtBlock->col0.g = OMatch6[g][0];
+	dxtBlock->col0.b = 0;
+	dxtBlock->col1.r = 31;
+	dxtBlock->col1.g = OMatch6[g][1];
+	dxtBlock->col1.b = 0;
+	dxtBlock->indices = 0xaaaaaaaa;
+
+	if (dxtBlock->col0.u < dxtBlock->col1.u)
+	{
+		swap(dxtBlock->col0.u, dxtBlock->col1.u);
+		dxtBlock->indices ^= 0x55555555;
+	}
+}
+
+
+// Brute force green channel compressor
+void OptimalCompress::compressDXT1G(const ColorBlock & rgba, BlockDXT1 * block)
+{
+	nvDebugCheck(block != NULL);
+	
+	uint8 ming = 63;
+	uint8 maxg = 0;
+	
+	bool isSingleColor = true;
+	uint8 singleColor = rgba.color(0).g;
+
+	// Get min/max green.
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 green = (rgba.color(i).g + 1) >> 2;
+		ming = min(ming, green);
+		maxg = max(maxg, green);
+
+		if (rgba.color(i).g != singleColor) isSingleColor = false;
+	}
+
+	if (isSingleColor)
+	{
+		compressDXT1G(singleColor, block);
+		return;
+	}
+
+	block->col0.r = 31;
+	block->col1.r = 31;
+	block->col0.g = maxg;
+	block->col1.g = ming;
+	block->col0.b = 0;
+	block->col1.b = 0;
+
+	int bestError = computeGreenError(rgba, block);
+	int bestg0 = maxg;
+	int bestg1 = ming;
+
+	// Expand search space a bit.
+	const int greenExpand = 4;
+	ming = (ming <= greenExpand) ? 0 : ming - greenExpand;
+	maxg = (maxg >= 63-greenExpand) ? 63 : maxg + greenExpand;
+
+	for (int g0 = ming+1; g0 <= maxg; g0++)
+	{
+		for (int g1 = ming; g1 < g0; g1++)
+		{
+			block->col0.g = g0;
+			block->col1.g = g1;
+			int error = computeGreenError(rgba, block, bestError);
+			
+			if (error < bestError)
+			{
+				bestError = error;
+				bestg0 = g0;
+				bestg1 = g1;
+			}
+		}
+	}
+	
+	block->col0.g = bestg0;
+	block->col1.g = bestg1;
+
+	nvDebugCheck(bestg0 == bestg1 || block->isFourColorMode());
+
+
+	Color32 palette[4];
+	block->evaluatePalette(palette, false); // @@ Use target decoder.
+	block->indices = computeGreenIndices(rgba, palette);
+}
+
+
+/*void OptimalCompress::initLumaTables() {
+
+    // For all possible color pairs:
+    for (int c0 = 0; c0 < 65536; c0++) {
+        for (int c1 = 0; c1 < 65536; c1++) {
+            
+            // Compute 
+
+        }
+    }
+
+
+    for (int r = 0; r < 1<<5; r++) {
+        for (int g = 0; g < 1<<6; g++) {
+            for (int b = 0; b < 1<<5; b++) {
+
+
+            }
+        }
+    }
+}*/
+
+
+// Brute force Luma compressor
+void OptimalCompress::compressDXT1_Luma(const ColorBlock & rgba, BlockDXT1 * block)
+{
+	nvDebugCheck(block != NULL);
+	
+    // F_YR = 19595/65536.0f, F_YG = 38470/65536.0f, F_YB = 7471/65536.0f;
+    // 195841
+    //if (
+
+
+    /*
+	uint8 ming = 63;
+	uint8 maxg = 0;
+	
+	bool isSingleColor = true;
+	uint8 singleColor = rgba.color(0).g;
+
+	// Get min/max green.
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 green = (rgba.color(i).g + 1) >> 2;
+		ming = min(ming, green);
+		maxg = max(maxg, green);
+
+		if (rgba.color(i).g != singleColor) isSingleColor = false;
+	}
+
+	if (isSingleColor)
+	{
+		compressDXT1G(singleColor, block);
+		return;
+	}
+
+	block->col0.r = 31;
+	block->col1.r = 31;
+	block->col0.g = maxg;
+	block->col1.g = ming;
+	block->col0.b = 0;
+	block->col1.b = 0;
+
+	int bestError = computeGreenError(rgba, block);
+	int bestg0 = maxg;
+	int bestg1 = ming;
+
+	// Expand search space a bit.
+	const int greenExpand = 4;
+	ming = (ming <= greenExpand) ? 0 : ming - greenExpand;
+	maxg = (maxg >= 63-greenExpand) ? 63 : maxg + greenExpand;
+
+	for (int g0 = ming+1; g0 <= maxg; g0++)
+	{
+		for (int g1 = ming; g1 < g0; g1++)
+		{
+			block->col0.g = g0;
+			block->col1.g = g1;
+			int error = computeGreenError(rgba, block, bestError);
+			
+			if (error < bestError)
+			{
+				bestError = error;
+				bestg0 = g0;
+				bestg1 = g1;
+			}
+		}
+	}
+	
+	block->col0.g = bestg0;
+	block->col1.g = bestg1;
+
+	nvDebugCheck(bestg0 == bestg1 || block->isFourColorMode());
+    */
+
+	Color32 palette[4];
+	block->evaluatePalette(palette, false); // @@ Use target decoder.
+	block->indices = computeGreenIndices(rgba, palette);
+}
+
+
+void OptimalCompress::compressDXT3A(const AlphaBlock4x4 & src, AlphaBlockDXT3 * dst)
+{
+	dst->alpha0 = quantize4(src.alpha[0]);
+	dst->alpha1 = quantize4(src.alpha[1]);
+	dst->alpha2 = quantize4(src.alpha[2]);
+	dst->alpha3 = quantize4(src.alpha[3]);
+	dst->alpha4 = quantize4(src.alpha[4]);
+	dst->alpha5 = quantize4(src.alpha[5]);
+	dst->alpha6 = quantize4(src.alpha[6]);
+	dst->alpha7 = quantize4(src.alpha[7]);
+	dst->alpha8 = quantize4(src.alpha[8]);
+	dst->alpha9 = quantize4(src.alpha[9]);
+	dst->alphaA = quantize4(src.alpha[10]);
+	dst->alphaB = quantize4(src.alpha[11]);
+	dst->alphaC = quantize4(src.alpha[12]);
+	dst->alphaD = quantize4(src.alpha[13]);
+	dst->alphaE = quantize4(src.alpha[14]);
+	dst->alphaF = quantize4(src.alpha[15]);
+}
+
+void OptimalCompress::compressDXT3A(const ColorBlock & src, AlphaBlockDXT3 * dst)
+{
+    AlphaBlock4x4 tmp;
+    tmp.init(src, 3);
+    compressDXT3A(tmp, dst);
+}
+
+void OptimalCompress::compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst)
+{
+	uint8 mina = 255;
+	uint8 maxa = 0;
+
+	uint8 mina_no01 = 255;
+	uint8 maxa_no01 = 0;
+
+	// Get min/max alpha.
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 alpha = src.alpha[i];
+		mina = min(mina, alpha);
+		maxa = max(maxa, alpha);
+
+        if (alpha != 0 && alpha != 255) {
+    	    mina_no01 = min(mina_no01, alpha);
+	        maxa_no01 = max(maxa_no01, alpha);
+        }
+	}
+
+    if (maxa - mina < 8) {
+	    dst->alpha0 = maxa;
+	    dst->alpha1 = mina;
+
+        nvDebugCheck(computeAlphaError(src, dst) == 0);
+    }
+    else if (maxa_no01 - mina_no01 < 6) {
+	    dst->alpha0 = mina_no01;
+	    dst->alpha1 = maxa_no01;
+
+        nvDebugCheck(computeAlphaError(src, dst) == 0);
+    }
+    else {
+		float besterror = computeAlphaError(src, dst);
+		int besta0 = maxa;
+		int besta1 = mina;
+
+		// Expand search space a bit.
+		const int alphaExpand = 8;
+		mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand;
+		maxa = (maxa >= 255-alphaExpand) ? 255 : maxa + alphaExpand;
+
+		for (int a0 = mina+9; a0 < maxa; a0++)
+		{
+			for (int a1 = mina; a1 < a0-8; a1++)
+			{
+				nvDebugCheck(a0 - a1 > 8);
+
+				dst->alpha0 = a0;
+				dst->alpha1 = a1;
+				float error = computeAlphaError(src, dst, besterror);
+
+				if (error < besterror)
+				{
+					besterror = error;
+					besta0 = a0;
+					besta1 = a1;
+				}
+			}
+		}
+
+        // Try using the 6 step encoding.
+        /*if (mina == 0 || maxa == 255)*/ {
+
+		    // Expand search space a bit.
+		    const int alphaExpand = 6;
+            mina_no01 = (mina_no01 <= alphaExpand) ? 0 : mina_no01 - alphaExpand;
+            maxa_no01 = (maxa_no01 >= 255 - alphaExpand) ? 255 : maxa_no01 + alphaExpand;
+
+            for (int a0 = mina_no01 + 9; a0 < maxa_no01; a0++)
+		    {
+                for (int a1 = mina_no01; a1 < a0 - 8; a1++)
+			    {
+				    nvDebugCheck(a0 - a1 > 8);
+
+				    dst->alpha0 = a1;
+				    dst->alpha1 = a0;
+				    float error = computeAlphaError(src, dst, besterror);
+
+				    if (error < besterror)
+				    {
+					    besterror = error;
+					    besta0 = a1;
+					    besta1 = a0;
+				    }
+			    }
+		    }
+        }
+
+		dst->alpha0 = besta0;
+		dst->alpha1 = besta1;
+	}
+
+    computeAlphaIndices(src, dst);
+}
+
+
+void OptimalCompress::compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst)
+{
+    AlphaBlock4x4 tmp;
+    tmp.init(src, 3);
+    compressDXT5A(tmp, dst);
+}
+
+#if 0
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
+const float threshold = 0.15f;
+
+static float computeAlphaError_RGBM(const ColorSet & src, const ColorBlock & RGB, const AlphaBlockDXT5 * dst, float bestError = FLT_MAX)
+{
+    uint8 alphas[8];
+    dst->evaluatePalette(alphas, /*d3d9=*/false); // @@ Use target decoder.
+
+    float totalError = 0;
+
+    for (uint i = 0; i < 16; i++)
+    {
+        float R = src.color(i).x;
+        float G = src.color(i).y;
+        float B = src.color(i).z;
+        
+        float r = float(RGB.color(i).r) / 255.0f;
+        float g = float(RGB.color(i).g) / 255.0f;
+        float b = float(RGB.color(i).b) / 255.0f;
+
+        float minDist = FLT_MAX;
+        for (uint p = 0; p < 8; p++)
+        {
+            // Compute M.
+            float M = float(alphas[p]) / 255.0f * (1 - threshold) + threshold;
+
+            // Decode color.
+            float fr = r * M;
+            float fg = g * M;
+            float fb = b * M;
+
+            // Measure error.
+            float error = square(R - fr) + square(G - fg) + square(B - fb);
+
+            minDist = min(error, minDist);
+        }
+
+        totalError += minDist * src.weights[i];
+
+        if (totalError > bestError)
+        {
+            // early out
+            return totalError;
+        }
+    }
+
+    return totalError;
+}
+
+static void computeAlphaIndices_RGBM(const ColorSet & src, const ColorBlock & RGB, AlphaBlockDXT5 * dst)
+{
+    uint8 alphas[8];
+    dst->evaluatePalette(alphas, /*d3d9=*/false); // @@ Use target decoder.
+
+    for (uint i = 0; i < 16; i++)
+    {
+        float R = src.color(i).x;
+        float G = src.color(i).y;
+        float B = src.color(i).z;
+
+        float r = float(RGB.color(i).r) / 255.0f;
+        float g = float(RGB.color(i).g) / 255.0f;
+        float b = float(RGB.color(i).b) / 255.0f;
+
+        float minDist = FLT_MAX;
+        int bestIndex = 8;
+        for (uint p = 0; p < 8; p++)
+        {
+            // Compute M.
+            float M = float(alphas[p]) / 255.0f * (1 - threshold) + threshold;
+
+            // Decode color.
+            float fr = r * M;
+            float fg = g * M;
+            float fb = b * M;
+
+            // Measure error.
+            float error = square(R - fr) + square(G - fg) + square(B - fb);
+
+            if (error < minDist)
+            {
+                minDist = error;
+                bestIndex = p;
+            }
+        }
+        nvDebugCheck(bestIndex < 8);
+
+        dst->setIndex(i, bestIndex);
+    }
+}
+
+
+void OptimalCompress::compressDXT5A_RGBM(const ColorSet & src, const ColorBlock & RGB, AlphaBlockDXT5 * dst)
+{
+    uint8 mina = 255;
+    uint8 maxa = 0;
+
+    uint8 mina_no01 = 255;
+    uint8 maxa_no01 = 0;
+
+    // Get min/max alpha.
+    /*for (uint i = 0; i < 16; i++)
+    {
+        uint8 alpha = src.alpha[i];
+        mina = min(mina, alpha);
+        maxa = max(maxa, alpha);
+
+        if (alpha != 0 && alpha != 255) {
+            mina_no01 = min(mina_no01, alpha);
+            maxa_no01 = max(maxa_no01, alpha);
+        }
+    }*/
+    mina = 0;
+    maxa = 255;
+    mina_no01 = 0;
+    maxa_no01 = 255;
+
+    /*if (maxa - mina < 8) {
+        dst->alpha0 = maxa;
+        dst->alpha1 = mina;
+
+        nvDebugCheck(computeAlphaError(src, dst) == 0);
+    }
+    else if (maxa_no01 - mina_no01 < 6) {
+        dst->alpha0 = mina_no01;
+        dst->alpha1 = maxa_no01;
+
+        nvDebugCheck(computeAlphaError(src, dst) == 0);
+    }
+    else*/
+    {
+        float besterror = computeAlphaError_RGBM(src, RGB, dst);
+        int besta0 = maxa;
+        int besta1 = mina;
+
+        // Expand search space a bit.
+        const int alphaExpand = 8;
+        mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand;
+        maxa = (maxa >= 255 - alphaExpand) ? 255 : maxa + alphaExpand;
+
+        for (int a0 = mina + 9; a0 < maxa; a0++)
+        {
+            for (int a1 = mina; a1 < a0 - 8; a1++)
+            {
+                nvDebugCheck(a0 - a1 > 8);
+
+                dst->alpha0 = a0;
+                dst->alpha1 = a1;
+                float error = computeAlphaError_RGBM(src, RGB, dst, besterror);
+
+                if (error < besterror)
+                {
+                    besterror = error;
+                    besta0 = a0;
+                    besta1 = a1;
+                }
+            }
+        }
+
+        // Try using the 6 step encoding.
+        /*if (mina == 0 || maxa == 255)*/ {
+
+            // Expand search space a bit.
+            const int alphaExpand = 6;
+            mina_no01 = (mina_no01 <= alphaExpand) ? 0 : mina_no01 - alphaExpand;
+            maxa_no01 = (maxa_no01 >= 255 - alphaExpand) ? 255 : maxa_no01 + alphaExpand;
+
+            for (int a0 = mina_no01 + 9; a0 < maxa_no01; a0++)
+            {
+                for (int a1 = mina_no01; a1 < a0 - 8; a1++)
+                {
+                    nvDebugCheck(a0 - a1 > 8);
+
+                    dst->alpha0 = a1;
+                    dst->alpha1 = a0;
+                    float error = computeAlphaError_RGBM(src, RGB, dst, besterror);
+
+                    if (error < besterror)
+                    {
+                        besterror = error;
+                        besta0 = a1;
+                        besta1 = a0;
+                    }
+                }
+            }
+        }
+
+        dst->alpha0 = besta0;
+        dst->alpha1 = besta1;
+    }
+
+    computeAlphaIndices_RGBM(src, RGB, dst);
+}
+#endif // 0
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/OutputOptions.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/OutputOptions.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/OutputOptions.h
@@ -1,76 +1,95 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_OUTPUTOPTIONS_H
-#define NV_TT_OUTPUTOPTIONS_H
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-#include "nvtt.h"
-
-namespace nvtt
-{
-
-	struct DefaultOutputHandler : public nvtt::OutputHandler
-	{
-		DefaultOutputHandler(const char * fileName) : stream(fileName) {}
-		
-		virtual ~DefaultOutputHandler()
-		{
-		}
-		
-		virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
-		{
-			// ignore.
-		}
-		
-		// Output data.
-		virtual bool writeData(const void * data, int size)
-		{
-			stream.serialize(const_cast<void *>(data), size);
-
-			//return !stream.isError();
-			return true;
-		}
-		
-		nv::StdOutputStream stream;
-	};
-	
-	
-	struct OutputOptions::Private
-	{
-		nv::Path fileName;
-		
-		mutable OutputHandler * outputHandler;
-		ErrorHandler * errorHandler;
-		bool outputHeader;
-		
-		bool openFile() const;
-		void closeFile() const;
-	};
-
-	
-} // nvtt namespace
-
-
-#endif // NV_TT_OUTPUTOPTIONS_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_OUTPUTOPTIONS_H
+#define NV_TT_OUTPUTOPTIONS_H
+
+#include "nvtt.h"
+
+#include "nvcore/StrLib.h" // Path
+#include "nvcore/StdStream.h"
+
+
+namespace nvtt
+{
+
+	struct DefaultOutputHandler : public nvtt::OutputHandler
+	{
+		DefaultOutputHandler(const char * fileName) : stream(fileName) {}
+        DefaultOutputHandler(FILE * fp) : stream(fp, false) {}
+		
+		virtual ~DefaultOutputHandler() {}
+		
+		virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
+		{
+			// ignore.
+		}
+		
+		// Output data.
+		virtual bool writeData(const void * data, int size)
+		{
+			stream.serialize(const_cast<void *>(data), size);
+
+			//return !stream.isError();
+			return true;
+		}
+
+		virtual void endImage()
+		{
+			// ignore.
+		}
+
+		nv::StdOutputStream stream;
+	};
+
+
+	struct OutputOptions::Private
+	{
+		nv::Path fileName;
+        FILE * fileHandle;
+		
+		OutputHandler * outputHandler;
+		ErrorHandler * errorHandler;
+
+		bool outputHeader;
+		Container container;
+        int version;
+        bool srgb;
+        bool deleteOutputHandler;
+
+        void * wrapperProxy;    // For the C/C# wrapper.
+		
+		bool hasValidOutputHandler() const;
+
+		void beginImage(int size, int width, int height, int depth, int face, int miplevel) const;
+		bool writeData(const void * data, int size) const;
+        void endImage() const;
+		void error(Error e) const;
+	};
+
+	
+} // nvtt namespace
+
+
+#endif // NV_TT_OUTPUTOPTIONS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/OutputOptions.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/OutputOptions.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/OutputOptions.cpp
@@ -1,102 +1,177 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "OutputOptions.h"
-
-using namespace nvtt;
-
-
-OutputOptions::OutputOptions() : m(*new OutputOptions::Private())
-{
-	reset();
-}
-
-OutputOptions::~OutputOptions()
-{
-	delete &m;
-}
-
-/// Set default output options.
-void OutputOptions::reset()
-{
-	m.fileName.reset();
-	m.outputHandler = NULL;
-	m.errorHandler = NULL;
-	m.outputHeader = true;
-}
-
-
-/// Set output file name.
-void OutputOptions::setFileName(const char * fileName)
-{
-	m.fileName = fileName;
-	m.outputHandler = NULL;
-}
-
-/// Set output handler.
-void OutputOptions::setOutputHandler(OutputHandler * outputHandler)
-{
-	m.fileName.reset();
-	m.outputHandler = outputHandler;
-}
-
-/// Set error handler.
-void OutputOptions::setErrorHandler(ErrorHandler * errorHandler)
-{
-	m.errorHandler = errorHandler;
-}
-
-/// Set output header.
-void OutputOptions::setOutputHeader(bool outputHeader)
-{
-	m.outputHeader = outputHeader;
-}
-
-
-bool OutputOptions::Private::openFile() const
-{
-	if (!fileName.isNull())
-	{
-		nvCheck(outputHandler == NULL);
-		
-		DefaultOutputHandler * oh = new DefaultOutputHandler(fileName.str());
-		if (oh->stream.isError())
-		{
-			return false;
-		}
-		
-		outputHandler = oh;
-	}
-	
-	return true;
-}
-
-void OutputOptions::Private::closeFile() const
-{
-	if (!fileName.isNull())
-	{
-		delete outputHandler;
-		outputHandler = NULL;
-	}
-}
-
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "OutputOptions.h"
+
+using namespace nvtt;
+
+
+OutputOptions::OutputOptions() : m(*new OutputOptions::Private())
+{
+    reset();
+}
+
+OutputOptions::~OutputOptions()
+{
+    // Cleanup output handler.
+    setOutputHandler(NULL);
+
+    delete &m;
+}
+
+/// Set default output options.
+void OutputOptions::reset()
+{
+    m.fileName.reset();
+    m.fileHandle = NULL;
+
+    m.outputHandler = NULL;
+    m.errorHandler = NULL;
+
+    m.outputHeader = true;
+    m.container = Container_DDS;
+    m.version = 0;
+    m.srgb = false;
+    m.deleteOutputHandler = false;
+}
+
+
+/// Set output file name.
+void OutputOptions::setFileName(const char * fileName)
+{
+    if (m.deleteOutputHandler)
+    {
+        delete m.outputHandler;
+    }
+
+    m.fileName = fileName;
+    m.fileHandle = NULL;
+    m.outputHandler = NULL;
+    m.deleteOutputHandler = false;
+
+    DefaultOutputHandler * oh = new DefaultOutputHandler(fileName);
+    if (oh->stream.isError()) {
+        delete oh;
+    }
+    else {
+        m.deleteOutputHandler = true;
+        m.outputHandler = oh;
+    }
+}
+
+/// Set output file handle.
+void OutputOptions::setFileHandle(void * fp)
+{
+    if (m.deleteOutputHandler) {
+        delete m.outputHandler;
+    }
+
+    m.fileName.reset();
+    m.fileHandle = (FILE *)fp;
+    m.outputHandler = NULL;
+    m.deleteOutputHandler = false;
+
+    DefaultOutputHandler * oh = new DefaultOutputHandler(m.fileHandle);
+    if (oh->stream.isError()) {
+        delete oh;
+    }
+    else {
+        m.deleteOutputHandler = true;
+        m.outputHandler = oh;
+    }
+}
+
+
+/// Set output handler.
+void OutputOptions::setOutputHandler(OutputHandler * outputHandler)
+{
+    if (m.deleteOutputHandler) {
+        delete m.outputHandler;
+    }
+
+    m.fileName.reset();
+    m.fileHandle = NULL;
+    m.outputHandler = outputHandler;
+    m.deleteOutputHandler = false;
+}
+
+/// Set error handler.
+void OutputOptions::setErrorHandler(ErrorHandler * errorHandler)
+{
+    m.errorHandler = errorHandler;
+}
+
+/// Set output header.
+void OutputOptions::setOutputHeader(bool outputHeader)
+{
+    m.outputHeader = outputHeader;
+}
+
+/// Set container.
+void OutputOptions::setContainer(Container container)
+{
+    m.container = container;
+}
+
+/// Set user version.
+void OutputOptions::setUserVersion(int version)
+{
+    m.version = version;
+}
+
+/// Set SRGB flag.
+void OutputOptions::setSrgbFlag(bool b)
+{
+    m.srgb = b;
+}
+
+bool OutputOptions::Private::hasValidOutputHandler() const
+{
+    if (!fileName.isNull() || fileHandle != NULL)
+    {
+        return outputHandler != NULL;
+    }
+
+    return true;
+}
+
+void OutputOptions::Private::beginImage(int size, int width, int height, int depth, int face, int miplevel) const
+{
+    if (outputHandler != NULL) outputHandler->beginImage(size, width, height, depth, face, miplevel);
+}
+
+bool OutputOptions::Private::writeData(const void * data, int size) const
+{
+    return outputHandler == NULL || outputHandler->writeData(data, size);
+}
+
+void OutputOptions::Private::endImage() const
+{
+    if (outputHandler != NULL) outputHandler->endImage();
+}
+
+void OutputOptions::Private::error(Error e) const
+{
+    if (errorHandler != NULL) errorHandler->error(e);
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/QuickCompressDXT.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/QuickCompressDXT.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/QuickCompressDXT.h
@@ -1,50 +1,59 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_QUICKCOMPRESSDXT_H
-#define NV_TT_QUICKCOMPRESSDXT_H
-
-#include <nvimage/nvimage.h>
-
-namespace nv
-{
-	struct ColorBlock;
-	struct BlockDXT1;
-	struct BlockDXT3;
-	struct BlockDXT5;
-	struct AlphaBlockDXT3;
-	struct AlphaBlockDXT5;
-
-	namespace QuickCompress
-	{
-		void compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock);
-		void compressDXT1a(const ColorBlock & rgba, BlockDXT1 * dxtBlock);
-		
-		void compressDXT3(const ColorBlock & rgba, BlockDXT3 * dxtBlock);
-		
-		void compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock, int iterationCount=8);
-		void compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount=8);
-	}
-} // nv namespace
-
-#endif // NV_TT_QUICKCOMPRESSDXT_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_QUICKCOMPRESSDXT_H
+#define NV_TT_QUICKCOMPRESSDXT_H
+
+#include <nvimage/nvimage.h>
+
+namespace nv
+{
+	struct ColorBlock;
+    struct ColorSet;
+    struct AlphaBlock4x4;
+	struct BlockDXT1;
+	struct BlockDXT3;
+	struct BlockDXT5;
+	struct AlphaBlockDXT3;
+	struct AlphaBlockDXT5;
+    class Vector3;
+
+	namespace QuickCompress
+	{
+		void compressDXT1(const ColorBlock & src, BlockDXT1 * dst);
+		void compressDXT1a(const ColorBlock & src, BlockDXT1 * dst);
+		
+		void compressDXT3(const ColorBlock & src, BlockDXT3 * dst);
+		
+		void compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst, int iterationCount=8);
+		void compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst, int iterationCount=8);
+
+		void compressDXT5(const ColorBlock & src, BlockDXT5 * dst, int iterationCount=8);
+
+        void outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
+        void outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block);
+	}
+} // nv namespace
+
+#endif // NV_TT_QUICKCOMPRESSDXT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/QuickCompressDXT.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/QuickCompressDXT.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/QuickCompressDXT.cpp
@@ -1,585 +1,870 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvmath/Color.h>
-
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-
-#include "QuickCompressDXT.h"
-#include "OptimalCompressDXT.h"
-
-
-using namespace nv;
-using namespace QuickCompress;
-
-
-
-inline static void extractColorBlockRGB(const ColorBlock & rgba, Vector3 block[16])
-{
-	for (int i = 0; i < 16; i++)
-	{
-		const Color32 c = rgba.color(i);
-		block[i] = Vector3(c.r, c.g, c.b);
-	}
-}
-
-inline static uint extractColorBlockRGBA(const ColorBlock & rgba, Vector3 block[16])
-{
-	int num = 0;
-	
-	for (int i = 0; i < 16; i++)
-	{
-		const Color32 c = rgba.color(i);
-		if (c.a > 127)
-		{
-			block[num++] = Vector3(c.r, c.g, c.b);
-		}
-	}
-	
-	return num;
-}
-
-
-// find minimum and maximum colors based on bounding box in color space
-inline static void findMinMaxColorsBox(const Vector3 * block, uint num, Vector3 * restrict maxColor, Vector3 * restrict minColor)
-{
-	*maxColor = Vector3(0, 0, 0);
-	*minColor = Vector3(255, 255, 255);
-	
-	for (uint i = 0; i < num; i++)
-	{
-		*maxColor = max(*maxColor, block[i]);
-		*minColor = min(*minColor, block[i]);
-	}
-}
-
-
-inline static void selectDiagonal(const Vector3 * block, uint num, Vector3 * restrict maxColor, Vector3 * restrict minColor)
-{
-	Vector3 center = (*maxColor + *minColor) * 0.5;
-
-	Vector2 covariance = Vector2(zero);
-	for (uint i = 0; i < num; i++)
-	{
-		Vector3 t = block[i] - center;
-		covariance += t.xy() * t.z();
-	}
-
-	float x0 = maxColor->x();
-	float y0 = maxColor->y();
-	float x1 = minColor->x();
-	float y1 = minColor->y();
-	
-	if (covariance.x() < 0) {
-		swap(x0, x1);
-	}
-	if (covariance.y() < 0) {
-		swap(y0, y1);
-	}
-	
-	maxColor->set(x0, y0, maxColor->z());
-	minColor->set(x1, y1, minColor->z());
-}
-
-inline static void insetBBox(Vector3 * restrict maxColor, Vector3 * restrict minColor)
-{
-	Vector3 inset = (*maxColor - *minColor) / 16.0f - (8.0f / 255.0f) / 16.0f;
-	*maxColor = clamp(*maxColor - inset, 0.0f, 255.0f);
-	*minColor = clamp(*minColor + inset, 0.0f, 255.0f);
-}
-
-inline static uint16 roundAndExpand(Vector3 * restrict v)
-{
-	uint r = uint(clamp(v->x() * (31.0f / 255.0f), 0.0f, 31.0f) + 0.5f);
-	uint g = uint(clamp(v->y() * (63.0f / 255.0f), 0.0f, 63.0f) + 0.5f);
-	uint b = uint(clamp(v->z() * (31.0f / 255.0f), 0.0f, 31.0f) + 0.5f);
-	
-	uint16 w = (r << 11) | (g << 5) | b;
-
-	r = (r << 3) | (r >> 2);
-	g = (g << 2) | (g >> 4);
-	b = (b << 3) | (b >> 2);
-	*v = Vector3(float(r), float(g), float(b));
-	
-	return w;
-}
-
-inline static float colorDistance(Vector3::Arg c0, Vector3::Arg c1)
-{
-	return dot(c0-c1, c0-c1);
-}
-
-inline static uint computeIndices4(Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
-{
-	Vector3 palette[4];
-	palette[0] = maxColor;
-	palette[1] = minColor;
-	palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f);
-	palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f);
-	
-	uint indices = 0;
-	for(int i = 0; i < 16; i++)
-	{
-		float d0 = colorDistance(palette[0], block[i]);
-		float d1 = colorDistance(palette[1], block[i]);
-		float d2 = colorDistance(palette[2], block[i]);
-		float d3 = colorDistance(palette[3], block[i]);
-		
-		uint b0 = d0 > d3;
-		uint b1 = d1 > d2;
-		uint b2 = d0 > d2;
-		uint b3 = d1 > d3;
-		uint b4 = d2 > d3;
-		
-		uint x0 = b1 & b2;
-		uint x1 = b0 & b3;
-		uint x2 = b0 & b4;
-		
-		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
-	}
-
-	return indices;
-}
-
-inline static uint computeIndices3(const ColorBlock & rgba, Vector3::Arg maxColor, Vector3::Arg minColor)
-{
-	Vector3 palette[4];
-	palette[0] = minColor;
-	palette[1] = maxColor;
-	palette[2] = (palette[0] + palette[1]) * 0.5f;
-	
-	uint indices = 0;
-	for(int i = 0; i < 16; i++)
-	{
-		Color32 c = rgba.color(i);
-		Vector3 color = Vector3(c.r, c.g, c.b);
-		
-		float d0 = colorDistance(palette[0], color);
-		float d1 = colorDistance(palette[1], color);
-		float d2 = colorDistance(palette[2], color);
-		
-		uint index;
-		if (c.a < 128) index = 3;
-		else if (d0 < d1 && d0 < d2) index = 0;
-		else if (d1 < d2) index = 1;
-		else index = 2;
-		
-		indices |= index << (2 * i);
-	}
-
-	return indices;
-}
-
-
-static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
-{
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	Vector3 alphax_sum(zero);
-	Vector3 betax_sum(zero);
-	
-	for( int i = 0; i < 16; ++i )
-	{
-		const uint bits = dxtBlock->indices >> (2 * i);
-		
-		float beta = float(bits & 1);
-		if (bits & 2) beta = (1 + beta) / 3.0f;
-		float alpha = 1.0f - beta;
-		
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * block[i];
-		betax_sum += beta * block[i];
-	}
-
-	float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-	if (equal(denom, 0.0f)) return;
-	
-	float factor = 1.0f / denom;
-	
-	Vector3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	Vector3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-
-	a = clamp(a, 0, 255);
-	b = clamp(b, 0, 255);
-	
-	uint16 color0 = roundAndExpand(&a);
-	uint16 color1 = roundAndExpand(&b);
-
-	if (color0 < color1)
-	{
-		swap(a, b);
-		swap(color0, color1);
-	}
-
-	dxtBlock->col0 = Color16(color0);
-	dxtBlock->col1 = Color16(color1);
-	dxtBlock->indices = computeIndices4(block, a, b);
-}
-
-/*static void optimizeEndPoints3(Vector3 block[16], BlockDXT1 * dxtBlock)
-{
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	Vector3 alphax_sum(zero);
-	Vector3 betax_sum(zero);
-	
-	for( int i = 0; i < 16; ++i )
-	{
-		const uint bits = dxtBlock->indices >> (2 * i);
-
-		float beta = (bits & 1);
-		if (bits & 2) beta = 0.5f;
-		float alpha = 1.0f - beta;
-
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * block[i];
-		betax_sum += beta * block[i];
-	}
-
-	float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
-	if (equal(denom, 0.0f)) return;
-	
-	float factor = 1.0f / denom;
-	
-	Vector3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	Vector3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-
-	a = clamp(a, 0, 255);
-	b = clamp(b, 0, 255);
-	
-	uint16 color0 = roundAndExpand(&a);
-	uint16 color1 = roundAndExpand(&b);
-
-	if (color0 < color1)
-	{
-		swap(a, b);
-		swap(color0, color1);
-	}
-
-	dxtBlock->col0 = Color16(color1);
-	dxtBlock->col1 = Color16(color0);
-	dxtBlock->indices = computeIndices3(block, a, b);
-}*/
-
-namespace
-{
-
-	static uint computeAlphaIndices(const ColorBlock & rgba, AlphaBlockDXT5 * block)
-	{
-		uint8 alphas[8];
-		block->evaluatePalette(alphas);
-
-		uint totalError = 0;
-
-		for (uint i = 0; i < 16; i++)
-		{
-			uint8 alpha = rgba.color(i).a;
-
-			uint besterror = 256*256;
-			uint best = 8;
-			for(uint p = 0; p < 8; p++)
-			{
-				int d = alphas[p] - alpha;
-				uint error = d * d;
-
-				if (error < besterror)
-				{
-					besterror = error;
-					best = p;
-				}
-			}
-			nvDebugCheck(best < 8);
-
-			totalError += besterror;
-			block->setIndex(i, best);
-		}
-
-		return totalError;
-	}
-
-	static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
-	{
-		float alpha2_sum = 0;
-		float beta2_sum = 0;
-		float alphabeta_sum = 0;
-		float alphax_sum = 0;
-		float betax_sum = 0;
-
-		for (int i = 0; i < 16; i++)
-		{
-			uint idx = block->index(i);
-			float alpha;
-			if (idx < 2) alpha = 1.0f - idx;
-			else alpha = (8.0f - idx) / 7.0f;
-			
-			float beta = 1 - alpha;
-
-			alpha2_sum += alpha * alpha;
-			beta2_sum += beta * beta;
-			alphabeta_sum += alpha * beta;
-			alphax_sum += alpha * rgba.color(i).a;
-			betax_sum += beta * rgba.color(i).a;
-		}
-
-		const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-		float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-		float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-
-		uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
-		uint alpha1 = uint(min(max(b, 0.0f), 255.0f));
-
-		if (alpha0 < alpha1)
-		{
-			swap(alpha0, alpha1);
-
-			// Flip indices:
-			for (int i = 0; i < 16; i++)
-			{
-				uint idx = block->index(i);
-				if (idx < 2) block->setIndex(i, 1 - idx);
-				else block->setIndex(i, 9 - idx);
-			}
-		}
-		else if (alpha0 == alpha1)
-		{
-			for (int i = 0; i < 16; i++)
-			{
-				block->setIndex(i, 0);
-			}
-		}
-
-		block->alpha0 = alpha0;
-		block->alpha1 = alpha1;
-	}
-
-	/*
-	static void optimizeAlpha6(const ColorBlock & rgba, AlphaBlockDXT5 * block)
-	{
-		float alpha2_sum = 0;
-		float beta2_sum = 0;
-		float alphabeta_sum = 0;
-		float alphax_sum = 0;
-		float betax_sum = 0;
-
-		for (int i = 0; i < 16; i++)
-		{
-			uint8 x = rgba.color(i).a;
-			if (x == 0 || x == 255) continue;
-
-			uint bits = block->index(i);
-			if (bits == 6 || bits == 7) continue;
-
-			float alpha;
-			if (bits == 0) alpha = 1.0f;
-			else if (bits == 1) alpha = 0.0f;
-			else alpha = (6.0f - block->index(i)) / 5.0f;
-			
-			float beta = 1 - alpha;
-
-			alpha2_sum += alpha * alpha;
-			beta2_sum += beta * beta;
-			alphabeta_sum += alpha * beta;
-			alphax_sum += alpha * x;
-			betax_sum += beta * x;
-		}
-
-		const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-		float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-		float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-
-		uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
-		uint alpha1 = uint(min(max(b, 0.0f), 255.0f));
-
-		if (alpha0 > alpha1)
-		{
-			swap(alpha0, alpha1);
-		}
-
-		block->alpha0 = alpha0;
-		block->alpha1 = alpha1;
-	}
-	*/
-
-	static bool sameIndices(const AlphaBlockDXT5 & block0, const AlphaBlockDXT5 & block1)
-	{
-		const uint64 mask = ~uint64(0xFFFF);
-		return (block0.u | mask) == (block1.u | mask);
-	}
-
-} // namespace
-
-
-
-void QuickCompress::compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
-{
-	if (rgba.isSingleColor())
-	{
-		OptimalCompress::compressDXT1(rgba.color(0), dxtBlock);
-	}
-	else
-	{
-		// read block
-		Vector3 block[16];
-		extractColorBlockRGB(rgba, block);
-		
-		// find min and max colors
-		Vector3 maxColor, minColor;
-		findMinMaxColorsBox(block, 16, &maxColor, &minColor);
-		
-		selectDiagonal(block, 16, &maxColor, &minColor);
-		
-		insetBBox(&maxColor, &minColor);
-		
-		uint16 color0 = roundAndExpand(&maxColor);
-		uint16 color1 = roundAndExpand(&minColor);
-
-		if (color0 < color1)
-		{
-			swap(maxColor, minColor);
-			swap(color0, color1);
-		}
-
-		dxtBlock->col0 = Color16(color0);
-		dxtBlock->col1 = Color16(color1);
-		dxtBlock->indices = computeIndices4(block, maxColor, minColor);
-
-		optimizeEndPoints4(block, dxtBlock);
-	}
-}
-
-
-void QuickCompress::compressDXT1a(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
-{
-	bool hasAlpha = false;
-	
-	for (uint i = 0; i < 16; i++)
-	{
-		if (rgba.color(i).a < 128) {
-			hasAlpha = true;
-			break;
-		}
-	}
-	
-	if (!hasAlpha)
-	{
-		compressDXT1(rgba, dxtBlock);
-	}
-	// @@ Handle single RGB, with varying alpha? We need tables for single color compressor in 3 color mode.
-	//else if (rgba.isSingleColorNoAlpha()) { ... }
-	else 
-	{
-		// read block
-		Vector3 block[16];
-		uint num = extractColorBlockRGBA(rgba, block);
-		
-		// find min and max colors
-		Vector3 maxColor, minColor;
-		findMinMaxColorsBox(block, num, &maxColor, &minColor);
-		
-		selectDiagonal(block, num, &maxColor, &minColor);
-		
-		insetBBox(&maxColor, &minColor);
-		
-		uint16 color0 = roundAndExpand(&maxColor);
-		uint16 color1 = roundAndExpand(&minColor);
-		
-		if (color0 < color1)
-		{
-			swap(maxColor, minColor);
-			swap(color0, color1);
-		}
-		
-		dxtBlock->col0 = Color16(color1);
-		dxtBlock->col1 = Color16(color0);
-		dxtBlock->indices = computeIndices3(rgba, maxColor, minColor);
-		
-		//	optimizeEndPoints(block, dxtBlock);
-	}
-}
-
-
-void QuickCompress::compressDXT3(const ColorBlock & rgba, BlockDXT3 * dxtBlock)
-{
-	compressDXT1(rgba, &dxtBlock->color);
-	OptimalCompress::compressDXT3A(rgba, &dxtBlock->alpha);
-}
-
-
-void QuickCompress::compressDXT5A(const ColorBlock & rgba, AlphaBlockDXT5 * dxtBlock, int iterationCount/*=8*/)
-{
-	uint8 alpha0 = 0;
-	uint8 alpha1 = 255;
-	
-	// Get min/max alpha.
-	for (uint i = 0; i < 16; i++)
-	{
-		uint8 alpha = rgba.color(i).a;
-		alpha0 = max(alpha0, alpha);
-		alpha1 = min(alpha1, alpha);
-	}
-	
-	AlphaBlockDXT5 block;
-	block.alpha0 = alpha0 - (alpha0 - alpha1) / 34;
-	block.alpha1 = alpha1 + (alpha0 - alpha1) / 34;
-	uint besterror = computeAlphaIndices(rgba, &block);
-	
-	AlphaBlockDXT5 bestblock = block;
-
-	for (int i = 0; i < iterationCount; i++)
-	{
-		optimizeAlpha8(rgba, &block);
-		uint error = computeAlphaIndices(rgba, &block);
-		
-		if (error >= besterror)
-		{
-			// No improvement, stop.
-			break;
-		}
-		if (sameIndices(block, bestblock))
-		{
-			bestblock = block;
-			break;
-		}
-		
-		besterror = error;
-		bestblock = block;
-	};
-	
-	// Copy best block to result;
-	*dxtBlock = bestblock;
-}
-
-void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount/*=8*/)
-{
-	compressDXT1(rgba, &dxtBlock->color);
-	compressDXT5A(rgba, &dxtBlock->alpha, iterationCount);
-}
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "QuickCompressDXT.h"
+#include "OptimalCompressDXT.h"
+
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+
+#include "nvmath/Color.inl"
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+
+#include "nvcore/Utils.h" // swap
+
+#include <string.h> // memset
+#include <float.h> // FLT_MAX 
+
+using namespace nv;
+using namespace QuickCompress;
+
+
+
+inline static void extractColorBlockRGB(const ColorBlock & rgba, Vector3 block[16])
+{
+	for (int i = 0; i < 16; i++)
+	{
+		const Color32 c = rgba.color(i);
+		block[i] = Vector3(c.r, c.g, c.b);
+	}
+}
+
+inline static uint extractColorBlockRGBA(const ColorBlock & rgba, Vector3 block[16])
+{
+	int num = 0;
+	
+	for (int i = 0; i < 16; i++)
+	{
+		const Color32 c = rgba.color(i);
+		if (c.a > 127)
+		{
+			block[num++] = Vector3(c.r, c.g, c.b);
+		}
+	}
+	
+	return num;
+}
+
+
+// find minimum and maximum colors based on bounding box in color space
+inline static void findMinMaxColorsBox(const Vector3 * block, uint num, Vector3 * restrict maxColor, Vector3 * restrict minColor)
+{
+	*maxColor = Vector3(0, 0, 0);
+	*minColor = Vector3(255, 255, 255);
+	
+	for (uint i = 0; i < num; i++)
+	{
+		*maxColor = max(*maxColor, block[i]);
+		*minColor = min(*minColor, block[i]);
+	}
+}
+
+
+inline static void selectDiagonal(const Vector3 * block, uint num, Vector3 * restrict maxColor, Vector3 * restrict minColor)
+{
+	Vector3 center = (*maxColor + *minColor) * 0.5f;
+
+	Vector2 covariance = Vector2(0.0f);
+	for (uint i = 0; i < num; i++)
+	{
+		Vector3 t = block[i] - center;
+		covariance += t.xy() * t.z;
+	}
+
+	float x0 = maxColor->x;
+	float y0 = maxColor->y;
+	float x1 = minColor->x;
+	float y1 = minColor->y;
+	
+	if (covariance.x < 0) {
+		swap(x0, x1);
+	}
+	if (covariance.y < 0) {
+		swap(y0, y1);
+	}
+	
+	maxColor->set(x0, y0, maxColor->z);
+	minColor->set(x1, y1, minColor->z);
+}
+
+inline static void insetBBox(Vector3 * restrict maxColor, Vector3 * restrict minColor)
+{
+	Vector3 inset = (*maxColor - *minColor) / 16.0f - (8.0f / 255.0f) / 16.0f;
+	*maxColor = clamp(*maxColor - inset, 0.0f, 255.0f);
+	*minColor = clamp(*minColor + inset, 0.0f, 255.0f);
+}
+
+#include "nvmath/ftoi.h"
+
+// Takes a normalized color in [0, 255] range and returns 
+inline static uint16 roundAndExpand(Vector3 * restrict v)
+{
+	uint r = ftoi_floor(clamp(v->x * (31.0f / 255.0f), 0.0f, 31.0f));
+	uint g = ftoi_floor(clamp(v->y * (63.0f / 255.0f), 0.0f, 63.0f));
+	uint b = ftoi_floor(clamp(v->z * (31.0f / 255.0f), 0.0f, 31.0f));
+
+    float r0 = float(((r+0) << 3) | ((r+0) >> 2));
+    float r1 = float(((r+1) << 3) | ((r+1) >> 2));
+    if (fabs(v->x - r1) < fabs(v->x - r0)) r = min(r+1, 31U);
+
+    float g0 = float(((g+0) << 2) | ((g+0) >> 4));
+    float g1 = float(((g+1) << 2) | ((g+1) >> 4));
+    if (fabs(v->y - g1) < fabs(v->y - g0)) g = min(g+1, 63U);
+
+    float b0 = float(((b+0) << 3) | ((b+0) >> 2));
+    float b1 = float(((b+1) << 3) | ((b+1) >> 2));
+    if (fabs(v->z - b1) < fabs(v->z - b0)) b = min(b+1, 31U);
+
+
+	uint16 w = (r << 11) | (g << 5) | b;
+
+	r = (r << 3) | (r >> 2);
+	g = (g << 2) | (g >> 4);
+	b = (b << 3) | (b >> 2);
+	*v = Vector3(float(r), float(g), float(b));
+	
+	return w;
+}
+
+// Takes a normalized color in [0, 255] range and returns 
+inline static uint16 roundAndExpand01(Vector3 * restrict v)
+{
+	uint r = ftoi_floor(clamp(v->x * 31.0f, 0.0f, 31.0f));
+	uint g = ftoi_floor(clamp(v->y * 63.0f, 0.0f, 63.0f));
+	uint b = ftoi_floor(clamp(v->z * 31.0f, 0.0f, 31.0f));
+
+    float r0 = float(((r+0) << 3) | ((r+0) >> 2));
+    float r1 = float(((r+1) << 3) | ((r+1) >> 2));
+    if (fabs(v->x - r1) < fabs(v->x - r0)) r = min(r+1, 31U);
+
+    float g0 = float(((g+0) << 2) | ((g+0) >> 4));
+    float g1 = float(((g+1) << 2) | ((g+1) >> 4));
+    if (fabs(v->y - g1) < fabs(v->y - g0)) g = min(g+1, 63U);
+
+    float b0 = float(((b+0) << 3) | ((b+0) >> 2));
+    float b1 = float(((b+1) << 3) | ((b+1) >> 2));
+    if (fabs(v->z - b1) < fabs(v->z - b0)) b = min(b+1, 31U);
+
+
+	uint16 w = (r << 11) | (g << 5) | b;
+
+	r = (r << 3) | (r >> 2);
+	g = (g << 2) | (g >> 4);
+	b = (b << 3) | (b >> 2);
+	*v = Vector3(float(r) / 255.0f, float(g) / 255.0f, float(b) / 255.0f);
+	
+	return w;
+}
+
+
+
+inline static float colorDistance(Vector3::Arg c0, Vector3::Arg c1)
+{
+	return dot(c0-c1, c0-c1);
+}
+
+Vector3 round255(const Vector3 & v) {
+    //return Vector3(ftoi_round(255 * v.x), ftoi_round(255 * v.y), ftoi_round(255 * v.z)) * (1.0f / 255);
+    //return Vector3(floorf(v.x + 0.5f), floorf(v.y + 0.5f), floorf(v.z + 0.5f));
+    return v;
+}
+
+
+inline static uint computeIndices4(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = maxColor;
+	palette[1] = minColor;
+	//palette[2] = round255((2 * palette[0] + palette[1]) / 3.0f);
+	//palette[3] = round255((2 * palette[1] + palette[0]) / 3.0f);
+	palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f);
+	palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f);
+	
+	uint indices = 0;
+	for(int i = 0; i < 16; i++)
+	{
+		float d0 = colorDistance(palette[0], block[i]);
+		float d1 = colorDistance(palette[1], block[i]);
+		float d2 = colorDistance(palette[2], block[i]);
+		float d3 = colorDistance(palette[3], block[i]);
+		
+		uint b0 = d0 > d3;
+		uint b1 = d1 > d2;
+		uint b2 = d0 > d2;
+		uint b3 = d1 > d3;
+		uint b4 = d2 > d3;
+		
+		uint x0 = b1 & b2;
+		uint x1 = b0 & b3;
+		uint x2 = b0 & b4;
+		
+		indices |= (x2 | ((x0 | x1) << 1)) << (2 * i);
+	}
+
+	return indices;
+}
+
+// maxColor and minColor are expected to be in the same range as the color set.
+/*
+inline static uint computeIndices4(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = maxColor;
+	palette[1] = minColor;
+	palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f);
+	palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f);
+	
+    Vector3 mem[(4+2)*2];
+    memset(mem, 0, sizeof(mem));
+
+	Vector3 * row0 = mem;
+	Vector3 * row1 = mem + (4+2);
+
+	uint indices = 0;
+    //for(int i = 0; i < 16; i++)
+	for (uint y = 0; y < 4; y++) {
+		for (uint x = 0; x < 4; x++) {
+            int i = y*4+x;
+
+            if (!set.isValidIndex(i)) {
+                // Skip masked pixels and out of bounds.
+                continue;
+            }
+
+            Vector3 color = set.color(i).xyz();
+
+            // Add error.
+            color += row0[1+x];
+
+		    float d0 = colorDistance(palette[0], color);
+		    float d1 = colorDistance(palette[1], color);
+		    float d2 = colorDistance(palette[2], color);
+		    float d3 = colorDistance(palette[3], color);
+    		
+		    uint b0 = d0 > d3;
+		    uint b1 = d1 > d2;
+		    uint b2 = d0 > d2;
+		    uint b3 = d1 > d3;
+		    uint b4 = d2 > d3;
+    		
+		    uint x0 = b1 & b2;
+		    uint x1 = b0 & b3;
+		    uint x2 = b0 & b4;
+
+            int index = x2 | ((x0 | x1) << 1);
+		    indices |= index << (2 * i);
+
+		    // Compute new error.
+		    Vector3 diff = color - palette[index];
+            
+		    // Propagate new error.
+		    //row0[1+x+1] += 7.0f / 16.0f * diff;
+		    //row1[1+x-1] += 3.0f / 16.0f * diff;
+		    //row1[1+x+0] += 5.0f / 16.0f * diff;
+		    //row1[1+x+1] += 1.0f / 16.0f * diff;
+        }
+
+		swap(row0, row1);
+		memset(row1, 0, sizeof(Vector3) * (4+2));
+	}
+
+	return indices;
+}*/
+
+inline static float evaluatePaletteError4(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = maxColor;
+	palette[1] = minColor;
+	//palette[2] = round255((2 * palette[0] + palette[1]) / 3.0f);
+	//palette[3] = round255((2 * palette[1] + palette[0]) / 3.0f);
+	palette[2] = lerp(palette[0], palette[1], 1.0f / 3.0f);
+	palette[3] = lerp(palette[0], palette[1], 2.0f / 3.0f);
+	
+	float total = 0.0f;
+	for (int i = 0; i < 16; i++)
+	{
+		float d0 = colorDistance(palette[0], block[i]);
+		float d1 = colorDistance(palette[1], block[i]);
+		float d2 = colorDistance(palette[2], block[i]);
+		float d3 = colorDistance(palette[3], block[i]);
+
+		total += min(min(d0, d1), min(d2, d3));
+	}
+
+	return total;
+}
+
+inline static float evaluatePaletteError3(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = minColor;
+	palette[1] = maxColor;
+	palette[2] = (palette[0] + palette[1]) * 0.5f;
+	palette[3] = Vector3(0);
+	
+	float total = 0.0f;
+	for (int i = 0; i < 16; i++)
+	{
+		float d0 = colorDistance(palette[0], block[i]);
+		float d1 = colorDistance(palette[1], block[i]);
+		float d2 = colorDistance(palette[2], block[i]);
+		//float d3 = colorDistance(palette[3], block[i]);
+
+		//total += min(min(d0, d1), min(d2, d3));
+        total += min(min(d0, d1), d2);
+	}
+
+	return total;
+}
+
+
+// maxColor and minColor are expected to be in the same range as the color set.
+/*inline static uint computeIndices3(const ColorSet & set, Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = minColor;
+	palette[1] = maxColor;
+	palette[2] = (palette[0] + palette[1]) * 0.5f;
+	
+	uint indices = 0;
+	for(int i = 0; i < 16; i++)
+	{
+        if (!set.isValidIndex(i)) {
+            // Skip masked pixels and out of bounds.
+            indices |= 3 << (2 * i);
+            continue;
+        }
+
+        Vector3 color = set.color(i).xyz();
+		
+		float d0 = colorDistance(palette[0], color);
+		float d1 = colorDistance(palette[1], color);
+		float d2 = colorDistance(palette[2], color);
+		
+		uint index;
+		if (d0 < d1 && d0 < d2) index = 0;
+		else if (d1 < d2) index = 1;
+		else index = 2;
+		
+		indices |= index << (2 * i);
+	}
+
+	return indices;
+}*/
+
+inline static uint computeIndices3(const Vector3 block[16], Vector3::Arg maxColor, Vector3::Arg minColor)
+{
+	Vector3 palette[4];
+	palette[0] = minColor;
+	palette[1] = maxColor;
+	palette[2] = (palette[0] + palette[1]) * 0.5f;
+	
+	uint indices = 0;
+	for(int i = 0; i < 16; i++)
+	{
+		float d0 = colorDistance(palette[0], block[i]);
+		float d1 = colorDistance(palette[1], block[i]);
+		float d2 = colorDistance(palette[2], block[i]);
+		
+		uint index;
+		if (d0 < d1 && d0 < d2) index = 0;
+		else if (d1 < d2) index = 1;
+		else index = 2;
+		
+		indices |= index << (2 * i);
+	}
+
+	return indices;
+}
+
+
+
+
+static void optimizeEndPoints4(Vector3 block[16], BlockDXT1 * dxtBlock)
+{
+	float alpha2_sum = 0.0f;
+	float beta2_sum = 0.0f;
+	float alphabeta_sum = 0.0f;
+	Vector3 alphax_sum(0.0f);
+	Vector3 betax_sum(0.0f);
+	
+	for( int i = 0; i < 16; ++i )
+	{
+		const uint bits = dxtBlock->indices >> (2 * i);
+		
+		float beta = float(bits & 1);
+		if (bits & 2) beta = (1 + beta) / 3.0f;
+		float alpha = 1.0f - beta;
+		
+		alpha2_sum += alpha * alpha;
+		beta2_sum += beta * beta;
+		alphabeta_sum += alpha * beta;
+		alphax_sum += alpha * block[i];
+		betax_sum += beta * block[i];
+	}
+
+	float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+	if (equal(denom, 0.0f)) return;
+	
+	float factor = 1.0f / denom;
+	
+	Vector3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+	Vector3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+	a = clamp(a, 0, 255);
+	b = clamp(b, 0, 255);
+	
+	uint16 color0 = roundAndExpand(&a);
+	uint16 color1 = roundAndExpand(&b);
+
+	if (color0 < color1)
+	{
+		swap(a, b);
+		swap(color0, color1);
+	}
+
+	dxtBlock->col0 = Color16(color0);
+	dxtBlock->col1 = Color16(color1);
+	dxtBlock->indices = computeIndices4(block, a, b);
+}
+
+static void optimizeEndPoints3(Vector3 block[16], BlockDXT1 * dxtBlock)
+{
+	float alpha2_sum = 0.0f;
+	float beta2_sum = 0.0f;
+	float alphabeta_sum = 0.0f;
+	Vector3 alphax_sum(0.0f);
+	Vector3 betax_sum(0.0f);
+	
+	for( int i = 0; i < 16; ++i )
+	{
+		const uint bits = dxtBlock->indices >> (2 * i);
+
+		float beta = float(bits & 1);
+		if (bits & 2) beta = 0.5f;
+		float alpha = 1.0f - beta;
+
+		alpha2_sum += alpha * alpha;
+		beta2_sum += beta * beta;
+		alphabeta_sum += alpha * beta;
+		alphax_sum += alpha * block[i];
+		betax_sum += beta * block[i];
+	}
+
+	float denom = alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum;
+	if (equal(denom, 0.0f)) return;
+	
+	float factor = 1.0f / denom;
+	
+	Vector3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+	Vector3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+	a = clamp(a, 0, 255);
+	b = clamp(b, 0, 255);
+	
+	uint16 color0 = roundAndExpand(&a);
+	uint16 color1 = roundAndExpand(&b);
+
+	if (color0 < color1)
+	{
+		swap(a, b);
+		swap(color0, color1);
+	}
+
+	dxtBlock->col0 = Color16(color1);
+	dxtBlock->col1 = Color16(color0);
+	dxtBlock->indices = computeIndices3(block, a, b);
+}
+
+namespace
+{
+
+	static uint computeAlphaIndices(const AlphaBlock4x4 & src, AlphaBlockDXT5 * block)
+	{
+		uint8 alphas[8];
+		block->evaluatePalette(alphas, false); // @@ Use target decoder.
+
+		uint totalError = 0;
+
+		for (uint i = 0; i < 16; i++)
+		{
+			uint8 alpha = src.alpha[i];
+
+			uint besterror = 256*256;
+			uint best = 8;
+			for(uint p = 0; p < 8; p++)
+			{
+				int d = alphas[p] - alpha;
+				uint error = d * d;
+
+				if (error < besterror)
+				{
+					besterror = error;
+					best = p;
+				}
+			}
+			nvDebugCheck(best < 8);
+
+			totalError += besterror;
+			block->setIndex(i, best);
+		}
+
+		return totalError;
+	}
+
+	static void optimizeAlpha8(const AlphaBlock4x4 & src, AlphaBlockDXT5 * block)
+	{
+		float alpha2_sum = 0;
+		float beta2_sum = 0;
+		float alphabeta_sum = 0;
+		float alphax_sum = 0;
+		float betax_sum = 0;
+
+		for (int i = 0; i < 16; i++)
+		{
+			uint idx = block->index(i);
+			float alpha;
+			if (idx < 2) alpha = 1.0f - idx;
+			else alpha = (8.0f - idx) / 7.0f;
+			
+			float beta = 1 - alpha;
+
+			alpha2_sum += alpha * alpha;
+			beta2_sum += beta * beta;
+			alphabeta_sum += alpha * beta;
+			alphax_sum += alpha * src.alpha[i];
+			betax_sum += beta * src.alpha[i];
+		}
+
+		const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+		float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+		float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+		uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
+		uint alpha1 = uint(min(max(b, 0.0f), 255.0f));
+
+		if (alpha0 < alpha1)
+		{
+			swap(alpha0, alpha1);
+
+			// Flip indices:
+			for (int i = 0; i < 16; i++)
+			{
+				uint idx = block->index(i);
+				if (idx < 2) block->setIndex(i, 1 - idx);
+				else block->setIndex(i, 9 - idx);
+			}
+		}
+		else if (alpha0 == alpha1)
+		{
+			for (int i = 0; i < 16; i++)
+			{
+				block->setIndex(i, 0);
+			}
+		}
+
+		block->alpha0 = alpha0;
+		block->alpha1 = alpha1;
+	}
+
+	/*
+	static void optimizeAlpha6(const ColorBlock & rgba, AlphaBlockDXT5 * block)
+	{
+		float alpha2_sum = 0;
+		float beta2_sum = 0;
+		float alphabeta_sum = 0;
+		float alphax_sum = 0;
+		float betax_sum = 0;
+
+		for (int i = 0; i < 16; i++)
+		{
+			uint8 x = rgba.color(i).a;
+			if (x == 0 || x == 255) continue;
+
+			uint bits = block->index(i);
+			if (bits == 6 || bits == 7) continue;
+
+			float alpha;
+			if (bits == 0) alpha = 1.0f;
+			else if (bits == 1) alpha = 0.0f;
+			else alpha = (6.0f - block->index(i)) / 5.0f;
+			
+			float beta = 1 - alpha;
+
+			alpha2_sum += alpha * alpha;
+			beta2_sum += beta * beta;
+			alphabeta_sum += alpha * beta;
+			alphax_sum += alpha * x;
+			betax_sum += beta * x;
+		}
+
+		const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+		float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+		float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+		uint alpha0 = uint(min(max(a, 0.0f), 255.0f));
+		uint alpha1 = uint(min(max(b, 0.0f), 255.0f));
+
+		if (alpha0 > alpha1)
+		{
+			swap(alpha0, alpha1);
+		}
+
+		block->alpha0 = alpha0;
+		block->alpha1 = alpha1;
+	}
+	*/
+
+	static bool sameIndices(const AlphaBlockDXT5 & block0, const AlphaBlockDXT5 & block1)
+	{
+		const uint64 mask = ~uint64(0xFFFF);
+		return (block0.u | mask) == (block1.u | mask);
+	}
+
+} // namespace
+
+
+
+void QuickCompress::compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
+{
+	if (rgba.isSingleColor())
+	{
+		OptimalCompress::compressDXT1(rgba.color(0), dxtBlock);
+	}
+	else
+	{
+		// read block
+		Vector3 block[16];
+		extractColorBlockRGB(rgba, block);
+
+#if 1
+		// find min and max colors
+		Vector3 maxColor, minColor;
+		findMinMaxColorsBox(block, 16, &maxColor, &minColor);
+		
+		selectDiagonal(block, 16, &maxColor, &minColor);
+		
+		insetBBox(&maxColor, &minColor);
+#else
+		float weights[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+		Vector3 cluster[4];
+		int count = Compute4Means(16, block, weights, Vector3(1, 1, 1), cluster);
+
+		Vector3 maxColor, minColor;
+		float bestError = FLT_MAX;
+
+		for (int i = 1; i < 4; i++)
+		{
+			for (int j = 0; j < i; j++)
+			{
+		        uint16 color0 = roundAndExpand(&cluster[i]);
+		        uint16 color1 = roundAndExpand(&cluster[j]);
+
+				float error = evaluatePaletteError4(block, cluster[i], cluster[j]);
+				if (error < bestError) {
+					bestError = error;
+					maxColor = cluster[i];
+					minColor = cluster[j];
+				}
+			}
+		}
+#endif
+
+		uint16 color0 = roundAndExpand(&maxColor);
+		uint16 color1 = roundAndExpand(&minColor);
+
+		if (color0 < color1)
+		{
+			swap(maxColor, minColor);
+			swap(color0, color1);
+		}
+
+		dxtBlock->col0 = Color16(color0);
+		dxtBlock->col1 = Color16(color1);
+		dxtBlock->indices = computeIndices4(block, maxColor, minColor);
+
+		optimizeEndPoints4(block, dxtBlock);
+	}
+}
+
+
+void QuickCompress::compressDXT1a(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
+{
+	bool hasAlpha = false;
+	
+	for (uint i = 0; i < 16; i++)
+	{
+		if (rgba.color(i).a == 0) {
+			hasAlpha = true;
+			break;
+		}
+	}
+	
+	if (!hasAlpha)
+	{
+		compressDXT1(rgba, dxtBlock);
+	}
+	// @@ Handle single RGB, with varying alpha? We need tables for single color compressor in 3 color mode.
+	//else if (rgba.isSingleColorNoAlpha()) { ... }
+	else 
+	{
+		// read block
+		Vector3 block[16];
+		uint num = extractColorBlockRGBA(rgba, block);
+		
+		// find min and max colors
+		Vector3 maxColor, minColor;
+		findMinMaxColorsBox(block, num, &maxColor, &minColor);
+		
+		selectDiagonal(block, num, &maxColor, &minColor);
+		
+		insetBBox(&maxColor, &minColor);
+		
+		uint16 color0 = roundAndExpand(&maxColor);
+		uint16 color1 = roundAndExpand(&minColor);
+		
+		if (color0 < color1)
+		{
+			swap(maxColor, minColor);
+			swap(color0, color1);
+		}
+		
+		dxtBlock->col0 = Color16(color1);
+		dxtBlock->col1 = Color16(color0);
+		dxtBlock->indices = computeIndices3(block, maxColor, minColor);
+		
+		//	optimizeEndPoints(block, dxtBlock);
+	}
+}
+
+
+void QuickCompress::compressDXT3(const ColorBlock & src, BlockDXT3 * dxtBlock)
+{
+	compressDXT1(src, &dxtBlock->color);
+	OptimalCompress::compressDXT3A(src, &dxtBlock->alpha);
+}
+
+void QuickCompress::compressDXT5A(const ColorBlock & src, AlphaBlockDXT5 * dst, int iterationCount/*=8*/)
+{
+    AlphaBlock4x4 tmp;
+    tmp.init(src, 3);
+    compressDXT5A(tmp, dst, iterationCount);
+}
+
+void QuickCompress::compressDXT5A(const AlphaBlock4x4 & src, AlphaBlockDXT5 * dst, int iterationCount/*=8*/)
+{
+	uint8 alpha0 = 0;
+	uint8 alpha1 = 255;
+	
+	// Get min/max alpha.
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 alpha = src.alpha[i];
+		alpha0 = max(alpha0, alpha);
+		alpha1 = min(alpha1, alpha);
+	}
+	
+	AlphaBlockDXT5 block;
+	block.alpha0 = alpha0 - (alpha0 - alpha1) / 34;
+	block.alpha1 = alpha1 + (alpha0 - alpha1) / 34;
+	uint besterror = computeAlphaIndices(src, &block);
+	
+	AlphaBlockDXT5 bestblock = block;
+
+	for (int i = 0; i < iterationCount; i++)
+	{
+		optimizeAlpha8(src, &block);
+		uint error = computeAlphaIndices(src, &block);
+		
+		if (error >= besterror)
+		{
+			// No improvement, stop.
+			break;
+		}
+		if (sameIndices(block, bestblock))
+		{
+			bestblock = block;
+			break;
+		}
+		
+		besterror = error;
+		bestblock = block;
+	};
+	
+	// Copy best block to result;
+	*dst = bestblock;
+}
+
+void QuickCompress::compressDXT5(const ColorBlock & rgba, BlockDXT5 * dxtBlock, int iterationCount/*=8*/)
+{
+	compressDXT1(rgba, &dxtBlock->color);
+	compressDXT5A(rgba, &dxtBlock->alpha, iterationCount);
+}
+
+
+
+/*void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
+{
+    Vector3 minColor = start * 255.0f;
+    Vector3 maxColor = end * 255.0f;
+    uint16 color0 = roundAndExpand(&maxColor);
+    uint16 color1 = roundAndExpand(&minColor);
+
+    if (color0 < color1)
+    {
+        swap(maxColor, minColor);
+        swap(color0, color1);
+    }
+
+    block->col0 = Color16(color0);
+    block->col1 = Color16(color1);
+    block->indices = computeIndices4(set, maxColor / 255.0f, minColor / 255.0f);
+
+    //optimizeEndPoints4(set, block);
+}
+
+void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, const Vector3 & end, BlockDXT1 * block)
+{
+    Vector3 minColor = start * 255.0f;
+    Vector3 maxColor = end * 255.0f;
+    uint16 color0 = roundAndExpand(&minColor);
+    uint16 color1 = roundAndExpand(&maxColor);
+
+    if (color0 > color1)
+    {
+        swap(maxColor, minColor);
+        swap(color0, color1);
+    }
+
+    block->col0 = Color16(color0);
+    block->col1 = Color16(color1);
+    block->indices = computeIndices3(set, maxColor / 255.0f, minColor / 255.0f);
+
+    //optimizeEndPoints3(set, block);
+}
+*/
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/SingleColorLookup.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/SingleColorLookup.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/SingleColorLookup.h
@@ -1,588 +1,9 @@
 
-/*
-typedef unsigned char uint8;
+#include "nvcore/nvcore.h" // uint8
 
-static int Mul8Bit(int a, int b)
-{
-	int t = a * b + 128;
-	return (t + (t >> 8)) >> 8;
-}
-
-static inline int Lerp13(int fm, int to)
-{
-	return (fm * 2 + to) / 3;
-}
-
-static void PrepareOptTable(uint8 * Table, const uint8 * expand, int size)
-{
-	for (int i = 0; i < 256; i++)
-	{
-		float bestErr = 256;
-
-		for (int min = 0; min < size; min++)
-		{
-			for (int max = 0; max < size; max++)
-			{
-				int mine = expand[min];
-				int maxe = expand[max];
-				float err = abs(maxe + Mul8Bit(mine-maxe, 0x55) - i);
-				err += 0.03f * abs(max - min);
-
-				if (err < bestErr)
-				{
-					Table[i*2+0] = max;
-					Table[i*2+1] = min;
-					bestErr = err;
-				}
-			}
-		}
-	}
-}
-
-
-void initTables()
-{
-	uint8 Expand5[32];
-	uint8 Expand6[64];
-
-	for(sInt i=0;i<32;i++)
-		Expand5[i] = (i<<3)|(i>>2);
-
-	for(sInt i=0;i<64;i++)
-		Expand6[i] = (i<<2)|(i>>4);
-
-	PrepareOptTable(OMatch5, Expand5, 32)
-	PrepareOptTable(OMatch6, Expand6, 64)
-};
-*/
-
-#if __CUDACC__
-__constant__ unsigned short
-#else
-const static uint8 
-#endif
-OMatch5[256][2] =
-{
-	{0x00, 0x00},
-	{0x00, 0x00},
-	{0x00, 0x01},
-	{0x00, 0x01},
-	{0x01, 0x00},
-	{0x01, 0x00},
-	{0x01, 0x00},
-	{0x01, 0x01},
-	{0x01, 0x01},
-	{0x01, 0x01},
-	{0x01, 0x02},
-	{0x00, 0x04},
-	{0x02, 0x01},
-	{0x02, 0x01},
-	{0x02, 0x01},
-	{0x02, 0x02},
-	{0x02, 0x02},
-	{0x02, 0x02},
-	{0x02, 0x03},
-	{0x01, 0x05},
-	{0x03, 0x02},
-	{0x03, 0x02},
-	{0x04, 0x00},
-	{0x03, 0x03},
-	{0x03, 0x03},
-	{0x03, 0x03},
-	{0x03, 0x04},
-	{0x03, 0x04},
-	{0x03, 0x04},
-	{0x03, 0x05},
-	{0x04, 0x03},
-	{0x04, 0x03},
-	{0x05, 0x02},
-	{0x04, 0x04},
-	{0x04, 0x04},
-	{0x04, 0x05},
-	{0x04, 0x05},
-	{0x05, 0x04},
-	{0x05, 0x04},
-	{0x05, 0x04},
-	{0x06, 0x03},
-	{0x05, 0x05},
-	{0x05, 0x05},
-	{0x05, 0x06},
-	{0x04, 0x08},
-	{0x06, 0x05},
-	{0x06, 0x05},
-	{0x06, 0x05},
-	{0x06, 0x06},
-	{0x06, 0x06},
-	{0x06, 0x06},
-	{0x06, 0x07},
-	{0x05, 0x09},
-	{0x07, 0x06},
-	{0x07, 0x06},
-	{0x08, 0x04},
-	{0x07, 0x07},
-	{0x07, 0x07},
-	{0x07, 0x07},
-	{0x07, 0x08},
-	{0x07, 0x08},
-	{0x07, 0x08},
-	{0x07, 0x09},
-	{0x08, 0x07},
-	{0x08, 0x07},
-	{0x09, 0x06},
-	{0x08, 0x08},
-	{0x08, 0x08},
-	{0x08, 0x09},
-	{0x08, 0x09},
-	{0x09, 0x08},
-	{0x09, 0x08},
-	{0x09, 0x08},
-	{0x0A, 0x07},
-	{0x09, 0x09},
-	{0x09, 0x09},
-	{0x09, 0x0A},
-	{0x08, 0x0C},
-	{0x0A, 0x09},
-	{0x0A, 0x09},
-	{0x0A, 0x09},
-	{0x0A, 0x0A},
-	{0x0A, 0x0A},
-	{0x0A, 0x0A},
-	{0x0A, 0x0B},
-	{0x09, 0x0D},
-	{0x0B, 0x0A},
-	{0x0B, 0x0A},
-	{0x0C, 0x08},
-	{0x0B, 0x0B},
-	{0x0B, 0x0B},
-	{0x0B, 0x0B},
-	{0x0B, 0x0C},
-	{0x0B, 0x0C},
-	{0x0B, 0x0C},
-	{0x0B, 0x0D},
-	{0x0C, 0x0B},
-	{0x0C, 0x0B},
-	{0x0D, 0x0A},
-	{0x0C, 0x0C},
-	{0x0C, 0x0C},
-	{0x0C, 0x0D},
-	{0x0C, 0x0D},
-	{0x0D, 0x0C},
-	{0x0D, 0x0C},
-	{0x0D, 0x0C},
-	{0x0E, 0x0B},
-	{0x0D, 0x0D},
-	{0x0D, 0x0D},
-	{0x0D, 0x0E},
-	{0x0C, 0x10},
-	{0x0E, 0x0D},
-	{0x0E, 0x0D},
-	{0x0E, 0x0D},
-	{0x0E, 0x0E},
-	{0x0E, 0x0E},
-	{0x0E, 0x0E},
-	{0x0E, 0x0F},
-	{0x0D, 0x11},
-	{0x0F, 0x0E},
-	{0x0F, 0x0E},
-	{0x10, 0x0C},
-	{0x0F, 0x0F},
-	{0x0F, 0x0F},
-	{0x0F, 0x0F},
-	{0x0F, 0x10},
-	{0x0F, 0x10},
-	{0x0F, 0x10},
-	{0x0F, 0x11},
-	{0x10, 0x0F},
-	{0x10, 0x0F},
-	{0x11, 0x0E},
-	{0x10, 0x10},
-	{0x10, 0x10},
-	{0x10, 0x11},
-	{0x10, 0x11},
-	{0x11, 0x10},
-	{0x11, 0x10},
-	{0x11, 0x10},
-	{0x12, 0x0F},
-	{0x11, 0x11},
-	{0x11, 0x11},
-	{0x11, 0x12},
-	{0x10, 0x14},
-	{0x12, 0x11},
-	{0x12, 0x11},
-	{0x12, 0x11},
-	{0x12, 0x12},
-	{0x12, 0x12},
-	{0x12, 0x12},
-	{0x12, 0x13},
-	{0x11, 0x15},
-	{0x13, 0x12},
-	{0x13, 0x12},
-	{0x14, 0x10},
-	{0x13, 0x13},
-	{0x13, 0x13},
-	{0x13, 0x13},
-	{0x13, 0x14},
-	{0x13, 0x14},
-	{0x13, 0x14},
-	{0x13, 0x15},
-	{0x14, 0x13},
-	{0x14, 0x13},
-	{0x15, 0x12},
-	{0x14, 0x14},
-	{0x14, 0x14},
-	{0x14, 0x15},
-	{0x14, 0x15},
-	{0x15, 0x14},
-	{0x15, 0x14},
-	{0x15, 0x14},
-	{0x16, 0x13},
-	{0x15, 0x15},
-	{0x15, 0x15},
-	{0x15, 0x16},
-	{0x14, 0x18},
-	{0x16, 0x15},
-	{0x16, 0x15},
-	{0x16, 0x15},
-	{0x16, 0x16},
-	{0x16, 0x16},
-	{0x16, 0x16},
-	{0x16, 0x17},
-	{0x15, 0x19},
-	{0x17, 0x16},
-	{0x17, 0x16},
-	{0x18, 0x14},
-	{0x17, 0x17},
-	{0x17, 0x17},
-	{0x17, 0x17},
-	{0x17, 0x18},
-	{0x17, 0x18},
-	{0x17, 0x18},
-	{0x17, 0x19},
-	{0x18, 0x17},
-	{0x18, 0x17},
-	{0x19, 0x16},
-	{0x18, 0x18},
-	{0x18, 0x18},
-	{0x18, 0x19},
-	{0x18, 0x19},
-	{0x19, 0x18},
-	{0x19, 0x18},
-	{0x19, 0x18},
-	{0x1A, 0x17},
-	{0x19, 0x19},
-	{0x19, 0x19},
-	{0x19, 0x1A},
-	{0x18, 0x1C},
-	{0x1A, 0x19},
-	{0x1A, 0x19},
-	{0x1A, 0x19},
-	{0x1A, 0x1A},
-	{0x1A, 0x1A},
-	{0x1A, 0x1A},
-	{0x1A, 0x1B},
-	{0x19, 0x1D},
-	{0x1B, 0x1A},
-	{0x1B, 0x1A},
-	{0x1C, 0x18},
-	{0x1B, 0x1B},
-	{0x1B, 0x1B},
-	{0x1B, 0x1B},
-	{0x1B, 0x1C},
-	{0x1B, 0x1C},
-	{0x1B, 0x1C},
-	{0x1B, 0x1D},
-	{0x1C, 0x1B},
-	{0x1C, 0x1B},
-	{0x1D, 0x1A},
-	{0x1C, 0x1C},
-	{0x1C, 0x1C},
-	{0x1C, 0x1D},
-	{0x1C, 0x1D},
-	{0x1D, 0x1C},
-	{0x1D, 0x1C},
-	{0x1D, 0x1C},
-	{0x1E, 0x1B},
-	{0x1D, 0x1D},
-	{0x1D, 0x1D},
-	{0x1D, 0x1E},
-	{0x1D, 0x1E},
-	{0x1E, 0x1D},
-	{0x1E, 0x1D},
-	{0x1E, 0x1D},
-	{0x1E, 0x1E},
-	{0x1E, 0x1E},
-	{0x1E, 0x1E},
-	{0x1E, 0x1F},
-	{0x1E, 0x1F},
-	{0x1F, 0x1E},
-	{0x1F, 0x1E},
-	{0x1F, 0x1E},
-	{0x1F, 0x1F},
-	{0x1F, 0x1F},
-};
-
-#if __CUDACC__
-__constant__ unsigned short
-#else
-const static uint8
-#endif
-OMatch6[256][2] =
-{
-	{0x00, 0x00},
-	{0x00, 0x01},
-	{0x01, 0x00},
-	{0x01, 0x01},
-	{0x01, 0x01},
-	{0x01, 0x02},
-	{0x02, 0x01},
-	{0x02, 0x02},
-	{0x02, 0x02},
-	{0x02, 0x03},
-	{0x03, 0x02},
-	{0x03, 0x03},
-	{0x03, 0x03},
-	{0x03, 0x04},
-	{0x04, 0x03},
-	{0x04, 0x04},
-	{0x04, 0x04},
-	{0x04, 0x05},
-	{0x05, 0x04},
-	{0x05, 0x05},
-	{0x05, 0x05},
-	{0x05, 0x06},
-	{0x06, 0x05},
-	{0x00, 0x11},
-	{0x06, 0x06},
-	{0x06, 0x07},
-	{0x07, 0x06},
-	{0x02, 0x10},
-	{0x07, 0x07},
-	{0x07, 0x08},
-	{0x08, 0x07},
-	{0x03, 0x11},
-	{0x08, 0x08},
-	{0x08, 0x09},
-	{0x09, 0x08},
-	{0x05, 0x10},
-	{0x09, 0x09},
-	{0x09, 0x0A},
-	{0x0A, 0x09},
-	{0x06, 0x11},
-	{0x0A, 0x0A},
-	{0x0A, 0x0B},
-	{0x0B, 0x0A},
-	{0x08, 0x10},
-	{0x0B, 0x0B},
-	{0x0B, 0x0C},
-	{0x0C, 0x0B},
-	{0x09, 0x11},
-	{0x0C, 0x0C},
-	{0x0C, 0x0D},
-	{0x0D, 0x0C},
-	{0x0B, 0x10},
-	{0x0D, 0x0D},
-	{0x0D, 0x0E},
-	{0x0E, 0x0D},
-	{0x0C, 0x11},
-	{0x0E, 0x0E},
-	{0x0E, 0x0F},
-	{0x0F, 0x0E},
-	{0x0E, 0x10},
-	{0x0F, 0x0F},
-	{0x0F, 0x10},
-	{0x10, 0x0E},
-	{0x10, 0x0F},
-	{0x11, 0x0E},
-	{0x10, 0x10},
-	{0x10, 0x11},
-	{0x11, 0x10},
-	{0x12, 0x0F},
-	{0x11, 0x11},
-	{0x11, 0x12},
-	{0x12, 0x11},
-	{0x14, 0x0E},
-	{0x12, 0x12},
-	{0x12, 0x13},
-	{0x13, 0x12},
-	{0x15, 0x0F},
-	{0x13, 0x13},
-	{0x13, 0x14},
-	{0x14, 0x13},
-	{0x17, 0x0E},
-	{0x14, 0x14},
-	{0x14, 0x15},
-	{0x15, 0x14},
-	{0x18, 0x0F},
-	{0x15, 0x15},
-	{0x15, 0x16},
-	{0x16, 0x15},
-	{0x1A, 0x0E},
-	{0x16, 0x16},
-	{0x16, 0x17},
-	{0x17, 0x16},
-	{0x1B, 0x0F},
-	{0x17, 0x17},
-	{0x17, 0x18},
-	{0x18, 0x17},
-	{0x13, 0x21},
-	{0x18, 0x18},
-	{0x18, 0x19},
-	{0x19, 0x18},
-	{0x15, 0x20},
-	{0x19, 0x19},
-	{0x19, 0x1A},
-	{0x1A, 0x19},
-	{0x16, 0x21},
-	{0x1A, 0x1A},
-	{0x1A, 0x1B},
-	{0x1B, 0x1A},
-	{0x18, 0x20},
-	{0x1B, 0x1B},
-	{0x1B, 0x1C},
-	{0x1C, 0x1B},
-	{0x19, 0x21},
-	{0x1C, 0x1C},
-	{0x1C, 0x1D},
-	{0x1D, 0x1C},
-	{0x1B, 0x20},
-	{0x1D, 0x1D},
-	{0x1D, 0x1E},
-	{0x1E, 0x1D},
-	{0x1C, 0x21},
-	{0x1E, 0x1E},
-	{0x1E, 0x1F},
-	{0x1F, 0x1E},
-	{0x1E, 0x20},
-	{0x1F, 0x1F},
-	{0x1F, 0x20},
-	{0x20, 0x1E},
-	{0x20, 0x1F},
-	{0x21, 0x1E},
-	{0x20, 0x20},
-	{0x20, 0x21},
-	{0x21, 0x20},
-	{0x22, 0x1F},
-	{0x21, 0x21},
-	{0x21, 0x22},
-	{0x22, 0x21},
-	{0x24, 0x1E},
-	{0x22, 0x22},
-	{0x22, 0x23},
-	{0x23, 0x22},
-	{0x25, 0x1F},
-	{0x23, 0x23},
-	{0x23, 0x24},
-	{0x24, 0x23},
-	{0x27, 0x1E},
-	{0x24, 0x24},
-	{0x24, 0x25},
-	{0x25, 0x24},
-	{0x28, 0x1F},
-	{0x25, 0x25},
-	{0x25, 0x26},
-	{0x26, 0x25},
-	{0x2A, 0x1E},
-	{0x26, 0x26},
-	{0x26, 0x27},
-	{0x27, 0x26},
-	{0x2B, 0x1F},
-	{0x27, 0x27},
-	{0x27, 0x28},
-	{0x28, 0x27},
-	{0x23, 0x31},
-	{0x28, 0x28},
-	{0x28, 0x29},
-	{0x29, 0x28},
-	{0x25, 0x30},
-	{0x29, 0x29},
-	{0x29, 0x2A},
-	{0x2A, 0x29},
-	{0x26, 0x31},
-	{0x2A, 0x2A},
-	{0x2A, 0x2B},
-	{0x2B, 0x2A},
-	{0x28, 0x30},
-	{0x2B, 0x2B},
-	{0x2B, 0x2C},
-	{0x2C, 0x2B},
-	{0x29, 0x31},
-	{0x2C, 0x2C},
-	{0x2C, 0x2D},
-	{0x2D, 0x2C},
-	{0x2B, 0x30},
-	{0x2D, 0x2D},
-	{0x2D, 0x2E},
-	{0x2E, 0x2D},
-	{0x2C, 0x31},
-	{0x2E, 0x2E},
-	{0x2E, 0x2F},
-	{0x2F, 0x2E},
-	{0x2E, 0x30},
-	{0x2F, 0x2F},
-	{0x2F, 0x30},
-	{0x30, 0x2E},
-	{0x30, 0x2F},
-	{0x31, 0x2E},
-	{0x30, 0x30},
-	{0x30, 0x31},
-	{0x31, 0x30},
-	{0x32, 0x2F},
-	{0x31, 0x31},
-	{0x31, 0x32},
-	{0x32, 0x31},
-	{0x34, 0x2E},
-	{0x32, 0x32},
-	{0x32, 0x33},
-	{0x33, 0x32},
-	{0x35, 0x2F},
-	{0x33, 0x33},
-	{0x33, 0x34},
-	{0x34, 0x33},
-	{0x37, 0x2E},
-	{0x34, 0x34},
-	{0x34, 0x35},
-	{0x35, 0x34},
-	{0x38, 0x2F},
-	{0x35, 0x35},
-	{0x35, 0x36},
-	{0x36, 0x35},
-	{0x3A, 0x2E},
-	{0x36, 0x36},
-	{0x36, 0x37},
-	{0x37, 0x36},
-	{0x3B, 0x2F},
-	{0x37, 0x37},
-	{0x37, 0x38},
-	{0x38, 0x37},
-	{0x3D, 0x2E},
-	{0x38, 0x38},
-	{0x38, 0x39},
-	{0x39, 0x38},
-	{0x3E, 0x2F},
-	{0x39, 0x39},
-	{0x39, 0x3A},
-	{0x3A, 0x39},
-	{0x3A, 0x3A},
-	{0x3A, 0x3A},
-	{0x3A, 0x3B},
-	{0x3B, 0x3A},
-	{0x3B, 0x3B},
-	{0x3B, 0x3B},
-	{0x3B, 0x3C},
-	{0x3C, 0x3B},
-	{0x3C, 0x3C},
-	{0x3C, 0x3C},
-	{0x3C, 0x3D},
-	{0x3D, 0x3C},
-	{0x3D, 0x3D},
-	{0x3D, 0x3D},
-	{0x3D, 0x3E},
-	{0x3E, 0x3D},
-	{0x3E, 0x3E},
-	{0x3E, 0x3E},
-	{0x3E, 0x3F},
-	{0x3F, 0x3E},
-	{0x3F, 0x3F},
-	{0x3F, 0x3F},
-};
+extern uint8 OMatch5[256][2];
+extern uint8 OMatch6[256][2];
+extern uint8 OMatchAlpha5[256][2];
+extern uint8 OMatchAlpha6[256][2];
 
+void initSingleColorLookup();
\ No newline at end of file
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/SingleColorLookup.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/SingleColorLookup.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/SingleColorLookup.cpp
@@ -0,0 +1,90 @@
+
+#include "SingleColorLookup.h"
+
+#include "nvcore/Debug.h"
+
+#include <stdlib.h> // abs
+
+// Globals
+uint8 OMatch5[256][2];
+uint8 OMatch6[256][2];
+uint8 OMatchAlpha5[256][2];
+uint8 OMatchAlpha6[256][2];
+
+
+
+static int Mul8Bit(int a, int b)
+{
+	int t = a * b + 128;
+	return (t + (t >> 8)) >> 8;
+}
+
+static inline int Lerp13(int a, int b)
+{
+#ifdef DXT_USE_ROUNDING_BIAS
+    // with rounding bias
+    return a + Mul8Bit(b-a, 0x55);
+#else
+    // without rounding bias
+    // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
+    return (a * 2 + b) / 3;
+#endif
+}
+
+static void PrepareOptTable(uint8 * table, const uint8 * expand, int size, bool alpha_mode)
+{
+	for (int i = 0; i < 256; i++)
+	{
+		int bestErr = 256 * 100;
+
+		for (int min = 0; min < size; min++)
+		{
+			for (int max = 0; max < size; max++)
+			{
+				int mine = expand[min];
+				int maxe = expand[max];
+
+				int err;
+                if (alpha_mode) err = abs((maxe + mine)/2 - i);
+                else err = abs(Lerp13(maxe, mine) - i);
+                err *= 100;
+
+                // DX10 spec says that interpolation must be within 3% of "correct" result,
+                // add this as error term. (normally we'd expect a random distribution of
+                // +-1.5% error, but nowhere in the spec does it say that the error has to be
+                // unbiased - better safe than sorry).
+				err += abs(max - min) * 3;
+
+				if (err < bestErr)
+				{
+					table[i*2+0] = max;
+					table[i*2+1] = min;
+					bestErr = err;
+				}
+			}
+		}
+	}
+}
+
+
+NV_AT_STARTUP(initSingleColorLookup());
+
+void initSingleColorLookup()
+{
+	uint8 expand5[32];
+	uint8 expand6[64];
+
+    for (int i = 0; i < 32; i++) {
+		expand5[i] = (i<<3) | (i>>2);
+    }
+
+    for (int i = 0; i < 64; i++) {
+		expand6[i] = (i<<2) | (i>>4);
+    }
+
+	PrepareOptTable(&OMatch5[0][0], expand5, 32, false);
+	PrepareOptTable(&OMatch6[0][0], expand6, 64, false);
+    PrepareOptTable(&OMatchAlpha5[0][0], expand5, 32, true);
+	PrepareOptTable(&OMatchAlpha6[0][0], expand6, 64, true);
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/Surface.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/Surface.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/Surface.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_TEXIMAGE_H
+#define NVTT_TEXIMAGE_H
+
+#include "nvtt.h"
+
+#include "nvcore/RefCounted.h"
+#include "nvcore/Ptr.h"
+
+#include "nvimage/Image.h"
+#include "nvimage/FloatImage.h"
+
+namespace nvtt
+{
+
+    struct Surface::Private : public nv::RefCounted
+    {
+        void operator=(const Private &);
+    public:
+        Private()
+        {
+            nvDebugCheck( refCount() == 0 );
+
+            type = TextureType_2D;
+            wrapMode = WrapMode_Mirror;
+            alphaMode = AlphaMode_None;
+            isNormalMap = false;
+            
+            image = NULL;
+        }
+        Private(const Private & p) : RefCounted() // Copy ctor. inits refcount to 0.
+        {
+            nvDebugCheck( refCount() == 0 );
+
+            type = p.type;
+            wrapMode = p.wrapMode;
+            alphaMode = p.alphaMode;
+            isNormalMap = p.isNormalMap;
+
+            image = p.image->clone();
+        }
+        ~Private()
+        {
+            delete image;
+        }
+
+        TextureType type;
+        WrapMode wrapMode;
+        AlphaMode alphaMode;
+        bool isNormalMap;
+
+        nv::FloatImage * image;
+    };
+
+} // nvtt namespace
+
+namespace nv {
+    bool canMakeNextMipmap(uint w, uint h, uint d, uint min_size);
+    uint countMipmaps(uint w);
+    uint countMipmaps(uint w, uint h, uint d);
+    uint countMipmapsWithMinSize(uint w, uint h, uint d, uint min_size);
+    uint computeImageSize(uint w, uint h, uint d, uint bitCount, uint alignmentInBytes, nvtt::Format format);
+    void getTargetExtent(int * w, int * h, int * d, int maxExtent, nvtt::RoundMode roundMode, nvtt::TextureType textureType);
+}
+
+
+#endif // NVTT_TEXIMAGE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/Surface.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/Surface.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/Surface.cpp
@@ -0,0 +1,3255 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "Surface.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Color.h"
+#include "nvmath/Half.h"
+#include "nvmath/ftoi.h"
+
+#include "nvimage/Filter.h"
+#include "nvimage/ImageIO.h"
+#include "nvimage/NormalMap.h"
+#include "nvimage/BlockDXT.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/PixelFormat.h"
+#include "nvimage/ErrorMetric.h"
+#include "nvimage/DirectDrawSurface.h"
+
+#include <float.h>
+#include <string.h> // memset, memcpy
+
+#if NV_CC_GNUC
+#include <math.h> // exp2f and log2f
+#endif
+
+using namespace nv;
+using namespace nvtt;
+
+namespace
+{
+    // 1 -> 1, 2 -> 2, 3 -> 2, 4 -> 4, 5 -> 4, ...
+    static inline uint previousPowerOfTwo(uint v)
+    {
+        return nextPowerOfTwo(v + 1) / 2;
+    }
+
+    static inline uint nearestPowerOfTwo(uint v)
+    {
+        const uint np2 = nextPowerOfTwo(v);
+        const uint pp2 = previousPowerOfTwo(v);
+
+        if (np2 - v <= v - pp2)
+        {
+            return np2;
+        }
+        else
+        {
+            return pp2;
+        }
+    }
+
+    static inline uint nextMultipleOfFour(uint v)
+    {
+        return (v + 3) & ~3;
+    }
+    static inline uint previousMultipleOfFour(uint v)
+    {
+        return v & ~3;
+    }
+
+    static inline uint nearestMultipleOfFour(uint v)
+    {
+        const uint nm4 = nextMultipleOfFour(v);
+        const uint pm4 = previousMultipleOfFour(v);
+
+        if (nm4 - v <= v - pm4)
+        {
+            return nm4;
+        }
+        else
+        {
+            return pm4;
+        }
+    }
+
+
+    static int blockSize(Format format)
+    {
+        if (format == Format_DXT1 || format == Format_DXT1a || format == Format_DXT1n) {
+            return 8;
+        }
+        else if (format == Format_DXT3) {
+            return 16;
+        }
+        else if (format == Format_DXT5 || format == Format_DXT5n || format == Format_BC3_RGBM) {
+            return 16;
+        }
+        else if (format == Format_BC4) {
+            return 8;
+        }
+        else if (format == Format_BC5 /*|| format == Format_BC5_Luma*/) {
+            return 16;
+        }
+        else if (format == Format_CTX1) {
+            return 8;
+        }
+        else if (format == Format_BC6) {
+            return 16;
+        }
+        else if (format == Format_BC7) {
+            return 16;
+        }
+        return 0;
+    }
+
+    /*static int translateMask(int input) {
+        if (input > 0) return 1 << input;
+        return ~input;
+    }*/
+}
+
+bool nv::canMakeNextMipmap(uint w, uint h, uint d, uint min_size)
+{
+    if (min_size==1u) {  
+        if(w==1u && h==1u && d==1u) {
+            return false;
+        }
+    }
+    else if (((w <= min_size || h <= min_size) && d == 1u)) {
+        return false;
+    }
+
+    return true;
+}
+
+uint nv::countMipmaps(uint w)
+{
+    uint mipmap = 0;
+
+    while (w != 1) {
+        w = max(1U, w / 2);
+        mipmap++;
+    }
+
+    return mipmap + 1;
+}
+
+uint nv::countMipmaps(uint w, uint h, uint d)
+{
+    uint mipmap = 0;
+
+    while (w != 1 || h != 1 || d != 1) {
+        w = max(1U, w / 2);
+        h = max(1U, h / 2);
+        d = max(1U, d / 2);
+        mipmap++;
+    }
+
+    return mipmap + 1;
+}
+
+uint nv::countMipmapsWithMinSize(uint w, uint h, uint d, uint min_size)
+{
+    uint mipmap = 0;
+
+    while (canMakeNextMipmap(w, h, d, min_size)) {
+        w = max(1U, w / 2);
+        h = max(1U, h / 2);
+        d = max(1U, d / 2);
+        mipmap++;
+    }
+
+   return mipmap + 1;
+}
+
+
+uint nv::computeImageSize(uint w, uint h, uint d, uint bitCount, uint pitchAlignmentInBytes, Format format)
+{
+    if (format == Format_RGBA) {
+        return d * h * computeBytePitch(w, bitCount, pitchAlignmentInBytes);
+    }
+    else {
+        return ((w + 3) / 4) * ((h + 3) / 4) * blockSize(format) * d;
+    }
+}
+
+void nv::getTargetExtent(int * width, int * height, int * depth, int maxExtent, RoundMode roundMode, TextureType textureType) {
+    nvDebugCheck(width != NULL && *width > 0);
+    nvDebugCheck(height != NULL && *height > 0);
+    nvDebugCheck(depth != NULL && *depth > 0);
+
+    int w = *width;
+    int h = *height;
+    int d = *depth;
+
+    if (roundMode != RoundMode_None && maxExtent > 0)
+    {
+        // rounded max extent should never be higher than original max extent.
+        maxExtent = previousPowerOfTwo(maxExtent);
+    }
+
+    // Scale extents without changing aspect ratio.
+    int m = max(max(w, h), d);
+    if (maxExtent > 0 && m > maxExtent)
+    {
+        w = max((w * maxExtent) / m, 1);
+        h = max((h * maxExtent) / m, 1);
+        d = max((d * maxExtent) / m, 1);
+    }
+
+    if (textureType == TextureType_2D)
+    {
+        d = 1;
+    }
+    else if (textureType == TextureType_Cube)
+    {
+        w = h = (w + h) / 2;
+        d = 1;
+    }
+
+    // Round to power of two.
+    if (roundMode == RoundMode_ToNextPowerOfTwo)
+    {
+        w = nextPowerOfTwo(w);
+        h = nextPowerOfTwo(h);
+        d = nextPowerOfTwo(d);
+    }
+    else if (roundMode == RoundMode_ToNearestPowerOfTwo)
+    {
+        w = nearestPowerOfTwo(w);
+        h = nearestPowerOfTwo(h);
+        d = nearestPowerOfTwo(d);
+    }
+    else if (roundMode == RoundMode_ToPreviousPowerOfTwo)
+    {
+        w = previousPowerOfTwo(w);
+        h = previousPowerOfTwo(h);
+        d = previousPowerOfTwo(d);
+    }
+    else if (roundMode == RoundMode_ToNextMultipleOfFour)
+    {
+        w = nextMultipleOfFour(w);
+        h = nextMultipleOfFour(h);
+        d = nextMultipleOfFour(d);
+    }
+    else if (roundMode == RoundMode_ToNextMultipleOfFour)
+    {
+        w = nearestMultipleOfFour(w);
+        h = nearestMultipleOfFour(h);
+        d = nearestMultipleOfFour(d);
+    }
+    else if (roundMode == RoundMode_ToPreviousMultipleOfFour)
+    {
+        w = previousMultipleOfFour(w);
+        h = previousMultipleOfFour(h);
+        d = previousMultipleOfFour(d);
+    }
+
+    *width = w;
+    *height = h;
+    *depth = d;
+}
+
+
+
+Surface::Surface() : m(new Surface::Private())
+{
+    m->addRef();
+}
+
+Surface::Surface(const Surface & tex) : m(tex.m)
+{
+    if (m != NULL) m->addRef();
+}
+
+Surface::~Surface()
+{
+    if (m != NULL) m->release();
+    m = NULL;
+}
+
+void Surface::operator=(const Surface & tex)
+{
+    if (tex.m != NULL) tex.m->addRef();
+    if (m != NULL) m->release();
+    m = tex.m;
+}
+
+void Surface::detach()
+{
+    if (m->refCount() > 1)
+    {
+        m->release();
+        m = new Surface::Private(*m);
+        m->addRef();
+        nvDebugCheck(m->refCount() == 1);
+    }
+}
+
+void Surface::setWrapMode(WrapMode wrapMode)
+{
+    if (m->wrapMode != wrapMode)
+    {
+        detach();
+        m->wrapMode = wrapMode;
+    }
+}
+
+void Surface::setAlphaMode(AlphaMode alphaMode)
+{
+    if (m->alphaMode != alphaMode)
+    {
+        detach();
+        m->alphaMode = alphaMode;
+    }
+}
+
+void Surface::setNormalMap(bool isNormalMap)
+{
+    if (m->isNormalMap != isNormalMap)
+    {
+        detach();
+        m->isNormalMap = isNormalMap;
+    }
+}
+
+bool Surface::isNull() const
+{
+    return m->image == NULL;
+}
+
+int Surface::width() const
+{
+    if (m->image != NULL) return m->image->width();
+    return 0;
+}
+
+int Surface::height() const
+{
+    if (m->image != NULL) return m->image->height();
+    return 0;
+}
+
+int Surface::depth() const
+{
+    if (m->image != NULL) return m->image->depth();
+    return 0;
+}
+
+WrapMode Surface::wrapMode() const
+{
+    return m->wrapMode;
+}
+
+AlphaMode Surface::alphaMode() const
+{
+    return m->alphaMode;
+}
+
+bool Surface::isNormalMap() const
+{
+    return m->isNormalMap;
+}
+
+TextureType Surface::type() const
+{
+    return m->type;
+}
+
+int Surface::countMipmaps() const
+{
+    if (m->image == NULL) return 0;
+    return ::countMipmaps(m->image->width(), m->image->height(), 1);
+}
+
+int Surface::countMipmaps(int min_size) const
+{
+    if (m->image == NULL) return 0;
+    return ::countMipmapsWithMinSize(m->image->width(), m->image->height(), 1, min_size);
+}
+
+float Surface::alphaTestCoverage(float alphaRef/*= 0.5*/, int alpha_channel/*=3*/) const
+{
+    if (m->image == NULL) return 0.0f;
+
+    alphaRef = nv::clamp(alphaRef, 1.0f/256, 255.0f/256);
+
+    return m->image->alphaTestCoverage(alphaRef, alpha_channel);
+}
+
+float Surface::average(int channel, int alpha_channel/*= -1*/, float gamma /*= 2.2f*/) const
+{
+    if (m->image == NULL) return 0.0f;
+
+    const uint count = m->image->width() * m->image->height();
+
+    float sum = 0.0f;
+    const float * c = m->image->channel(channel);
+
+    float denom;
+
+    if (alpha_channel == -1) {
+        for (uint i = 0; i < count; i++) {
+            sum += powf(c[i], gamma);
+        }
+
+        denom = float(count);
+    }
+    else {
+        float alpha_sum = 0.0f;
+        const float * a = m->image->channel(alpha_channel);
+        
+        for (uint i = 0; i < count; i++) {
+            sum += powf(c[i], gamma) * a[i];
+            alpha_sum += a[i];
+        }
+
+        denom = alpha_sum;
+    }
+
+    // Avoid division by zero.
+    if (denom == 0.0f) return 0.0f;
+
+    return powf(sum / denom, 1.0f/gamma);
+}
+
+const float * Surface::data() const
+{
+    return m->image->channel(0);
+}
+
+const float * Surface::channel(int i) const
+{
+    if (i < 0 || i > 3) return NULL;
+    return m->image->channel(i);
+}
+
+
+void Surface::histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const
+{
+    // We assume it's clear in case we want to accumulate multiple histograms.
+    //memset(bins, 0, sizeof(int)*count);
+
+    if (m->image == NULL) return;
+
+    const float * c = m->image->channel(channel);
+
+    float scale = float(binCount) / rangeMax;
+    float bias = - scale * rangeMin;
+
+    const uint count = m->image->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float f = c[i] * scale + bias;
+        int idx = ftoi_floor(f);
+        if (idx < 0) idx = 0;
+        if (idx > binCount-1) idx = binCount-1;
+        binPtr[idx]++;
+    }
+}
+
+void Surface::range(int channel, float * rangeMin, float * rangeMax, int alpha_channel/*= -1*/, float alpha_ref/*= 0.f*/) const
+{
+    Vector2 range(FLT_MAX, -FLT_MAX);
+
+    FloatImage * img = m->image;
+
+    if (alpha_channel == -1) { // no alpha channel; just like the original range function
+
+        if (m->image != NULL) {
+            float * c = img->channel(channel);
+
+            const uint count = img->pixelCount();
+            for (uint p = 0; p < count; p++) {
+                float f = c[p];
+                if (f < range.x) range.x = f;
+                if (f > range.y) range.y = f;
+            }
+        }
+    }
+    else { // use alpha test to ignore some pixels
+        //note, it's quite possible to get FLT_MAX,-FLT_MAX back if all pixels fail the test
+
+        if (m->image != NULL)
+        {
+            const float * c = img->channel(channel);
+            const float * a = img->channel(alpha_channel);
+
+            const uint count = img->pixelCount();
+            for (uint p = 0; p < count; p++) {
+                if(a[p]>alpha_ref) {
+                    float f = c[p];
+                    if (f < range.x) range.x = f;
+                    if (f > range.y) range.y = f;
+                }
+            }
+        }
+    }
+
+    *rangeMin = range.x;
+    *rangeMax = range.y;
+}
+
+bool Surface::load(const char * fileName, bool * hasAlpha/*= NULL*/)
+{
+    AutoPtr<FloatImage> img(ImageIO::loadFloat(fileName));
+    if (img == NULL) {
+        // Try loading as DDS.
+        if (nv::strEqual(nv::Path::extension(fileName), ".dds")) {
+            nv::DirectDrawSurface dds;
+            if (dds.load(fileName)) {
+                if (dds.header.isBlockFormat()) {
+                    int w = dds.surfaceWidth(0);
+                    int h = dds.surfaceHeight(0);
+                    uint size = dds.surfaceSize(0);
+
+                    void * data = malloc(size);
+                    dds.readSurface(0, 0, data, size);
+
+                    // @@ Handle all formats! @@ Get nvtt format from dds.surfaceFormat() ?
+
+                    if (dds.header.hasDX10Header()) {
+                        if (dds.header.header10.dxgiFormat == DXGI_FORMAT_BC6H_UF16) {
+                            this->setImage2D(nvtt::Format_BC6, nvtt::Decoder_D3D10, w, h, data);
+                        }
+                        else {
+                            // @@
+                            nvCheck(false);
+                        }
+                    }
+                    else {
+                        uint fourcc = dds.header.pf.fourcc;
+                        if (fourcc == FOURCC_DXT1) {
+                            this->setImage2D(nvtt::Format_BC1, nvtt::Decoder_D3D10, w, h, data);
+                        }
+                        else if (fourcc == FOURCC_DXT5) {
+                            this->setImage2D(nvtt::Format_BC3, nvtt::Decoder_D3D10, w, h, data);
+                        }
+                        else {
+                            // @@ 
+                            nvCheck(false);
+                        }
+                    }
+
+                    free(data);
+                }
+                else {
+                    Image img;
+                    dds.mipmap(&img, /*face=*/0, /*mipmap=*/0);
+
+                    int w = img.width();
+                    int h = img.height();
+                    int d = img.depth();
+
+                    // @@ Add support for all pixel formats.
+
+                    this->setImage(nvtt::InputFormat_BGRA_8UB, w, h, d, img.pixels());
+                }
+
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    detach();
+
+    if (hasAlpha != NULL) {
+        *hasAlpha = (img->componentCount() == 4);
+    }
+
+    // @@ Have loadFloat allocate the image with the desired number of channels.
+    img->resizeChannelCount(4);
+
+    delete m->image;
+    m->image = img.release();
+
+    return true;
+}
+
+bool Surface::save(const char * fileName, bool hasAlpha/*=0*/, bool hdr/*=0*/) const
+{
+    if (m->image == NULL) {
+        return false;
+    }
+
+    if (hdr) {
+        return ImageIO::saveFloat(fileName, m->image, 0, 4);
+    }
+    else {
+        AutoPtr<Image> image(m->image->createImage(0, 4));
+        nvCheck(image != NULL);
+
+        if (hasAlpha) {
+            image->setFormat(Image::Format_ARGB);
+        }
+
+        return ImageIO::save(fileName, image.ptr());
+    }
+}
+
+
+bool Surface::setImage(int w, int h, int d)
+{
+    detach();
+
+    if (m->image == NULL) {
+        m->image = new FloatImage();
+    }
+    m->image->allocate(4, w, h, d);
+    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
+
+    m->image->clear();
+
+    return true;
+}
+
+
+#if 0 //NV_OS_WIN32
+
+#include <windows.h>
+#undef min
+#undef max
+
+static int filter(unsigned int code, struct _EXCEPTION_POINTERS *ep) {
+   if (code == EXCEPTION_ACCESS_VIOLATION) {
+      return EXCEPTION_EXECUTE_HANDLER;
+   }
+   else {
+      return EXCEPTION_CONTINUE_SEARCH;
+   };
+}
+
+#define TRY __try
+#define CATCH __except (filter(GetExceptionCode(), GetExceptionInformation()))
+
+#else // 0
+
+#define TRY if (true)
+#define CATCH else
+
+#endif
+
+bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void * data)
+{
+    detach();
+
+    if (m->image == NULL) {
+        m->image = new FloatImage();
+    }
+    m->image->allocate(4, w, h, d);
+    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
+
+    const int count = m->image->pixelCount();
+
+    float * rdst = m->image->channel(0);
+    float * gdst = m->image->channel(1);
+    float * bdst = m->image->channel(2);
+    float * adst = m->image->channel(3);
+
+    if (format == InputFormat_BGRA_8UB)
+    {
+        const Color32 * src = (const Color32 *)data;
+
+        TRY {
+            for (int i = 0; i < count; i++)
+            {
+                rdst[i] = float(src[i].r) / 255.0f;
+                gdst[i] = float(src[i].g) / 255.0f;
+                bdst[i] = float(src[i].b) / 255.0f;
+                adst[i] = float(src[i].a) / 255.0f;
+            }
+        }
+        CATCH {
+            return false;
+        }
+    }
+    else if (format == InputFormat_RGBA_16F)
+    {
+        const uint16 * src = (const uint16 *)data;
+
+        TRY {
+            for (int i = 0; i < count; i++)
+            {
+                ((uint32 *)rdst)[i] = half_to_float(src[4*i+0]);
+                ((uint32 *)gdst)[i] = half_to_float(src[4*i+1]);
+                ((uint32 *)bdst)[i] = half_to_float(src[4*i+2]);
+                ((uint32 *)adst)[i] = half_to_float(src[4*i+3]);
+            }
+        }
+        CATCH {
+            return false;
+        }
+    }
+    else if (format == InputFormat_RGBA_32F)
+    {
+        const float * src = (const float *)data;
+
+        TRY {
+            for (int i = 0; i < count; i++)
+            {
+                rdst[i] = src[4 * i + 0];
+                gdst[i] = src[4 * i + 1];
+                bdst[i] = src[4 * i + 2];
+                adst[i] = src[4 * i + 3];
+            }
+        }
+        CATCH {
+            return false;
+        }
+    }
+    else if (format == InputFormat_R_32F)
+    {
+        const float * src = (const float *)data;
+
+        TRY {
+            for (int i = 0; i < count; i++)
+            {
+                rdst[i] = src[i];
+                gdst[i] = 0;
+                bdst[i] = 0;
+                adst[i] = 0;
+            }
+        }
+        CATCH {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool Surface::setImage(InputFormat format, int w, int h, int d, const void * r, const void * g, const void * b, const void * a)
+{
+    detach();
+
+    if (m->image == NULL) {
+        m->image = new FloatImage();
+    }
+    m->image->allocate(4, w, h, d);
+    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
+
+    const int count = m->image->pixelCount();
+
+    float * rdst = m->image->channel(0);
+    float * gdst = m->image->channel(1);
+    float * bdst = m->image->channel(2);
+    float * adst = m->image->channel(3);
+
+    if (format == InputFormat_BGRA_8UB)
+    {
+        const uint8 * rsrc = (const uint8 *)r;
+        const uint8 * gsrc = (const uint8 *)g;
+        const uint8 * bsrc = (const uint8 *)b;
+        const uint8 * asrc = (const uint8 *)a;
+
+        TRY {
+            for (int i = 0; i < count; i++) rdst[i] = float(rsrc[i]) / 255.0f;
+            for (int i = 0; i < count; i++) gdst[i] = float(gsrc[i]) / 255.0f;
+            for (int i = 0; i < count; i++) bdst[i] = float(bsrc[i]) / 255.0f;
+            for (int i = 0; i < count; i++) adst[i] = float(asrc[i]) / 255.0f;
+        }
+        CATCH {
+            return false;
+        }
+    }
+    else if (format == InputFormat_RGBA_16F)
+    {
+        const uint16 * rsrc = (const uint16 *)r;
+        const uint16 * gsrc = (const uint16 *)g;
+        const uint16 * bsrc = (const uint16 *)b;
+        const uint16 * asrc = (const uint16 *)a;
+
+        TRY {
+            for (int i = 0; i < count; i++) ((uint32 *)rdst)[i] = half_to_float(rsrc[i]);
+            for (int i = 0; i < count; i++) ((uint32 *)gdst)[i] = half_to_float(gsrc[i]);
+            for (int i = 0; i < count; i++) ((uint32 *)bdst)[i] = half_to_float(bsrc[i]);
+            for (int i = 0; i < count; i++) ((uint32 *)adst)[i] = half_to_float(asrc[i]);
+        }
+        CATCH {
+            return false;
+        }
+    }
+    else if (format == InputFormat_RGBA_32F)
+    {
+        const float * rsrc = (const float *)r;
+        const float * gsrc = (const float *)g;
+        const float * bsrc = (const float *)b;
+        const float * asrc = (const float *)a;
+
+        TRY {
+            memcpy(rdst, rsrc, count * sizeof(float));
+            memcpy(gdst, gsrc, count * sizeof(float));
+            memcpy(bdst, bsrc, count * sizeof(float));
+            memcpy(adst, asrc, count * sizeof(float));
+        }
+        CATCH {
+            return false;
+        }
+    }
+    else if (format == InputFormat_R_32F)
+    {
+        const float * rsrc = (const float *)r;
+
+        TRY {
+            memcpy(rdst, rsrc, count * sizeof(float));
+            memset(gdst, 0, count * sizeof(float));
+            memset(bdst, 0, count * sizeof(float));
+            memset(adst, 0, count * sizeof(float));
+        }
+        CATCH {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+// @@ Add support for compressed 3D textures.
+bool Surface::setImage2D(Format format, Decoder decoder, int w, int h, const void * data)
+{
+    if (format != nvtt::Format_BC1 &&
+        format != nvtt::Format_BC2 &&
+        format != nvtt::Format_BC3 &&
+        format != nvtt::Format_BC4 &&
+        format != nvtt::Format_BC5 &&
+        format != nvtt::Format_BC6 &&
+        format != nvtt::Format_BC7)
+    {
+        return false;
+    }
+
+    detach();
+
+    if (m->image == NULL) {
+        m->image = new FloatImage();
+    }
+    m->image->allocate(4, w, h, 1);
+    m->type = TextureType_2D;
+
+    const int bw = (w + 3) / 4;
+    const int bh = (h + 3) / 4;
+
+    const uint bs = blockSize(format);
+
+    const uint8 * ptr = (const uint8 *)data;
+
+    TRY {
+		if (format == nvtt::Format_BC6)
+		{
+			// BC6 format - decode directly to float
+
+			for (int y = 0; y < bh; y++)
+			{
+				for (int x = 0; x < bw; x++)
+				{
+                    Vector3 colors[16];
+                    const BlockBC6 * block = (const BlockBC6 *)ptr;
+					block->decodeBlock(colors);
+
+					for (int yy = 0; yy < 4; yy++)
+					{
+						for (int xx = 0; xx < 4; xx++)
+						{
+							Vector3 rgb = colors[yy*4 + xx];
+
+							if (x * 4 + xx < w && y * 4 + yy < h)
+							{
+								m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = rgb.x;
+								m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = rgb.y;
+								m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = rgb.z;
+								m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = 1.0f;
+							}
+						}
+					}
+
+					ptr += bs;
+				}
+			}
+		}
+		else
+		{
+			// Non-BC6 - decode to 8-bit, then convert to float
+
+			for (int y = 0; y < bh; y++)
+			{
+				for (int x = 0; x < bw; x++)
+				{
+					ColorBlock colors;
+
+					if (format == nvtt::Format_BC1)
+					{
+						const BlockDXT1 * block = (const BlockDXT1 *)ptr;
+
+						if (decoder == Decoder_D3D10) {
+							block->decodeBlock(&colors, false);
+						}
+						else if (decoder == Decoder_D3D9) {
+							block->decodeBlock(&colors, false);
+						}
+						else if (decoder == Decoder_NV5x) {
+							block->decodeBlockNV5x(&colors);
+						}
+					}
+					else if (format == nvtt::Format_BC2)
+					{
+						const BlockDXT3 * block = (const BlockDXT3 *)ptr;
+
+						if (decoder == Decoder_D3D10) {
+							block->decodeBlock(&colors, false);
+						}
+						else if (decoder == Decoder_D3D9) {
+							block->decodeBlock(&colors, false);
+						}
+						else if (decoder == Decoder_NV5x) {
+							block->decodeBlockNV5x(&colors);
+						}
+					}
+					else if (format == nvtt::Format_BC3)
+					{
+						const BlockDXT5 * block = (const BlockDXT5 *)ptr;
+
+						if (decoder == Decoder_D3D10) {
+							block->decodeBlock(&colors, false);
+						}
+						else if (decoder == Decoder_D3D9) {
+							block->decodeBlock(&colors, false);
+						}
+						else if (decoder == Decoder_NV5x) {
+							block->decodeBlockNV5x(&colors);
+						}
+					}
+					else if (format == nvtt::Format_BC4)
+					{
+						const BlockATI1 * block = (const BlockATI1 *)ptr;
+						block->decodeBlock(&colors, decoder == Decoder_D3D9);
+					}
+					else if (format == nvtt::Format_BC5)
+					{
+						const BlockATI2 * block = (const BlockATI2 *)ptr;
+						block->decodeBlock(&colors, decoder == Decoder_D3D9);
+					}
+					else if (format == nvtt::Format_BC7)
+					{
+						const BlockBC7 * block = (const BlockBC7 *)ptr;
+						block->decodeBlock(&colors);
+					}
+					else
+					{
+						nvDebugCheck(false);
+					}
+
+					for (int yy = 0; yy < 4; yy++)
+					{
+						for (int xx = 0; xx < 4; xx++)
+						{
+							Color32 c = colors.color(xx, yy);
+
+							if (x * 4 + xx < w && y * 4 + yy < h)
+							{
+								m->image->pixel(0, x*4 + xx, y*4 + yy, 0) = float(c.r) * 1.0f/255.0f;
+								m->image->pixel(1, x*4 + xx, y*4 + yy, 0) = float(c.g) * 1.0f/255.0f;
+								m->image->pixel(2, x*4 + xx, y*4 + yy, 0) = float(c.b) * 1.0f/255.0f;
+								m->image->pixel(3, x*4 + xx, y*4 + yy, 0) = float(c.a) * 1.0f/255.0f;
+							}
+						}
+					}
+
+					ptr += bs;
+				}
+			}
+		}
+    }
+    CATCH {
+        return false;
+    }
+
+    return true;
+}
+
+
+static void getDefaultFilterWidthAndParams(int filter, float * filterWidth, float params[2])
+{
+    if (filter == ResizeFilter_Box) {
+        *filterWidth = 0.5f;
+    }
+    else if (filter == ResizeFilter_Triangle) {
+        *filterWidth = 1.0f;
+    }
+    else if (filter == ResizeFilter_Kaiser)
+    {
+        *filterWidth = 3.0f;
+        params[0] = 4.0f;
+        params[1] = 1.0f;
+    }
+    else //if (filter == ResizeFilter_Mitchell)
+    {
+        *filterWidth = 2.0f;
+        params[0] = 1.0f / 3.0f;
+        params[1] = 1.0f / 3.0f;
+    }
+}
+
+void Surface::resize(int w, int h, int d, ResizeFilter filter)
+{
+    float filterWidth;
+    float params[2];
+    getDefaultFilterWidthAndParams(filter, &filterWidth, params);
+
+    resize(w, h, d, filter, filterWidth, params);
+}
+
+void Surface::resize(int w, int h, int d, ResizeFilter filter, float filterWidth, const float * params)
+{
+    if (isNull() || (w == width() && h == height() && d == depth())) {
+        return;
+    }
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    FloatImage::WrapMode wrapMode = (FloatImage::WrapMode)m->wrapMode;
+
+    if (m->alphaMode == AlphaMode_Transparency)
+    {
+        if (filter == ResizeFilter_Box)
+        {
+            BoxFilter filter(filterWidth);
+            img = img->resize(filter, w, h, d, wrapMode, 3);
+        }
+        else if (filter == ResizeFilter_Triangle)
+        {
+            TriangleFilter filter(filterWidth);
+            img = img->resize(filter, w, h, d, wrapMode, 3);
+        }
+        else if (filter == ResizeFilter_Kaiser)
+        {
+            KaiserFilter filter(filterWidth);
+            if (params != NULL) filter.setParameters(params[0], params[1]);
+            img = img->resize(filter, w, h, d, wrapMode, 3);
+        }
+        else //if (filter == ResizeFilter_Mitchell)
+        {
+            nvDebugCheck(filter == ResizeFilter_Mitchell);
+            MitchellFilter filter;
+            if (params != NULL) filter.setParameters(params[0], params[1]);
+            img = img->resize(filter, w, h, d, wrapMode, 3);
+        }
+    }
+    else
+    {
+        if (filter == ResizeFilter_Box)
+        {
+            BoxFilter filter(filterWidth);
+            img = img->resize(filter, w, h, d, wrapMode);
+        }
+        else if (filter == ResizeFilter_Triangle)
+        {
+            TriangleFilter filter(filterWidth);
+            img = img->resize(filter, w, h, d, wrapMode);
+        }
+        else if (filter == ResizeFilter_Kaiser)
+        {
+            KaiserFilter filter(filterWidth);
+            if (params != NULL) filter.setParameters(params[0], params[1]);
+            img = img->resize(filter, w, h, d, wrapMode);
+        }
+        else //if (filter == ResizeFilter_Mitchell)
+        {
+            nvDebugCheck(filter == ResizeFilter_Mitchell);
+            MitchellFilter filter;
+            if (params != NULL) filter.setParameters(params[0], params[1]);
+            img = img->resize(filter, w, h, d, wrapMode);
+        }
+    }
+
+    delete m->image;
+    m->image = img;
+}
+
+void Surface::resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter)
+{
+    if (isNull()) return;
+
+    float filterWidth;
+    float params[2];
+    getDefaultFilterWidthAndParams(filter, &filterWidth, params);
+
+    int w = m->image->width();
+    int h = m->image->height();
+    int d = m->image->depth();
+
+    getTargetExtent(&w, &h, &d, maxExtent, roundMode, m->type);
+
+    if (m->type == TextureType_2D) 
+    {
+        nvDebugCheck(d==1);
+        int md = nv::min(w,h);
+        w = md;
+        h = md;
+    }
+    else if (m->type == TextureType_Cube)
+    {
+        nvDebugCheck(d==1);
+        nvDebugCheck(w==h);
+    }
+    else if (m->type == TextureType_3D)
+    {
+        int md = nv::min(nv::min(w,h),d);
+        w = md;
+        h = md;
+        d = md;
+    }
+
+    resize(w, h, d, filter, filterWidth, params);
+}
+
+void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter)
+{
+    float filterWidth;
+    float params[2];
+    getDefaultFilterWidthAndParams(filter, &filterWidth, params);
+
+    resize(maxExtent, roundMode, filter, filterWidth, params);
+}
+
+void Surface::resize(int maxExtent, RoundMode roundMode, ResizeFilter filter, float filterWidth, const float * params)
+{
+    if (isNull()) return;
+
+    int w = m->image->width();
+    int h = m->image->height();
+    int d = m->image->depth();
+
+    getTargetExtent(&w, &h, &d, maxExtent, roundMode, m->type);
+
+    resize(w, h, d, filter, filterWidth, params);
+}
+
+bool Surface::canMakeNextMipmap(int min_size /*= 1*/)
+{
+    if (isNull()) return false;
+
+    return nv::canMakeNextMipmap(width(), height(), depth(), min_size);
+}
+
+
+bool Surface::buildNextMipmap(MipmapFilter filter, int min_size /*= 1*/)
+{
+    float filterWidth;
+    float params[2];
+    getDefaultFilterWidthAndParams(filter, &filterWidth, params);
+
+    return buildNextMipmap(filter, filterWidth, params, min_size);
+}
+
+bool Surface::buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params, int min_size /*= 1*/)
+{
+    if (!canMakeNextMipmap(min_size)) {
+        return false;
+    }
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    FloatImage::WrapMode wrapMode = (FloatImage::WrapMode)m->wrapMode;
+
+    if (m->alphaMode == AlphaMode_Transparency)
+    {
+        if (filter == MipmapFilter_Box)
+        {
+            BoxFilter filter(filterWidth);
+            img = img->downSample(filter, wrapMode, 3);
+        }
+        else if (filter == MipmapFilter_Triangle)
+        {
+            TriangleFilter filter(filterWidth);
+            img = img->downSample(filter, wrapMode, 3);
+        }
+        else if (filter == MipmapFilter_Kaiser)
+        {
+            nvDebugCheck(filter == MipmapFilter_Kaiser);
+            KaiserFilter filter(filterWidth);
+            if (params != NULL) filter.setParameters(params[0], params[1]);
+            img = img->downSample(filter, wrapMode, 3);
+        }
+    }
+    else
+    {
+        if (filter == MipmapFilter_Box)
+        {
+            if (filterWidth == 0.5f && img->depth() == 1) {
+                img = img->fastDownSample();
+            }
+            else {
+                BoxFilter filter(filterWidth);
+                img = img->downSample(filter, wrapMode);
+            }
+        }
+        else if (filter == MipmapFilter_Triangle)
+        {
+            TriangleFilter filter(filterWidth);
+            img = img->downSample(filter, wrapMode);
+        }
+        else //if (filter == MipmapFilter_Kaiser)
+        {
+            nvDebugCheck(filter == MipmapFilter_Kaiser);
+            KaiserFilter filter(filterWidth);
+            if (params != NULL) filter.setParameters(params[0], params[1]);
+            img = img->downSample(filter, wrapMode);
+        }
+    }
+
+    delete m->image;
+    m->image = img;
+
+    return true;
+}
+
+bool Surface::buildNextMipmapSolidColor(const float * const color_components)
+{
+    if (isNull() || (width() == 1 && height() == 1 && depth() == 1)) {
+        return false;
+    }
+
+    detach();
+
+    FloatImage * img = new FloatImage();
+    const uint w = max(1, m->image->m_width / 2);
+    const uint h = max(1, m->image->m_height / 2);
+    img->allocate(m->image->m_componentCount, w, h);
+
+    for(uint c = 0; c < img->m_componentCount; c++)
+    {
+        img->clear(c, color_components[c]);
+    }
+
+    delete m->image;
+    m->image = img;
+
+    return true;
+}
+
+void Surface::canvasSize(int w, int h, int d)
+{
+    nvDebugCheck(w > 0 && h > 0 && d > 0);
+
+    if (isNull() || (w == width() && h == height() && d == depth())) {
+        return;
+    }
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    FloatImage * new_img = new FloatImage;
+    new_img->allocate(4, w, h, d);
+    new_img->clear();
+
+    w = min(uint(w), img->width());
+    h = min(uint(h), img->height());
+    d = min(uint(d), img->depth());
+
+    for (int z = 0; z < d; z++) {
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++) {
+                new_img->pixel(0, x, y, z) = img->pixel(0, x, y, z);
+                new_img->pixel(1, x, y, z) = img->pixel(1, x, y, z);
+                new_img->pixel(2, x, y, z) = img->pixel(2, x, y, z);
+                new_img->pixel(3, x, y, z) = img->pixel(3, x, y, z);
+            }
+        }
+    }
+
+    delete m->image;
+    m->image = new_img;
+    m->type = (d == 1) ? TextureType_2D : TextureType_3D;
+}
+
+
+// Color transforms.
+void Surface::toLinear(float gamma)
+{
+    if (isNull()) return;
+    if (equal(gamma, 1.0f)) return;
+
+    detach();
+
+    m->image->toLinear(0, 3, gamma);
+}
+
+void Surface::toGamma(float gamma)
+{
+    if (isNull()) return;
+    if (equal(gamma, 1.0f)) return;
+
+    detach();
+
+    m->image->toGamma(0, 3, gamma);
+}
+
+void Surface::toLinear(int channel, float gamma)
+{
+    if (isNull()) return;
+    if (equal(gamma, 1.0f)) return;
+
+    detach();
+
+    m->image->toLinear(channel, 1, gamma);
+}
+
+void Surface::toGamma(int channel, float gamma)
+{
+    if (isNull()) return;
+    if (equal(gamma, 1.0f)) return;
+
+    detach();
+
+    m->image->toGamma(channel, 1, gamma);
+}
+
+
+
+static float toSrgb(float f) {
+    if (isNan(f))               f = 0.0f;
+    else if (f <= 0.0f)         f = 0.0f;
+    else if (f <= 0.0031308f)   f = 12.92f * f;
+    else if (f <= 1.0f)         f = (powf(f, 0.41666f) * 1.055f) - 0.055f;
+    else                        f = 1.0f;
+    return f;
+}
+
+void Surface::toSrgb()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    const uint count = img->pixelCount();
+    for (uint c = 0; c < 3; c++) {
+        float * channel = img->channel(c);
+        for (uint i = 0; i < count; i++) {
+            channel[i] = ::toSrgb(channel[i]);
+        }
+    }
+}
+
+static float fromSrgb(float f) {
+    if (f < 0.0f)           f = 0.0f;
+    else if (f < 0.04045f)  f = f / 12.92f;
+    else if (f <= 1.0f)     f = powf((f + 0.055f) / 1.055f, 2.4f);
+    else                    f = 1.0f;
+    return f;
+}
+
+void Surface::toLinearFromSrgb()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    const uint count = img->pixelCount();
+    for (uint c = 0; c < 3; c++) {
+        float * channel = img->channel(c);
+        for (uint i = 0; i < count; i++) {
+            channel[i] = ::fromSrgb(channel[i]);
+        }
+    }
+}
+
+static float toXenonSrgb(float f) {
+    if (f < 0)                  f = 0;
+    else if (f < (1.0f/16.0f))  f = 4.0f * f;
+    else if (f < (1.0f/8.0f))   f = 0.25f  + 2.0f * (f - 0.0625f);
+    else if (f < 0.5f)          f = 0.375f + 1.0f * (f - 0.125f);
+    else if (f < 1.0f)          f = 0.75f  + 0.5f * (f - 0.50f);
+    else                        f = 1.0f;
+    return f;
+}
+
+void Surface::toXenonSrgb()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    const uint count = img->pixelCount();
+    for (uint c = 0; c < 3; c++) {
+        float * channel = img->channel(c);
+        for (uint i = 0; i < count; i++) {
+            channel[i] = ::toXenonSrgb(channel[i]);
+        }
+    }
+}
+
+
+void Surface::transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4])
+{
+    if (isNull()) return;
+
+    detach();
+
+    Matrix xform(
+        Vector4(w0[0], w0[1], w0[2], w0[3]),
+        Vector4(w1[0], w1[1], w1[2], w1[3]),
+        Vector4(w2[0], w2[1], w2[2], w2[3]),
+        Vector4(w3[0], w3[1], w3[2], w3[3]));
+
+    Vector4 voffset(offset[0], offset[1], offset[2], offset[3]);
+
+    m->image->transform(0, xform, voffset);
+}
+
+// R, G, B, A, 1, 0, -1
+void Surface::swizzle(int r, int g, int b, int a)
+{
+    if (isNull()) return;
+    if (r == 0 && g == 1 && b == 2 && a == 3) return;
+
+    detach();
+
+    m->image->swizzle(0, r, g, b, a);
+}
+
+// color * scale + bias
+void Surface::scaleBias(int channel, float scale, float bias)
+{
+    if (isNull()) return;
+    if (equal(scale, 1.0f) && equal(bias, 0.0f)) return;
+
+    detach();
+
+    m->image->scaleBias(channel, 1, scale, bias);
+}
+
+void Surface::clamp(int channel, float low, float high)
+{
+    if (isNull()) return;
+
+    detach();
+
+    m->image->clamp(channel, 1, low, high);
+}
+
+void Surface::blend(float red, float green, float blue, float alpha, float t)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        r[i] = lerp(r[i], red, t);
+        g[i] = lerp(g[i], green, t);
+        b[i] = lerp(b[i], blue, t);
+        a[i] = lerp(a[i], alpha, t);
+    }
+}
+
+void Surface::premultiplyAlpha()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        r[i] *= a[i];
+        g[i] *= a[i];
+        b[i] *= a[i];
+    }
+}
+
+
+void Surface::toGreyScale(float redScale, float greenScale, float blueScale, float alphaScale)
+{
+    if (isNull()) return;
+
+    detach();
+
+    float sum = redScale + greenScale + blueScale + alphaScale;
+    redScale /= sum;
+    greenScale /= sum;
+    blueScale /= sum;
+    alphaScale /= sum;
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float grey = r[i] * redScale + g[i] * greenScale + b[i] * blueScale + a[i] * alphaScale;
+        a[i] = b[i] = g[i] = r[i] = grey;
+    }
+}
+
+// Draw colored border.
+void Surface::setBorder(float r, float g, float b, float a)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    const uint w = img->width();
+    const uint h = img->height();
+    const uint d = img->depth();
+
+    for (uint z = 0; z < d; z++)
+    {
+        for (uint i = 0; i < w; i++)
+        {
+            img->pixel(0, i, 0, z) = r;
+            img->pixel(1, i, 0, z) = g;
+            img->pixel(2, i, 0, z) = b;
+            img->pixel(3, i, 0, z) = a;
+
+            img->pixel(0, i, h-1, z) = r;
+            img->pixel(1, i, h-1, z) = g;
+            img->pixel(2, i, h-1, z) = b;
+            img->pixel(3, i, h-1, z) = a;
+        }
+
+        for (uint i = 0; i < h; i++)
+        {
+            img->pixel(0, 0, i, z) = r;
+            img->pixel(1, 0, i, z) = g;
+            img->pixel(2, 0, i, z) = b;
+            img->pixel(3, 0, i, z) = a;
+
+            img->pixel(0, w-1, i, z) = r;
+            img->pixel(1, w-1, i, z) = g;
+            img->pixel(2, w-1, i, z) = b;
+            img->pixel(3, w-1, i, z) = a;
+        }
+    }
+}
+
+// Fill image with the given color.
+void Surface::fill(float red, float green, float blue, float alpha)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) r[i] = red;
+    for (uint i = 0; i < count; i++) g[i] = green;
+    for (uint i = 0; i < count; i++) b[i] = blue;
+    for (uint i = 0; i < count; i++) a[i] = alpha;
+}
+
+
+void Surface::scaleAlphaToCoverage(float coverage, float alphaRef/*= 0.5f*/, int alpha_channel/*= 3*/)
+{
+    if (isNull()) return;
+
+    detach();
+
+    alphaRef = nv::clamp(alphaRef, 1.0f/256, 255.0f/256);
+
+    m->image->scaleAlphaToCoverage(coverage, alphaRef, alpha_channel);
+}
+
+/*bool Surface::normalizeRange(float * rangeMin, float * rangeMax)
+{
+    if (m->image == NULL) return false;
+
+    range(0, rangeMin, rangeMax);
+
+    if (*rangeMin == *rangeMax) {
+        // Single color image.
+        return false;
+    }
+
+    const float scale = 1.0f / (*rangeMax - *rangeMin);
+    const float bias = *rangeMin * scale;
+
+    if (range.x == 0.0f && range.y == 1.0f) {
+        // Already normalized.
+        return true;
+    }
+
+    detach();
+
+    // Scale to range.
+    img->scaleBias(0, 4, scale, bias);
+    //img->clamp(0, 4, 0.0f, 1.0f);
+
+    return true;
+}*/
+
+// Ideally you should compress/quantize the RGB and M portions independently.
+// Once you have M quantized, you would compute the corresponding RGB and quantize that.
+void Surface::toRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
+{
+    if (isNull()) return;
+
+    detach();
+
+    threshold = ::clamp(threshold, 1e-6f, 1.0f);
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
+
+#if 0
+        // Baseline, no compression:
+        r[i] = R;
+        g[i] = G;
+        b[i] = B;
+        a[i] = 1;
+
+#elif 0
+        float M = max(max(R, G), max(B, threshold));
+
+        r[i] = R / M;
+        g[i] = G / M;
+        b[i] = B / M;
+
+        a[i] = (M - threshold) / (1 - threshold);
+
+#else
+        // The optimal compressor produces the best results, but can introduce interpolation errors!
+        float bestM;
+        float bestError = FLT_MAX;
+
+        //float range = 15;  // 4 bit quantization.
+        //int irange = 16;
+        float range = 255;  // 8 bit quantization.
+        int irange = 256;
+
+
+        float M = max(max(R, G), max(B, threshold));
+        int iM = ftoi_ceil((M - threshold) / (1 - threshold) * range);
+
+        //for (int m = 0; m < 256; m++) {                           // If we use the entire search space, interpolation errors are very likely to occur.
+        for (int m = max(iM-16, 0); m < min(iM+16, irange); m++) {     // If we constrain the search space, these errors disappear.
+        //for (int m = max(iM-4, 0); m < min(iM+4, irange); m++) {     // If we constrain the search space, these errors disappear.
+            float fm = float(m) / range;
+
+            // Decode M
+            float M = fm * (1 - threshold) + threshold;
+
+            // Encode.
+            int ir = ftoi_round(range * nv::saturate(R / M));
+            int ig = ftoi_round(range * nv::saturate(G / M));
+            int ib = ftoi_round(range * nv::saturate(B / M));
+
+            // Decode.
+            float fr = (float(ir) / range) * M;
+            float fg = (float(ig) / range) * M;
+            float fb = (float(ib) / range) * M;
+
+            // Measure error.
+            float error = square(R-fr) + square(G-fg) + square(B-fb);
+
+            if (error < bestError) {
+                bestError = error;
+                bestM = M;
+            }
+        }
+
+        M = bestM;
+        r[i] = nv::saturate(R / M);
+        g[i] = nv::saturate(G / M);
+        b[i] = nv::saturate(B / M);
+        a[i] = (M - threshold) / (1 - threshold);
+#endif
+    }
+}
+
+// @@ IC: Dubious merge. Review!
+void Surface::fromRGBM(float range/*= 1*/, float threshold/*= 0.25*/)
+{
+    if (isNull()) return;
+
+    detach();
+
+    threshold = ::clamp(threshold, 1e-6f, 1.0f);
+
+	FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float M = a[i] * (range - threshold) + threshold;
+
+        r[i] *= M;
+        g[i] *= M;
+        b[i] *= M;
+        a[i] = 1.0f;
+    }
+}
+
+// This is dumb way to encode luminance only values.
+void Surface::toLM(float range/*= 1*/, float threshold/*= 0.25*/)
+{
+    if (isNull()) return;
+
+    detach();
+
+    threshold = ::clamp(threshold, 1e-6f, 1.0f);
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
+
+        float M = max(max(R, G), max(B, threshold));
+
+        float L = (R + G + B) / 3;
+        r[i] = L / M;
+        b[i] = L / M;
+        g[i] = L / M;
+        a[i] = (M - threshold) / (1 - threshold);
+    }
+}
+
+
+static Color32 toRgbe8(float r, float g, float b)
+{
+    Color32 c;
+    float v = max(max(r, g), b);
+    if (v < 1e-32) {
+        c.r = c.g = c.b = c.a = 0;
+    }
+    else {
+        int e;
+        v = frexp(v, &e) * 256.0f / v;
+        c.r = uint8(clamp(r * v, 0.0f, 255.0f));
+        c.g = uint8(clamp(g * v, 0.0f, 255.0f));
+        c.b = uint8(clamp(b * v, 0.0f, 255.0f));
+        c.a = e + 128;
+    }
+
+    return c;
+}
+
+
+/*
+  Alen Ladavac @ GDAlgorithms-list on Feb 7, 2007:
+    One trick that we use to alleviate such problems is to use RGBE5.3 -
+    i.e. have a fixed point exponent. Note that it is not enough to just
+    shift the exponent up for 3 bits, but you actually have to convert
+    each pixel in the RGBE8 texture by unpacking it to floats and then
+    repacking it with a non-integer exponent, which gives different
+    mantissas as well. Now your jumps in exponent are much smaller, thus
+    the bands are not that noticeable. It is still not as good as FP16,
+    but it is much better than RGBE8. I hope this explanation is
+    understandable, if not I can fill in more details.
+
+    Though there still are some bands, you can get an even better
+    precision if you upload that same texture as RGBA16, because you'll
+    get even more interpolation then, and it works good as a scalable
+    option for people with more GPU RAM). Alternatively, when some of the
+    future cards (hopefully, because I'm trying to lobby for that
+    everywhere :) ), start returning more than 8 bits, your scenes will
+    automatically look better even without using RGBA16.
+
+  Jon Watte:
+    The interpolation of 5.3 is the same as that of 8 bits, because it's a
+    fixed point format.
+
+    The reason using 5.3 helps, is that each bit of quantization in the
+    interpolation only means 1/8th of a fully significant bit. The
+    quantization still happens, it's just less visible. The trade-off is
+    that you get less dynamic range.
+
+  Alen Ladavac:
+    True, but it is just a small part of the improvement. The greater part
+    is that RGB values have to be calculated according to the fractional
+    exponent. With integer exponent, the RGB values jump by a factor of 2
+    when each bit changes in exponent, and 5.3 with correct adjustment of
+    RGB lowers this jump to be about 1.09, which is much better. I may not
+    be entirely correct on the numbers, which I'm pulling out from my
+    memory now, but it's a rough estimate.
+*/
+/* Ward's version:
+static Color32 toRgbe8(float r, float g, float b)
+{
+    Color32 c;
+    float v = max(max(r, g), b);
+    if (v < 1e-32) {
+        c.r = c.g = c.b = c.a = 0;
+    }
+    else {
+        int e;
+        v = frexp(v, &e) * 256.0f / v;
+        c.r = uint8(clamp(r * v, 0.0f, 255.0f));
+        c.g = uint8(clamp(g * v, 0.0f, 255.0f));
+        c.b = uint8(clamp(b * v, 0.0f, 255.0f));
+        c.a = e + 128;
+    }
+
+    return c;
+}
+*/
+
+// For R9G9B9E5, use toRGBE(9, 5), for Ward's RGBE, use toRGBE(8, 8)
+// @@ Note that most Radiance HDR loaders use an exponent bias of 128 instead of 127! This implementation
+// matches the OpenGL extension.
+void Surface::toRGBE(int mantissaBits, int exponentBits)
+{
+    // According to the OpenGL extension:
+    // http://www.opengl.org/registry/specs/EXT/texture_shared_exponent.txt
+    //
+    // Components red, green, and blue are first clamped (in the process,
+    // mapping NaN to zero) so:
+    //
+    //     red_c   = max(0, min(sharedexp_max, red))
+    //     green_c = max(0, min(sharedexp_max, green))
+    //     blue_c  = max(0, min(sharedexp_max, blue))
+    //
+    // where sharedexp_max is (2^N-1)/2^N * 2^(Emax-B), N is the number
+    // of mantissa bits per component, Emax is the maximum allowed biased
+    // exponent value (careful: not necessarily 2^E-1 when E is the number of
+    // exponent bits), bits, and B is the exponent bias.  For the RGB9_E5_EXT
+    // format, N=9, Emax=31, and B=15.
+    //
+    // The largest clamped component, max_c, is determined:
+    //
+    //     max_c = max(red_c, green_c, blue_c)
+    //
+    // A preliminary shared exponent is computed:
+    //
+    //     exp_shared_p = max(-B-1, floor(log2(max_c))) + 1 + B
+    //
+    // A refined shared exponent is then computed as:
+    //
+    //     max_s   = floor(max_c   / 2^(exp_shared_p - B - N) + 0.5)
+    //
+    //                  { exp_shared_p,    0 <= max_s <  2^N
+    //     exp_shared = {
+    //                  { exp_shared_p+1,       max_s == 2^N
+    //
+    // These integers values in the range 0 to 2^N-1 are then computed:
+    //
+    //     red_s   = floor(red_c   / 2^(exp_shared - B - N) + 0.5)
+    //     green_s = floor(green_c / 2^(exp_shared - B - N) + 0.5)
+    //     blue_s  = floor(blue_c  / 2^(exp_shared - B - N) + 0.5)
+
+    if (isNull()) return;
+
+    detach();
+
+    // mantissaBits = N
+    // exponentBits = E
+    // exponentMax = Emax
+    // exponentBias = B
+    // maxValue = sharedexp_max
+
+    // max exponent: 5 -> 31, 8 -> 255
+    const int exponentMax = (1 << exponentBits) - 1;
+
+    // exponent bias: 5 -> 15, 8 -> 127
+    const int exponentBias = (1 << (exponentBits - 1)) - 1;
+
+    // Maximum representable value: 5 -> 63488, 8 -> HUGE
+    const float maxValue = float(exponentMax) / float(exponentMax + 1) * float(1 << (exponentMax - exponentBias));
+
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        // Clamp components:
+        float R = ::clamp(r[i], 0.0f, maxValue);
+        float G = ::clamp(g[i], 0.0f, maxValue);
+        float B = ::clamp(b[i], 0.0f, maxValue);
+
+        // Compute max:
+        float M = max3(R, G, B);
+
+        // Preliminary exponent:
+        int E = max(- exponentBias - 1, floatExponent(M)) + 1 + exponentBias;
+        nvDebugCheck(E >= 0 && E < (1 << exponentBits));
+
+        double denom = pow(2.0, double(E - exponentBias - mantissaBits));
+
+        // Refine exponent:
+        int m = ftoi_round(float(M / denom));
+        nvDebugCheck(m <= (1 << mantissaBits));
+
+        if (m == (1 << mantissaBits)) {
+            denom *= 2;
+            E += 1;
+            nvDebugCheck(E < (1 << exponentBits));
+        }
+
+        R = floatRound(float(R / denom));
+        G = floatRound(float(G / denom));
+        B = floatRound(float(B / denom));
+
+        nvDebugCheck(R >= 0 && R < (1 << mantissaBits));
+        nvDebugCheck(G >= 0 && G < (1 << mantissaBits));
+        nvDebugCheck(B >= 0 && B < (1 << mantissaBits));
+
+        // Store as normalized float.
+        r[i] = R / ((1 << mantissaBits) - 1);
+        g[i] = G / ((1 << mantissaBits) - 1);
+        b[i] = B / ((1 << mantissaBits) - 1);
+        a[i] = float(E) / ((1 << exponentBits) - 1);
+    }
+}
+
+void Surface::fromRGBE(int mantissaBits, int exponentBits)
+{
+    // According to the OpenGL extension:
+    // http://www.opengl.org/registry/specs/EXT/texture_shared_exponent.txt
+    //
+    // The 1st, 2nd, 3rd, and 4th components are called
+    // p_red, p_green, p_blue, and p_exp respectively and are treated as
+    // unsigned integers.  These are then used to compute floating-point
+    // RGB components (ignoring the "Conversion to floating-point" section
+    // below in this case) as follows:
+    //
+    //   red   = p_red   * 2^(p_exp - B - N)
+    //   green = p_green * 2^(p_exp - B - N)
+    //   blue  = p_blue  * 2^(p_exp - B - N)
+    //
+    // where B is 15 (the exponent bias) and N is 9 (the number of mantissa
+    // bits)."
+
+
+    // int exponent = v.field.biasedexponent - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS;
+    // float scale = (float) pow(2, exponent);
+    //
+    // retval[0] = v.field.r * scale;
+    // retval[1] = v.field.g * scale;
+    // retval[2] = v.field.b * scale;
+
+
+    if (isNull()) return;
+
+    detach();
+
+    // exponent bias: 5 -> 15, 8 -> 127
+    const int exponentBias = (1 << (exponentBits - 1)) - 1;
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        // Expand normalized float to to 9995
+        int R = ftoi_round(r[i] * ((1 << mantissaBits) - 1));
+        int G = ftoi_round(g[i] * ((1 << mantissaBits) - 1));
+        int B = ftoi_round(b[i] * ((1 << mantissaBits) - 1));
+        int E = ftoi_round(a[i] * ((1 << exponentBits) - 1));
+
+        //float scale = ldexpf(1.0f, E - exponentBias - mantissaBits);
+        float scale = powf(2, float(E - exponentBias - mantissaBits));
+
+        r[i] = R * scale;
+        g[i] = G * scale;
+        b[i] = B * scale;
+        a[i] = 1;
+    }
+}
+
+// Y is in the [0, 1] range, while CoCg are in the [-1, 1] range.
+void Surface::toYCoCg()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float R = r[i];
+        float G = g[i];
+        float B = b[i];
+
+        float Y = (2*G + R + B) * 0.25f;
+        float Co = (R - B);
+        float Cg = (2*G - R - B) * 0.5f;
+
+        r[i] = Co;
+        g[i] = Cg;
+        b[i] = 1.0f;
+        a[i] = Y;
+    }
+}
+
+// img.toYCoCg();
+// img.blockScaleCoCg();
+// img.scaleBias(0, 0.5, 0.5);
+// img.scaleBias(1, 0.5, 0.5);
+
+// @@ Add support for threshold.
+// We could do something to prevent scale values from adjacent blocks from being too different to each other
+// and minimize bilinear interpolation artifacts.
+void Surface::blockScaleCoCg(int bits/*= 5*/, float threshold/*= 0.0*/)
+{
+    if (isNull() || depth() != 1) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    const uint w = img->width();
+    const uint h = img->height();
+    const uint bw = max(1U, w/4);
+    const uint bh = max(1U, h/4);
+
+    for (uint bj = 0; bj < bh; bj++) {
+        for (uint bi = 0; bi < bw; bi++) {
+
+            // Compute per block scale.
+            float m = 1.0f / 255.0f;
+            for (uint j = 0; j < 4; j++) {
+                const uint y = bj*4 + j;
+                if (y >= h) continue;
+
+                for (uint i = 0; i < 4; i++) {
+                    const uint x = bi*4 + i;
+                    if (x >= w) continue;
+
+                    float Co = img->pixel(0, x, y, 0);
+                    float Cg = img->pixel(1, x, y, 0);
+
+                    m = max(m, fabsf(Co));
+                    m = max(m, fabsf(Cg));
+                }
+            }
+
+            float scale = PixelFormat::quantizeCeil(m, bits, 8);
+            nvDebugCheck(scale >= m);
+
+            // Store block scale in blue channel and scale CoCg.
+            for (uint j = 0; j < 4; j++) {
+                for (uint i = 0; i < 4; i++) {
+                    uint x = min(bi*4 + i, w);
+                    uint y = min(bj*4 + j, h);
+
+                    float & Co = img->pixel(0, x, y, 0);
+                    float & Cg = img->pixel(1, x, y, 0);
+
+                    Co /= scale;
+                    nvDebugCheck(fabsf(Co) <= 1.0f);
+
+                    Cg /= scale;
+                    nvDebugCheck(fabsf(Cg) <= 1.0f);
+
+                    img->pixel(2, x, y, 0) = scale;
+                }
+            }
+        }
+    }
+}
+
+void Surface::fromYCoCg()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float Co = r[i];
+        float Cg = g[i];
+        float scale = b[i] * 0.5f;
+        float Y = a[i];
+
+        Co *= scale;
+        Cg *= scale;
+
+        float R = Y + Co - Cg;
+        float G = Y + Cg;
+        float B = Y - Co - Cg;
+
+        r[i] = R;
+        g[i] = G;
+        b[i] = B;
+        a[i] = 1.0f;
+    }
+}
+
+void Surface::toLUVW(float range/*= 1.0f*/)
+{
+    if (isNull()) return;
+
+    detach();
+
+    float irange = 1.0f / range;
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    float * a = img->channel(3);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i] * irange, 0.0f, 1.0f);
+        float G = nv::clamp(g[i] * irange, 0.0f, 1.0f);
+        float B = nv::clamp(b[i] * irange, 0.0f, 1.0f);
+
+        float L = max(sqrtf(R*R + G*G + B*B), 1e-6f); // Avoid division by zero.
+
+        r[i] = R / L;
+        g[i] = G / L;
+        b[i] = B / L;
+        a[i] = L / sqrtf(3);
+    }
+}
+
+void Surface::fromLUVW(float range/*= 1.0f*/)
+{
+    // Decompression is the same as in RGBM.
+    fromRGBM(range * sqrtf(3));
+}
+
+void Surface::abs(int channel)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * c = img->channel(channel);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        c[i] = fabsf(c[i]);
+    }
+}
+
+void Surface::convolve(int channel, int kernelSize, float * kernelData)
+{
+    if (isNull()) return;
+
+    detach();
+
+    Kernel2 k(kernelSize, kernelData);
+    m->image->convolve(k, channel, (FloatImage::WrapMode)m->wrapMode);
+}
+
+// Assumes input has already been scaled by exposure.
+void Surface::toneMap(ToneMapper tm, float * parameters)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+    const uint count = img->pixelCount();
+
+    if (tm == ToneMapper_Linear) {
+        // Clamp preserving the hue.
+        for (uint i = 0; i < count; i++) {
+            float m = max3(r[i], g[i], b[i]);
+            if (m > 1.0f) {
+                r[i] *= 1.0f / m;
+                g[i] *= 1.0f / m;
+                b[i] *= 1.0f / m;
+            }
+        }
+    }
+    else if (tm == ToneMapper_Reindhart) {
+        for (uint i = 0; i < count; i++) {
+            r[i] /= r[i] + 1;
+            g[i] /= g[i] + 1;
+            b[i] /= b[i] + 1;
+        }
+    }
+    else if (tm == ToneMapper_Halo) {
+        for (uint i = 0; i < count; i++) {
+            r[i] = 1 - exp2f(-r[i]);
+            g[i] = 1 - exp2f(-g[i]);
+            b[i] = 1 - exp2f(-b[i]);
+        }
+    }
+    else if (tm == ToneMapper_Lightmap) {
+        // @@ Goals:
+        // Preserve hue.
+        // Avoid clamping abrubtly.
+        // Minimize color difference along most of the color range. [0, alpha)
+        for (uint i = 0; i < count; i++) {
+            float m = max3(r[i], g[i], b[i]);
+            if (m > 1.0f) {
+                r[i] *= 1.0f / m;
+                g[i] *= 1.0f / m;
+                b[i] *= 1.0f / m;
+            }
+        }
+    }
+}
+
+void Surface::toLogScale(int channel, float base) {
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * c = img->channel(channel);
+
+    float scale = 1.0f / log2f(base);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        c[i] = log2f(c[i]) * scale;
+    }
+}
+
+void Surface::fromLogScale(int channel, float base) {
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * c = img->channel(channel);
+
+    float scale = log2f(base);
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        c[i] = exp2f(c[i] * scale);
+    }
+}
+
+
+
+/*
+void Surface::blockLuminanceScale(float scale)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    //float * r = img->channel(0);
+    //float * g = img->channel(1);
+    //float * b = img->channel(2);
+    //float * a = img->channel(3);
+
+    const uint w = img->width();
+    const uint h = img->height();
+    const uint bw = max(1U, w/4);
+    const uint bh = max(1U, h/4);
+
+    Vector3 L = normalize(Vector3(1, 1, 1));
+
+    for (uint bj = 0; bj < bh; bj++) {
+        for (uint bi = 0; bi < bw; bi++) {
+
+            // Compute block centroid.
+            Vector3 centroid(0.0f);
+            int count = 0;
+            for (uint j = 0; j < 4; j++) {
+                const uint y = bj*4 + j;
+                if (y >= h) continue;
+
+                for (uint i = 0; i < 4; i++) {
+                    const uint x = bi*4 + i;
+                    if (x >= w) continue;
+
+                    float r = img->pixel(x, y, 0);
+                    float g = img->pixel(x, y, 1);
+                    float b = img->pixel(x, y, 2);
+                    Vector3 rgb(r, g, b);
+
+                    centroid += rgb;
+                    count++;
+                }
+            }
+
+            centroid /= float(count);
+
+            // Project to luminance plane.
+            for (uint j = 0; j < 4; j++) {
+                const uint y = bj*4 + j;
+                if (y >= h) continue;
+
+                for (uint i = 0; i < 4; i++) {
+                    const uint x = bi*4 + i;
+                    if (x >= w) continue;
+
+                    float & r = img->pixel(x, y, 0);
+                    float & g = img->pixel(x, y, 1);
+                    float & b = img->pixel(x, y, 2);
+                    Vector3 rgb(r, g, b);
+
+                    Vector3 delta = rgb - centroid;
+
+                    delta -= scale * dot(delta, L) * L;
+
+                    r = centroid.x + delta.x;
+                    g = centroid.y + delta.y;
+                    b = centroid.z + delta.z;
+                }
+            }
+        }
+    }
+}
+*/
+
+/*
+void Surface::toJPEGLS()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], 0.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], 0.0f, 1.0f);
+
+        r[i] = R-G;
+        g[i] = G;
+        b[i] = B-G;
+    }
+}
+
+void Surface::fromJPEGLS()
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    float * r = img->channel(0);
+    float * g = img->channel(1);
+    float * b = img->channel(2);
+
+    const uint count = img->width() * img->height();
+    for (uint i = 0; i < count; i++) {
+        float R = nv::clamp(r[i], -1.0f, 1.0f);
+        float G = nv::clamp(g[i], 0.0f, 1.0f);
+        float B = nv::clamp(b[i], -1.0f, 1.0f);
+
+        r[i] = R+G;
+        g[i] = G;
+        b[i] = B+G;
+    }
+}
+*/
+
+
+// If dither is true, this uses Floyd-Steinberg dithering method.
+void Surface::binarize(int channel, float threshold, bool dither)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    if (!dither) {
+        float * c = img->channel(channel);
+        const uint count = img->pixelCount();
+        for (uint i = 0; i < count; i++) {
+            c[i] = float(c[i] > threshold);
+        }
+    }
+    else {
+        const uint w = img->width();
+        const uint h = img->height();
+        const uint d = img->depth();
+
+        float * row0 = new float[(w+2)];
+        float * row1 = new float[(w+2)];
+
+        // @@ Extend Floyd-Steinberg dithering to 3D properly.
+        for (uint z = 0; z < d; z++) {
+            memset(row0, 0, sizeof(float)*(w+2));
+            memset(row1, 0, sizeof(float)*(w+2));
+
+            for (uint y = 0; y < h; y++) {
+                for (uint x = 0; x < w; x++) {
+
+                    float & f = img->pixel(channel, x, y, 0);
+
+                    // Add error and quantize.
+                    float qf = float(f + row0[1+x] > threshold);
+
+                    // Compute new error:
+                    float diff = f - qf;
+
+                    // Store color.
+                    f = qf;
+
+                    // Propagate new error.
+                    row0[1+x+1] += (7.0f / 16.0f) * diff;
+                    row1[1+x-1] += (3.0f / 16.0f) * diff;
+                    row1[1+x+0] += (5.0f / 16.0f) * diff;
+                    row1[1+x+1] += (1.0f / 16.0f) * diff;
+                }
+
+                swap(row0, row1);
+                memset(row1, 0, sizeof(float)*(w+2));
+            }
+        }
+
+        delete [] row0;
+        delete [] row1;
+    }
+}
+
+// Uniform quantizer.
+// Assumes input is in [0, 1] range. Output is in the [0, 1] range, but rounded to the middle of each bin.
+// If exactEndPoints is true, [0, 1] are represented exactly, and the correponding bins are half the size, so quantization is not truly uniform.
+// When dither is true, this uses Floyd-Steinberg dithering.
+void Surface::quantize(int channel, int bits, bool exactEndPoints, bool dither)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    float scale, offset0, offset1;
+    if (exactEndPoints) {
+        // floor(x*(range-1) + 0.5) / (range-1)
+        scale = float((1 << bits) - 1);
+        offset0 = 0.5f;
+        offset1 = 0.0f;
+    }
+    else {
+        // (floor(x*range) + 0.5) / range
+        scale = float(1 << bits);
+        offset0 = 0.0f;
+        offset1 = 0.5f;
+    }
+
+    if (!dither) {
+        float * c = img->channel(channel);
+        const uint count = img->pixelCount();
+        for (uint i = 0; i < count; i++) {
+            c[i] = saturate((floorf(c[i] * scale + offset0) + offset1) / scale);
+        }
+    }
+    else {
+        const uint w = img->width();
+        const uint h = img->height();
+        const uint d = img->depth();
+
+        float * row0 = new float[(w+2)];
+        float * row1 = new float[(w+2)];
+
+        for (uint z = 0; z < d; z++) {
+            memset(row0, 0, sizeof(float)*(w+2));
+            memset(row1, 0, sizeof(float)*(w+2));
+
+            for (uint y = 0; y < h; y++) {
+                for (uint x = 0; x < w; x++) {
+
+                    float & f = img->pixel(channel, x, y, 0);
+
+                    // Add error and quantize.
+                    float qf = saturate((floorf((f + row0[1+x]) * scale + offset0) + offset1) / scale);
+
+                    // Compute new error:
+                    float diff = f - qf;
+
+                    // Store color.
+                    f = qf;
+
+                    // Propagate new error.
+                    row0[1+x+1] += (7.0f / 16.0f) * diff;
+                    row1[1+x-1] += (3.0f / 16.0f) * diff;
+                    row1[1+x+0] += (5.0f / 16.0f) * diff;
+                    row1[1+x+1] += (1.0f / 16.0f) * diff;
+                }
+
+                swap(row0, row1);
+                memset(row1, 0, sizeof(float)*(w+2));
+            }
+        }
+
+        delete [] row0;
+        delete [] row1;
+    }
+}
+
+
+
+// Set normal map options.
+void Surface::toNormalMap(float sm, float medium, float big, float large)
+{
+    if (isNull()) return;
+
+    detach();
+
+    const Vector4 filterWeights(sm, medium, big, large);
+
+    const FloatImage * img = m->image;
+    m->image = nv::createNormalMap(img, (FloatImage::WrapMode)m->wrapMode, filterWeights);
+
+    delete img;
+
+    m->isNormalMap = true;
+}
+
+void Surface::normalizeNormalMap()
+{
+    if (isNull()) return;
+    if (!m->isNormalMap) return;
+
+    detach();
+
+    nv::normalizeNormalMap(m->image);
+}
+
+void Surface::transformNormals(NormalTransform xform)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float & x = img->pixel(0, i);
+        float & y = img->pixel(1, i);
+        float & z = img->pixel(2, i);
+        Vector3 n(x, y, z);
+
+        n = normalizeSafe(n, Vector3(0.0f), 0.0f);
+
+        if (xform == NormalTransform_Orthographic) {
+            n.z = 0.0f;
+        }
+        else if (xform == NormalTransform_Stereographic) {
+            n.x = n.x / (1 + n.z);
+            n.y = n.y / (1 + n.z);
+            n.z = 0.0f;
+        }
+        else if (xform == NormalTransform_Paraboloid) {
+            float a = (n.x * n.x) + (n.y * n.y);
+            float b = n.z;
+            float c = -1.0f;
+            float discriminant = b * b - 4.0f * a * c;
+            float t = (-b + sqrtf(discriminant)) / (2.0f * a);
+            n.x = n.x * t;
+            n.y = n.y * t;
+            n.z = 0.0f;
+        }
+        else if (xform == NormalTransform_Quartic) {
+            // Use Newton's method to solve equation:
+            // f(t) = 1 - zt - (x^2+y^2)t^2 + x^2y^2t^4 = 0
+            // f'(t) = - z - 2(x^2+y^2)t + 4x^2y^2t^3
+
+            // Initial approximation:
+            float a = (n.x * n.x) + (n.y * n.y);
+            float b = n.z;
+            float c = -1.0f;
+            float discriminant = b * b - 4.0f * a * c;
+            float t = (-b + sqrtf(discriminant)) / (2.0f * a);
+
+            float d = fabs(n.z * t - (1 - n.x*n.x*t*t) * (1 - n.y*n.y*t*t));
+
+            while (d > 0.0001) {
+                float ft = 1 - n.z * t - (n.x*n.x + n.y*n.y)*t*t + n.x*n.x*n.y*n.y*t*t*t*t;
+                float fit = - n.z - 2*(n.x*n.x + n.y*n.y)*t + 4*n.x*n.x*n.y*n.y*t*t*t;
+                t -= ft / fit;
+                d = fabs(n.z * t - (1 - n.x*n.x*t*t) * (1 - n.y*n.y*t*t));
+            };
+
+            n.x = n.x * t;
+            n.y = n.y * t;
+            n.z = 0.0f;
+        }
+        /*else if (xform == NormalTransform_DualParaboloid) {
+
+        }*/
+
+        x = n.x;
+        y = n.y;
+        z = n.z;
+    }
+}
+
+void Surface::reconstructNormals(NormalTransform xform)
+{
+    if (isNull()) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float & x = img->pixel(0, i);
+        float & y = img->pixel(1, i);
+        float & z = img->pixel(2, i);
+        Vector3 n(x, y, z);
+
+        if (xform == NormalTransform_Orthographic) {
+            n.z = sqrtf(1 - nv::clamp(n.x * n.x + n.y * n.y, 0.0f, 1.0f));
+        }
+        else if (xform == NormalTransform_Stereographic) {
+            float denom = 2.0f / (1 + nv::clamp(n.x * n.x + n.y * n.y, 0.0f, 1.0f));
+            n.x *= denom;
+            n.y *= denom;
+            n.z = denom - 1;
+        }
+        else if (xform == NormalTransform_Paraboloid) {
+            n.x = n.x;
+            n.y = n.y;
+            n.z = 1.0f - nv::clamp(n.x * n.x + n.y * n.y, 0.0f, 1.0f);
+            n = normalizeSafe(n, Vector3(0.0f), 0.0f);
+        }
+        else if (xform == NormalTransform_Quartic) {
+            n.x = n.x;
+            n.y = n.y;
+            n.z = nv::clamp((1 - n.x * n.x) * (1 - n.y * n.y), 0.0f, 1.0f);
+            n = normalizeSafe(n, Vector3(0.0f), 0.0f);
+        }
+        /*else if (xform == NormalTransform_DualParaboloid) {
+
+        }*/
+
+        x = n.x;
+        y = n.y;
+        z = n.z;
+    }
+}
+
+void Surface::toCleanNormalMap()
+{
+    if (isNull()) return;
+
+    detach();
+
+    const uint count = m->image->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        float x = m->image->pixel(0, i);
+        float y = m->image->pixel(1, i);
+
+        m->image->pixel(2, i) = x*x + y*y;
+    }
+}
+
+// [-1,1] -> [ 0,1]
+void Surface::packNormals(float scale/*= 0.5f*/, float bias/*= 0.5f*/) {
+    if (isNull()) return;
+    detach();
+    m->image->scaleBias(0, 3, scale, bias);
+}
+
+// [ 0,1] -> [-1,1]
+void Surface::expandNormals(float scale/*= 2.0f*/, float bias/*= - 2.0f * 127.0f / 255.0f*/) {
+    if (isNull()) return;
+    detach();
+    m->image->scaleBias(0, 3, scale, bias);
+}
+
+
+// Create a Toksvig map for this normal map.
+// http://blog.selfshadow.com/2011/07/22/specular-showdown/
+// @@ Assumes this is a normal map expanded in the [-1, 1] range.
+Surface Surface::createToksvigMap(float power) const
+{
+    if (isNull()) return Surface();
+
+    // @@ TODO
+
+    return Surface();
+}
+
+// @@ Should I add support for LEAN maps? That requires 5 terms, which would have to be encoded in two textures.
+// There's nothing stopping us from having 5 channels in a surface, and then, let the user swizzle them as they wish.
+// CLEAN maps are probably more practical, though.
+// http://www.cs.umbc.edu/~olano/papers/lean/
+// http://gaim.umbc.edu/2011/07/24/shiny-and-clean/
+// http://gaim.umbc.edu/2011/07/26/on-error/
+NVTT_API Surface Surface::createCleanMap() const
+{
+    if (isNull()) return Surface();
+
+    // @@ TODO
+
+    return Surface();
+}
+
+
+void Surface::flipX()
+{
+    if (isNull()) return;
+
+    detach();
+
+    m->image->flipX();
+}
+
+void Surface::flipY()
+{
+    if (isNull()) return;
+
+    detach();
+
+    m->image->flipY();
+}
+
+void Surface::flipZ()
+{
+    if (isNull()) return;
+
+    detach();
+
+    m->image->flipZ();
+}
+
+Surface Surface::createSubImage(int x0, int x1, int y0, int y1, int z0, int z1) const
+{
+    Surface s;
+
+    if (isNull()) return s;
+    if (x0 < 0 || x1 > width() || x0 > x1) return s;
+    if (y0 < 0 || y1 > height() || y0 > y1) return s;
+    if (z0 < 0 || z1 > depth() || z0 > z1) return s;
+    if (x1 >= width() || y1 >= height() || z1 >= depth()) return s;
+
+    FloatImage * img = s.m->image = new FloatImage;
+
+    int w = x1 - x0 + 1;
+    int h = y1 - y0 + 1;
+    int d = z1 - z0 + 1;
+
+    img->allocate(4, w, h, d);
+
+    for (int c = 0; c < 4; c++) {
+        for (int z = 0; z < d; z++) {
+            for (int y = 0; y < h; y++) {
+                for (int x = 0; x < w; x++) {
+                    img->pixel(c, x, y, z) = m->image->pixel(c, x0+x, y0+y, z0+z);
+                }
+            }
+        }
+    }
+
+    return s;
+}
+
+bool Surface::copyChannel(const Surface & srcImage, int srcChannel)
+{
+    return copyChannel(srcImage, srcChannel, srcChannel);
+}
+
+bool Surface::copyChannel(const Surface & srcImage, int srcChannel, int dstChannel)
+{
+    if (srcChannel < 0 || srcChannel > 3 || dstChannel < 0 || dstChannel > 3) return false;
+
+    FloatImage * dst = m->image;
+    const FloatImage * src = srcImage.m->image;
+
+    if (!sameLayout(dst, src)) {
+        return false;
+    }
+    nvDebugCheck(dst->componentCount() == 4 && src->componentCount() == 4);
+
+    detach();
+
+    dst = m->image;
+
+    memcpy(dst->channel(dstChannel), src->channel(srcChannel), dst->pixelCount()*sizeof(float));
+
+    return true;
+}
+
+bool Surface::addChannel(const Surface & srcImage, int srcChannel, int dstChannel, float scale)
+{
+    if (srcChannel < 0 || srcChannel > 3 || dstChannel < 0 || dstChannel > 3) return false;
+
+    FloatImage * dst = m->image;
+    const FloatImage * src = srcImage.m->image;
+
+    if (!sameLayout(dst, src)) {
+        return false;
+    }
+    nvDebugCheck(dst->componentCount() == 4 && src->componentCount() == 4);
+
+    detach();
+
+    dst = m->image;
+
+    float * d = dst->channel(dstChannel);
+    const float * s = src->channel(srcChannel);
+
+    const uint count = src->pixelCount();
+    for (uint i = 0; i < count; i++) {
+        d[i] += s[i] * scale;
+    }
+
+    return true;
+}
+
+
+bool Surface::copy(const Surface & srcImage, int xsrc, int ysrc, int zsrc, int xsize, int ysize, int zsize, int xdst, int ydst, int zdst)
+{
+    if (xsrc < 0 || ysrc < 0 || zsrc < 0) return false;
+    if (xdst < 0 || ydst < 0 || zdst < 0) return false;
+
+    FloatImage * dst = m->image;
+    const FloatImage * src = srcImage.m->image;
+
+    if (U32(xsrc + xsize) > src->width() || U32(ysrc + ysize) > src->height() || U32(zsrc + zsize) > src->depth()) return false;
+    if (U32(xdst + xsize) > dst->width() || U32(ydst + ysize) > dst->height() || U32(zdst + zsize) > dst->depth()) return false;
+
+    detach();
+
+    // For each channel.
+    for(int i = 0; i < 4; i++) {
+        float * d = dst->channel(i);
+        const float * s = src->channel(i);
+
+        // Copy region from src to dst.
+        for (int z = 0; z < zsize; z++) {
+            for (int y = 0; y < ysize; y++) {
+                for (int x = 0; x < xsize; x++) {
+                    d[dst->index(xdst + x, ydst + y, zdst + z)] = s[src->index(xsrc + x, ysrc + y, zsrc + z)];
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+
+// Draw colored border around atlas elements.
+void Surface::setAtlasBorder(int aw, int ah, float r, float g, float b, float a)
+{
+    if (isNull()) return;
+    if (aw <= 0) return;
+    if (ah <= 0) return;
+
+    detach();
+
+    FloatImage * img = m->image;
+    const uint w = img->width();
+    const uint h = img->height();
+    const uint d = img->depth();
+
+    // @@ Ideally the reminder of these divisions should be 0.
+    uint tile_height = h / ah;
+    uint tile_width = w / aw;
+
+    // Note that this renders two consecutive lines between tiles. In theory we could just have one, but this way I think we have better rotation invariance.
+
+    for (uint z = 0; z < d; z++)
+    {
+        // Horizontal lines:
+        for (uint i = 0, y = 0; i < uint(ah); i++, y += tile_height)
+        {
+            for (uint x = 0; x < w; x++)
+            {
+                img->pixel(0, x, y, z) = r;
+                img->pixel(1, x, y, z) = g;
+                img->pixel(2, x, y, z) = b;
+                img->pixel(3, x, y, z) = a;
+
+                img->pixel(0, x, y + tile_height - 1, z) = r;
+                img->pixel(1, x, y + tile_height - 1, z) = g;
+                img->pixel(2, x, y + tile_height - 1, z) = b;
+                img->pixel(3, x, y + tile_height - 1, z) = a;
+            }
+        }
+
+        // Vertical lines:
+        for (uint i = 0, x = 0; i < uint(ah); i++, x += tile_width)
+        {
+            for (uint y = 0; y < h; y++)
+            {
+                img->pixel(0, x, y, z) = r;
+                img->pixel(1, x, y, z) = g;
+                img->pixel(2, x, y, z) = b;
+                img->pixel(3, x, y, z) = a;
+
+                img->pixel(0, x + tile_width - 1, y, z) = r;
+                img->pixel(1, x + tile_width - 1, y, z) = g;
+                img->pixel(2, x + tile_width - 1, y, z) = b;
+                img->pixel(3, x + tile_width - 1, y, z) = a;
+            }
+        }
+    }
+}
+
+
+
+float nvtt::rmsError(const Surface & reference, const Surface & image)
+{
+    return nv::rmsColorError(reference.m->image, image.m->image, reference.alphaMode() == nvtt::AlphaMode_Transparency);
+}
+
+
+float nvtt::rmsAlphaError(const Surface & reference, const Surface & image)
+{
+    return nv::rmsAlphaError(reference.m->image, image.m->image);
+}
+
+
+float nvtt::cieLabError(const Surface & reference, const Surface & image)
+{
+    return nv::cieLabError(reference.m->image, image.m->image);
+}
+
+float nvtt::angularError(const Surface & reference, const Surface & image)
+{
+    //return nv::averageAngularError(reference.m->image, image.m->image);
+    return nv::rmsAngularError(reference.m->image, image.m->image);
+}
+
+
+Surface nvtt::diff(const Surface & reference, const Surface & image, float scale)
+{
+    const FloatImage * ref = reference.m->image;
+    const FloatImage * img = image.m->image;
+
+    if (!sameLayout(img, ref)) {
+        return Surface();
+    }
+
+    nvDebugCheck(img->componentCount() == 4);
+    nvDebugCheck(ref->componentCount() == 4);
+
+    nvtt::Surface diffImage;
+    FloatImage * diff = diffImage.m->image = new FloatImage;
+    diff->allocate(4, img->width(), img->height(), img->depth());
+
+    const uint count = img->pixelCount();
+    for (uint i = 0; i < count; i++)
+    {
+        float r0 = img->pixel(0, i);
+        float g0 = img->pixel(1, i);
+        float b0 = img->pixel(2, i);
+        //float a0 = img->pixel(3, i);
+        float r1 = ref->pixel(0, i);
+        float g1 = ref->pixel(1, i);
+        float b1 = ref->pixel(2, i);
+        float a1 = ref->pixel(3, i);
+
+        float dr = r0 - r1;
+        float dg = g0 - g1;
+        float db = b0 - b1;
+        //float da = a0 - a1;
+
+        if (reference.alphaMode() == nvtt::AlphaMode_Transparency)
+        {
+            dr *= a1;
+            dg *= a1;
+            db *= a1;
+        }
+
+        diff->pixel(0, i) = dr * scale;
+        diff->pixel(1, i) = dg * scale;
+        diff->pixel(2, i) = db * scale;
+        diff->pixel(3, i) = a1;
+    }
+
+    return diffImage;
+}
+
+float nvtt::rmsToneMappedError(const Surface & reference, const Surface & img, float exposure)
+{
+    // @@ We could do this in the rms function without having to create image copies.
+    Surface r = reference;
+    Surface i = img;
+
+    // @@ Ideally we should use our Reindhart operator. Add Reindhart_L & Reindhart_M ?
+
+    float scale = 1.0f / exposure;
+
+    r.scaleBias(0, scale, 0); r.scaleBias(1, scale, 0); r.scaleBias(2, scale, 0);
+    r.toneMap(ToneMapper_Reindhart, NULL);
+    r.toSrgb();
+
+    i.scaleBias(0, scale, 0); i.scaleBias(1, scale, 0); i.scaleBias(2, scale, 0);
+    i.toneMap(ToneMapper_Reindhart, NULL);
+    i.toSrgb();
+
+    return nv::rmsColorError(r.m->image, i.m->image, reference.alphaMode() == nvtt::AlphaMode_Transparency);
+}
+
+
+Surface nvtt::histogram(const Surface & img, int width, int height)
+{
+    float min_color[3], max_color[3];
+    img.range(0, &min_color[0], &max_color[0]);
+    img.range(1, &min_color[1], &max_color[1]);
+    img.range(2, &min_color[2], &max_color[2]);
+
+    float minRange = nv::min3(min_color[0], min_color[1], min_color[2]);
+    float maxRange = nv::max3(max_color[0], max_color[1], max_color[2]);
+
+    if (maxRange > 16) maxRange = 16;
+
+    return histogram(img, /*minRange*/0, maxRange, width, height);
+}
+
+#include "nvcore/Array.inl"
+#include "nvmath/PackedFloat.h"
+#include <stdio.h>
+
+nvtt::Surface nvtt::histogram(const Surface & img, float minRange, float maxRange, int width, int height)
+{
+    nv::Array<Vector3> buckets;
+    buckets.resize(width, Vector3(0));
+
+    int w = img.width();
+    int h = img.height();
+    int d = img.depth();
+
+    const float * r = img.channel(0);
+    const float * g = img.channel(1);
+    const float * b = img.channel(2);
+    const float * a = img.channel(3);
+
+#if 0
+    for (int z = 0; z < d; z++)
+    for (int y = 0; y < h; y++)
+    for (int x = 0; x < w; x++)
+    {
+        int i = x + y * w + z * w * d;
+
+        float fr = (r[i] - minRange) / (maxRange - minRange);
+        float fg = (g[i] - minRange) / (maxRange - minRange);
+        float fb = (b[i] - minRange) / (maxRange - minRange);
+
+        int R = ftoi_round(fr * (width - 1));
+        int G = ftoi_round(fg * (width - 1));
+        int B = ftoi_round(fb * (width - 1));
+
+        R = nv::clamp(R, 0, width-1);
+        G = nv::clamp(G, 0, width-1);
+        B = nv::clamp(B, 0, width-1);
+        
+        // Alpha weighted histogram?
+        float A = nv::saturate(a[i]);
+
+        buckets[R].x += A;
+        buckets[G].y += A;
+        buckets[B].z += A;
+    }
+
+#elif 1
+    
+    float exposure = 0.22f;
+
+    //int E = 8, M = 23;    // float
+    int E = 5, M = 10;    // half
+    //int E = 5, M = 9;     // rgb9e5
+    //int E = 5, M = 6;     // r11g11b10
+
+    for (int e = 0; e < (1 << E); e++)
+    {
+        /*if (e == 0x1f) {    // Skip NaN and inf.
+            continue;
+        }*/
+        if (e == 0) {       // Skip denormals.
+            continue;
+        }
+
+        for (int m = 0; m < (1 << M); m++)
+        {
+            Float754 F;
+            F.field.negative = 0;
+            F.field.biasedexponent = e + 128 - (1 << (E - 1)) - 1;  // E=5 -> 128 - 15
+            F.field.mantissa = m << (23 - M);
+
+            // value = (1 + mantissa) * 2^(e-15)
+
+            // @@ Handle denormals.
+
+            float fc = F.value;
+
+            // Tone mapping:
+            fc /= exposure;
+            //fc /= (fc + 1);             // Reindhart tone mapping.
+            fc = 1 - exp2f(-fc);        // Halo2 tone mapping.
+
+            // Gamma space conversion:
+            //fc = sqrtf(fc);
+            fc = powf(fc, 1.0f/2.2f);
+            //fc = toSrgb(fc);
+
+            //fc = (fc - 0.5f) * 8; // zoom in
+            //if (fc < 0 || fc > 1) continue;
+
+            //printf("%f\n", fc);
+
+            int c = ftoi_round(fc * (width - 1) / 1);
+            c = clamp(c, 0, width - 1);
+
+            buckets[c] += Vector3(1);
+        }
+    }
+
+#else
+
+    float exposure = 0.22f;
+
+    int R = 8, M = 8;
+    //int R = 6, M = 8;
+    //int R = 9, M = 5;
+
+    float threshold = 1.0f / (1 << M);
+    //float threshold = 0.25f;
+
+    for (int r = 0; r < (1 << R); r++)
+    {
+        float fr = float(r) / ((1 << R) - 1);
+
+        for (int m = 0; m < (1 << M); m++)
+        {
+            float fm = float(m) / ((1 << M) - 1);
+            float M = fm * (1 - threshold) + threshold;
+
+            float fc = fr * M;
+
+            fc /= exposure;
+            
+            //fc /= (fc + 1);             // Reindhart tone mapping.
+            fc = 1 - exp2f(-fc);        // Halo2 tone mapping.
+
+            // Gamma space conversion:
+            //fc = sqrtf(fc);
+            fc = powf(fc, 1.0f/2.2f);
+            //fc = toSrgb(fc);
+
+            //fc = (fc - 0.5f) * 8; // zoom in
+            //if (fc < 0 || fc > 1) continue;
+
+            int c = ftoi_round(fc * (width - 1));
+            c = clamp(c, 0, width - 1);
+
+            buckets[c] += Vector3(1);
+        }
+    }
+
+    //buckets[0] = Vector3(1);    // Hack, for prettier histograms.
+
+#endif
+
+
+    // Compute largerst height.
+    float maxh = 0;
+    for (int i = 0; i < width; i++) {
+        maxh = nv::max(maxh, nv::max3(buckets[i].x, buckets[i].y, buckets[i].z));
+    }
+
+    printf("maxh = %f\n", maxh);
+    //maxh = 80;
+    maxh = 256;
+
+    // Draw histogram.
+    nvtt::Surface hist;
+    hist.setImage(width, height, 1);
+    
+    for (int y = 0; y < height; y++) {
+        float fy = 1.0f - float(y) / (height - 1);
+        for (int x = 0; x < width; x++) {
+            hist.m->image->pixel(0, x, y, /*z=*/0) = fy < (buckets[x].x / maxh);
+            hist.m->image->pixel(1, x, y, /*z=*/0) = fy < (buckets[x].y / maxh);
+            hist.m->image->pixel(2, x, y, /*z=*/0) = fy < (buckets[x].z / maxh);
+        }
+    }
+
+    return hist;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/TaskDispatcher.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/TaskDispatcher.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/TaskDispatcher.h
@@ -0,0 +1,149 @@
+
+#include "nvtt.h"
+
+// OpenMP
+// http://en.wikipedia.org/wiki/OpenMP
+#if defined(HAVE_OPENMP)
+#include <omp.h>
+#endif
+
+// Gran Central Dispatch (GCD/libdispatch)
+// http://developer.apple.com/mac/library/documentation/Performance/Reference/GCD_libdispatch_Ref/Reference/reference.html
+#if NV_OS_DARWIN && defined(HAVE_DISPATCH_H)
+#define HAVE_GCD 1
+#include <dispatch/dispatch.h>
+#endif
+
+// Parallel Patterns Library (PPL) is part of Microsoft's concurrency runtime: 
+// http://msdn.microsoft.com/en-us/library/dd504870.aspx
+#if NV_OS_WIN32 && _MSC_VER >= 1600
+#define HAVE_PPL 1
+#include <ppl.h>
+#endif
+
+// Intel Thread Building Blocks (TBB).
+// http://www.threadingbuildingblocks.org/
+#if defined(HAVE_TBB)
+#include <tbb/parallel_for.h>
+#endif
+
+#include "nvthread/ParallelFor.h"
+
+
+namespace nvtt {
+
+    struct SequentialTaskDispatcher : public TaskDispatcher
+    {
+        virtual void dispatch(Task * task, void * context, int count) {
+            for (int i = 0; i < count; i++) {
+                task(context, i);
+            }
+        }
+    };
+
+    struct ParallelTaskDispatcher : public TaskDispatcher
+    {
+        virtual void dispatch(Task * task, void * context, int count) {
+            nv::ParallelFor parallelFor(task, context);
+            parallelFor.run(count); // @@ Add support for custom grain.
+        }
+    };
+
+
+#if defined(HAVE_OPENMP)
+
+    struct OpenMPTaskDispatcher : public TaskDispatcher
+    {
+        virtual void dispatch(Task * task, void * context, int count) {
+            #pragma omp parallel for
+            for (int i = 0; i < count; i++) {
+                task(context, i);
+            }
+        }
+    };
+
+#endif
+
+#if NV_OS_DARWIN && defined(HAVE_DISPATCH_H)
+
+    // Task dispatcher using Apple's Grand Central Dispatch.
+    struct AppleTaskDispatcher : public TaskDispatcher
+    {
+        // @@ This is really lame, but I refuse to use size_t in the public API.
+        struct BlockContext {
+            Task * task;
+            void * context;
+        };
+
+        static void block(void * context, size_t id) {
+            BlockContext * ctx = (BlockContext *)context;
+            ctx->task(ctx->context, int(id));
+        }
+
+        virtual void dispatch(Task * task, void * context, int count) {
+            dispatch_queue_t q = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
+            BlockContext blockCtx = { task, context };
+            dispatch_apply_f(count, q, &blockCtx, block);
+        }
+    };
+
+#endif
+
+#if defined(HAVE_PPL)
+
+    struct TaskFunctor {
+        TaskFunctor(Task * task, void * context) : task(task), context(context) {}
+        void operator()(int n) const {
+            task(context, n);
+        }
+        Task * task;
+        void * context;
+    };
+
+    // Task dispatcher using Microsoft's concurrency runtime.
+    struct MicrosoftTaskDispatcher : public TaskDispatcher
+    {
+        virtual void dispatch(Task * task, void * context, int count)
+        {
+            TaskFunctor func(task, context);
+            Concurrency::parallel_for(0, count, func);
+        }
+    };
+
+#endif
+
+#if defined(HAVE_TBB)
+
+    struct TaskFunctor {
+        TaskFunctor(Task * task, void * context) : task(task), context(context) {}
+        void operator()(int & n) const {
+            task(context, n);
+        }
+        Task * task;
+        void * context;
+    };
+
+    // Task dispatcher using Inte's Thread Building Blocks.
+    struct IntelTaskDispatcher : public TaskDispatcher
+    {
+        virtual void dispatch(Task * task, void * context, int count) {
+            parallel_for(blocked_range<int>(0, count, 1), TaskFunctor(task, context));
+        }
+    };
+
+#endif
+
+#if defined(HAVE_OPENMP)
+    typedef OpenMPTaskDispatcher        ConcurrentTaskDispatcher;
+#elif defined(HAVE_TBB)
+    typedef IntelTaskDispatcher         ConcurrentTaskDispatcher;
+#elif defined(HAVE_PPL)
+    typedef MicrosoftTaskDispatcher     ConcurrentTaskDispatcher;
+#elif defined(HAVE_GCD)
+    typedef AppleTaskDispatcher         ConcurrentTaskDispatcher;
+#else
+    //typedef SequentialTaskDispatcher    ConcurrentTaskDispatcher;
+    typedef ParallelTaskDispatcher        ConcurrentTaskDispatcher;
+#endif
+
+} // namespace nvtt
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/BitmapTable.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/BitmapTable.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/BitmapTable.h
@@ -0,0 +1,1886 @@
+
+
+/*
+static void doPrecomputation()
+{
+	uint bitmaps[1024];
+
+	int indices[16];
+	int num = 0;
+
+	// Compute bitmaps with 3 clusters:
+
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < 16; ++m )
+	{
+		indices[m] = 0;
+	}
+	const int imax = 15;
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is half along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// last cluster [j,k) is at the end
+			if( j < 16 )
+			{
+				indices[j] = 1;
+			}
+			
+			uint bitmap = 0;
+			
+			for(int p = 0; p < 16; p++) {
+				bitmap |= indices[p] << (p * 2);
+			}
+				
+			bitmaps[num] = bitmap;
+			
+			num++;
+		}
+	}
+	nvDebugCheck(num == 151);
+
+	// Align to 160.
+	for(int i = 0; i < 9; i++)
+	{
+		bitmaps[num] = 0x555AA000;
+		num++;
+	}
+	nvDebugCheck(num == 160);
+
+	// Append bitmaps with 4 clusters:
+
+	// first cluster [0,i) is at the start
+	for( int m = 0; m < 16; ++m )
+	{
+		indices[m] = 0;
+	}
+	for( int i = imax; i >= 0; --i )
+	{
+		// second cluster [i,j) is one third along
+		for( int m = i; m < 16; ++m )
+		{
+			indices[m] = 2;
+		}
+		const int jmax = ( i == 0 ) ? 15 : 16;
+		for( int j = jmax; j >= i; --j )
+		{
+			// third cluster [j,k) is two thirds along
+			for( int m = j; m < 16; ++m )
+			{
+				indices[m] = 3;
+			}
+			
+			int kmax = ( j == 0 ) ? 15 : 16;
+			for( int k = kmax; k >= j; --k )
+			{
+				// last cluster [k,n) is at the end
+				if( k < 16 )
+				{
+					indices[k] = 1;
+				}
+				
+				uint bitmap = 0;
+				
+				bool hasThree = false;
+				for(int p = 0; p < 16; p++) {
+					bitmap |= indices[p] << (p * 2);
+					
+					if (indices[p] == 3) hasThree = true;
+				}
+				
+				if (hasThree) {
+					bitmaps[num] = bitmap;
+					num++;
+				}
+			}
+		}
+	}
+	nvDebugCheck(num == 975);
+	
+	// Align to 1024.
+	for(int i = 0; i < 49; i++)
+	{
+		bitmaps[num] = 0x555AA000;
+		num++;
+	}
+
+	nvDebugCheck(num == 1024);
+
+	printf("uint bitmaps[992] =\n{\n");
+	for (int i = 0; i < 992; i++)
+	{
+		printf("\t0x%.8X,\n", bitmaps[i]);
+	}
+	printf("};\n");
+}
+*/
+
+
+const static uint s_bitmapTable[992] =
+{
+	0x80000000,
+	0x40000000,
+	0xA0000000,
+	0x60000000,
+	0x50000000,
+	0xA8000000,
+	0x68000000,
+	0x58000000,
+	0x54000000,
+	0xAA000000,
+	0x6A000000,
+	0x5A000000,
+	0x56000000,
+	0x55000000,
+	0xAA800000,
+	0x6A800000,
+	0x5A800000,
+	0x56800000,
+	0x55800000,
+	0x55400000,
+	0xAAA00000,
+	0x6AA00000,
+	0x5AA00000,
+	0x56A00000,
+	0x55A00000,
+	0x55600000,
+	0x55500000,
+	0xAAA80000,
+	0x6AA80000,
+	0x5AA80000,
+	0x56A80000,
+	0x55A80000,
+	0x55680000,
+	0x55580000,
+	0x55540000,
+	0xAAAA0000,
+	0x6AAA0000,
+	0x5AAA0000,
+	0x56AA0000,
+	0x55AA0000,
+	0x556A0000,
+	0x555A0000,
+	0x55560000,
+	0x55550000,
+	0xAAAA8000,
+	0x6AAA8000,
+	0x5AAA8000,
+	0x56AA8000,
+	0x55AA8000,
+	0x556A8000,
+	0x555A8000,
+	0x55568000,
+	0x55558000,
+	0x55554000,
+	0xAAAAA000,
+	0x6AAAA000,
+	0x5AAAA000,
+	0x56AAA000,
+	0x55AAA000,
+	0x556AA000,
+	0x555AA000,
+	0x5556A000,
+	0x5555A000,
+	0x55556000,
+	0x55555000,
+	0xAAAAA800,
+	0x6AAAA800,
+	0x5AAAA800,
+	0x56AAA800,
+	0x55AAA800,
+	0x556AA800,
+	0x555AA800,
+	0x5556A800,
+	0x5555A800,
+	0x55556800,
+	0x55555800,
+	0x55555400,
+	0xAAAAAA00,
+	0x6AAAAA00,
+	0x5AAAAA00,
+	0x56AAAA00,
+	0x55AAAA00,
+	0x556AAA00,
+	0x555AAA00,
+	0x5556AA00,
+	0x5555AA00,
+	0x55556A00,
+	0x55555A00,
+	0x55555600,
+	0x55555500,
+	0xAAAAAA80,
+	0x6AAAAA80,
+	0x5AAAAA80,
+	0x56AAAA80,
+	0x55AAAA80,
+	0x556AAA80,
+	0x555AAA80,
+	0x5556AA80,
+	0x5555AA80,
+	0x55556A80,
+	0x55555A80,
+	0x55555680,
+	0x55555580,
+	0x55555540,
+	0xAAAAAAA0,
+	0x6AAAAAA0,
+	0x5AAAAAA0,
+	0x56AAAAA0,
+	0x55AAAAA0,
+	0x556AAAA0,
+	0x555AAAA0,
+	0x5556AAA0,
+	0x5555AAA0,
+	0x55556AA0,
+	0x55555AA0,
+	0x555556A0,
+	0x555555A0,
+	0x55555560,
+	0x55555550,
+	0xAAAAAAA8,
+	0x6AAAAAA8,
+	0x5AAAAAA8,
+	0x56AAAAA8,
+	0x55AAAAA8,
+	0x556AAAA8,
+	0x555AAAA8,
+	0x5556AAA8,
+	0x5555AAA8,
+	0x55556AA8,
+	0x55555AA8,
+	0x555556A8,
+	0x555555A8,
+	0x55555568,
+	0x55555558,
+	0x55555554,
+	0x6AAAAAAA,
+	0x5AAAAAAA,
+	0x56AAAAAA,
+	0x55AAAAAA,
+	0x556AAAAA,
+	0x555AAAAA,
+	0x5556AAAA,
+	0x5555AAAA,
+	0x55556AAA,
+	0x55555AAA,
+	0x555556AA,
+	0x555555AA,
+	0x5555556A,
+	0x5555555A,
+	0x55555556,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0x55555555,
+	0xC0000000,
+	0xE0000000,
+	0xF0000000,
+	0x70000000,
+	0xE8000000,
+	0xF8000000,
+	0x78000000,
+	0xFC000000,
+	0x7C000000,
+	0x5C000000,
+	0xEA000000,
+	0xFA000000,
+	0x7A000000,
+	0xFE000000,
+	0x7E000000,
+	0x5E000000,
+	0xFF000000,
+	0x7F000000,
+	0x5F000000,
+	0x57000000,
+	0xEA800000,
+	0xFA800000,
+	0x7A800000,
+	0xFE800000,
+	0x7E800000,
+	0x5E800000,
+	0xFF800000,
+	0x7F800000,
+	0x5F800000,
+	0x57800000,
+	0xFFC00000,
+	0x7FC00000,
+	0x5FC00000,
+	0x57C00000,
+	0x55C00000,
+	0xEAA00000,
+	0xFAA00000,
+	0x7AA00000,
+	0xFEA00000,
+	0x7EA00000,
+	0x5EA00000,
+	0xFFA00000,
+	0x7FA00000,
+	0x5FA00000,
+	0x57A00000,
+	0xFFE00000,
+	0x7FE00000,
+	0x5FE00000,
+	0x57E00000,
+	0x55E00000,
+	0xFFF00000,
+	0x7FF00000,
+	0x5FF00000,
+	0x57F00000,
+	0x55F00000,
+	0x55700000,
+	0xEAA80000,
+	0xFAA80000,
+	0x7AA80000,
+	0xFEA80000,
+	0x7EA80000,
+	0x5EA80000,
+	0xFFA80000,
+	0x7FA80000,
+	0x5FA80000,
+	0x57A80000,
+	0xFFE80000,
+	0x7FE80000,
+	0x5FE80000,
+	0x57E80000,
+	0x55E80000,
+	0xFFF80000,
+	0x7FF80000,
+	0x5FF80000,
+	0x57F80000,
+	0x55F80000,
+	0x55780000,
+	0xFFFC0000,
+	0x7FFC0000,
+	0x5FFC0000,
+	0x57FC0000,
+	0x55FC0000,
+	0x557C0000,
+	0x555C0000,
+	0xEAAA0000,
+	0xFAAA0000,
+	0x7AAA0000,
+	0xFEAA0000,
+	0x7EAA0000,
+	0x5EAA0000,
+	0xFFAA0000,
+	0x7FAA0000,
+	0x5FAA0000,
+	0x57AA0000,
+	0xFFEA0000,
+	0x7FEA0000,
+	0x5FEA0000,
+	0x57EA0000,
+	0x55EA0000,
+	0xFFFA0000,
+	0x7FFA0000,
+	0x5FFA0000,
+	0x57FA0000,
+	0x55FA0000,
+	0x557A0000,
+	0xFFFE0000,
+	0x7FFE0000,
+	0x5FFE0000,
+	0x57FE0000,
+	0x55FE0000,
+	0x557E0000,
+	0x555E0000,
+	0xFFFF0000,
+	0x7FFF0000,
+	0x5FFF0000,
+	0x57FF0000,
+	0x55FF0000,
+	0x557F0000,
+	0x555F0000,
+	0x55570000,
+	0xEAAA8000,
+	0xFAAA8000,
+	0x7AAA8000,
+	0xFEAA8000,
+	0x7EAA8000,
+	0x5EAA8000,
+	0xFFAA8000,
+	0x7FAA8000,
+	0x5FAA8000,
+	0x57AA8000,
+	0xFFEA8000,
+	0x7FEA8000,
+	0x5FEA8000,
+	0x57EA8000,
+	0x55EA8000,
+	0xFFFA8000,
+	0x7FFA8000,
+	0x5FFA8000,
+	0x57FA8000,
+	0x55FA8000,
+	0x557A8000,
+	0xFFFE8000,
+	0x7FFE8000,
+	0x5FFE8000,
+	0x57FE8000,
+	0x55FE8000,
+	0x557E8000,
+	0x555E8000,
+	0xFFFF8000,
+	0x7FFF8000,
+	0x5FFF8000,
+	0x57FF8000,
+	0x55FF8000,
+	0x557F8000,
+	0x555F8000,
+	0x55578000,
+	0xFFFFC000,
+	0x7FFFC000,
+	0x5FFFC000,
+	0x57FFC000,
+	0x55FFC000,
+	0x557FC000,
+	0x555FC000,
+	0x5557C000,
+	0x5555C000,
+	0xEAAAA000,
+	0xFAAAA000,
+	0x7AAAA000,
+	0xFEAAA000,
+	0x7EAAA000,
+	0x5EAAA000,
+	0xFFAAA000,
+	0x7FAAA000,
+	0x5FAAA000,
+	0x57AAA000,
+	0xFFEAA000,
+	0x7FEAA000,
+	0x5FEAA000,
+	0x57EAA000,
+	0x55EAA000,
+	0xFFFAA000,
+	0x7FFAA000,
+	0x5FFAA000,
+	0x57FAA000,
+	0x55FAA000,
+	0x557AA000,
+	0xFFFEA000,
+	0x7FFEA000,
+	0x5FFEA000,
+	0x57FEA000,
+	0x55FEA000,
+	0x557EA000,
+	0x555EA000,
+	0xFFFFA000,
+	0x7FFFA000,
+	0x5FFFA000,
+	0x57FFA000,
+	0x55FFA000,
+	0x557FA000,
+	0x555FA000,
+	0x5557A000,
+	0xFFFFE000,
+	0x7FFFE000,
+	0x5FFFE000,
+	0x57FFE000,
+	0x55FFE000,
+	0x557FE000,
+	0x555FE000,
+	0x5557E000,
+	0x5555E000,
+	0xFFFFF000,
+	0x7FFFF000,
+	0x5FFFF000,
+	0x57FFF000,
+	0x55FFF000,
+	0x557FF000,
+	0x555FF000,
+	0x5557F000,
+	0x5555F000,
+	0x55557000,
+	0xEAAAA800,
+	0xFAAAA800,
+	0x7AAAA800,
+	0xFEAAA800,
+	0x7EAAA800,
+	0x5EAAA800,
+	0xFFAAA800,
+	0x7FAAA800,
+	0x5FAAA800,
+	0x57AAA800,
+	0xFFEAA800,
+	0x7FEAA800,
+	0x5FEAA800,
+	0x57EAA800,
+	0x55EAA800,
+	0xFFFAA800,
+	0x7FFAA800,
+	0x5FFAA800,
+	0x57FAA800,
+	0x55FAA800,
+	0x557AA800,
+	0xFFFEA800,
+	0x7FFEA800,
+	0x5FFEA800,
+	0x57FEA800,
+	0x55FEA800,
+	0x557EA800,
+	0x555EA800,
+	0xFFFFA800,
+	0x7FFFA800,
+	0x5FFFA800,
+	0x57FFA800,
+	0x55FFA800,
+	0x557FA800,
+	0x555FA800,
+	0x5557A800,
+	0xFFFFE800,
+	0x7FFFE800,
+	0x5FFFE800,
+	0x57FFE800,
+	0x55FFE800,
+	0x557FE800,
+	0x555FE800,
+	0x5557E800,
+	0x5555E800,
+	0xFFFFF800,
+	0x7FFFF800,
+	0x5FFFF800,
+	0x57FFF800,
+	0x55FFF800,
+	0x557FF800,
+	0x555FF800,
+	0x5557F800,
+	0x5555F800,
+	0x55557800,
+	0xFFFFFC00,
+	0x7FFFFC00,
+	0x5FFFFC00,
+	0x57FFFC00,
+	0x55FFFC00,
+	0x557FFC00,
+	0x555FFC00,
+	0x5557FC00,
+	0x5555FC00,
+	0x55557C00,
+	0x55555C00,
+	0xEAAAAA00,
+	0xFAAAAA00,
+	0x7AAAAA00,
+	0xFEAAAA00,
+	0x7EAAAA00,
+	0x5EAAAA00,
+	0xFFAAAA00,
+	0x7FAAAA00,
+	0x5FAAAA00,
+	0x57AAAA00,
+	0xFFEAAA00,
+	0x7FEAAA00,
+	0x5FEAAA00,
+	0x57EAAA00,
+	0x55EAAA00,
+	0xFFFAAA00,
+	0x7FFAAA00,
+	0x5FFAAA00,
+	0x57FAAA00,
+	0x55FAAA00,
+	0x557AAA00,
+	0xFFFEAA00,
+	0x7FFEAA00,
+	0x5FFEAA00,
+	0x57FEAA00,
+	0x55FEAA00,
+	0x557EAA00,
+	0x555EAA00,
+	0xFFFFAA00,
+	0x7FFFAA00,
+	0x5FFFAA00,
+	0x57FFAA00,
+	0x55FFAA00,
+	0x557FAA00,
+	0x555FAA00,
+	0x5557AA00,
+	0xFFFFEA00,
+	0x7FFFEA00,
+	0x5FFFEA00,
+	0x57FFEA00,
+	0x55FFEA00,
+	0x557FEA00,
+	0x555FEA00,
+	0x5557EA00,
+	0x5555EA00,
+	0xFFFFFA00,
+	0x7FFFFA00,
+	0x5FFFFA00,
+	0x57FFFA00,
+	0x55FFFA00,
+	0x557FFA00,
+	0x555FFA00,
+	0x5557FA00,
+	0x5555FA00,
+	0x55557A00,
+	0xFFFFFE00,
+	0x7FFFFE00,
+	0x5FFFFE00,
+	0x57FFFE00,
+	0x55FFFE00,
+	0x557FFE00,
+	0x555FFE00,
+	0x5557FE00,
+	0x5555FE00,
+	0x55557E00,
+	0x55555E00,
+	0xFFFFFF00,
+	0x7FFFFF00,
+	0x5FFFFF00,
+	0x57FFFF00,
+	0x55FFFF00,
+	0x557FFF00,
+	0x555FFF00,
+	0x5557FF00,
+	0x5555FF00,
+	0x55557F00,
+	0x55555F00,
+	0x55555700,
+	0xEAAAAA80,
+	0xFAAAAA80,
+	0x7AAAAA80,
+	0xFEAAAA80,
+	0x7EAAAA80,
+	0x5EAAAA80,
+	0xFFAAAA80,
+	0x7FAAAA80,
+	0x5FAAAA80,
+	0x57AAAA80,
+	0xFFEAAA80,
+	0x7FEAAA80,
+	0x5FEAAA80,
+	0x57EAAA80,
+	0x55EAAA80,
+	0xFFFAAA80,
+	0x7FFAAA80,
+	0x5FFAAA80,
+	0x57FAAA80,
+	0x55FAAA80,
+	0x557AAA80,
+	0xFFFEAA80,
+	0x7FFEAA80,
+	0x5FFEAA80,
+	0x57FEAA80,
+	0x55FEAA80,
+	0x557EAA80,
+	0x555EAA80,
+	0xFFFFAA80,
+	0x7FFFAA80,
+	0x5FFFAA80,
+	0x57FFAA80,
+	0x55FFAA80,
+	0x557FAA80,
+	0x555FAA80,
+	0x5557AA80,
+	0xFFFFEA80,
+	0x7FFFEA80,
+	0x5FFFEA80,
+	0x57FFEA80,
+	0x55FFEA80,
+	0x557FEA80,
+	0x555FEA80,
+	0x5557EA80,
+	0x5555EA80,
+	0xFFFFFA80,
+	0x7FFFFA80,
+	0x5FFFFA80,
+	0x57FFFA80,
+	0x55FFFA80,
+	0x557FFA80,
+	0x555FFA80,
+	0x5557FA80,
+	0x5555FA80,
+	0x55557A80,
+	0xFFFFFE80,
+	0x7FFFFE80,
+	0x5FFFFE80,
+	0x57FFFE80,
+	0x55FFFE80,
+	0x557FFE80,
+	0x555FFE80,
+	0x5557FE80,
+	0x5555FE80,
+	0x55557E80,
+	0x55555E80,
+	0xFFFFFF80,
+	0x7FFFFF80,
+	0x5FFFFF80,
+	0x57FFFF80,
+	0x55FFFF80,
+	0x557FFF80,
+	0x555FFF80,
+	0x5557FF80,
+	0x5555FF80,
+	0x55557F80,
+	0x55555F80,
+	0x55555780,
+	0xFFFFFFC0,
+	0x7FFFFFC0,
+	0x5FFFFFC0,
+	0x57FFFFC0,
+	0x55FFFFC0,
+	0x557FFFC0,
+	0x555FFFC0,
+	0x5557FFC0,
+	0x5555FFC0,
+	0x55557FC0,
+	0x55555FC0,
+	0x555557C0,
+	0x555555C0,
+	0xEAAAAAA0,
+	0xFAAAAAA0,
+	0x7AAAAAA0,
+	0xFEAAAAA0,
+	0x7EAAAAA0,
+	0x5EAAAAA0,
+	0xFFAAAAA0,
+	0x7FAAAAA0,
+	0x5FAAAAA0,
+	0x57AAAAA0,
+	0xFFEAAAA0,
+	0x7FEAAAA0,
+	0x5FEAAAA0,
+	0x57EAAAA0,
+	0x55EAAAA0,
+	0xFFFAAAA0,
+	0x7FFAAAA0,
+	0x5FFAAAA0,
+	0x57FAAAA0,
+	0x55FAAAA0,
+	0x557AAAA0,
+	0xFFFEAAA0,
+	0x7FFEAAA0,
+	0x5FFEAAA0,
+	0x57FEAAA0,
+	0x55FEAAA0,
+	0x557EAAA0,
+	0x555EAAA0,
+	0xFFFFAAA0,
+	0x7FFFAAA0,
+	0x5FFFAAA0,
+	0x57FFAAA0,
+	0x55FFAAA0,
+	0x557FAAA0,
+	0x555FAAA0,
+	0x5557AAA0,
+	0xFFFFEAA0,
+	0x7FFFEAA0,
+	0x5FFFEAA0,
+	0x57FFEAA0,
+	0x55FFEAA0,
+	0x557FEAA0,
+	0x555FEAA0,
+	0x5557EAA0,
+	0x5555EAA0,
+	0xFFFFFAA0,
+	0x7FFFFAA0,
+	0x5FFFFAA0,
+	0x57FFFAA0,
+	0x55FFFAA0,
+	0x557FFAA0,
+	0x555FFAA0,
+	0x5557FAA0,
+	0x5555FAA0,
+	0x55557AA0,
+	0xFFFFFEA0,
+	0x7FFFFEA0,
+	0x5FFFFEA0,
+	0x57FFFEA0,
+	0x55FFFEA0,
+	0x557FFEA0,
+	0x555FFEA0,
+	0x5557FEA0,
+	0x5555FEA0,
+	0x55557EA0,
+	0x55555EA0,
+	0xFFFFFFA0,
+	0x7FFFFFA0,
+	0x5FFFFFA0,
+	0x57FFFFA0,
+	0x55FFFFA0,
+	0x557FFFA0,
+	0x555FFFA0,
+	0x5557FFA0,
+	0x5555FFA0,
+	0x55557FA0,
+	0x55555FA0,
+	0x555557A0,
+	0xFFFFFFE0,
+	0x7FFFFFE0,
+	0x5FFFFFE0,
+	0x57FFFFE0,
+	0x55FFFFE0,
+	0x557FFFE0,
+	0x555FFFE0,
+	0x5557FFE0,
+	0x5555FFE0,
+	0x55557FE0,
+	0x55555FE0,
+	0x555557E0,
+	0x555555E0,
+	0xFFFFFFF0,
+	0x7FFFFFF0,
+	0x5FFFFFF0,
+	0x57FFFFF0,
+	0x55FFFFF0,
+	0x557FFFF0,
+	0x555FFFF0,
+	0x5557FFF0,
+	0x5555FFF0,
+	0x55557FF0,
+	0x55555FF0,
+	0x555557F0,
+	0x555555F0,
+	0x55555570,
+	0xEAAAAAA8,
+	0xFAAAAAA8,
+	0x7AAAAAA8,
+	0xFEAAAAA8,
+	0x7EAAAAA8,
+	0x5EAAAAA8,
+	0xFFAAAAA8,
+	0x7FAAAAA8,
+	0x5FAAAAA8,
+	0x57AAAAA8,
+	0xFFEAAAA8,
+	0x7FEAAAA8,
+	0x5FEAAAA8,
+	0x57EAAAA8,
+	0x55EAAAA8,
+	0xFFFAAAA8,
+	0x7FFAAAA8,
+	0x5FFAAAA8,
+	0x57FAAAA8,
+	0x55FAAAA8,
+	0x557AAAA8,
+	0xFFFEAAA8,
+	0x7FFEAAA8,
+	0x5FFEAAA8,
+	0x57FEAAA8,
+	0x55FEAAA8,
+	0x557EAAA8,
+	0x555EAAA8,
+	0xFFFFAAA8,
+	0x7FFFAAA8,
+	0x5FFFAAA8,
+	0x57FFAAA8,
+	0x55FFAAA8,
+	0x557FAAA8,
+	0x555FAAA8,
+	0x5557AAA8,
+	0xFFFFEAA8,
+	0x7FFFEAA8,
+	0x5FFFEAA8,
+	0x57FFEAA8,
+	0x55FFEAA8,
+	0x557FEAA8,
+	0x555FEAA8,
+	0x5557EAA8,
+	0x5555EAA8,
+	0xFFFFFAA8,
+	0x7FFFFAA8,
+	0x5FFFFAA8,
+	0x57FFFAA8,
+	0x55FFFAA8,
+	0x557FFAA8,
+	0x555FFAA8,
+	0x5557FAA8,
+	0x5555FAA8,
+	0x55557AA8,
+	0xFFFFFEA8,
+	0x7FFFFEA8,
+	0x5FFFFEA8,
+	0x57FFFEA8,
+	0x55FFFEA8,
+	0x557FFEA8,
+	0x555FFEA8,
+	0x5557FEA8,
+	0x5555FEA8,
+	0x55557EA8,
+	0x55555EA8,
+	0xFFFFFFA8,
+	0x7FFFFFA8,
+	0x5FFFFFA8,
+	0x57FFFFA8,
+	0x55FFFFA8,
+	0x557FFFA8,
+	0x555FFFA8,
+	0x5557FFA8,
+	0x5555FFA8,
+	0x55557FA8,
+	0x55555FA8,
+	0x555557A8,
+	0xFFFFFFE8,
+	0x7FFFFFE8,
+	0x5FFFFFE8,
+	0x57FFFFE8,
+	0x55FFFFE8,
+	0x557FFFE8,
+	0x555FFFE8,
+	0x5557FFE8,
+	0x5555FFE8,
+	0x55557FE8,
+	0x55555FE8,
+	0x555557E8,
+	0x555555E8,
+	0xFFFFFFF8,
+	0x7FFFFFF8,
+	0x5FFFFFF8,
+	0x57FFFFF8,
+	0x55FFFFF8,
+	0x557FFFF8,
+	0x555FFFF8,
+	0x5557FFF8,
+	0x5555FFF8,
+	0x55557FF8,
+	0x55555FF8,
+	0x555557F8,
+	0x555555F8,
+	0x55555578,
+	0xFFFFFFFC,
+	0x7FFFFFFC,
+	0x5FFFFFFC,
+	0x57FFFFFC,
+	0x55FFFFFC,
+	0x557FFFFC,
+	0x555FFFFC,
+	0x5557FFFC,
+	0x5555FFFC,
+	0x55557FFC,
+	0x55555FFC,
+	0x555557FC,
+	0x555555FC,
+	0x5555557C,
+	0x5555555C,
+	0xEAAAAAAA,
+	0xFAAAAAAA,
+	0x7AAAAAAA,
+	0xFEAAAAAA,
+	0x7EAAAAAA,
+	0x5EAAAAAA,
+	0xFFAAAAAA,
+	0x7FAAAAAA,
+	0x5FAAAAAA,
+	0x57AAAAAA,
+	0xFFEAAAAA,
+	0x7FEAAAAA,
+	0x5FEAAAAA,
+	0x57EAAAAA,
+	0x55EAAAAA,
+	0xFFFAAAAA,
+	0x7FFAAAAA,
+	0x5FFAAAAA,
+	0x57FAAAAA,
+	0x55FAAAAA,
+	0x557AAAAA,
+	0xFFFEAAAA,
+	0x7FFEAAAA,
+	0x5FFEAAAA,
+	0x57FEAAAA,
+	0x55FEAAAA,
+	0x557EAAAA,
+	0x555EAAAA,
+	0xFFFFAAAA,
+	0x7FFFAAAA,
+	0x5FFFAAAA,
+	0x57FFAAAA,
+	0x55FFAAAA,
+	0x557FAAAA,
+	0x555FAAAA,
+	0x5557AAAA,
+	0xFFFFEAAA,
+	0x7FFFEAAA,
+	0x5FFFEAAA,
+	0x57FFEAAA,
+	0x55FFEAAA,
+	0x557FEAAA,
+	0x555FEAAA,
+	0x5557EAAA,
+	0x5555EAAA,
+	0xFFFFFAAA,
+	0x7FFFFAAA,
+	0x5FFFFAAA,
+	0x57FFFAAA,
+	0x55FFFAAA,
+	0x557FFAAA,
+	0x555FFAAA,
+	0x5557FAAA,
+	0x5555FAAA,
+	0x55557AAA,
+	0xFFFFFEAA,
+	0x7FFFFEAA,
+	0x5FFFFEAA,
+	0x57FFFEAA,
+	0x55FFFEAA,
+	0x557FFEAA,
+	0x555FFEAA,
+	0x5557FEAA,
+	0x5555FEAA,
+	0x55557EAA,
+	0x55555EAA,
+	0xFFFFFFAA,
+	0x7FFFFFAA,
+	0x5FFFFFAA,
+	0x57FFFFAA,
+	0x55FFFFAA,
+	0x557FFFAA,
+	0x555FFFAA,
+	0x5557FFAA,
+	0x5555FFAA,
+	0x55557FAA,
+	0x55555FAA,
+	0x555557AA,
+	0xFFFFFFEA,
+	0x7FFFFFEA,
+	0x5FFFFFEA,
+	0x57FFFFEA,
+	0x55FFFFEA,
+	0x557FFFEA,
+	0x555FFFEA,
+	0x5557FFEA,
+	0x5555FFEA,
+	0x55557FEA,
+	0x55555FEA,
+	0x555557EA,
+	0x555555EA,
+	0xFFFFFFFA,
+	0x7FFFFFFA,
+	0x5FFFFFFA,
+	0x57FFFFFA,
+	0x55FFFFFA,
+	0x557FFFFA,
+	0x555FFFFA,
+	0x5557FFFA,
+	0x5555FFFA,
+	0x55557FFA,
+	0x55555FFA,
+	0x555557FA,
+	0x555555FA,
+	0x5555557A,
+	0xFFFFFFFE,
+	0x7FFFFFFE,
+	0x5FFFFFFE,
+	0x57FFFFFE,
+	0x55FFFFFE,
+	0x557FFFFE,
+	0x555FFFFE,
+	0x5557FFFE,
+	0x5555FFFE,
+	0x55557FFE,
+	0x55555FFE,
+	0x555557FE,
+	0x555555FE,
+	0x5555557E,
+	0x5555555E,
+	0x7FFFFFFF,
+	0x5FFFFFFF,
+	0x57FFFFFF,
+	0x55FFFFFF,
+	0x557FFFFF,
+	0x555FFFFF,
+	0x5557FFFF,
+	0x5555FFFF,
+	0x55557FFF,
+	0x55555FFF,
+	0x555557FF,
+	0x555555FF,
+	0x5555557F,
+	0x5555555F,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+	0x55555557,
+ 	0x55555557,
+};
+
+
+/*
+void precomp()
+{
+	unsigned int bitmaps[1024];
+
+	int num = 0;
+
+	printf("const static uint s_bitmapTableCTX[704] =\n{\n");
+
+	for (int a = 1; a <= 15; a++)
+	{
+		  for (int b = a; b <= 15; b++)
+		  {
+				for (int c = b; c <= 15; c++)
+				{
+					int indices[16];
+
+					int i = 0;
+					for(; i < a; i++) {
+						indices[i] = 0;
+					}
+					for(; i < a+b; i++) {
+						indices[i] = 2;
+					}
+					for(; i < a+b+c; i++) {
+						indices[i] = 3;
+					}
+					for(; i < 16; i++) {
+						indices[i] = 1;
+					}
+
+					unsigned int bm = 0;
+					for(i = 0; i < 16; i++) {
+						bm |= indices[i] << (i * 2);
+					}
+
+					printf("\t0x%8X, // %d %d %d %d\n", bm, a-0, b-a, c-b, 16-c);
+
+					bitmaps[num] = bm;
+					num++;
+				}
+		  }
+	}
+
+	// Align to 32: 680 -> 704
+	while (num < 704)
+	{
+		printf("\t0x80000000,\n");
+
+		bitmaps[num] = 0x80000000; // 15 0 0 1;
+		num++;
+	}
+
+	printf("}; // num = %d\n", num);
+}
+*/
+
+const static uint s_bitmapTableCTX[704] =
+{
+	0x55555578, // 1 0 0 15
+	0x555555F8, // 1 0 1 14
+	0x555557F8, // 1 0 2 13
+	0x55555FF8, // 1 0 3 12
+	0x55557FF8, // 1 0 4 11
+	0x5555FFF8, // 1 0 5 10
+	0x5557FFF8, // 1 0 6 9
+	0x555FFFF8, // 1 0 7 8
+	0x557FFFF8, // 1 0 8 7
+	0x55FFFFF8, // 1 0 9 6
+	0x57FFFFF8, // 1 0 10 5
+	0x5FFFFFF8, // 1 0 11 4
+	0x7FFFFFF8, // 1 0 12 3
+	0xFFFFFFF8, // 1 0 13 2
+	0xFFFFFFF8, // 1 0 14 1
+	0x555557E8, // 1 1 0 14
+	0x55555FE8, // 1 1 1 13
+	0x55557FE8, // 1 1 2 12
+	0x5555FFE8, // 1 1 3 11
+	0x5557FFE8, // 1 1 4 10
+	0x555FFFE8, // 1 1 5 9
+	0x557FFFE8, // 1 1 6 8
+	0x55FFFFE8, // 1 1 7 7
+	0x57FFFFE8, // 1 1 8 6
+	0x5FFFFFE8, // 1 1 9 5
+	0x7FFFFFE8, // 1 1 10 4
+	0xFFFFFFE8, // 1 1 11 3
+	0xFFFFFFE8, // 1 1 12 2
+	0xFFFFFFE8, // 1 1 13 1
+	0x55557FA8, // 1 2 0 13
+	0x5555FFA8, // 1 2 1 12
+	0x5557FFA8, // 1 2 2 11
+	0x555FFFA8, // 1 2 3 10
+	0x557FFFA8, // 1 2 4 9
+	0x55FFFFA8, // 1 2 5 8
+	0x57FFFFA8, // 1 2 6 7
+	0x5FFFFFA8, // 1 2 7 6
+	0x7FFFFFA8, // 1 2 8 5
+	0xFFFFFFA8, // 1 2 9 4
+	0xFFFFFFA8, // 1 2 10 3
+	0xFFFFFFA8, // 1 2 11 2
+	0xFFFFFFA8, // 1 2 12 1
+	0x5557FEA8, // 1 3 0 12
+	0x555FFEA8, // 1 3 1 11
+	0x557FFEA8, // 1 3 2 10
+	0x55FFFEA8, // 1 3 3 9
+	0x57FFFEA8, // 1 3 4 8
+	0x5FFFFEA8, // 1 3 5 7
+	0x7FFFFEA8, // 1 3 6 6
+	0xFFFFFEA8, // 1 3 7 5
+	0xFFFFFEA8, // 1 3 8 4
+	0xFFFFFEA8, // 1 3 9 3
+	0xFFFFFEA8, // 1 3 10 2
+	0xFFFFFEA8, // 1 3 11 1
+	0x557FFAA8, // 1 4 0 11
+	0x55FFFAA8, // 1 4 1 10
+	0x57FFFAA8, // 1 4 2 9
+	0x5FFFFAA8, // 1 4 3 8
+	0x7FFFFAA8, // 1 4 4 7
+	0xFFFFFAA8, // 1 4 5 6
+	0xFFFFFAA8, // 1 4 6 5
+	0xFFFFFAA8, // 1 4 7 4
+	0xFFFFFAA8, // 1 4 8 3
+	0xFFFFFAA8, // 1 4 9 2
+	0xFFFFFAA8, // 1 4 10 1
+	0x57FFEAA8, // 1 5 0 10
+	0x5FFFEAA8, // 1 5 1 9
+	0x7FFFEAA8, // 1 5 2 8
+	0xFFFFEAA8, // 1 5 3 7
+	0xFFFFEAA8, // 1 5 4 6
+	0xFFFFEAA8, // 1 5 5 5
+	0xFFFFEAA8, // 1 5 6 4
+	0xFFFFEAA8, // 1 5 7 3
+	0xFFFFEAA8, // 1 5 8 2
+	0xFFFFEAA8, // 1 5 9 1
+	0x7FFFAAA8, // 1 6 0 9
+	0xFFFFAAA8, // 1 6 1 8
+	0xFFFFAAA8, // 1 6 2 7
+	0xFFFFAAA8, // 1 6 3 6
+	0xFFFFAAA8, // 1 6 4 5
+	0xFFFFAAA8, // 1 6 5 4
+	0xFFFFAAA8, // 1 6 6 3
+	0xFFFFAAA8, // 1 6 7 2
+	0xFFFFAAA8, // 1 6 8 1
+	0xFFFEAAA8, // 1 7 0 8
+	0xFFFEAAA8, // 1 7 1 7
+	0xFFFEAAA8, // 1 7 2 6
+	0xFFFEAAA8, // 1 7 3 5
+	0xFFFEAAA8, // 1 7 4 4
+	0xFFFEAAA8, // 1 7 5 3
+	0xFFFEAAA8, // 1 7 6 2
+	0xFFFEAAA8, // 1 7 7 1
+	0xFFFAAAA8, // 1 8 0 7
+	0xFFFAAAA8, // 1 8 1 6
+	0xFFFAAAA8, // 1 8 2 5
+	0xFFFAAAA8, // 1 8 3 4
+	0xFFFAAAA8, // 1 8 4 3
+	0xFFFAAAA8, // 1 8 5 2
+	0xFFFAAAA8, // 1 8 6 1
+	0xFFEAAAA8, // 1 9 0 6
+	0xFFEAAAA8, // 1 9 1 5
+	0xFFEAAAA8, // 1 9 2 4
+	0xFFEAAAA8, // 1 9 3 3
+	0xFFEAAAA8, // 1 9 4 2
+	0xFFEAAAA8, // 1 9 5 1
+	0xFFAAAAA8, // 1 10 0 5
+	0xFFAAAAA8, // 1 10 1 4
+	0xFFAAAAA8, // 1 10 2 3
+	0xFFAAAAA8, // 1 10 3 2
+	0xFFAAAAA8, // 1 10 4 1
+	0xFEAAAAA8, // 1 11 0 4
+	0xFEAAAAA8, // 1 11 1 3
+	0xFEAAAAA8, // 1 11 2 2
+	0xFEAAAAA8, // 1 11 3 1
+	0xFAAAAAA8, // 1 12 0 3
+	0xFAAAAAA8, // 1 12 1 2
+	0xFAAAAAA8, // 1 12 2 1
+	0xEAAAAAA8, // 1 13 0 2
+	0xEAAAAAA8, // 1 13 1 1
+	0xAAAAAAA8, // 1 14 0 1
+	0x55555FA0, // 2 0 0 14
+	0x55557FA0, // 2 0 1 13
+	0x5555FFA0, // 2 0 2 12
+	0x5557FFA0, // 2 0 3 11
+	0x555FFFA0, // 2 0 4 10
+	0x557FFFA0, // 2 0 5 9
+	0x55FFFFA0, // 2 0 6 8
+	0x57FFFFA0, // 2 0 7 7
+	0x5FFFFFA0, // 2 0 8 6
+	0x7FFFFFA0, // 2 0 9 5
+	0xFFFFFFA0, // 2 0 10 4
+	0xFFFFFFA0, // 2 0 11 3
+	0xFFFFFFA0, // 2 0 12 2
+	0xFFFFFFA0, // 2 0 13 1
+	0x5555FEA0, // 2 1 0 13
+	0x5557FEA0, // 2 1 1 12
+	0x555FFEA0, // 2 1 2 11
+	0x557FFEA0, // 2 1 3 10
+	0x55FFFEA0, // 2 1 4 9
+	0x57FFFEA0, // 2 1 5 8
+	0x5FFFFEA0, // 2 1 6 7
+	0x7FFFFEA0, // 2 1 7 6
+	0xFFFFFEA0, // 2 1 8 5
+	0xFFFFFEA0, // 2 1 9 4
+	0xFFFFFEA0, // 2 1 10 3
+	0xFFFFFEA0, // 2 1 11 2
+	0xFFFFFEA0, // 2 1 12 1
+	0x555FFAA0, // 2 2 0 12
+	0x557FFAA0, // 2 2 1 11
+	0x55FFFAA0, // 2 2 2 10
+	0x57FFFAA0, // 2 2 3 9
+	0x5FFFFAA0, // 2 2 4 8
+	0x7FFFFAA0, // 2 2 5 7
+	0xFFFFFAA0, // 2 2 6 6
+	0xFFFFFAA0, // 2 2 7 5
+	0xFFFFFAA0, // 2 2 8 4
+	0xFFFFFAA0, // 2 2 9 3
+	0xFFFFFAA0, // 2 2 10 2
+	0xFFFFFAA0, // 2 2 11 1
+	0x55FFEAA0, // 2 3 0 11
+	0x57FFEAA0, // 2 3 1 10
+	0x5FFFEAA0, // 2 3 2 9
+	0x7FFFEAA0, // 2 3 3 8
+	0xFFFFEAA0, // 2 3 4 7
+	0xFFFFEAA0, // 2 3 5 6
+	0xFFFFEAA0, // 2 3 6 5
+	0xFFFFEAA0, // 2 3 7 4
+	0xFFFFEAA0, // 2 3 8 3
+	0xFFFFEAA0, // 2 3 9 2
+	0xFFFFEAA0, // 2 3 10 1
+	0x5FFFAAA0, // 2 4 0 10
+	0x7FFFAAA0, // 2 4 1 9
+	0xFFFFAAA0, // 2 4 2 8
+	0xFFFFAAA0, // 2 4 3 7
+	0xFFFFAAA0, // 2 4 4 6
+	0xFFFFAAA0, // 2 4 5 5
+	0xFFFFAAA0, // 2 4 6 4
+	0xFFFFAAA0, // 2 4 7 3
+	0xFFFFAAA0, // 2 4 8 2
+	0xFFFFAAA0, // 2 4 9 1
+	0xFFFEAAA0, // 2 5 0 9
+	0xFFFEAAA0, // 2 5 1 8
+	0xFFFEAAA0, // 2 5 2 7
+	0xFFFEAAA0, // 2 5 3 6
+	0xFFFEAAA0, // 2 5 4 5
+	0xFFFEAAA0, // 2 5 5 4
+	0xFFFEAAA0, // 2 5 6 3
+	0xFFFEAAA0, // 2 5 7 2
+	0xFFFEAAA0, // 2 5 8 1
+	0xFFFAAAA0, // 2 6 0 8
+	0xFFFAAAA0, // 2 6 1 7
+	0xFFFAAAA0, // 2 6 2 6
+	0xFFFAAAA0, // 2 6 3 5
+	0xFFFAAAA0, // 2 6 4 4
+	0xFFFAAAA0, // 2 6 5 3
+	0xFFFAAAA0, // 2 6 6 2
+	0xFFFAAAA0, // 2 6 7 1
+	0xFFEAAAA0, // 2 7 0 7
+	0xFFEAAAA0, // 2 7 1 6
+	0xFFEAAAA0, // 2 7 2 5
+	0xFFEAAAA0, // 2 7 3 4
+	0xFFEAAAA0, // 2 7 4 3
+	0xFFEAAAA0, // 2 7 5 2
+	0xFFEAAAA0, // 2 7 6 1
+	0xFFAAAAA0, // 2 8 0 6
+	0xFFAAAAA0, // 2 8 1 5
+	0xFFAAAAA0, // 2 8 2 4
+	0xFFAAAAA0, // 2 8 3 3
+	0xFFAAAAA0, // 2 8 4 2
+	0xFFAAAAA0, // 2 8 5 1
+	0xFEAAAAA0, // 2 9 0 5
+	0xFEAAAAA0, // 2 9 1 4
+	0xFEAAAAA0, // 2 9 2 3
+	0xFEAAAAA0, // 2 9 3 2
+	0xFEAAAAA0, // 2 9 4 1
+	0xFAAAAAA0, // 2 10 0 4
+	0xFAAAAAA0, // 2 10 1 3
+	0xFAAAAAA0, // 2 10 2 2
+	0xFAAAAAA0, // 2 10 3 1
+	0xEAAAAAA0, // 2 11 0 3
+	0xEAAAAAA0, // 2 11 1 2
+	0xEAAAAAA0, // 2 11 2 1
+	0xAAAAAAA0, // 2 12 0 2
+	0xAAAAAAA0, // 2 12 1 1
+	0xAAAAAAA0, // 2 13 0 1
+	0x5557FA80, // 3 0 0 13
+	0x555FFA80, // 3 0 1 12
+	0x557FFA80, // 3 0 2 11
+	0x55FFFA80, // 3 0 3 10
+	0x57FFFA80, // 3 0 4 9
+	0x5FFFFA80, // 3 0 5 8
+	0x7FFFFA80, // 3 0 6 7
+	0xFFFFFA80, // 3 0 7 6
+	0xFFFFFA80, // 3 0 8 5
+	0xFFFFFA80, // 3 0 9 4
+	0xFFFFFA80, // 3 0 10 3
+	0xFFFFFA80, // 3 0 11 2
+	0xFFFFFA80, // 3 0 12 1
+	0x557FEA80, // 3 1 0 12
+	0x55FFEA80, // 3 1 1 11
+	0x57FFEA80, // 3 1 2 10
+	0x5FFFEA80, // 3 1 3 9
+	0x7FFFEA80, // 3 1 4 8
+	0xFFFFEA80, // 3 1 5 7
+	0xFFFFEA80, // 3 1 6 6
+	0xFFFFEA80, // 3 1 7 5
+	0xFFFFEA80, // 3 1 8 4
+	0xFFFFEA80, // 3 1 9 3
+	0xFFFFEA80, // 3 1 10 2
+	0xFFFFEA80, // 3 1 11 1
+	0x57FFAA80, // 3 2 0 11
+	0x5FFFAA80, // 3 2 1 10
+	0x7FFFAA80, // 3 2 2 9
+	0xFFFFAA80, // 3 2 3 8
+	0xFFFFAA80, // 3 2 4 7
+	0xFFFFAA80, // 3 2 5 6
+	0xFFFFAA80, // 3 2 6 5
+	0xFFFFAA80, // 3 2 7 4
+	0xFFFFAA80, // 3 2 8 3
+	0xFFFFAA80, // 3 2 9 2
+	0xFFFFAA80, // 3 2 10 1
+	0x7FFEAA80, // 3 3 0 10
+	0xFFFEAA80, // 3 3 1 9
+	0xFFFEAA80, // 3 3 2 8
+	0xFFFEAA80, // 3 3 3 7
+	0xFFFEAA80, // 3 3 4 6
+	0xFFFEAA80, // 3 3 5 5
+	0xFFFEAA80, // 3 3 6 4
+	0xFFFEAA80, // 3 3 7 3
+	0xFFFEAA80, // 3 3 8 2
+	0xFFFEAA80, // 3 3 9 1
+	0xFFFAAA80, // 3 4 0 9
+	0xFFFAAA80, // 3 4 1 8
+	0xFFFAAA80, // 3 4 2 7
+	0xFFFAAA80, // 3 4 3 6
+	0xFFFAAA80, // 3 4 4 5
+	0xFFFAAA80, // 3 4 5 4
+	0xFFFAAA80, // 3 4 6 3
+	0xFFFAAA80, // 3 4 7 2
+	0xFFFAAA80, // 3 4 8 1
+	0xFFEAAA80, // 3 5 0 8
+	0xFFEAAA80, // 3 5 1 7
+	0xFFEAAA80, // 3 5 2 6
+	0xFFEAAA80, // 3 5 3 5
+	0xFFEAAA80, // 3 5 4 4
+	0xFFEAAA80, // 3 5 5 3
+	0xFFEAAA80, // 3 5 6 2
+	0xFFEAAA80, // 3 5 7 1
+	0xFFAAAA80, // 3 6 0 7
+	0xFFAAAA80, // 3 6 1 6
+	0xFFAAAA80, // 3 6 2 5
+	0xFFAAAA80, // 3 6 3 4
+	0xFFAAAA80, // 3 6 4 3
+	0xFFAAAA80, // 3 6 5 2
+	0xFFAAAA80, // 3 6 6 1
+	0xFEAAAA80, // 3 7 0 6
+	0xFEAAAA80, // 3 7 1 5
+	0xFEAAAA80, // 3 7 2 4
+	0xFEAAAA80, // 3 7 3 3
+	0xFEAAAA80, // 3 7 4 2
+	0xFEAAAA80, // 3 7 5 1
+	0xFAAAAA80, // 3 8 0 5
+	0xFAAAAA80, // 3 8 1 4
+	0xFAAAAA80, // 3 8 2 3
+	0xFAAAAA80, // 3 8 3 2
+	0xFAAAAA80, // 3 8 4 1
+	0xEAAAAA80, // 3 9 0 4
+	0xEAAAAA80, // 3 9 1 3
+	0xEAAAAA80, // 3 9 2 2
+	0xEAAAAA80, // 3 9 3 1
+	0xAAAAAA80, // 3 10 0 3
+	0xAAAAAA80, // 3 10 1 2
+	0xAAAAAA80, // 3 10 2 1
+	0xAAAAAA80, // 3 11 0 2
+	0xAAAAAA80, // 3 11 1 1
+	0xAAAAAA80, // 3 12 0 1
+	0x55FFAA00, // 4 0 0 12
+	0x57FFAA00, // 4 0 1 11
+	0x5FFFAA00, // 4 0 2 10
+	0x7FFFAA00, // 4 0 3 9
+	0xFFFFAA00, // 4 0 4 8
+	0xFFFFAA00, // 4 0 5 7
+	0xFFFFAA00, // 4 0 6 6
+	0xFFFFAA00, // 4 0 7 5
+	0xFFFFAA00, // 4 0 8 4
+	0xFFFFAA00, // 4 0 9 3
+	0xFFFFAA00, // 4 0 10 2
+	0xFFFFAA00, // 4 0 11 1
+	0x5FFEAA00, // 4 1 0 11
+	0x7FFEAA00, // 4 1 1 10
+	0xFFFEAA00, // 4 1 2 9
+	0xFFFEAA00, // 4 1 3 8
+	0xFFFEAA00, // 4 1 4 7
+	0xFFFEAA00, // 4 1 5 6
+	0xFFFEAA00, // 4 1 6 5
+	0xFFFEAA00, // 4 1 7 4
+	0xFFFEAA00, // 4 1 8 3
+	0xFFFEAA00, // 4 1 9 2
+	0xFFFEAA00, // 4 1 10 1
+	0xFFFAAA00, // 4 2 0 10
+	0xFFFAAA00, // 4 2 1 9
+	0xFFFAAA00, // 4 2 2 8
+	0xFFFAAA00, // 4 2 3 7
+	0xFFFAAA00, // 4 2 4 6
+	0xFFFAAA00, // 4 2 5 5
+	0xFFFAAA00, // 4 2 6 4
+	0xFFFAAA00, // 4 2 7 3
+	0xFFFAAA00, // 4 2 8 2
+	0xFFFAAA00, // 4 2 9 1
+	0xFFEAAA00, // 4 3 0 9
+	0xFFEAAA00, // 4 3 1 8
+	0xFFEAAA00, // 4 3 2 7
+	0xFFEAAA00, // 4 3 3 6
+	0xFFEAAA00, // 4 3 4 5
+	0xFFEAAA00, // 4 3 5 4
+	0xFFEAAA00, // 4 3 6 3
+	0xFFEAAA00, // 4 3 7 2
+	0xFFEAAA00, // 4 3 8 1
+	0xFFAAAA00, // 4 4 0 8
+	0xFFAAAA00, // 4 4 1 7
+	0xFFAAAA00, // 4 4 2 6
+	0xFFAAAA00, // 4 4 3 5
+	0xFFAAAA00, // 4 4 4 4
+	0xFFAAAA00, // 4 4 5 3
+	0xFFAAAA00, // 4 4 6 2
+	0xFFAAAA00, // 4 4 7 1
+	0xFEAAAA00, // 4 5 0 7
+	0xFEAAAA00, // 4 5 1 6
+	0xFEAAAA00, // 4 5 2 5
+	0xFEAAAA00, // 4 5 3 4
+	0xFEAAAA00, // 4 5 4 3
+	0xFEAAAA00, // 4 5 5 2
+	0xFEAAAA00, // 4 5 6 1
+	0xFAAAAA00, // 4 6 0 6
+	0xFAAAAA00, // 4 6 1 5
+	0xFAAAAA00, // 4 6 2 4
+	0xFAAAAA00, // 4 6 3 3
+	0xFAAAAA00, // 4 6 4 2
+	0xFAAAAA00, // 4 6 5 1
+	0xEAAAAA00, // 4 7 0 5
+	0xEAAAAA00, // 4 7 1 4
+	0xEAAAAA00, // 4 7 2 3
+	0xEAAAAA00, // 4 7 3 2
+	0xEAAAAA00, // 4 7 4 1
+	0xAAAAAA00, // 4 8 0 4
+	0xAAAAAA00, // 4 8 1 3
+	0xAAAAAA00, // 4 8 2 2
+	0xAAAAAA00, // 4 8 3 1
+	0xAAAAAA00, // 4 9 0 3
+	0xAAAAAA00, // 4 9 1 2
+	0xAAAAAA00, // 4 9 2 1
+	0xAAAAAA00, // 4 10 0 2
+	0xAAAAAA00, // 4 10 1 1
+	0xAAAAAA00, // 4 11 0 1
+	0x7FFAA800, // 5 0 0 11
+	0xFFFAA800, // 5 0 1 10
+	0xFFFAA800, // 5 0 2 9
+	0xFFFAA800, // 5 0 3 8
+	0xFFFAA800, // 5 0 4 7
+	0xFFFAA800, // 5 0 5 6
+	0xFFFAA800, // 5 0 6 5
+	0xFFFAA800, // 5 0 7 4
+	0xFFFAA800, // 5 0 8 3
+	0xFFFAA800, // 5 0 9 2
+	0xFFFAA800, // 5 0 10 1
+	0xFFEAA800, // 5 1 0 10
+	0xFFEAA800, // 5 1 1 9
+	0xFFEAA800, // 5 1 2 8
+	0xFFEAA800, // 5 1 3 7
+	0xFFEAA800, // 5 1 4 6
+	0xFFEAA800, // 5 1 5 5
+	0xFFEAA800, // 5 1 6 4
+	0xFFEAA800, // 5 1 7 3
+	0xFFEAA800, // 5 1 8 2
+	0xFFEAA800, // 5 1 9 1
+	0xFFAAA800, // 5 2 0 9
+	0xFFAAA800, // 5 2 1 8
+	0xFFAAA800, // 5 2 2 7
+	0xFFAAA800, // 5 2 3 6
+	0xFFAAA800, // 5 2 4 5
+	0xFFAAA800, // 5 2 5 4
+	0xFFAAA800, // 5 2 6 3
+	0xFFAAA800, // 5 2 7 2
+	0xFFAAA800, // 5 2 8 1
+	0xFEAAA800, // 5 3 0 8
+	0xFEAAA800, // 5 3 1 7
+	0xFEAAA800, // 5 3 2 6
+	0xFEAAA800, // 5 3 3 5
+	0xFEAAA800, // 5 3 4 4
+	0xFEAAA800, // 5 3 5 3
+	0xFEAAA800, // 5 3 6 2
+	0xFEAAA800, // 5 3 7 1
+	0xFAAAA800, // 5 4 0 7
+	0xFAAAA800, // 5 4 1 6
+	0xFAAAA800, // 5 4 2 5
+	0xFAAAA800, // 5 4 3 4
+	0xFAAAA800, // 5 4 4 3
+	0xFAAAA800, // 5 4 5 2
+	0xFAAAA800, // 5 4 6 1
+	0xEAAAA800, // 5 5 0 6
+	0xEAAAA800, // 5 5 1 5
+	0xEAAAA800, // 5 5 2 4
+	0xEAAAA800, // 5 5 3 3
+	0xEAAAA800, // 5 5 4 2
+	0xEAAAA800, // 5 5 5 1
+	0xAAAAA800, // 5 6 0 5
+	0xAAAAA800, // 5 6 1 4
+	0xAAAAA800, // 5 6 2 3
+	0xAAAAA800, // 5 6 3 2
+	0xAAAAA800, // 5 6 4 1
+	0xAAAAA800, // 5 7 0 4
+	0xAAAAA800, // 5 7 1 3
+	0xAAAAA800, // 5 7 2 2
+	0xAAAAA800, // 5 7 3 1
+	0xAAAAA800, // 5 8 0 3
+	0xAAAAA800, // 5 8 1 2
+	0xAAAAA800, // 5 8 2 1
+	0xAAAAA800, // 5 9 0 2
+	0xAAAAA800, // 5 9 1 1
+	0xAAAAA800, // 5 10 0 1
+	0xFFAAA000, // 6 0 0 10
+	0xFFAAA000, // 6 0 1 9
+	0xFFAAA000, // 6 0 2 8
+	0xFFAAA000, // 6 0 3 7
+	0xFFAAA000, // 6 0 4 6
+	0xFFAAA000, // 6 0 5 5
+	0xFFAAA000, // 6 0 6 4
+	0xFFAAA000, // 6 0 7 3
+	0xFFAAA000, // 6 0 8 2
+	0xFFAAA000, // 6 0 9 1
+	0xFEAAA000, // 6 1 0 9
+	0xFEAAA000, // 6 1 1 8
+	0xFEAAA000, // 6 1 2 7
+	0xFEAAA000, // 6 1 3 6
+	0xFEAAA000, // 6 1 4 5
+	0xFEAAA000, // 6 1 5 4
+	0xFEAAA000, // 6 1 6 3
+	0xFEAAA000, // 6 1 7 2
+	0xFEAAA000, // 6 1 8 1
+	0xFAAAA000, // 6 2 0 8
+	0xFAAAA000, // 6 2 1 7
+	0xFAAAA000, // 6 2 2 6
+	0xFAAAA000, // 6 2 3 5
+	0xFAAAA000, // 6 2 4 4
+	0xFAAAA000, // 6 2 5 3
+	0xFAAAA000, // 6 2 6 2
+	0xFAAAA000, // 6 2 7 1
+	0xEAAAA000, // 6 3 0 7
+	0xEAAAA000, // 6 3 1 6
+	0xEAAAA000, // 6 3 2 5
+	0xEAAAA000, // 6 3 3 4
+	0xEAAAA000, // 6 3 4 3
+	0xEAAAA000, // 6 3 5 2
+	0xEAAAA000, // 6 3 6 1
+	0xAAAAA000, // 6 4 0 6
+	0xAAAAA000, // 6 4 1 5
+	0xAAAAA000, // 6 4 2 4
+	0xAAAAA000, // 6 4 3 3
+	0xAAAAA000, // 6 4 4 2
+	0xAAAAA000, // 6 4 5 1
+	0xAAAAA000, // 6 5 0 5
+	0xAAAAA000, // 6 5 1 4
+	0xAAAAA000, // 6 5 2 3
+	0xAAAAA000, // 6 5 3 2
+	0xAAAAA000, // 6 5 4 1
+	0xAAAAA000, // 6 6 0 4
+	0xAAAAA000, // 6 6 1 3
+	0xAAAAA000, // 6 6 2 2
+	0xAAAAA000, // 6 6 3 1
+	0xAAAAA000, // 6 7 0 3
+	0xAAAAA000, // 6 7 1 2
+	0xAAAAA000, // 6 7 2 1
+	0xAAAAA000, // 6 8 0 2
+	0xAAAAA000, // 6 8 1 1
+	0xAAAAA000, // 6 9 0 1
+	0xFAAA8000, // 7 0 0 9
+	0xFAAA8000, // 7 0 1 8
+	0xFAAA8000, // 7 0 2 7
+	0xFAAA8000, // 7 0 3 6
+	0xFAAA8000, // 7 0 4 5
+	0xFAAA8000, // 7 0 5 4
+	0xFAAA8000, // 7 0 6 3
+	0xFAAA8000, // 7 0 7 2
+	0xFAAA8000, // 7 0 8 1
+	0xEAAA8000, // 7 1 0 8
+	0xEAAA8000, // 7 1 1 7
+	0xEAAA8000, // 7 1 2 6
+	0xEAAA8000, // 7 1 3 5
+	0xEAAA8000, // 7 1 4 4
+	0xEAAA8000, // 7 1 5 3
+	0xEAAA8000, // 7 1 6 2
+	0xEAAA8000, // 7 1 7 1
+	0xAAAA8000, // 7 2 0 7
+	0xAAAA8000, // 7 2 1 6
+	0xAAAA8000, // 7 2 2 5
+	0xAAAA8000, // 7 2 3 4
+	0xAAAA8000, // 7 2 4 3
+	0xAAAA8000, // 7 2 5 2
+	0xAAAA8000, // 7 2 6 1
+	0xAAAA8000, // 7 3 0 6
+	0xAAAA8000, // 7 3 1 5
+	0xAAAA8000, // 7 3 2 4
+	0xAAAA8000, // 7 3 3 3
+	0xAAAA8000, // 7 3 4 2
+	0xAAAA8000, // 7 3 5 1
+	0xAAAA8000, // 7 4 0 5
+	0xAAAA8000, // 7 4 1 4
+	0xAAAA8000, // 7 4 2 3
+	0xAAAA8000, // 7 4 3 2
+	0xAAAA8000, // 7 4 4 1
+	0xAAAA8000, // 7 5 0 4
+	0xAAAA8000, // 7 5 1 3
+	0xAAAA8000, // 7 5 2 2
+	0xAAAA8000, // 7 5 3 1
+	0xAAAA8000, // 7 6 0 3
+	0xAAAA8000, // 7 6 1 2
+	0xAAAA8000, // 7 6 2 1
+	0xAAAA8000, // 7 7 0 2
+	0xAAAA8000, // 7 7 1 1
+	0xAAAA8000, // 7 8 0 1
+	0xAAAA0000, // 8 0 0 8
+	0xAAAA0000, // 8 0 1 7
+	0xAAAA0000, // 8 0 2 6
+	0xAAAA0000, // 8 0 3 5
+	0xAAAA0000, // 8 0 4 4
+	0xAAAA0000, // 8 0 5 3
+	0xAAAA0000, // 8 0 6 2
+	0xAAAA0000, // 8 0 7 1
+	0xAAAA0000, // 8 1 0 7
+	0xAAAA0000, // 8 1 1 6
+	0xAAAA0000, // 8 1 2 5
+	0xAAAA0000, // 8 1 3 4
+	0xAAAA0000, // 8 1 4 3
+	0xAAAA0000, // 8 1 5 2
+	0xAAAA0000, // 8 1 6 1
+	0xAAAA0000, // 8 2 0 6
+	0xAAAA0000, // 8 2 1 5
+	0xAAAA0000, // 8 2 2 4
+	0xAAAA0000, // 8 2 3 3
+	0xAAAA0000, // 8 2 4 2
+	0xAAAA0000, // 8 2 5 1
+	0xAAAA0000, // 8 3 0 5
+	0xAAAA0000, // 8 3 1 4
+	0xAAAA0000, // 8 3 2 3
+	0xAAAA0000, // 8 3 3 2
+	0xAAAA0000, // 8 3 4 1
+	0xAAAA0000, // 8 4 0 4
+	0xAAAA0000, // 8 4 1 3
+	0xAAAA0000, // 8 4 2 2
+	0xAAAA0000, // 8 4 3 1
+	0xAAAA0000, // 8 5 0 3
+	0xAAAA0000, // 8 5 1 2
+	0xAAAA0000, // 8 5 2 1
+	0xAAAA0000, // 8 6 0 2
+	0xAAAA0000, // 8 6 1 1
+	0xAAAA0000, // 8 7 0 1
+	0xAAA80000, // 9 0 0 7
+	0xAAA80000, // 9 0 1 6
+	0xAAA80000, // 9 0 2 5
+	0xAAA80000, // 9 0 3 4
+	0xAAA80000, // 9 0 4 3
+	0xAAA80000, // 9 0 5 2
+	0xAAA80000, // 9 0 6 1
+	0xAAA80000, // 9 1 0 6
+	0xAAA80000, // 9 1 1 5
+	0xAAA80000, // 9 1 2 4
+	0xAAA80000, // 9 1 3 3
+	0xAAA80000, // 9 1 4 2
+	0xAAA80000, // 9 1 5 1
+	0xAAA80000, // 9 2 0 5
+	0xAAA80000, // 9 2 1 4
+	0xAAA80000, // 9 2 2 3
+	0xAAA80000, // 9 2 3 2
+	0xAAA80000, // 9 2 4 1
+	0xAAA80000, // 9 3 0 4
+	0xAAA80000, // 9 3 1 3
+	0xAAA80000, // 9 3 2 2
+	0xAAA80000, // 9 3 3 1
+	0xAAA80000, // 9 4 0 3
+	0xAAA80000, // 9 4 1 2
+	0xAAA80000, // 9 4 2 1
+	0xAAA80000, // 9 5 0 2
+	0xAAA80000, // 9 5 1 1
+	0xAAA80000, // 9 6 0 1
+	0xAAA00000, // 10 0 0 6
+	0xAAA00000, // 10 0 1 5
+	0xAAA00000, // 10 0 2 4
+	0xAAA00000, // 10 0 3 3
+	0xAAA00000, // 10 0 4 2
+	0xAAA00000, // 10 0 5 1
+	0xAAA00000, // 10 1 0 5
+	0xAAA00000, // 10 1 1 4
+	0xAAA00000, // 10 1 2 3
+	0xAAA00000, // 10 1 3 2
+	0xAAA00000, // 10 1 4 1
+	0xAAA00000, // 10 2 0 4
+	0xAAA00000, // 10 2 1 3
+	0xAAA00000, // 10 2 2 2
+	0xAAA00000, // 10 2 3 1
+	0xAAA00000, // 10 3 0 3
+	0xAAA00000, // 10 3 1 2
+	0xAAA00000, // 10 3 2 1
+	0xAAA00000, // 10 4 0 2
+	0xAAA00000, // 10 4 1 1
+	0xAAA00000, // 10 5 0 1
+	0xAA800000, // 11 0 0 5
+	0xAA800000, // 11 0 1 4
+	0xAA800000, // 11 0 2 3
+	0xAA800000, // 11 0 3 2
+	0xAA800000, // 11 0 4 1
+	0xAA800000, // 11 1 0 4
+	0xAA800000, // 11 1 1 3
+	0xAA800000, // 11 1 2 2
+	0xAA800000, // 11 1 3 1
+	0xAA800000, // 11 2 0 3
+	0xAA800000, // 11 2 1 2
+	0xAA800000, // 11 2 2 1
+	0xAA800000, // 11 3 0 2
+	0xAA800000, // 11 3 1 1
+	0xAA800000, // 11 4 0 1
+	0xAA000000, // 12 0 0 4
+	0xAA000000, // 12 0 1 3
+	0xAA000000, // 12 0 2 2
+	0xAA000000, // 12 0 3 1
+	0xAA000000, // 12 1 0 3
+	0xAA000000, // 12 1 1 2
+	0xAA000000, // 12 1 2 1
+	0xAA000000, // 12 2 0 2
+	0xAA000000, // 12 2 1 1
+	0xAA000000, // 12 3 0 1
+	0xA8000000, // 13 0 0 3
+	0xA8000000, // 13 0 1 2
+	0xA8000000, // 13 0 2 1
+	0xA8000000, // 13 1 0 2
+	0xA8000000, // 13 1 1 1
+	0xA8000000, // 13 2 0 1
+	0xA0000000, // 14 0 0 2
+	0xA0000000, // 14 0 1 1
+	0xA0000000, // 14 1 0 1
+	0x80000000, // 15 0 0 1
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+	0x80000000,
+};
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/Bitmaps.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/Bitmaps.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/Bitmaps.h
@@ -1,1119 +0,0 @@
-
-
-/*
-static void doPrecomputation()
-{
-	uint bitmaps[1024];
-
-	int indices[16];
-	int num = 0;
-
-	// Compute bitmaps with 3 clusters:
-
-	// first cluster [0,i) is at the start
-	for( int m = 0; m < 16; ++m )
-	{
-		indices[m] = 0;
-	}
-	const int imax = 15;
-	for( int i = imax; i >= 0; --i )
-	{
-		// second cluster [i,j) is half along
-		for( int m = i; m < 16; ++m )
-		{
-			indices[m] = 2;
-		}
-		const int jmax = ( i == 0 ) ? 15 : 16;
-		for( int j = jmax; j >= i; --j )
-		{
-			// last cluster [j,k) is at the end
-			if( j < 16 )
-			{
-				indices[j] = 1;
-			}
-			
-			uint bitmap = 0;
-			
-			for(int p = 0; p < 16; p++) {
-				bitmap |= indices[p] << (p * 2);
-			}
-				
-			bitmaps[num] = bitmap;
-			
-			num++;
-		}
-	}
-	nvDebugCheck(num == 151);
-
-	// Align to 160.
-	for(int i = 0; i < 9; i++)
-	{
-		bitmaps[num] = 0x555AA000;
-		num++;
-	}
-	nvDebugCheck(num == 160);
-
-	// Append bitmaps with 4 clusters:
-
-	// first cluster [0,i) is at the start
-	for( int m = 0; m < 16; ++m )
-	{
-		indices[m] = 0;
-	}
-	for( int i = imax; i >= 0; --i )
-	{
-		// second cluster [i,j) is one third along
-		for( int m = i; m < 16; ++m )
-		{
-			indices[m] = 2;
-		}
-		const int jmax = ( i == 0 ) ? 15 : 16;
-		for( int j = jmax; j >= i; --j )
-		{
-			// third cluster [j,k) is two thirds along
-			for( int m = j; m < 16; ++m )
-			{
-				indices[m] = 3;
-			}
-			
-			int kmax = ( j == 0 ) ? 15 : 16;
-			for( int k = kmax; k >= j; --k )
-			{
-				// last cluster [k,n) is at the end
-				if( k < 16 )
-				{
-					indices[k] = 1;
-				}
-				
-				uint bitmap = 0;
-				
-				bool hasThree = false;
-				for(int p = 0; p < 16; p++) {
-					bitmap |= indices[p] << (p * 2);
-					
-					if (indices[p] == 3) hasThree = true;
-				}
-				
-				if (hasThree) {
-					bitmaps[num] = bitmap;
-					num++;
-				}
-			}
-		}
-	}
-	nvDebugCheck(num == 975);
-	
-	// Align to 1024.
-	for(int i = 0; i < 49; i++)
-	{
-		bitmaps[num] = 0x555AA000;
-		num++;
-	}
-
-	nvDebugCheck(num == 1024);
-
-	printf("uint bitmaps[992] =\n{\n");
-	for (int i = 0; i < 992; i++)
-	{
-		printf("\t0x%.8X,\n", bitmaps[i]);
-	}
-	printf("};\n");
-}
-*/
-
-
-const static uint s_bitmapTable[992] =
-{
-	0x80000000,
-	0x40000000,
-	0xA0000000,
-	0x60000000,
-	0x50000000,
-	0xA8000000,
-	0x68000000,
-	0x58000000,
-	0x54000000,
-	0xAA000000,
-	0x6A000000,
-	0x5A000000,
-	0x56000000,
-	0x55000000,
-	0xAA800000,
-	0x6A800000,
-	0x5A800000,
-	0x56800000,
-	0x55800000,
-	0x55400000,
-	0xAAA00000,
-	0x6AA00000,
-	0x5AA00000,
-	0x56A00000,
-	0x55A00000,
-	0x55600000,
-	0x55500000,
-	0xAAA80000,
-	0x6AA80000,
-	0x5AA80000,
-	0x56A80000,
-	0x55A80000,
-	0x55680000,
-	0x55580000,
-	0x55540000,
-	0xAAAA0000,
-	0x6AAA0000,
-	0x5AAA0000,
-	0x56AA0000,
-	0x55AA0000,
-	0x556A0000,
-	0x555A0000,
-	0x55560000,
-	0x55550000,
-	0xAAAA8000,
-	0x6AAA8000,
-	0x5AAA8000,
-	0x56AA8000,
-	0x55AA8000,
-	0x556A8000,
-	0x555A8000,
-	0x55568000,
-	0x55558000,
-	0x55554000,
-	0xAAAAA000,
-	0x6AAAA000,
-	0x5AAAA000,
-	0x56AAA000,
-	0x55AAA000,
-	0x556AA000,
-	0x555AA000,
-	0x5556A000,
-	0x5555A000,
-	0x55556000,
-	0x55555000,
-	0xAAAAA800,
-	0x6AAAA800,
-	0x5AAAA800,
-	0x56AAA800,
-	0x55AAA800,
-	0x556AA800,
-	0x555AA800,
-	0x5556A800,
-	0x5555A800,
-	0x55556800,
-	0x55555800,
-	0x55555400,
-	0xAAAAAA00,
-	0x6AAAAA00,
-	0x5AAAAA00,
-	0x56AAAA00,
-	0x55AAAA00,
-	0x556AAA00,
-	0x555AAA00,
-	0x5556AA00,
-	0x5555AA00,
-	0x55556A00,
-	0x55555A00,
-	0x55555600,
-	0x55555500,
-	0xAAAAAA80,
-	0x6AAAAA80,
-	0x5AAAAA80,
-	0x56AAAA80,
-	0x55AAAA80,
-	0x556AAA80,
-	0x555AAA80,
-	0x5556AA80,
-	0x5555AA80,
-	0x55556A80,
-	0x55555A80,
-	0x55555680,
-	0x55555580,
-	0x55555540,
-	0xAAAAAAA0,
-	0x6AAAAAA0,
-	0x5AAAAAA0,
-	0x56AAAAA0,
-	0x55AAAAA0,
-	0x556AAAA0,
-	0x555AAAA0,
-	0x5556AAA0,
-	0x5555AAA0,
-	0x55556AA0,
-	0x55555AA0,
-	0x555556A0,
-	0x555555A0,
-	0x55555560,
-	0x55555550,
-	0xAAAAAAA8,
-	0x6AAAAAA8,
-	0x5AAAAAA8,
-	0x56AAAAA8,
-	0x55AAAAA8,
-	0x556AAAA8,
-	0x555AAAA8,
-	0x5556AAA8,
-	0x5555AAA8,
-	0x55556AA8,
-	0x55555AA8,
-	0x555556A8,
-	0x555555A8,
-	0x55555568,
-	0x55555558,
-	0x55555554,
-	0x6AAAAAAA,
-	0x5AAAAAAA,
-	0x56AAAAAA,
-	0x55AAAAAA,
-	0x556AAAAA,
-	0x555AAAAA,
-	0x5556AAAA,
-	0x5555AAAA,
-	0x55556AAA,
-	0x55555AAA,
-	0x555556AA,
-	0x555555AA,
-	0x5555556A,
-	0x5555555A,
-	0x55555556,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0x55555555,
-	0xC0000000,
-	0xE0000000,
-	0xF0000000,
-	0x70000000,
-	0xE8000000,
-	0xF8000000,
-	0x78000000,
-	0xFC000000,
-	0x7C000000,
-	0x5C000000,
-	0xEA000000,
-	0xFA000000,
-	0x7A000000,
-	0xFE000000,
-	0x7E000000,
-	0x5E000000,
-	0xFF000000,
-	0x7F000000,
-	0x5F000000,
-	0x57000000,
-	0xEA800000,
-	0xFA800000,
-	0x7A800000,
-	0xFE800000,
-	0x7E800000,
-	0x5E800000,
-	0xFF800000,
-	0x7F800000,
-	0x5F800000,
-	0x57800000,
-	0xFFC00000,
-	0x7FC00000,
-	0x5FC00000,
-	0x57C00000,
-	0x55C00000,
-	0xEAA00000,
-	0xFAA00000,
-	0x7AA00000,
-	0xFEA00000,
-	0x7EA00000,
-	0x5EA00000,
-	0xFFA00000,
-	0x7FA00000,
-	0x5FA00000,
-	0x57A00000,
-	0xFFE00000,
-	0x7FE00000,
-	0x5FE00000,
-	0x57E00000,
-	0x55E00000,
-	0xFFF00000,
-	0x7FF00000,
-	0x5FF00000,
-	0x57F00000,
-	0x55F00000,
-	0x55700000,
-	0xEAA80000,
-	0xFAA80000,
-	0x7AA80000,
-	0xFEA80000,
-	0x7EA80000,
-	0x5EA80000,
-	0xFFA80000,
-	0x7FA80000,
-	0x5FA80000,
-	0x57A80000,
-	0xFFE80000,
-	0x7FE80000,
-	0x5FE80000,
-	0x57E80000,
-	0x55E80000,
-	0xFFF80000,
-	0x7FF80000,
-	0x5FF80000,
-	0x57F80000,
-	0x55F80000,
-	0x55780000,
-	0xFFFC0000,
-	0x7FFC0000,
-	0x5FFC0000,
-	0x57FC0000,
-	0x55FC0000,
-	0x557C0000,
-	0x555C0000,
-	0xEAAA0000,
-	0xFAAA0000,
-	0x7AAA0000,
-	0xFEAA0000,
-	0x7EAA0000,
-	0x5EAA0000,
-	0xFFAA0000,
-	0x7FAA0000,
-	0x5FAA0000,
-	0x57AA0000,
-	0xFFEA0000,
-	0x7FEA0000,
-	0x5FEA0000,
-	0x57EA0000,
-	0x55EA0000,
-	0xFFFA0000,
-	0x7FFA0000,
-	0x5FFA0000,
-	0x57FA0000,
-	0x55FA0000,
-	0x557A0000,
-	0xFFFE0000,
-	0x7FFE0000,
-	0x5FFE0000,
-	0x57FE0000,
-	0x55FE0000,
-	0x557E0000,
-	0x555E0000,
-	0xFFFF0000,
-	0x7FFF0000,
-	0x5FFF0000,
-	0x57FF0000,
-	0x55FF0000,
-	0x557F0000,
-	0x555F0000,
-	0x55570000,
-	0xEAAA8000,
-	0xFAAA8000,
-	0x7AAA8000,
-	0xFEAA8000,
-	0x7EAA8000,
-	0x5EAA8000,
-	0xFFAA8000,
-	0x7FAA8000,
-	0x5FAA8000,
-	0x57AA8000,
-	0xFFEA8000,
-	0x7FEA8000,
-	0x5FEA8000,
-	0x57EA8000,
-	0x55EA8000,
-	0xFFFA8000,
-	0x7FFA8000,
-	0x5FFA8000,
-	0x57FA8000,
-	0x55FA8000,
-	0x557A8000,
-	0xFFFE8000,
-	0x7FFE8000,
-	0x5FFE8000,
-	0x57FE8000,
-	0x55FE8000,
-	0x557E8000,
-	0x555E8000,
-	0xFFFF8000,
-	0x7FFF8000,
-	0x5FFF8000,
-	0x57FF8000,
-	0x55FF8000,
-	0x557F8000,
-	0x555F8000,
-	0x55578000,
-	0xFFFFC000,
-	0x7FFFC000,
-	0x5FFFC000,
-	0x57FFC000,
-	0x55FFC000,
-	0x557FC000,
-	0x555FC000,
-	0x5557C000,
-	0x5555C000,
-	0xEAAAA000,
-	0xFAAAA000,
-	0x7AAAA000,
-	0xFEAAA000,
-	0x7EAAA000,
-	0x5EAAA000,
-	0xFFAAA000,
-	0x7FAAA000,
-	0x5FAAA000,
-	0x57AAA000,
-	0xFFEAA000,
-	0x7FEAA000,
-	0x5FEAA000,
-	0x57EAA000,
-	0x55EAA000,
-	0xFFFAA000,
-	0x7FFAA000,
-	0x5FFAA000,
-	0x57FAA000,
-	0x55FAA000,
-	0x557AA000,
-	0xFFFEA000,
-	0x7FFEA000,
-	0x5FFEA000,
-	0x57FEA000,
-	0x55FEA000,
-	0x557EA000,
-	0x555EA000,
-	0xFFFFA000,
-	0x7FFFA000,
-	0x5FFFA000,
-	0x57FFA000,
-	0x55FFA000,
-	0x557FA000,
-	0x555FA000,
-	0x5557A000,
-	0xFFFFE000,
-	0x7FFFE000,
-	0x5FFFE000,
-	0x57FFE000,
-	0x55FFE000,
-	0x557FE000,
-	0x555FE000,
-	0x5557E000,
-	0x5555E000,
-	0xFFFFF000,
-	0x7FFFF000,
-	0x5FFFF000,
-	0x57FFF000,
-	0x55FFF000,
-	0x557FF000,
-	0x555FF000,
-	0x5557F000,
-	0x5555F000,
-	0x55557000,
-	0xEAAAA800,
-	0xFAAAA800,
-	0x7AAAA800,
-	0xFEAAA800,
-	0x7EAAA800,
-	0x5EAAA800,
-	0xFFAAA800,
-	0x7FAAA800,
-	0x5FAAA800,
-	0x57AAA800,
-	0xFFEAA800,
-	0x7FEAA800,
-	0x5FEAA800,
-	0x57EAA800,
-	0x55EAA800,
-	0xFFFAA800,
-	0x7FFAA800,
-	0x5FFAA800,
-	0x57FAA800,
-	0x55FAA800,
-	0x557AA800,
-	0xFFFEA800,
-	0x7FFEA800,
-	0x5FFEA800,
-	0x57FEA800,
-	0x55FEA800,
-	0x557EA800,
-	0x555EA800,
-	0xFFFFA800,
-	0x7FFFA800,
-	0x5FFFA800,
-	0x57FFA800,
-	0x55FFA800,
-	0x557FA800,
-	0x555FA800,
-	0x5557A800,
-	0xFFFFE800,
-	0x7FFFE800,
-	0x5FFFE800,
-	0x57FFE800,
-	0x55FFE800,
-	0x557FE800,
-	0x555FE800,
-	0x5557E800,
-	0x5555E800,
-	0xFFFFF800,
-	0x7FFFF800,
-	0x5FFFF800,
-	0x57FFF800,
-	0x55FFF800,
-	0x557FF800,
-	0x555FF800,
-	0x5557F800,
-	0x5555F800,
-	0x55557800,
-	0xFFFFFC00,
-	0x7FFFFC00,
-	0x5FFFFC00,
-	0x57FFFC00,
-	0x55FFFC00,
-	0x557FFC00,
-	0x555FFC00,
-	0x5557FC00,
-	0x5555FC00,
-	0x55557C00,
-	0x55555C00,
-	0xEAAAAA00,
-	0xFAAAAA00,
-	0x7AAAAA00,
-	0xFEAAAA00,
-	0x7EAAAA00,
-	0x5EAAAA00,
-	0xFFAAAA00,
-	0x7FAAAA00,
-	0x5FAAAA00,
-	0x57AAAA00,
-	0xFFEAAA00,
-	0x7FEAAA00,
-	0x5FEAAA00,
-	0x57EAAA00,
-	0x55EAAA00,
-	0xFFFAAA00,
-	0x7FFAAA00,
-	0x5FFAAA00,
-	0x57FAAA00,
-	0x55FAAA00,
-	0x557AAA00,
-	0xFFFEAA00,
-	0x7FFEAA00,
-	0x5FFEAA00,
-	0x57FEAA00,
-	0x55FEAA00,
-	0x557EAA00,
-	0x555EAA00,
-	0xFFFFAA00,
-	0x7FFFAA00,
-	0x5FFFAA00,
-	0x57FFAA00,
-	0x55FFAA00,
-	0x557FAA00,
-	0x555FAA00,
-	0x5557AA00,
-	0xFFFFEA00,
-	0x7FFFEA00,
-	0x5FFFEA00,
-	0x57FFEA00,
-	0x55FFEA00,
-	0x557FEA00,
-	0x555FEA00,
-	0x5557EA00,
-	0x5555EA00,
-	0xFFFFFA00,
-	0x7FFFFA00,
-	0x5FFFFA00,
-	0x57FFFA00,
-	0x55FFFA00,
-	0x557FFA00,
-	0x555FFA00,
-	0x5557FA00,
-	0x5555FA00,
-	0x55557A00,
-	0xFFFFFE00,
-	0x7FFFFE00,
-	0x5FFFFE00,
-	0x57FFFE00,
-	0x55FFFE00,
-	0x557FFE00,
-	0x555FFE00,
-	0x5557FE00,
-	0x5555FE00,
-	0x55557E00,
-	0x55555E00,
-	0xFFFFFF00,
-	0x7FFFFF00,
-	0x5FFFFF00,
-	0x57FFFF00,
-	0x55FFFF00,
-	0x557FFF00,
-	0x555FFF00,
-	0x5557FF00,
-	0x5555FF00,
-	0x55557F00,
-	0x55555F00,
-	0x55555700,
-	0xEAAAAA80,
-	0xFAAAAA80,
-	0x7AAAAA80,
-	0xFEAAAA80,
-	0x7EAAAA80,
-	0x5EAAAA80,
-	0xFFAAAA80,
-	0x7FAAAA80,
-	0x5FAAAA80,
-	0x57AAAA80,
-	0xFFEAAA80,
-	0x7FEAAA80,
-	0x5FEAAA80,
-	0x57EAAA80,
-	0x55EAAA80,
-	0xFFFAAA80,
-	0x7FFAAA80,
-	0x5FFAAA80,
-	0x57FAAA80,
-	0x55FAAA80,
-	0x557AAA80,
-	0xFFFEAA80,
-	0x7FFEAA80,
-	0x5FFEAA80,
-	0x57FEAA80,
-	0x55FEAA80,
-	0x557EAA80,
-	0x555EAA80,
-	0xFFFFAA80,
-	0x7FFFAA80,
-	0x5FFFAA80,
-	0x57FFAA80,
-	0x55FFAA80,
-	0x557FAA80,
-	0x555FAA80,
-	0x5557AA80,
-	0xFFFFEA80,
-	0x7FFFEA80,
-	0x5FFFEA80,
-	0x57FFEA80,
-	0x55FFEA80,
-	0x557FEA80,
-	0x555FEA80,
-	0x5557EA80,
-	0x5555EA80,
-	0xFFFFFA80,
-	0x7FFFFA80,
-	0x5FFFFA80,
-	0x57FFFA80,
-	0x55FFFA80,
-	0x557FFA80,
-	0x555FFA80,
-	0x5557FA80,
-	0x5555FA80,
-	0x55557A80,
-	0xFFFFFE80,
-	0x7FFFFE80,
-	0x5FFFFE80,
-	0x57FFFE80,
-	0x55FFFE80,
-	0x557FFE80,
-	0x555FFE80,
-	0x5557FE80,
-	0x5555FE80,
-	0x55557E80,
-	0x55555E80,
-	0xFFFFFF80,
-	0x7FFFFF80,
-	0x5FFFFF80,
-	0x57FFFF80,
-	0x55FFFF80,
-	0x557FFF80,
-	0x555FFF80,
-	0x5557FF80,
-	0x5555FF80,
-	0x55557F80,
-	0x55555F80,
-	0x55555780,
-	0xFFFFFFC0,
-	0x7FFFFFC0,
-	0x5FFFFFC0,
-	0x57FFFFC0,
-	0x55FFFFC0,
-	0x557FFFC0,
-	0x555FFFC0,
-	0x5557FFC0,
-	0x5555FFC0,
-	0x55557FC0,
-	0x55555FC0,
-	0x555557C0,
-	0x555555C0,
-	0xEAAAAAA0,
-	0xFAAAAAA0,
-	0x7AAAAAA0,
-	0xFEAAAAA0,
-	0x7EAAAAA0,
-	0x5EAAAAA0,
-	0xFFAAAAA0,
-	0x7FAAAAA0,
-	0x5FAAAAA0,
-	0x57AAAAA0,
-	0xFFEAAAA0,
-	0x7FEAAAA0,
-	0x5FEAAAA0,
-	0x57EAAAA0,
-	0x55EAAAA0,
-	0xFFFAAAA0,
-	0x7FFAAAA0,
-	0x5FFAAAA0,
-	0x57FAAAA0,
-	0x55FAAAA0,
-	0x557AAAA0,
-	0xFFFEAAA0,
-	0x7FFEAAA0,
-	0x5FFEAAA0,
-	0x57FEAAA0,
-	0x55FEAAA0,
-	0x557EAAA0,
-	0x555EAAA0,
-	0xFFFFAAA0,
-	0x7FFFAAA0,
-	0x5FFFAAA0,
-	0x57FFAAA0,
-	0x55FFAAA0,
-	0x557FAAA0,
-	0x555FAAA0,
-	0x5557AAA0,
-	0xFFFFEAA0,
-	0x7FFFEAA0,
-	0x5FFFEAA0,
-	0x57FFEAA0,
-	0x55FFEAA0,
-	0x557FEAA0,
-	0x555FEAA0,
-	0x5557EAA0,
-	0x5555EAA0,
-	0xFFFFFAA0,
-	0x7FFFFAA0,
-	0x5FFFFAA0,
-	0x57FFFAA0,
-	0x55FFFAA0,
-	0x557FFAA0,
-	0x555FFAA0,
-	0x5557FAA0,
-	0x5555FAA0,
-	0x55557AA0,
-	0xFFFFFEA0,
-	0x7FFFFEA0,
-	0x5FFFFEA0,
-	0x57FFFEA0,
-	0x55FFFEA0,
-	0x557FFEA0,
-	0x555FFEA0,
-	0x5557FEA0,
-	0x5555FEA0,
-	0x55557EA0,
-	0x55555EA0,
-	0xFFFFFFA0,
-	0x7FFFFFA0,
-	0x5FFFFFA0,
-	0x57FFFFA0,
-	0x55FFFFA0,
-	0x557FFFA0,
-	0x555FFFA0,
-	0x5557FFA0,
-	0x5555FFA0,
-	0x55557FA0,
-	0x55555FA0,
-	0x555557A0,
-	0xFFFFFFE0,
-	0x7FFFFFE0,
-	0x5FFFFFE0,
-	0x57FFFFE0,
-	0x55FFFFE0,
-	0x557FFFE0,
-	0x555FFFE0,
-	0x5557FFE0,
-	0x5555FFE0,
-	0x55557FE0,
-	0x55555FE0,
-	0x555557E0,
-	0x555555E0,
-	0xFFFFFFF0,
-	0x7FFFFFF0,
-	0x5FFFFFF0,
-	0x57FFFFF0,
-	0x55FFFFF0,
-	0x557FFFF0,
-	0x555FFFF0,
-	0x5557FFF0,
-	0x5555FFF0,
-	0x55557FF0,
-	0x55555FF0,
-	0x555557F0,
-	0x555555F0,
-	0x55555570,
-	0xEAAAAAA8,
-	0xFAAAAAA8,
-	0x7AAAAAA8,
-	0xFEAAAAA8,
-	0x7EAAAAA8,
-	0x5EAAAAA8,
-	0xFFAAAAA8,
-	0x7FAAAAA8,
-	0x5FAAAAA8,
-	0x57AAAAA8,
-	0xFFEAAAA8,
-	0x7FEAAAA8,
-	0x5FEAAAA8,
-	0x57EAAAA8,
-	0x55EAAAA8,
-	0xFFFAAAA8,
-	0x7FFAAAA8,
-	0x5FFAAAA8,
-	0x57FAAAA8,
-	0x55FAAAA8,
-	0x557AAAA8,
-	0xFFFEAAA8,
-	0x7FFEAAA8,
-	0x5FFEAAA8,
-	0x57FEAAA8,
-	0x55FEAAA8,
-	0x557EAAA8,
-	0x555EAAA8,
-	0xFFFFAAA8,
-	0x7FFFAAA8,
-	0x5FFFAAA8,
-	0x57FFAAA8,
-	0x55FFAAA8,
-	0x557FAAA8,
-	0x555FAAA8,
-	0x5557AAA8,
-	0xFFFFEAA8,
-	0x7FFFEAA8,
-	0x5FFFEAA8,
-	0x57FFEAA8,
-	0x55FFEAA8,
-	0x557FEAA8,
-	0x555FEAA8,
-	0x5557EAA8,
-	0x5555EAA8,
-	0xFFFFFAA8,
-	0x7FFFFAA8,
-	0x5FFFFAA8,
-	0x57FFFAA8,
-	0x55FFFAA8,
-	0x557FFAA8,
-	0x555FFAA8,
-	0x5557FAA8,
-	0x5555FAA8,
-	0x55557AA8,
-	0xFFFFFEA8,
-	0x7FFFFEA8,
-	0x5FFFFEA8,
-	0x57FFFEA8,
-	0x55FFFEA8,
-	0x557FFEA8,
-	0x555FFEA8,
-	0x5557FEA8,
-	0x5555FEA8,
-	0x55557EA8,
-	0x55555EA8,
-	0xFFFFFFA8,
-	0x7FFFFFA8,
-	0x5FFFFFA8,
-	0x57FFFFA8,
-	0x55FFFFA8,
-	0x557FFFA8,
-	0x555FFFA8,
-	0x5557FFA8,
-	0x5555FFA8,
-	0x55557FA8,
-	0x55555FA8,
-	0x555557A8,
-	0xFFFFFFE8,
-	0x7FFFFFE8,
-	0x5FFFFFE8,
-	0x57FFFFE8,
-	0x55FFFFE8,
-	0x557FFFE8,
-	0x555FFFE8,
-	0x5557FFE8,
-	0x5555FFE8,
-	0x55557FE8,
-	0x55555FE8,
-	0x555557E8,
-	0x555555E8,
-	0xFFFFFFF8,
-	0x7FFFFFF8,
-	0x5FFFFFF8,
-	0x57FFFFF8,
-	0x55FFFFF8,
-	0x557FFFF8,
-	0x555FFFF8,
-	0x5557FFF8,
-	0x5555FFF8,
-	0x55557FF8,
-	0x55555FF8,
-	0x555557F8,
-	0x555555F8,
-	0x55555578,
-	0xFFFFFFFC,
-	0x7FFFFFFC,
-	0x5FFFFFFC,
-	0x57FFFFFC,
-	0x55FFFFFC,
-	0x557FFFFC,
-	0x555FFFFC,
-	0x5557FFFC,
-	0x5555FFFC,
-	0x55557FFC,
-	0x55555FFC,
-	0x555557FC,
-	0x555555FC,
-	0x5555557C,
-	0x5555555C,
-	0xEAAAAAAA,
-	0xFAAAAAAA,
-	0x7AAAAAAA,
-	0xFEAAAAAA,
-	0x7EAAAAAA,
-	0x5EAAAAAA,
-	0xFFAAAAAA,
-	0x7FAAAAAA,
-	0x5FAAAAAA,
-	0x57AAAAAA,
-	0xFFEAAAAA,
-	0x7FEAAAAA,
-	0x5FEAAAAA,
-	0x57EAAAAA,
-	0x55EAAAAA,
-	0xFFFAAAAA,
-	0x7FFAAAAA,
-	0x5FFAAAAA,
-	0x57FAAAAA,
-	0x55FAAAAA,
-	0x557AAAAA,
-	0xFFFEAAAA,
-	0x7FFEAAAA,
-	0x5FFEAAAA,
-	0x57FEAAAA,
-	0x55FEAAAA,
-	0x557EAAAA,
-	0x555EAAAA,
-	0xFFFFAAAA,
-	0x7FFFAAAA,
-	0x5FFFAAAA,
-	0x57FFAAAA,
-	0x55FFAAAA,
-	0x557FAAAA,
-	0x555FAAAA,
-	0x5557AAAA,
-	0xFFFFEAAA,
-	0x7FFFEAAA,
-	0x5FFFEAAA,
-	0x57FFEAAA,
-	0x55FFEAAA,
-	0x557FEAAA,
-	0x555FEAAA,
-	0x5557EAAA,
-	0x5555EAAA,
-	0xFFFFFAAA,
-	0x7FFFFAAA,
-	0x5FFFFAAA,
-	0x57FFFAAA,
-	0x55FFFAAA,
-	0x557FFAAA,
-	0x555FFAAA,
-	0x5557FAAA,
-	0x5555FAAA,
-	0x55557AAA,
-	0xFFFFFEAA,
-	0x7FFFFEAA,
-	0x5FFFFEAA,
-	0x57FFFEAA,
-	0x55FFFEAA,
-	0x557FFEAA,
-	0x555FFEAA,
-	0x5557FEAA,
-	0x5555FEAA,
-	0x55557EAA,
-	0x55555EAA,
-	0xFFFFFFAA,
-	0x7FFFFFAA,
-	0x5FFFFFAA,
-	0x57FFFFAA,
-	0x55FFFFAA,
-	0x557FFFAA,
-	0x555FFFAA,
-	0x5557FFAA,
-	0x5555FFAA,
-	0x55557FAA,
-	0x55555FAA,
-	0x555557AA,
-	0xFFFFFFEA,
-	0x7FFFFFEA,
-	0x5FFFFFEA,
-	0x57FFFFEA,
-	0x55FFFFEA,
-	0x557FFFEA,
-	0x555FFFEA,
-	0x5557FFEA,
-	0x5555FFEA,
-	0x55557FEA,
-	0x55555FEA,
-	0x555557EA,
-	0x555555EA,
-	0xFFFFFFFA,
-	0x7FFFFFFA,
-	0x5FFFFFFA,
-	0x57FFFFFA,
-	0x55FFFFFA,
-	0x557FFFFA,
-	0x555FFFFA,
-	0x5557FFFA,
-	0x5555FFFA,
-	0x55557FFA,
-	0x55555FFA,
-	0x555557FA,
-	0x555555FA,
-	0x5555557A,
-	0xFFFFFFFE,
-	0x7FFFFFFE,
-	0x5FFFFFFE,
-	0x57FFFFFE,
-	0x55FFFFFE,
-	0x557FFFFE,
-	0x555FFFFE,
-	0x5557FFFE,
-	0x5555FFFE,
-	0x55557FFE,
-	0x55555FFE,
-	0x555557FE,
-	0x555555FE,
-	0x5555557E,
-	0x5555555E,
-	0x7FFFFFFF,
-	0x5FFFFFFF,
-	0x57FFFFFF,
-	0x55FFFFFF,
-	0x557FFFFF,
-	0x555FFFFF,
-	0x5557FFFF,
-	0x5555FFFF,
-	0x55557FFF,
-	0x55555FFF,
-	0x555557FF,
-	0x555555FF,
-	0x5555557F,
-	0x5555555F,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
-	0x55555557,
- 	0x55555557,
-};
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CompressKernel.cu
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CompressKernel.cu
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CompressKernel.cu
@@ -1,1122 +1,2022 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-#include "CudaMath.h"
-
-#include "../SingleColorLookup.h"
-
-#define NUM_THREADS 64		// Number of threads per block.
-
-#if __DEVICE_EMULATION__
-#define __debugsync() __syncthreads()
-#else
-#define __debugsync()
-#endif
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef unsigned int uint;
-
-template <class T> 
-__device__ inline void swap(T & a, T & b)
-{
-	T tmp = a;
-	a = b;
-	b = tmp;
-}
-
-__constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
-__constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };
-
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Sort colors
-////////////////////////////////////////////////////////////////////////////////
-__device__ void sortColors(const float * values, int * cmp)
-{
-	int tid = threadIdx.x;
-
-#if 1
-	cmp[tid] = (values[0] < values[tid]);
-	cmp[tid] += (values[1] < values[tid]);
-	cmp[tid] += (values[2] < values[tid]);
-	cmp[tid] += (values[3] < values[tid]);
-	cmp[tid] += (values[4] < values[tid]);
-	cmp[tid] += (values[5] < values[tid]);
-	cmp[tid] += (values[6] < values[tid]);
-	cmp[tid] += (values[7] < values[tid]);
-	cmp[tid] += (values[8] < values[tid]);
-	cmp[tid] += (values[9] < values[tid]);
-	cmp[tid] += (values[10] < values[tid]);
-	cmp[tid] += (values[11] < values[tid]);
-	cmp[tid] += (values[12] < values[tid]);
-	cmp[tid] += (values[13] < values[tid]);
-	cmp[tid] += (values[14] < values[tid]);
-	cmp[tid] += (values[15] < values[tid]);
-	
-	// Resolve elements with the same index.
-	if (tid > 0 && cmp[tid] == cmp[0]) ++cmp[tid];
-	if (tid > 1 && cmp[tid] == cmp[1]) ++cmp[tid];
-	if (tid > 2 && cmp[tid] == cmp[2]) ++cmp[tid];
-	if (tid > 3 && cmp[tid] == cmp[3]) ++cmp[tid];
-	if (tid > 4 && cmp[tid] == cmp[4]) ++cmp[tid];
-	if (tid > 5 && cmp[tid] == cmp[5]) ++cmp[tid];
-	if (tid > 6 && cmp[tid] == cmp[6]) ++cmp[tid];
-	if (tid > 7 && cmp[tid] == cmp[7]) ++cmp[tid];
-	if (tid > 8 && cmp[tid] == cmp[8]) ++cmp[tid];
-	if (tid > 9 && cmp[tid] == cmp[9]) ++cmp[tid];
-	if (tid > 10 && cmp[tid] == cmp[10]) ++cmp[tid];
-	if (tid > 11 && cmp[tid] == cmp[11]) ++cmp[tid];
-	if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
-	if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
-	if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
-#else
-
-	cmp[tid] = 0;
-
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		cmp[tid] += (values[i] < values[tid]);
-	}
-
-	// Resolve elements with the same index.
-	#pragma unroll
-	for (int i = 0; i < 15; i++)
-	{
-		if (tid > 0 && cmp[tid] == cmp[i]) ++cmp[tid];
-	}
-#endif
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Load color block to shared mem
-////////////////////////////////////////////////////////////////////////////////
-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
-{
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
-
-	__shared__ float dps[16];
-
-	if (idx < 16)
-	{
-		// Read color and copy to shared mem.
-		uint c = image[(bid) * 16 + idx];
-		
-		colors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
-		colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
-		
-		// No need to synchronize, 16 < warp size.
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
-		
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
-		
-		*sameColor = (axis == make_float3(0, 0, 0));
-		
-		dps[idx] = dot(colors[idx], axis);
-		
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
-		
-		sortColors(dps, xrefs);
-		
-		float3 tmp = colors[idx];
-		colors[xrefs[idx]] = tmp;
-	}
-}
-
-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
-{
-	const int bid = blockIdx.x;
-	const int idx = threadIdx.x;
-
-	__shared__ float3 rawColors[16];
-	__shared__ float dps[16];
-
-	if (idx < 16)
-	{
-		// Read color and copy to shared mem.
-		uint c = image[(bid) * 16 + idx];
-		
-		rawColors[idx].z = ((c >> 0) & 0xFF) * (1.0f / 255.0f);
-		rawColors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
-		rawColors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
-		weights[idx] = (((c >> 24) & 0xFF) + 1) * (1.0f / 256.0f);
-		
-		colors[idx] = rawColors[idx] * weights[idx];
-
-		
-		// No need to synchronize, 16 < warp size.
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
-
-		// Sort colors along the best fit line.
-		colorSums(colors, sums);
-		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
-
-		*sameColor = (axis == make_float3(0, 0, 0));
-
-		// Single color compressor needs unweighted colors.
-		if (*sameColor) colors[idx] = rawColors[idx];
-
-		dps[idx] = dot(rawColors[idx], axis);
-		
-#if __DEVICE_EMULATION__
-		} __debugsync(); if (idx < 16) {
-#endif
-		
-		sortColors(dps, xrefs);
-		
-		float3 tmp = colors[idx];
-		colors[xrefs[idx]] = tmp;
-		
-		float w = weights[idx];
-		weights[xrefs[idx]] = w;
-	}
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Round color to RGB565 and expand
-////////////////////////////////////////////////////////////////////////////////
-inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
-{
-	v.x = rintf(__saturatef(v.x) * 31.0f);
-	v.y = rintf(__saturatef(v.y) * 63.0f);
-	v.z = rintf(__saturatef(v.z) * 31.0f);
-	*w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z;
-	v.x *= 0.03227752766457f; // approximate integer bit expansion.
-	v.y *= 0.01583151765563f;
-	v.z *= 0.03227752766457f;
-	return v;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Evaluate permutations
-////////////////////////////////////////////////////////////////////////////////
-__device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end)
-{
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
-
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		float beta = (bits & 1);
-		if (bits & 2) beta = (1 + beta) / 3.0f;
-		float alpha = 1.0f - beta;
-		
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * colors[i];
-		betax_sum += beta * colors[i];
-	}
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
-
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-	return dot(e, kColorMetricSqr);
-}
-
-__device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end)
-{
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
-
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-
-		float beta = (bits & 1);
-		if (bits & 2) beta = 0.5f;
-		float alpha = 1.0f - beta;
-	
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * colors[i];
-		betax_sum += beta * colors[i];
-	}
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
-
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-	return dot(e, kColorMetricSqr);
-}
-
-__constant__ const float alphaTable4[4] = { 9.0f, 0.0f, 6.0f, 3.0f };
-__constant__ const float alphaTable3[4] = { 4.0f, 0.0f, 2.0f, 2.0f };
-__constant__ const uint prods4[4] = { 0x090000,0x000900,0x040102,0x010402 };
-__constant__ const uint prods3[4] = { 0x040000,0x000400,0x040101,0x010401 };
-
-__device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
-{
-	// Compute endpoints using least squares.
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	uint akku = 0;
-
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		alphax_sum += alphaTable4[bits & 3] * colors[i];
-		akku += prods4[bits & 3];
-	}
-
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float3 betax_sum = 9.0f * color_sum - alphax_sum;
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
-
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-	return (1.0f / 9.0f) * dot(e, kColorMetricSqr);
-}
-
-__device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
-{
-	// Compute endpoints using least squares.
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-	uint akku = 0;
-
-	// Compute alpha & beta for this permutation.
-	#pragma unroll
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-
-		alphax_sum += alphaTable3[bits & 3] * colors[i];
-		akku += prods3[bits & 3];
-	}
-
-	float alpha2_sum = float(akku >> 16);
-	float beta2_sum = float((akku >> 8) & 0xff);
-	float alphabeta_sum = float(akku & 0xff);
-	float3 betax_sum = 4.0f * color_sum - alphax_sum;
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
-
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-	return (1.0f / 4.0f) * dot(e, kColorMetricSqr);
-}
-
-__device__ float evalPermutation4(const float3 * colors, const float * weights, float3 color_sum, uint permutation, ushort * start, ushort * end)
-{
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-		
-		float beta = (bits & 1);
-		if (bits & 2) beta = (1 + beta) / 3.0f;
-		float alpha = 1.0f - beta;
-		
-		alpha2_sum += alpha * alpha * weights[i];
-		beta2_sum += beta * beta * weights[i];
-		alphabeta_sum += alpha * beta * weights[i];
-		alphax_sum += alpha * colors[i];
-	}
-
-	float3 betax_sum = color_sum - alphax_sum;
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-	
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
-
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-	return dot(e, kColorMetricSqr);
-}
-
-/*
-__device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end)
-{
-	// Compute endpoints using least squares.
-	float alpha2_sum = 0.0f;
-	float beta2_sum = 0.0f;
-	float alphabeta_sum = 0.0f;
-	float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
-
-	// Compute alpha & beta for this permutation.
-	for (int i = 0; i < 16; i++)
-	{
-		const uint bits = permutation >> (2*i);
-
-		float beta = (bits & 1);
-		if (bits & 2) beta = 0.5f;
-		float alpha = 1.0f - beta;
-
-		alpha2_sum += alpha * alpha * weights[i];
-		beta2_sum += beta * beta * weights[i];
-		alphabeta_sum += alpha * beta * weights[i];
-		alphax_sum += alpha * colors[i];
-	}
-
-	float3 betax_sum = color_sum - alphax_sum;
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-
-	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand565(a, start);
-	b = roundAndExpand565(b, end);
-
-	// compute the error
-	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
-
-	return dot(e, kColorMetricSqr);
-}
-*/
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Evaluate all permutations
-////////////////////////////////////////////////////////////////////////////////
-__device__ void evalAllPermutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
-{
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	__shared__ uint s_permutations[160];
-
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
-		if (pidx < 160) s_permutations[pidx] = permutation;
-				
-		float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
-
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
-
-	for(int i = 0; i < 3; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 160) break;
-		
-		ushort start, end;
-		uint permutation = s_permutations[pidx];
-		float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-			
-			if (bestStart > bestEnd)
-			{
-				swap(bestEnd, bestStart);
-				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
-			}
-		}
-	}
-
-	errors[idx] = bestError;
-}
-
-/*
-__device__ void evalAllPermutations(const float3 * colors, const float * weights, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
-{
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	__shared__ uint s_permutations[160];
-	
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
-		if (pidx < 160) s_permutations[pidx] = permutation;
-
-		float error = evalPermutation4(colors, weights, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
-
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
-
-	for(int i = 0; i < 3; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 160) break;
-		
-		ushort start, end;
-		uint permutation = s_permutations[pidx];
-		float error = evalPermutation3(colors, weights, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-			
-			if (bestStart > bestEnd)
-			{
-				swap(bestEnd, bestStart);
-				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
-			}
-		}
-	}
-
-	errors[idx] = bestError;
-}
-*/
-
-__device__ void evalLevel4Permutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
-{
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
-
-		float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
-
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
-
-	errors[idx] = bestError;
-}
-
-__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
-{
-	const int idx = threadIdx.x;
-	
-	float bestError = FLT_MAX;
-	
-	for(int i = 0; i < 16; i++)
-	{
-		int pidx = idx + NUM_THREADS * i;
-		if (pidx >= 992) break;
-		
-		ushort start, end;
-		uint permutation = permutations[pidx];
-
-		float error = evalPermutation4(colors, weights, colorSum, permutation, &start, &end);
-		
-		if (error < bestError)
-		{
-			bestError = error;
-			bestPermutation = permutation;
-			bestStart = start;
-			bestEnd = end;
-		}
-	}
-
-	if (bestStart < bestEnd)
-	{
-		swap(bestEnd, bestStart);
-		bestPermutation ^= 0x55555555;	// Flip indices.
-	}
-
-	errors[idx] = bestError;
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Find index with minimum error
-////////////////////////////////////////////////////////////////////////////////
-__device__ int findMinError(float * errors)
-{
-	const int idx = threadIdx.x;
-
-	__shared__ int indices[NUM_THREADS];
-	indices[idx] = idx;
-
-#if __DEVICE_EMULATION__
-	for(int d = NUM_THREADS/2; d > 0; d >>= 1)
-	{
-		__syncthreads();
-		
-		if (idx < d)
-		{
-			float err0 = errors[idx];
-			float err1 = errors[idx + d];
-			
-			if (err1 < err0) {
-				errors[idx] = err1;
-				indices[idx] = indices[idx + d];
-			}
-		}
-	}
-
-#else
-	for(int d = NUM_THREADS/2; d > 32; d >>= 1)
-	{
-		__syncthreads();
-		
-		if (idx < d)
-		{
-			float err0 = errors[idx];
-			float err1 = errors[idx + d];
-			
-			if (err1 < err0) {
-				errors[idx] = err1;
-				indices[idx] = indices[idx + d];
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// unroll last 6 iterations
-	if (idx < 32)
-	{
-		if (errors[idx + 32] < errors[idx]) {
-			errors[idx] = errors[idx + 32];
-			indices[idx] = indices[idx + 32];
-		}
-		if (errors[idx + 16] < errors[idx]) {
-			errors[idx] = errors[idx + 16];
-			indices[idx] = indices[idx + 16];
-		}
-		if (errors[idx + 8] < errors[idx]) {
-			errors[idx] = errors[idx + 8];
-			indices[idx] = indices[idx + 8];
-		}
-		if (errors[idx + 4] < errors[idx]) {
-			errors[idx] = errors[idx + 4];
-			indices[idx] = indices[idx + 4];
-		}
-		if (errors[idx + 2] < errors[idx]) {
-			errors[idx] = errors[idx + 2];
-			indices[idx] = indices[idx + 2];
-		}
-		if (errors[idx + 1] < errors[idx]) {
-			errors[idx] = errors[idx + 1];
-			indices[idx] = indices[idx + 1];
-		}
-	}
-#endif
-
-	__syncthreads();
-
-	return indices[0];
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Save DXT block
-////////////////////////////////////////////////////////////////////////////////
-__device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 * result)
-{
-	const int bid = blockIdx.x;
-
-	if (start == end)
-	{
-		permutation = 0;
-	}
-	
-	// Reorder permutation.
-	uint indices = 0;
-	for(int i = 0; i < 16; i++)
-	{
-		int ref = xrefs[i];
-		indices |= ((permutation >> (2 * ref)) & 3) << (2 * i);
-	}
-	
-	// Write endpoints.
-	result[bid].x = (end << 16) | start;
-	
-	// Write palette indices.
-	result[bid].y = indices;
-}
-
-__device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
-{
-	const int bid = blockIdx.x;
-
-	int r = color.x * 255;
-	int g = color.y * 255;
-	int b = color.z * 255;
-
-	ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
-	ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
-
-	if (color0 < color1)
-	{
-		result[bid].x = (color0 << 16) | color1;
-		result[bid].y = 0xffffffff;
-	}
-	else
-	{
-		result[bid].x = (color1 << 16) | color0;
-		result[bid].y = 0xaaaaaaaa;
-	}
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Compress color block
-////////////////////////////////////////////////////////////////////////////////
-__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result)
-{
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
-
-	__syncthreads();
-
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
-
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
-
-	__shared__ float errors[NUM_THREADS];
-
-	evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
-}
-
-__global__ void compressLevel4DXT1(const uint * permutations, const uint * image, uint2 * result)
-{
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlock(image, colors, sums, xrefs, &sameColor);
-
-	__syncthreads();
-
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
-
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
-
-	__shared__ float errors[NUM_THREADS];
-
-	evalLevel4Permutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	const int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
-}
-
-__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
-{
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ float weights[16];
-	__shared__ int xrefs[16];
-	__shared__ int sameColor;
-	
-	loadColorBlock(image, colors, sums, weights, xrefs, &sameColor);
-	
-	__syncthreads();
-
-	if (sameColor)
-	{
-		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
-		return;
-	}
-
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
-
-	__shared__ float errors[NUM_THREADS];
-	
-	evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
-	}
-}
-
-
-/*
-__device__ float computeError(const float weights[16], uchar a0, uchar a1)
-{
-	float palette[6];
-	palette[0] = (6.0f/7.0f * a0 + 1.0f/7.0f * a1);
-	palette[1] = (5.0f/7.0f * a0 + 2.0f/7.0f * a1);
-	palette[2] = (4.0f/7.0f * a0 + 3.0f/7.0f * a1);
-	palette[3] = (3.0f/7.0f * a0 + 4.0f/7.0f * a1);
-	palette[4] = (2.0f/7.0f * a0 + 5.0f/7.0f * a1);
-	palette[5] = (1.0f/7.0f * a0 + 6.0f/7.0f * a1);
-
-	float total = 0.0f;
-
-	for (uint i = 0; i < 16; i++)
-	{
-		float alpha = weights[i];
-
-		float error = a0 - alpha;
-		error = min(error, palette[0] - alpha);
-		error = min(error, palette[1] - alpha);
-		error = min(error, palette[2] - alpha);
-		error = min(error, palette[3] - alpha);
-		error = min(error, palette[4] - alpha);
-		error = min(error, palette[5] - alpha);
-		error = min(error, a1 - alpha);
-		
-		total += error;
-	}
-	
-	return total;
-}
-
-inline __device__ uchar roundAndExpand(float a)
-{
-	return rintf(__saturatef(a) * 255.0f);
-}
-*/
-/*
-__device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1)
-{
-	float alpha2_sum = 0;
-	float beta2_sum = 0;
-	float alphabeta_sum = 0;
-	float alphax_sum = 0;
-	float betax_sum = 0;
-
-	for (int i = 0; i < 16; i++)
-	{
-		uint idx = index[i];
-		float alpha;
-		if (idx < 2) alpha = 1.0f - idx;
-		else alpha = (8.0f - idx) / 7.0f;
-		
-		float beta = 1 - alpha;
-
-		alpha2_sum += alpha * alpha;
-		beta2_sum += beta * beta;
-		alphabeta_sum += alpha * beta;
-		alphax_sum += alpha * alphas[i];
-		betax_sum += beta * alphas[i];
-	}
-
-	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-	float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
-	float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
-
-	a0 = roundAndExpand8(a);
-	a1 = roundAndExpand8(b);
-}
-*/
-/*
-__device__ void compressAlpha(const float alphas[16], uint4 * result)
-{
-	const int tid = threadIdx.x;
-	
-	// Compress alpha block!
-	// Brute force approach:
-	// Try all color pairs: 256*256/2 = 32768, 32768/64 = 512 iterations?
-
-	// Determine min & max alphas
-
-	float A0, A1;
-
-	if (tid < 16)
-	{
-		__shared__ uint s_alphas[16];
-		
-		s_alphas[tid] = alphas[tid];
-		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^8]);
-		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^4]);
-		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^2]);
-		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^1]);
-		A0 = s_alphas[tid];
-		
-		s_alphas[tid] = alphas[tid];
-		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^8]);
-		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^4]);
-		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^2]);
-		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^1]);
-		A1 = s_alphas[tid];
-	}
-
-	__syncthreads();
-
-	int minIdx = 0;
-
-	if (A1 - A0 > 8)
-	{
-		float bestError = FLT_MAX;
-
-		// 64 threads -> 8x8
-		// divide [A1-A0] in partitions.
-		// test endpoints 
-		
-		for (int i = 0; i < 128; i++)
-		{
-			uint idx = (i * NUM_THREADS + tid) * 4;
-			uchar a0 = idx & 255;
-			uchar a1 = idx >> 8;
-			
-			float error = computeError(alphas, a0, a1);
-			
-			if (error < bestError)
-			{
-				bestError = error;
-				A0 = a0;
-				A1 = a1;
-			}
-		}
-		
-		__shared__ float errors[NUM_THREADS];
-		errors[tid] = bestError;
-		
-		// Minimize error.
-		minIdx = findMinError(errors);
-
-	}
-
-	if (minIdx == tid)
-	{
-		// @@ Compute indices.
-	
-		// @@ Write alpha block.
-	}
-}
-
-__global__ void compressDXT5(const uint * permutations, const uint * image, uint4 * result)
-{
-	__shared__ float3 colors[16];
-	__shared__ float3 sums[16];
-	__shared__ float weights[16];
-	__shared__ int xrefs[16];
-	
-	loadColorBlock(image, colors, sums, weights, xrefs);
-	
-	__syncthreads();
-
-	compressAlpha(weights, result);	
-
-	ushort bestStart, bestEnd;
-	uint bestPermutation;
-
-	__shared__ float errors[NUM_THREADS];
-	
-	evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
-	
-	// Use a parallel reduction to find minimum error.
-	int minIdx = findMinError(errors);
-	
-	// Only write the result of the winner thread.
-	if (threadIdx.x == minIdx)
-	{
-		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, (uint2 *)result);
-	}
-}
-*/
-
-////////////////////////////////////////////////////////////////////////////////
-// Setup kernel
-////////////////////////////////////////////////////////////////////////////////
-
-extern "C" void setupCompressKernel(const float weights[3])
-{
-	// Set constants.
-	cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
-
-	float weightsSqr[3];
-	weightsSqr[0] = weights[0] * weights[0];
-	weightsSqr[1] = weights[1] * weights[1];
-	weightsSqr[2] = weights[2] * weights[2];
-
-	cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Launch kernel
-////////////////////////////////////////////////////////////////////////////////
-
-extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
-{
-	compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
-}
-
-extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
-{
-	compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
-}
-
-extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
-{
-	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
-}
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <math.h>
+#include <float.h> // FLT_MAX
+
+#include "CudaMath.h"
+
+
+#define NUM_THREADS 64		// Number of threads per block.
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+
+template <class T> 
+__device__ inline void swap(T & a, T & b)
+{
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+__constant__ uchar OMatch5[256][2];
+__constant__ uchar OMatch6[256][2];
+
+__constant__ float3 kColorMetric = { 1.0f, 1.0f, 1.0f };
+__constant__ float3 kColorMetricSqr = { 1.0f, 1.0f, 1.0f };
+
+// Some kernels read the input through texture.
+texture<uchar4, 2, cudaReadModeNormalizedFloat> tex;
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Color helpers
+////////////////////////////////////////////////////////////////////////////////
+
+__device__ inline uint float_to_u8(float value)
+{
+    return min(max(__float2int_rn((255 * value + 0.5f) / (1.0f + 1.0f/255.0f)), 0), 255);
+}
+
+__device__ inline uint float_to_u6(float value)
+{
+    return min(max(__float2int_rn((63 * value + 0.5f) / (1.0f + 1.0f/63.0f)), 0), 63);
+}
+
+__device__ inline uint float_to_u5(float value)
+{
+    return min(max(__float2int_rn((31 * value + 0.5f) / (1.0f + 1.0f/31.0f)), 0), 31);
+}
+
+__device__ inline float u8_to_float(uint value)
+{
+    return __saturatef(__uint2float_rn(value) / 255.0f);
+    //return (value) / 255.0f;
+}
+
+__device__ float3 color32ToFloat3(uint c)
+{
+    float3 color;
+    color.z = u8_to_float((c >> 0) & 0xFF);
+    color.y = u8_to_float((c >> 8) & 0xFF);
+    color.x = u8_to_float((c >> 16) & 0xFF);
+    return color;
+}
+
+__device__ int3 color16ToInt3(ushort c)
+{
+    int3 color;
+
+    color.z = ((c >> 0) & 0x1F);
+    color.z = (color.z << 3) | (color.z >> 2);
+
+    color.y = ((c >> 5) & 0x3F);
+    color.y = (color.y << 2) | (color.y >> 4);
+
+    color.x = ((c >> 11) & 0x1F);
+    color.x = (color.x << 3) | (color.x >> 2);
+    
+    return color;
+}
+
+__device__ float3 color16ToFloat3(ushort c)
+{
+    int3 color = color16ToInt3(c);
+    return make_float3(color.x, color.y, color.z) * (1.0f / 255.0f);
+}
+
+__device__ int3 float3ToInt3(float3 c)
+{
+    return make_int3(c.x * 255, c.y * 255, c.z * 255);
+}
+
+__device__ float3 int3ToFloat3(int3 c)
+{
+    return make_float3(float_to_u8(c.x), float_to_u8(c.y), float_to_u8(c.z));
+}
+
+
+__device__ int colorDistance(int3 c0, int3 c1)
+{
+    int dx = c0.x-c1.x;
+    int dy = c0.y-c1.y;
+    int dz = c0.z-c1.z;
+    return __mul24(dx, dx) + __mul24(dy, dy) + __mul24(dz, dz);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Round color to RGB565 and expand
+////////////////////////////////////////////////////////////////////////////////
+
+
+#if 0
+__device__ inline uint float_to_u8(float value)
+{
+    //uint result;
+    //asm("cvt.sat.rni.u8.f32 %0, %1;" : "=r" (result) : "f" (value));
+    //return result;
+    //return __float2uint_rn(__saturatef(value) * 255.0f);
+    
+    int result = __float2int_rn((255 * value + 0.5f) / (1.0f + 1.0f/255.0f));
+    result = max(result, 0);
+    result = min(result, 255);
+    return result;
+}
+
+__device__ inline float u8_to_float(uint value)
+{
+    //float result;
+    //asm("cvt.sat.rn.f32.u8 %0, %1;" : "=f" (result) : "r" (value)); // this is wrong!
+    //return result;
+    return __saturatef(__uint2float_rn(value) / 255.0f);
+}
+
+inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
+{
+    uint x = float_to_u8(v.x) >> 3;
+    uint y = float_to_u8(v.y) >> 2;
+    uint z = float_to_u8(v.z) >> 3;
+    *w = (x << 11) | (y << 5) | z;
+    v.x = u8_to_float((x << 3) | (x >> 2));
+    v.y = u8_to_float((y << 2) | (y >> 4));
+    v.z = u8_to_float((z << 3) | (z >> 2));
+//    v.x = u8_to_float(x) * 255.0f / 31.0f;
+//    v.y = u8_to_float(y) * 255.0f / 63.0f;
+//    v.z = u8_to_float(z) * 255.0f / 31.0f;
+    return v;
+}
+#else
+
+inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
+{
+    uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
+    uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
+    uint z = __float2uint_rn(__saturatef(v.z) * 31.0f);
+
+    //uint x = float_to_u5(v.x);
+    //uint y = float_to_u6(v.y);
+    //uint z = float_to_u5(v.z);
+
+    *w = (x << 11) | (y << 5) | z;
+
+    v.x = __uint2float_rn(x) * 1.0f / 31.0f;
+    v.y = __uint2float_rn(y) * 1.0f / 63.0f;
+    v.z = __uint2float_rn(z) * 1.0f / 31.0f;
+
+    //v.x = u8_to_float((x << 3) | (x >> 2));
+    //v.y = u8_to_float((y << 2) | (y >> 4));
+    //v.z = u8_to_float((z << 3) | (z >> 2));
+
+    return v;
+}
+#endif
+inline __device__ float2 roundAndExpand56(float2 v, ushort * w)
+{
+    uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
+    uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
+    *w = (x << 11) | (y << 5);
+    v.x = __uint2float_rn(x) * 1.0f / 31.0f;
+    v.y = __uint2float_rn(y) * 1.0f / 63.0f;
+    return v;
+}
+
+inline __device__ float2 roundAndExpand88(float2 v, ushort * w)
+{
+    uint x = __float2uint_rn(__saturatef(v.x) * 255.0f);
+    uint y = __float2uint_rn(__saturatef(v.y) * 255.0f);
+    *w = (x << 8) | y;
+    v.x = __uint2float_rn(x) * 1.0f / 255.0f;
+    v.y = __uint2float_rn(y) * 1.0f / 255.0f;
+    return v;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Block errors
+////////////////////////////////////////////////////////////////////////////////
+
+__device__ float3 blockError4(const float3 * colors, uint permutation, float3 a, float3 b)
+{
+    float3 error = make_float3(0.0f, 0.0f, 0.0f);
+
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        float3 diff = colors[i] - (a*alpha + b*beta);
+
+        error += diff*diff;
+    }
+
+    return error;
+}
+
+__device__ float3 blockError4(const float3 * colors, uint permutation, ushort c0, ushort c1)
+{
+    float3 error = make_float3(0.0f, 0.0f, 0.0f);
+    
+    int3 color0 = color16ToInt3(c0);
+    int3 color1 = color16ToInt3(c1);
+
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        int beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta);
+        float alpha = 3 - beta;
+
+        int3 color;
+        color.x = (color0.x * alpha + color1.x * beta) / 3;
+        color.y = (color0.y * alpha + color1.y * beta) / 3;
+        color.z = (color0.z * alpha + color1.z * beta) / 3;
+
+        float3 diff = colors[i] - int3ToFloat3(color);
+
+        error += diff*diff;
+    }
+
+    return error;
+}
+
+
+__device__ float3 blockError3(const float3 * colors, uint permutation, float3 a, float3 b)
+{
+    float3 error = make_float3(0.0f, 0.0f, 0.0f);
+
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+
+        float3 diff = colors[i] - (a*alpha + b*beta);
+
+        error += diff*diff;
+    }
+
+    return error;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Sort colors
+////////////////////////////////////////////////////////////////////////////////
+
+// @@ Experimental code to avoid duplicate colors for faster compression.
+// We could first sort along the best fit line and only compare colors that have the same projection.
+// The hardest part is to maintain the indices to map packed/sorted colors to the input colors.
+// We also need to update several functions that assume the number of colors is fixed to 16.
+// And compute different bit maps for the different color counts.
+// This is a fairly high amount of work.
+__device__ int packColors(float3 * values, float * weights, int * ranks)
+{
+    const int tid = threadIdx.x;
+
+    __shared__ int count;
+    count = 0;
+
+    bool alive = true;
+
+    // Append this
+    for (int i = 0; i < 16; i++)
+    {
+        // One thread leads on each iteration.
+        if (tid == i) {
+
+            // If thread alive, then append element.
+            if (alive) {
+                values[count] = values[i];
+                weights[count] = weights[i];
+                count++;
+            }
+
+            // Otherwise update weight.
+            else {
+                weights[ranks[i]] += weights[i];
+            }
+        }
+
+        // Kill all threads that have the same element and record rank.
+        if (values[i] == values[tid]) {
+            alive = false;
+            ranks[tid] = count - 1;
+        }
+    }
+
+    return count;
+}
+
+
+__device__ void sortColors(const float * values, int * ranks)
+{
+    const int tid = threadIdx.x;
+
+    int rank = 0;
+
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        rank += (values[i] < values[tid]);
+    }
+    
+    ranks[tid] = rank;
+
+    // Resolve elements with the same index.
+    #pragma unroll
+    for (int i = 0; i < 15; i++)
+    {
+        if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
+    }
+}
+
+__device__ void sortColors(const float * values, int * ranks, int count)
+{
+    const int tid = threadIdx.x;
+
+    int rank = 0;
+
+    #pragma unroll
+    for (int i = 0; i < count; i++)
+    {
+        rank += (values[i] < values[tid]);
+    }
+    
+    ranks[tid] = rank;
+
+    // Resolve elements with the same index.
+    #pragma unroll
+    for (int i = 0; i < count-1; i++)
+    {
+        if ((tid > i) & (ranks[tid] == ranks[i])) ++ranks[tid];
+    }
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Load color block to shared mem
+////////////////////////////////////////////////////////////////////////////////
+
+__device__ void loadColorBlockTex(uint firstBlock, uint blockWidth, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
+{
+    const int bid = blockIdx.x;
+    const int idx = threadIdx.x;
+
+    __shared__ float dps[16];
+
+    if (idx < 16)
+    {
+        float x = 4 * ((firstBlock + bid) % blockWidth) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+        float y = 4 * ((firstBlock + bid) / blockWidth) + idx / 4;
+
+        // Read color and copy to shared mem.
+        float4 c = tex2D(tex, x, y);
+
+        colors[idx].x = c.z;
+        colors[idx].y = c.y;
+        colors[idx].z = c.x;
+
+        // Sort colors along the best fit line.
+        colorSums(colors, sums);
+        float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+
+        *sameColor = (axis == make_float3(0, 0, 0));
+
+        dps[idx] = dot(colors[idx], axis);
+
+        sortColors(dps, xrefs);
+
+        float3 tmp = colors[idx];
+        colors[xrefs[idx]] = tmp;
+    }
+}
+
+/*
+__device__ void loadColorBlockTex(uint firstBlock, uint w, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
+{
+	const int bid = blockIdx.x;
+	const int idx = threadIdx.x;
+
+	__shared__ float dps[16];
+
+	if (idx < 16)
+	{
+		float x = 4 * ((firstBlock + bid) % w) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+		float y = 4 * ((firstBlock + bid) / w) + idx / 4;
+
+		// Read color and copy to shared mem.
+		float4 c = tex2D(tex, x, y);
+
+		colors[idx].x = c.z;
+		colors[idx].y = c.y;
+		colors[idx].z = c.x;
+		weights[idx] = 1;
+
+		int count = packColors(colors, weights);
+		if (idx < count)
+		{
+			// Sort colors along the best fit line.
+			colorSums(colors, sums);
+			float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+			
+			*sameColor = (axis == make_float3(0, 0, 0));
+			
+			dps[idx] = dot(colors[idx], axis);
+			
+			sortColors(dps, xrefs);
+			
+			float3 tmp = colors[idx];
+			colors[xrefs[idx]] = tmp;
+		}
+	}
+}
+*/
+
+__device__ void loadColorBlockTex(uint firstBlock, uint width, float3 colors[16], float3 sums[16], float weights[16], int xrefs[16], int * sameColor)
+{
+    const int bid = blockIdx.x;
+    const int idx = threadIdx.x;
+
+    __shared__ float3 rawColors[16];
+    __shared__ float dps[16];
+
+    if (idx < 16)
+    {
+        float x = 4 * ((firstBlock + bid) % width) + idx % 4; // @@ Avoid mod and div by using 2D grid?
+        float y = 4 * ((firstBlock + bid) / width) + idx / 4;
+
+        // Read color and copy to shared mem.
+        float4 c = tex2D(tex, x, y);
+
+        rawColors[idx].x = c.z;
+        rawColors[idx].y = c.y;
+        rawColors[idx].z = c.x;
+        weights[idx] = c.w;
+
+        colors[idx] = rawColors[idx] * weights[idx];
+
+        // Sort colors along the best fit line.
+        colorSums(colors, sums);
+        float3 axis = bestFitLine(colors, sums[0], kColorMetric);
+
+        *sameColor = (axis == make_float3(0, 0, 0));
+
+        // Single color compressor needs unweighted colors.
+        if (*sameColor) colors[idx] = rawColors[idx];
+
+        dps[idx] = dot(colors[idx], axis);
+
+        sortColors(dps, xrefs);
+
+        float3 tmp = colors[idx];
+        float w = weights[idx];
+        colors[xrefs[idx]] = tmp;
+        weights[xrefs[idx]] = w;
+    }
+}
+
+__device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sums[16], int xrefs[16], int * sameColor)
+{
+    const int bid = blockIdx.x;
+    const int idx = threadIdx.x;
+
+    __shared__ float dps[16];
+
+    if (idx < 16)
+    {
+        // Read color and copy to shared mem.
+        uint c = image[(bid) * 16 + idx];
+
+        colors[idx].y = ((c >> 8) & 0xFF) * (1.0f / 255.0f);
+        colors[idx].x = ((c >> 16) & 0xFF) * (1.0f / 255.0f);
+
+        // Sort colors along the best fit line.
+        colorSums(colors, sums);
+        float2 axis = bestFitLine(colors, sums[0]);
+
+        *sameColor = (axis == make_float2(0, 0));
+
+        dps[idx] = dot(colors[idx], axis);
+
+        sortColors(dps, xrefs);
+
+        float2 tmp = colors[idx];
+        colors[xrefs[idx]] = tmp;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Evaluate permutations
+////////////////////////////////////////////////////////////////////////////////
+__device__ float evalPermutation4(const float3 * colors, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
+}
+
+__device__ float evalPermutation3(const float3 * colors, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    float3 betax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha;
+        beta2_sum += beta * beta;
+        alphabeta_sum += alpha * beta;
+        alphax_sum += alpha * colors[i];
+        betax_sum += beta * colors[i];
+    }
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
+}
+
+__constant__ const float alphaTable4[4] = { 9.0f, 0.0f, 6.0f, 3.0f };
+__constant__ const float alphaTable3[4] = { 4.0f, 0.0f, 2.0f, 2.0f };
+__constant__ const uint prods4[4] = { 0x090000,0x000900,0x040102,0x010402 };
+__constant__ const uint prods3[4] = { 0x040000,0x000400,0x040101,0x010401 };
+
+__device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    uint akku = 0;
+
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        alphax_sum += alphaTable4[bits & 3] * colors[i];
+        akku += prods4[bits & 3];
+    }
+
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float3 betax_sum = 9.0f * color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    //float3 e = blockError4(colors, permutation, *start, *end);
+
+    return (1.0f / 9.0f) * dot(e, kColorMetricSqr);
+}
+
+__device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+    uint akku = 0;
+
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        alphax_sum += alphaTable3[bits & 3] * colors[i];
+        akku += prods3[bits & 3];
+    }
+
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float3 betax_sum = 4.0f * color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    //float3 e = blockError3(colors, permutation, a, b);
+
+    return (1.0f / 4.0f) * dot(e, kColorMetricSqr);
+}
+
+__device__ float evalPermutation4(const float3 * colors, const float * weights, float3 color_sum, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = (1 + beta) / 3.0f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha * weights[i];
+        beta2_sum += beta * beta * weights[i];
+        alphabeta_sum += alpha * beta * weights[i];
+        alphax_sum += alpha * colors[i];
+    }
+
+    float3 betax_sum = color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
+}
+
+/*
+__device__ float evalPermutation3(const float3 * colors, const float * weights, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float alpha2_sum = 0.0f;
+    float beta2_sum = 0.0f;
+    float alphabeta_sum = 0.0f;
+    float3 alphax_sum = make_float3(0.0f, 0.0f, 0.0f);
+
+    // Compute alpha & beta for this permutation.
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        float beta = (bits & 1);
+        if (bits & 2) beta = 0.5f;
+        float alpha = 1.0f - beta;
+
+        alpha2_sum += alpha * alpha * weights[i];
+        beta2_sum += beta * beta * weights[i];
+        alphabeta_sum += alpha * beta * weights[i];
+        alphax_sum += alpha * colors[i];
+    }
+
+    float3 betax_sum = color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float3 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6-5 color and expand...
+    a = roundAndExpand565(a, start);
+    b = roundAndExpand565(b, end);
+
+    // compute the error
+    float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return dot(e, kColorMetricSqr);
+}
+*/
+
+__device__ float evalPermutation4(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float2 alphax_sum = make_float2(0.0f, 0.0f);
+    uint akku = 0;
+
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        alphax_sum += alphaTable4[bits & 3] * colors[i];
+        akku += prods4[bits & 3];
+    }
+
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float2 betax_sum = 9.0f * color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6 color and expand...
+    a = roundAndExpand56(a, start);
+    b = roundAndExpand56(b, end);
+
+    // compute the error
+    float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return (1.0f / 9.0f) * (e.x + e.y);
+}
+
+__device__ float evalPermutation3(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float2 alphax_sum = make_float2(0.0f, 0.0f);
+    uint akku = 0;
+
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        alphax_sum += alphaTable3[bits & 3] * colors[i];
+        akku += prods3[bits & 3];
+    }
+
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float2 betax_sum = 4.0f * color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 5-6 color and expand...
+    a = roundAndExpand56(a, start);
+    b = roundAndExpand56(b, end);
+
+    // compute the error
+    float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return (1.0f / 4.0f) * (e.x + e.y);
+}
+
+__device__ float evalPermutationCTX(const float2 * colors, float2 color_sum, uint permutation, ushort * start, ushort * end)
+{
+    // Compute endpoints using least squares.
+    float2 alphax_sum = make_float2(0.0f, 0.0f);
+    uint akku = 0;
+
+    // Compute alpha & beta for this permutation.
+    #pragma unroll
+    for (int i = 0; i < 16; i++)
+    {
+        const uint bits = permutation >> (2*i);
+
+        alphax_sum += alphaTable4[bits & 3] * colors[i];
+        akku += prods4[bits & 3];
+    }
+
+    float alpha2_sum = float(akku >> 16);
+    float beta2_sum = float((akku >> 8) & 0xff);
+    float alphabeta_sum = float(akku & 0xff);
+    float2 betax_sum = 9.0f * color_sum - alphax_sum;
+
+    const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+    float2 a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+    float2 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+    // Round a, b to the closest 8-8 color and expand...
+    a = roundAndExpand88(a, start);
+    b = roundAndExpand88(b, end);
+
+    // compute the error
+    float2 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
+
+    return (1.0f / 9.0f) * (e.x + e.y);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Evaluate all permutations
+////////////////////////////////////////////////////////////////////////////////
+__device__ void evalAllPermutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+    const int idx = threadIdx.x;
+
+    float bestError = FLT_MAX;
+
+    __shared__ uint s_permutations[160];
+
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
+
+        ushort start, end;
+        uint permutation = permutations[pidx];
+        if (pidx < 160) s_permutations[pidx] = permutation;
+
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    for(int i = 0; i < 3; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 160) break;
+
+        ushort start, end;
+        uint permutation = s_permutations[pidx];
+        float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+
+            if (bestStart > bestEnd)
+            {
+                swap(bestEnd, bestStart);
+                bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+            }
+        }
+    }
+
+    errors[idx] = bestError;
+}
+
+/*
+__device__ void evalAllPermutations(const float3 * colors, const float * weights, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+	const int idx = threadIdx.x;
+	
+	float bestError = FLT_MAX;
+	
+	__shared__ uint s_permutations[160];
+	
+	for(int i = 0; i < 16; i++)
+	{
+		int pidx = idx + NUM_THREADS * i;
+		if (pidx >= 992) break;
+		
+		ushort start, end;
+		uint permutation = permutations[pidx];
+		if (pidx < 160) s_permutations[pidx] = permutation;
+
+		float error = evalPermutation4(colors, weights, permutation, &start, &end);
+		
+		if (error < bestError)
+		{
+			bestError = error;
+			bestPermutation = permutation;
+			bestStart = start;
+			bestEnd = end;
+		}
+	}
+
+	if (bestStart < bestEnd)
+	{
+		swap(bestEnd, bestStart);
+		bestPermutation ^= 0x55555555;	// Flip indices.
+	}
+
+	for(int i = 0; i < 3; i++)
+	{
+		int pidx = idx + NUM_THREADS * i;
+		if (pidx >= 160) break;
+		
+		ushort start, end;
+		uint permutation = s_permutations[pidx];
+		float error = evalPermutation3(colors, weights, permutation, &start, &end);
+		
+		if (error < bestError)
+		{
+			bestError = error;
+			bestPermutation = permutation;
+			bestStart = start;
+			bestEnd = end;
+			
+			if (bestStart > bestEnd)
+			{
+				swap(bestEnd, bestStart);
+				bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+			}
+		}
+	}
+
+	errors[idx] = bestError;
+}
+*/
+
+__device__ void evalAllPermutations(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+    const int idx = threadIdx.x;
+
+    float bestError = FLT_MAX;
+
+    __shared__ uint s_permutations[160];
+
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
+
+        ushort start, end;
+        uint permutation = permutations[pidx];
+        if (pidx < 160) s_permutations[pidx] = permutation;
+
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    for(int i = 0; i < 3; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 160) break;
+
+        ushort start, end;
+        uint permutation = s_permutations[pidx];
+        float error = evalPermutation3(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+
+            if (bestStart > bestEnd)
+            {
+                swap(bestEnd, bestStart);
+                bestPermutation ^= (~bestPermutation >> 1) & 0x55555555;	// Flip indices.
+            }
+        }
+    }
+
+    errors[idx] = bestError;
+}
+
+__device__ void evalLevel4Permutations(const float3 * colors, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+    const int idx = threadIdx.x;
+
+    float bestError = FLT_MAX;
+
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
+
+        ushort start, end;
+        uint permutation = permutations[pidx];
+
+        float error = evalPermutation4(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    errors[idx] = bestError;
+}
+
+__device__ void evalLevel4Permutations(const float3 * colors, const float * weights, float3 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+    const int idx = threadIdx.x;
+
+    float bestError = FLT_MAX;
+
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 992) break;
+
+        ushort start, end;
+        uint permutation = permutations[pidx];
+
+        float error = evalPermutation4(colors, weights, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    errors[idx] = bestError;
+}
+
+__device__ void evalAllPermutationsCTX(const float2 * colors, float2 colorSum, const uint * permutations, ushort & bestStart, ushort & bestEnd, uint & bestPermutation, float * errors)
+{
+    const int idx = threadIdx.x;
+
+    float bestError = FLT_MAX;
+
+    for(int i = 0; i < 16; i++)
+    {
+        int pidx = idx + NUM_THREADS * i;
+        if (pidx >= 704) break;
+
+        ushort start, end;
+        uint permutation = permutations[pidx];
+
+        float error = evalPermutationCTX(colors, colorSum, permutation, &start, &end);
+
+        if (error < bestError)
+        {
+            bestError = error;
+            bestPermutation = permutation;
+            bestStart = start;
+            bestEnd = end;
+        }
+    }
+
+    if (bestStart < bestEnd)
+    {
+        swap(bestEnd, bestStart);
+        bestPermutation ^= 0x55555555;	// Flip indices.
+    }
+
+    errors[idx] = bestError;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Find index with minimum error
+////////////////////////////////////////////////////////////////////////////////
+__device__ int findMinError(float * errors)
+{
+    const int idx = threadIdx.x;
+
+    __shared__ int indices[NUM_THREADS];
+    indices[idx] = idx;
+
+    for(int d = NUM_THREADS/2; d > 32; d >>= 1)
+    {
+        __syncthreads();
+
+        if (idx < d)
+        {
+            float err0 = errors[idx];
+            float err1 = errors[idx + d];
+
+            if (err1 < err0) {
+                errors[idx] = err1;
+                indices[idx] = indices[idx + d];
+            }
+        }
+    }
+
+    __syncthreads();
+
+    // unroll last 6 iterations
+    if (idx < 32)
+    {
+        if (errors[idx + 32] < errors[idx]) {
+            errors[idx] = errors[idx + 32];
+            indices[idx] = indices[idx + 32];
+        }
+        if (errors[idx + 16] < errors[idx]) {
+            errors[idx] = errors[idx + 16];
+            indices[idx] = indices[idx + 16];
+        }
+        if (errors[idx + 8] < errors[idx]) {
+            errors[idx] = errors[idx + 8];
+            indices[idx] = indices[idx + 8];
+        }
+        if (errors[idx + 4] < errors[idx]) {
+            errors[idx] = errors[idx + 4];
+            indices[idx] = indices[idx + 4];
+        }
+        if (errors[idx + 2] < errors[idx]) {
+            errors[idx] = errors[idx + 2];
+            indices[idx] = indices[idx + 2];
+        }
+        if (errors[idx + 1] < errors[idx]) {
+            errors[idx] = errors[idx + 1];
+            indices[idx] = indices[idx + 1];
+        }
+    }
+
+    __syncthreads();
+
+    return indices[0];
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Save DXT block
+////////////////////////////////////////////////////////////////////////////////
+__device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 * result)
+{
+    const int bid = blockIdx.x;
+
+    if (start == end)
+    {
+        permutation = 0;
+    }
+
+    // Reorder permutation.
+    uint indices = 0;
+    for(int i = 0; i < 16; i++)
+    {
+        int ref = xrefs[i];
+        indices |= ((permutation >> (2 * ref)) & 3) << (2 * i);
+    }
+
+    // Write endpoints.
+    result[bid].x = (end << 16) | start;
+
+    // Write palette indices.
+    result[bid].y = indices;
+}
+
+__device__ void saveBlockDXT1_Parallel(uint endpoints, float3 colors[16], int xrefs[16], uint * result)
+{
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+
+    if (tid < 16)
+    {
+        int3 color = float3ToInt3(colors[xrefs[tid]]);
+
+        ushort endpoint0 = endpoints & 0xFFFF;
+        ushort endpoint1 = endpoints >> 16;
+
+        int3 palette[4];
+        palette[0] = color16ToInt3(endpoint0);
+        palette[1] = color16ToInt3(endpoint1);
+
+        int d0 = colorDistance(palette[0], color);
+        int d1 = colorDistance(palette[1], color);
+
+        uint index;
+        if (endpoint0 > endpoint1) 
+        {
+            palette[2].x = (2 * palette[0].x + palette[1].x) / 3;
+            palette[2].y = (2 * palette[0].y + palette[1].y) / 3;
+            palette[2].z = (2 * palette[0].z + palette[1].z) / 3;
+
+            palette[3].x = (2 * palette[1].x + palette[0].x) / 3;
+            palette[3].y = (2 * palette[1].y + palette[0].y) / 3;
+            palette[3].z = (2 * palette[1].z + palette[0].z) / 3;
+            
+            int d2 = colorDistance(palette[2], color);
+            int d3 = colorDistance(palette[3], color);
+
+            // Compute the index that best fit color.
+            uint b0 = d0 > d3;
+            uint b1 = d1 > d2;
+            uint b2 = d0 > d2;
+            uint b3 = d1 > d3;
+            uint b4 = d2 > d3;
+
+            uint x0 = b1 & b2;
+            uint x1 = b0 & b3;
+            uint x2 = b0 & b4;
+
+            index = (x2 | ((x0 | x1) << 1));
+        }
+        else {
+            palette[2].x = (palette[0].x + palette[1].x) / 2;
+            palette[2].y = (palette[0].y + palette[1].y) / 2;
+            palette[2].z = (palette[0].z + palette[1].z) / 2;
+
+            int d2 = colorDistance(palette[2], color);
+
+            index = 0;
+            if (d1 < d0 && d1 < d2) index = 1;
+            else if (d2 < d0) index = 2;
+        }
+
+        __shared__ uint indices[16];
+
+        indices[tid] = index << (2 * tid);
+        if (tid < 8) indices[tid] |= indices[tid+8];
+        if (tid < 4) indices[tid] |= indices[tid+4];
+        if (tid < 2) indices[tid] |= indices[tid+2];
+        if (tid < 1) indices[tid] |= indices[tid+1];
+
+        if (tid < 2) {
+            result[2 * bid + tid] = tid == 0 ? endpoints : indices[0];
+        }
+    }
+}
+
+__device__ void saveBlockDXT1_Parallel(uint endpoints, uint permutation, int xrefs[16], uint * result)
+{
+    const int tid = threadIdx.x;    
+    const int bid = blockIdx.x;
+
+    if (tid < 16)
+    {
+        // Reorder permutation.
+        uint index = ((permutation >> (2 * xrefs[tid])) & 3) << (2 * tid);
+        __shared__ uint indices[16];
+
+        indices[tid] = index;
+        if (tid < 8) indices[tid] |= indices[tid+8];
+        if (tid < 4) indices[tid] |= indices[tid+4];
+        if (tid < 2) indices[tid] |= indices[tid+2];
+        if (tid < 1) indices[tid] |= indices[tid+1];
+    	
+        if (tid < 2) {
+            result[2 * bid + tid] = tid == 0 ? endpoints : indices[0];
+        }
+    }
+}
+
+
+__device__ void saveBlockCTX1(ushort start, ushort end, uint permutation, int xrefs[16], uint2 * result)
+{
+    saveBlockDXT1(start, end, permutation, xrefs, result);
+}
+
+__device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
+{
+    const int bid = blockIdx.x;
+
+    int r = color.x * 255;
+    int g = color.y * 255;
+    int b = color.z * 255;
+
+    ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
+    ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
+
+    if (color0 < color1)
+    {
+        result[bid].x = (color0 << 16) | color1;
+        result[bid].y = 0xffffffff;
+    }
+    else
+    {
+        result[bid].x = (color1 << 16) | color0;
+        result[bid].y = 0xaaaaaaaa;
+    }
+}
+
+__device__ void saveSingleColorBlockDXT1(float2 color, uint2 * result)
+{
+    const int bid = blockIdx.x;
+
+    int r = color.x * 255;
+    int g = color.y * 255;
+
+    ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5);
+    ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5);
+
+    if (color0 < color1)
+    {
+        result[bid].x = (color0 << 16) | color1;
+        result[bid].y = 0xffffffff;
+    }
+    else
+    {
+        result[bid].x = (color1 << 16) | color0;
+        result[bid].y = 0xaaaaaaaa;
+    }
+}
+
+__device__ void saveSingleColorBlockCTX1(float2 color, uint2 * result)
+{
+    const int bid = blockIdx.x;
+
+    int r = color.x * 255;
+    int g = color.y * 255;
+
+    ushort color0 = (r << 8) | (g);
+
+    result[bid].x = (color0 << 16) | color0;
+    result[bid].y = 0x00000000;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Compress color block
+////////////////////////////////////////////////////////////////////////////////
+
+__global__ void compressDXT1(uint firstBlock, uint blockWidth, const uint * permutations, uint2 * result)
+{
+    __shared__ float3 colors[16];
+    __shared__ float3 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
+
+    loadColorBlockTex(firstBlock, blockWidth, colors, sums, xrefs, &sameColor);
+
+    __syncthreads();
+
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+    evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+    
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    __shared__ uint s_bestEndPoints;
+    //__shared__ uint s_bestPermutation;
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        s_bestEndPoints = (bestEnd << 16) | bestStart;
+        //s_bestPermutation = (bestStart != bestEnd) ? bestPermutation : 0;
+    }
+
+    __syncthreads();
+
+    saveBlockDXT1_Parallel(s_bestEndPoints, colors, xrefs, (uint *)result);
+    //saveBlockDXT1_Parallel(s_bestEndPoints, s_bestPermutation, xrefs, (uint *)result);
+}
+
+
+__global__ void compressLevel4DXT1(uint firstBlock, uint blockWidth, const uint * permutations, uint2 * result)
+{
+    __shared__ float3 colors[16];
+    __shared__ float3 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
+
+    loadColorBlockTex(firstBlock, blockWidth, colors, sums, xrefs, &sameColor);
+
+    __syncthreads();
+
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+
+    evalLevel4Permutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
+}
+
+__global__ void compressWeightedDXT1(uint firstBlock, uint blockWidth, const uint * permutations, uint2 * result)
+{
+    __shared__ float3 colors[16];
+    __shared__ float3 sums[16];
+    __shared__ float weights[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
+
+    loadColorBlockTex(firstBlock, blockWidth, colors, sums, weights, xrefs, &sameColor);
+
+    __syncthreads();
+
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+
+    evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
+}
+
+
+__global__ void compressNormalDXT1(const uint * permutations, const uint * image, uint2 * result)
+{
+    __shared__ float2 colors[16];
+    __shared__ float2 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
+
+    loadColorBlock(image, colors, sums, xrefs, &sameColor);
+
+    __syncthreads();
+
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+
+    evalAllPermutations(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
+}
+
+__global__ void compressCTX1(const uint * permutations, const uint * image, uint2 * result)
+{
+    __shared__ float2 colors[16];
+    __shared__ float2 sums[16];
+    __shared__ int xrefs[16];
+    __shared__ int sameColor;
+
+    loadColorBlock(image, colors, sums, xrefs, &sameColor);
+
+    __syncthreads();
+
+    if (sameColor)
+    {
+        if (threadIdx.x == 0) saveSingleColorBlockCTX1(colors[0], result);
+        return;
+    }
+
+    ushort bestStart, bestEnd;
+    uint bestPermutation;
+
+    __shared__ float errors[NUM_THREADS];
+
+    evalAllPermutationsCTX(colors, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+
+    // Use a parallel reduction to find minimum error.
+    const int minIdx = findMinError(errors);
+
+    // Only write the result of the winner thread.
+    if (threadIdx.x == minIdx)
+    {
+        saveBlockCTX1(bestStart, bestEnd, bestPermutation, xrefs, result);
+    }
+}
+
+
+/*
+__device__ float computeError(const float weights[16], uchar a0, uchar a1)
+{
+	float palette[6];
+	palette[0] = (6.0f/7.0f * a0 + 1.0f/7.0f * a1);
+	palette[1] = (5.0f/7.0f * a0 + 2.0f/7.0f * a1);
+	palette[2] = (4.0f/7.0f * a0 + 3.0f/7.0f * a1);
+	palette[3] = (3.0f/7.0f * a0 + 4.0f/7.0f * a1);
+	palette[4] = (2.0f/7.0f * a0 + 5.0f/7.0f * a1);
+	palette[5] = (1.0f/7.0f * a0 + 6.0f/7.0f * a1);
+
+	float total = 0.0f;
+
+	for (uint i = 0; i < 16; i++)
+	{
+		float alpha = weights[i];
+
+		float error = a0 - alpha;
+		error = min(error, palette[0] - alpha);
+		error = min(error, palette[1] - alpha);
+		error = min(error, palette[2] - alpha);
+		error = min(error, palette[3] - alpha);
+		error = min(error, palette[4] - alpha);
+		error = min(error, palette[5] - alpha);
+		error = min(error, a1 - alpha);
+		
+		total += error;
+	}
+	
+	return total;
+}
+
+inline __device__ uchar roundAndExpand(float a)
+{
+	return rintf(__saturatef(a) * 255.0f);
+}
+*/
+/*
+__device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1)
+{
+	float alpha2_sum = 0;
+	float beta2_sum = 0;
+	float alphabeta_sum = 0;
+	float alphax_sum = 0;
+	float betax_sum = 0;
+
+	for (int i = 0; i < 16; i++)
+	{
+		uint idx = index[i];
+		float alpha;
+		if (idx < 2) alpha = 1.0f - idx;
+		else alpha = (8.0f - idx) / 7.0f;
+		
+		float beta = 1 - alpha;
+
+		alpha2_sum += alpha * alpha;
+		beta2_sum += beta * beta;
+		alphabeta_sum += alpha * beta;
+		alphax_sum += alpha * alphas[i];
+		betax_sum += beta * alphas[i];
+	}
+
+	const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+	float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
+	float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
+
+	a0 = roundAndExpand8(a);
+	a1 = roundAndExpand8(b);
+}
+*/
+/*
+__device__ void compressAlpha(const float alphas[16], uint4 * result)
+{
+	const int tid = threadIdx.x;
+	
+	// Compress alpha block!
+	// Brute force approach:
+	// Try all color pairs: 256*256/2 = 32768, 32768/64 = 512 iterations?
+
+	// Determine min & max alphas
+
+	float A0, A1;
+
+	if (tid < 16)
+	{
+		__shared__ uint s_alphas[16];
+		
+		s_alphas[tid] = alphas[tid];
+		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^8]);
+		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^4]);
+		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^2]);
+		s_alphas[tid] = min(s_alphas[tid], s_alphas[tid^1]);
+		A0 = s_alphas[tid];
+		
+		s_alphas[tid] = alphas[tid];
+		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^8]);
+		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^4]);
+		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^2]);
+		s_alphas[tid] = max(s_alphas[tid], s_alphas[tid^1]);
+		A1 = s_alphas[tid];
+	}
+
+	__syncthreads();
+
+	int minIdx = 0;
+
+	if (A1 - A0 > 8)
+	{
+		float bestError = FLT_MAX;
+
+		// 64 threads -> 8x8
+		// divide [A1-A0] in partitions.
+		// test endpoints 
+		
+		for (int i = 0; i < 128; i++)
+		{
+			uint idx = (i * NUM_THREADS + tid) * 4;
+			uchar a0 = idx & 255;
+			uchar a1 = idx >> 8;
+			
+			float error = computeError(alphas, a0, a1);
+			
+			if (error < bestError)
+			{
+				bestError = error;
+				A0 = a0;
+				A1 = a1;
+			}
+		}
+		
+		__shared__ float errors[NUM_THREADS];
+		errors[tid] = bestError;
+		
+		// Minimize error.
+		minIdx = findMinError(errors);
+
+	}
+
+	if (minIdx == tid)
+	{
+		// @@ Compute indices.
+	
+		// @@ Write alpha block.
+	}
+}
+
+__global__ void compressDXT5(const uint * permutations, const uint * image, uint4 * result)
+{
+	__shared__ float3 colors[16];
+	__shared__ float3 sums[16];
+	__shared__ float weights[16];
+	__shared__ int xrefs[16];
+	
+	loadColorBlock(image, colors, sums, weights, xrefs);
+	
+	__syncthreads();
+
+	compressAlpha(weights, result);	
+
+	ushort bestStart, bestEnd;
+	uint bestPermutation;
+
+	__shared__ float errors[NUM_THREADS];
+	
+	evalLevel4Permutations(colors, weights, sums[0], permutations, bestStart, bestEnd, bestPermutation, errors);
+	
+	// Use a parallel reduction to find minimum error.
+	int minIdx = findMinError(errors);
+	
+	// Only write the result of the winner thread.
+	if (threadIdx.x == minIdx)
+	{
+		saveBlockDXT1(bestStart, bestEnd, bestPermutation, xrefs, (uint2 *)result);
+	}
+}
+*/
+
+/*__device__ void evaluatePalette(uint alpha0, uint alpha1, uint alphas[8])
+{
+	alpha[0] = alpha0;
+	alpha[1] = alpha1;
+	alpha[2] = (6 * alpha[0] + 1 * alpha[1]) / 7;	// bit code 010
+	alpha[3] = (5 * alpha[0] + 2 * alpha[1]) / 7;	// bit code 011
+	alpha[4] = (4 * alpha[0] + 3 * alpha[1]) / 7;	// bit code 100
+	alpha[5] = (3 * alpha[0] + 4 * alpha[1]) / 7;	// bit code 101
+	alpha[6] = (2 * alpha[0] + 5 * alpha[1]) / 7;	// bit code 110
+	alpha[7] = (1 * alpha[0] + 6 * alpha[1]) / 7;	// bit code 111
+}
+
+__device__ uint computeAlphaError(const uint block[16], uint alpha0, uint alpha1, int bestError = INT_MAX)
+{
+	uint8 alphas[8];
+	evaluatePalette(alpha0, alpha1, alphas);
+
+	int totalError = 0;
+
+	for (uint i = 0; i < 16; i++)
+	{
+		uint8 alpha = block[i];
+
+		// @@ It should be possible to do this much faster.
+
+		int minDist = INT_MAX;
+		for (uint p = 0; p < 8; p++)
+		{
+			int dist = alphaDistance(alpha, alphas[p]);
+			minDist = min(dist, minDist);
+		}
+
+
+
+		totalError += minDist;
+
+		if (totalError > bestError)
+		{
+			// early out
+			return totalError;
+		}
+	}
+
+	return totalError;
+}
+
+
+void compressDXT5A(uint alpha[16])
+{
+	// Get min/max alpha.
+	for (uint i = 0; i < 16; i++)
+	{
+		mina = min(mina, alpha[i]);
+		maxa = max(maxa, alpha[i]);
+	}
+
+	dxtBlock->alpha0 = maxa;
+	dxtBlock->alpha1 = mina;
+
+	if (maxa - mina > 8)
+	{
+		int besterror = computeAlphaError(rgba, dxtBlock);
+		int besta0 = maxa;
+		int besta1 = mina;
+
+		// Expand search space a bit.
+		const int alphaExpand = 8;
+		mina = (mina <= alphaExpand) ? 0 : mina - alphaExpand;
+		maxa = (maxa <= 255-alphaExpand) ? 255 : maxa + alphaExpand;
+
+		for (int a0 = mina+9; a0 < maxa; a0++)
+		{
+			for (int a1 = mina; a1 < a0-8; a1++)
+			{
+				nvDebugCheck(a0 - a1 > 8);
+
+				dxtBlock->alpha0 = a0;
+				dxtBlock->alpha1 = a1;
+				int error = computeAlphaError(rgba, dxtBlock, besterror);
+
+				if (error < besterror)
+				{
+					besterror = error;
+					besta0 = a0;
+					besta1 = a1;
+				}
+			}
+		}
+
+		dxtBlock->alpha0 = besta0;
+		dxtBlock->alpha1 = besta1;
+	}
+}
+
+__global__ void compressDXT5n(uint blockNum, uint2 * d_result)
+{
+	uint idx = blockIdx.x * 128 + threadIdx.x;
+
+	if (idx >= blockNum)
+	{
+		return;
+	}
+
+	// @@ Ideally we would load the data to shared mem to achieve coalesced global mem access.
+	// @@ Blocks would require too much shared memory (8k) and limit occupancy.
+
+	// @@ Ideally we should use SIMD processing, multiple threads (4-8) processing the same block.
+	// That simplifies coalescing, and reduces divergence.
+
+	// @@ Experiment with texture. That's probably the most simple approach.
+
+	uint x[16];
+	uint y[16];
+
+
+}
+*/
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Setup kernel
+////////////////////////////////////////////////////////////////////////////////
+
+extern "C" void setupOMatchTables(const void * OMatch5Src, size_t OMatch5Size, const void * OMatch6Src, size_t OMatch6Size)
+{
+    // Init single color lookup contant tables.
+    cudaMemcpyToSymbol(OMatch5, OMatch5Src, OMatch5Size, 0, cudaMemcpyHostToDevice);
+    cudaMemcpyToSymbol(OMatch6, OMatch6Src, OMatch6Size, 0, cudaMemcpyHostToDevice);
+}
+
+extern "C" void setupCompressKernel(const float weights[3])
+{
+    // Set constants.
+    cudaMemcpyToSymbol(kColorMetric, weights, sizeof(float) * 3, 0);
+
+    float weightsSqr[3];
+    weightsSqr[0] = weights[0] * weights[0];
+    weightsSqr[1] = weights[1] * weights[1];
+    weightsSqr[2] = weights[2] * weights[2];
+
+    cudaMemcpyToSymbol(kColorMetricSqr, weightsSqr, sizeof(float) * 3, 0);
+}
+
+extern "C" void bindTextureToArray(cudaArray * d_data)
+{
+    // Setup texture
+    tex.normalized = false;
+    tex.filterMode = cudaFilterModePoint;
+    tex.addressMode[0] = cudaAddressModeClamp;
+    tex.addressMode[1] = cudaAddressModeClamp;
+
+    cudaBindTextureToArray(tex, d_data);
+}
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Launch kernel
+////////////////////////////////////////////////////////////////////////////////
+
+// DXT1 compressors:
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
+{
+    compressDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
+}
+
+extern "C" void compressKernelDXT1_Level4(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
+{
+    compressLevel4DXT1<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
+}
+
+extern "C" void compressWeightedKernelDXT1(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
+{
+    compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
+}
+
+// @@ DXT1a compressors.
+
+
+// @@ DXT3 compressors:
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
+{
+    //compressDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
+}
+
+extern "C" void compressWeightedKernelDXT3(uint firstBlock, uint blockNum, uint blockWidth, uint * d_result, uint * d_bitmaps)
+{
+    //compressWeightedDXT3<<<blockNum, NUM_THREADS>>>(firstBlock, blockWidth, d_bitmaps, (uint2 *)d_result);
+}
+
+
+// @@ DXT5 compressors.
+extern "C" void compressKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+    //compressDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+extern "C" void compressWeightedKernelDXT5(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps)
+{
+    //compressWeightedDXT5<<<blockNum, NUM_THREADS>>>(firstBlock, w, d_bitmaps, (uint2 *)d_result);
+}
+
+
+
+
+
+/*
+extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+{
+    compressNormalDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+}
+
+extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+{
+    compressCTX1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+}
+*/
+/*
+extern "C" void compressKernelDXT5n(uint blockNum, cudaArray * d_data, uint * d_result)
+{
+//    compressDXT5n<<<blockNum/128, 128>>>(blockNum, (uint2 *)d_result);
+}
+*/
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/ConvolveKernel.cu
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/ConvolveKernel.cu
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/ConvolveKernel.cu
@@ -1,4 +1,5 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
 // 
 // Permission is hereby granted, free of charge, to any person
 // obtaining a copy of this software and associated documentation
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressDXT.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressDXT.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressDXT.h
@@ -1,61 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_CUDACOMPRESSDXT_H
-#define NV_TT_CUDACOMPRESSDXT_H
-
-#include <nvimage/nvimage.h>
-#include <nvtt/nvtt.h>
-
-namespace nv
-{
-	class Image;
-
-	class CudaCompressor
-	{
-	public:
-		CudaCompressor();
-		~CudaCompressor();
-
-		bool isValid() const;
-
-		void setImage(const Image * image, nvtt::AlphaMode alphaMode);
-
-		void compressDXT1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT3(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-		void compressDXT5(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
-
-	private:
-
-		uint * m_bitmapTable;
-		uint * m_data;
-		uint * m_result;
-		
-		const Image * m_image;
-		nvtt::AlphaMode m_alphaMode;
-	};
-
-} // nv namespace
-
-
-#endif // NV_TT_CUDAUTILS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressDXT.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressDXT.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressDXT.cpp
@@ -1,380 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Debug.h>
-#include <nvcore/Containers.h>
-#include <nvmath/Color.h>
-#include <nvimage/Image.h>
-#include <nvimage/ColorBlock.h>
-#include <nvimage/BlockDXT.h>
-#include <nvtt/CompressionOptions.h>
-#include <nvtt/OutputOptions.h>
-#include <nvtt/QuickCompressDXT.h>
-#include <nvtt/OptimalCompressDXT.h>
-
-#include "CudaCompressDXT.h"
-#include "CudaUtils.h"
-
-
-#if defined HAVE_CUDA
-#include <cuda_runtime_api.h>
-#endif
-
-#include <time.h>
-#include <stdio.h>
-
-using namespace nv;
-using namespace nvtt;
-
-#if defined HAVE_CUDA
-
-#define MAX_BLOCKS 8192U // 32768, 65535
-
-
-extern "C" void setupCompressKernel(const float weights[3]);
-extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-
-#include "Bitmaps.h"	// @@ Rename to BitmapTable.h
-
-// Convert linear image to block linear.
-static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
-{
-	const uint w = (image->width() + 3) / 4;
-	const uint h = (image->height() + 3) / 4;
-
-	for(uint by = 0; by < h; by++) {
-		for(uint bx = 0; bx < w; bx++) {
-			const uint bw = min(image->width() - bx * 4, 4U);
-			const uint bh = min(image->height() - by * 4, 4U);
-
-			for (uint i = 0; i < 16; i++) {
-				const int x = (i % 4) % bw;
-				const int y = (i / 4) % bh;
-				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
-			}
-		}
-	}
-}
-
-#endif
-
-
-CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_data(NULL), m_result(NULL)
-{
-#if defined HAVE_CUDA
-    // Allocate and upload bitmaps.
-    cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
-	if (m_bitmapTable != NULL)
-	{
-		cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
-	}
-
-	// Allocate scratch buffers.
-    cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
-    cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
-#endif
-}
-
-CudaCompressor::~CudaCompressor()
-{
-#if defined HAVE_CUDA
-	// Free device mem allocations.
-	cudaFree(m_data);
-	cudaFree(m_result);
-	cudaFree(m_bitmapTable);
-#endif
-}
-
-bool CudaCompressor::isValid() const
-{
-#if defined HAVE_CUDA
-	if (cudaGetLastError() != cudaSuccess)
-   	{
-		return false;
-	}
-#endif
-	return m_data != NULL && m_result != NULL && m_bitmapTable != NULL;
-}
-
-// @@ This code is very repetitive and needs to be cleaned up.
-
-void CudaCompressor::setImage(const Image * image, nvtt::AlphaMode alphaMode)
-{
-	m_image = image;
-	m_alphaMode = alphaMode;
-}
-
-/// Compress image using CUDA.
-void CudaCompressor::compressDXT1(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	nvDebugCheck(cuda::isHardwarePresent());
-#if defined HAVE_CUDA
-
-	// Image size in blocks.
-	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (m_image->height() + 3) / 4;
-
-	uint imageSize = w * h * 16 * sizeof(Color32);
-    uint * blockLinearImage = (uint *) ::malloc(imageSize);
-	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
-
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
-
-	clock_t start = clock();
-
-	setupCompressKernel(compressionOptions.colorWeight.ptr());
-	
-	// TODO: Add support for multiple GPUs.
-	uint bn = 0;
-	while(bn != blockNum)
-	{
-		uint count = min(blockNum - bn, MAX_BLOCKS);
-
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
-
-		// Launch kernel.
-		compressKernelDXT1(count, m_data, m_result, m_bitmapTable);
-
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
-
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
-
-		// Output result.
-		if (outputOptions.outputHandler != NULL)
-		{
-			outputOptions.outputHandler->writeData(blockLinearImage, count * 8);
-		}
-
-		bn += count;
-	}
-
-	clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-
-	free(blockLinearImage);
-
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
-
-
-/// Compress image using CUDA.
-void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	nvDebugCheck(cuda::isHardwarePresent());
-#if defined HAVE_CUDA
-
-	// Image size in blocks.
-	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (m_image->height() + 3) / 4;
-
-	uint imageSize = w * h * 16 * sizeof(Color32);
-    uint * blockLinearImage = (uint *) ::malloc(imageSize);
-	convertToBlockLinear(m_image, blockLinearImage);
-
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
-
-	AlphaBlockDXT3 * alphaBlocks = NULL;
-	alphaBlocks = (AlphaBlockDXT3 *)::malloc(min(compressedSize, MAX_BLOCKS * 8U));
-
-	setupCompressKernel(compressionOptions.colorWeight.ptr());
-	
-	clock_t start = clock();
-
-	uint bn = 0;
-	while(bn != blockNum)
-	{
-		uint count = min(blockNum - bn, MAX_BLOCKS);
-
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
-
-		// Launch kernel.
-		if (m_alphaMode == AlphaMode_Transparency)
-		{
-			compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
-		}
-		else
-		{
-			compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
-		}
-
-		// Compress alpha in parallel with the GPU.
-		for (uint i = 0; i < count; i++)
-		{
-			ColorBlock rgba(blockLinearImage + (bn + i) * 16);
-			OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
-		}
-
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
-
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
-
-		// Output result.
-		if (outputOptions.outputHandler != NULL)
-		{
-			for (uint i = 0; i < count; i++)
-			{
-				outputOptions.outputHandler->writeData(alphaBlocks + i, 8);
-				outputOptions.outputHandler->writeData(blockLinearImage + i * 2, 8);
-			}
-		}
-
-		bn += count;
-	}
-
-	clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-
-	free(alphaBlocks);
-	free(blockLinearImage);
-
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
-
-
-/// Compress image using CUDA.
-void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
-{
-	nvDebugCheck(cuda::isHardwarePresent());
-#if defined HAVE_CUDA
-
-	// Image size in blocks.
-	const uint w = (m_image->width() + 3) / 4;
-	const uint h = (m_image->height() + 3) / 4;
-
-	uint imageSize = w * h * 16 * sizeof(Color32);
-    uint * blockLinearImage = (uint *) ::malloc(imageSize);
-	convertToBlockLinear(m_image, blockLinearImage);
-
-	const uint blockNum = w * h;
-	const uint compressedSize = blockNum * 8;
-
-	AlphaBlockDXT5 * alphaBlocks = NULL;
-	alphaBlocks = (AlphaBlockDXT5 *)::malloc(min(compressedSize, MAX_BLOCKS * 8U));
-
-	setupCompressKernel(compressionOptions.colorWeight.ptr());
-	
-	clock_t start = clock();
-
-	uint bn = 0;
-	while(bn != blockNum)
-	{
-		uint count = min(blockNum - bn, MAX_BLOCKS);
-
-	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
-
-		// Launch kernel.
-		if (m_alphaMode == AlphaMode_Transparency)
-		{
-			compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);
-		}
-		else
-		{
-			compressKernelDXT1_Level4(count, m_data, m_result, m_bitmapTable);
-		}
-
-		// Compress alpha in parallel with the GPU.
-		for (uint i = 0; i < count; i++)
-		{
-			ColorBlock rgba(blockLinearImage + (bn + i) * 16);
-			QuickCompress::compressDXT5A(rgba, alphaBlocks + i);
-		}
-
-		// Check for errors.
-		cudaError_t err = cudaGetLastError();
-		if (err != cudaSuccess)
-		{
-			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
-
-			if (outputOptions.errorHandler != NULL)
-			{
-				outputOptions.errorHandler->error(Error_CudaError);
-			}
-		}
-
-		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);
-
-		// Output result.
-		if (outputOptions.outputHandler != NULL)
-		{
-			for (uint i = 0; i < count; i++)
-			{
-				outputOptions.outputHandler->writeData(alphaBlocks + i, 8);
-				outputOptions.outputHandler->writeData(blockLinearImage + i * 2, 8);
-			}
-		}
-
-		bn += count;
-	}
-
-	clock_t end = clock();
-	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-
-	free(alphaBlocks);
-	free(blockLinearImage);
-
-#else
-	if (outputOptions.errorHandler != NULL)
-	{
-		outputOptions.errorHandler->error(Error_CudaError);
-	}
-#endif
-}
-
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressorDXT.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressorDXT.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressorDXT.h
@@ -0,0 +1,113 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NV_TT_CUDACOMPRESSORDXT_H
+#define NV_TT_CUDACOMPRESSORDXT_H
+
+#include "nvtt/nvtt.h"
+#include "nvtt/Compressor.h" // CompressorInterface
+
+struct cudaArray;
+
+namespace nv
+{
+    class CudaContext
+    {
+    public:
+        CudaContext();
+        ~CudaContext();
+
+        bool isValid() const;
+
+    public:
+        // Device pointers.
+        uint * bitmapTable;
+        uint * bitmapTableCTX;
+        uint * data;
+        uint * result;
+    };
+
+#if defined HAVE_CUDA
+
+    struct CudaCompressor : public CompressorInterface
+    {
+        CudaCompressor(CudaContext & ctx);
+
+        virtual void compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions);
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions) = 0;
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const = 0;
+
+    protected:
+        CudaContext & m_ctx;
+    };
+
+    struct CudaCompressorDXT1 : public CudaCompressor
+    {
+        CudaCompressorDXT1(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 8; };
+    };
+
+    /*struct CudaCompressorDXT1n : public CudaCompressor
+    {
+        virtual void setup(const CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const { return 8; };
+    };*/
+
+    struct CudaCompressorDXT3 : public CudaCompressor
+    {
+        CudaCompressorDXT3(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; };
+    };
+
+    struct CudaCompressorDXT5 : public CudaCompressor
+    {
+        CudaCompressorDXT5(CudaContext & ctx) : CudaCompressor(ctx) {}
+
+        virtual void setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint first, uint count, uint w, uint h, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output);
+        virtual uint blockSize() const { return 16; };
+    };
+
+    /*struct CudaCompressorCXT1 : public CudaCompressor
+    {
+        virtual void setup(const CompressionOptions::Private & compressionOptions);
+        virtual void compressBlocks(uint blockCount, const void * input, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output) = 0;
+        virtual uint blockSize() const { return 8; };
+    };*/
+
+#endif // defined HAVE_CUDA
+
+} // nv namespace
+
+
+#endif // NV_TT_CUDAUTILS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressorDXT.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressorDXT.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaCompressorDXT.cpp
@@ -0,0 +1,608 @@
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "CudaCompressorDXT.h"
+#include "CudaUtils.h"
+
+#include "nvcore/Debug.h"
+#include "nvmath/Color.h"
+#include "nvmath/Vector.inl"
+#include "nvimage/Image.h"
+#include "nvimage/ColorBlock.h"
+#include "nvimage/BlockDXT.h"
+#include "nvtt/CompressionOptions.h"
+#include "nvtt/OutputOptions.h"
+#include "nvtt/QuickCompressDXT.h"
+#include "nvtt/OptimalCompressDXT.h"
+
+#include <time.h>
+#include <stdio.h>
+
+#if defined HAVE_CUDA
+#include <cuda_runtime_api.h>
+
+#define MAX_BLOCKS 8192U // 32768, 65535 // @@ Limit number of blocks on slow devices to prevent hitting the watchdog timer.
+
+extern "C" void setupOMatchTables(const void * OMatch5Src, size_t OMatch5Size, const void * OMatch6Src, size_t OMatch6Size);
+extern "C" void setupCompressKernel(const float weights[3]);
+extern "C" void bindTextureToArray(cudaArray * d_data);
+
+extern "C" void compressKernelDXT1(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT1_Level4(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT3(uint firstBlock, uint blockNum, uint w, uint * d_result, uint * d_bitmaps);
+//extern "C" void compressNormalKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+//extern "C" void compressKernelCTX1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+
+#include "BitmapTable.h"
+#include "nvtt/SingleColorLookup.h"
+
+#endif
+
+using namespace nv;
+using namespace nvtt;
+
+
+CudaContext::CudaContext() : 
+	bitmapTable(NULL), 
+	bitmapTableCTX(NULL), 
+	data(NULL), 
+	result(NULL)
+{
+#if defined HAVE_CUDA
+    // Allocate and upload bitmaps.
+    cudaMalloc((void**) &bitmapTable, 992 * sizeof(uint));
+    if (bitmapTable != NULL)
+    {
+        cudaMemcpy(bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
+    }
+
+    cudaMalloc((void**) &bitmapTableCTX, 704 * sizeof(uint));
+    if (bitmapTableCTX != NULL)
+    {
+        cudaMemcpy(bitmapTableCTX, s_bitmapTableCTX, 704 * sizeof(uint), cudaMemcpyHostToDevice);
+    }
+
+    // Allocate scratch buffers.
+    cudaMalloc((void**) &data, MAX_BLOCKS * 64U);
+    cudaMalloc((void**) &result, MAX_BLOCKS * 8U);
+
+    // Init single color lookup contant tables.
+	setupOMatchTables(OMatch5, sizeof(OMatch5), OMatch6, sizeof(OMatch6));
+#endif
+}
+
+CudaContext::~CudaContext()
+{
+#if defined HAVE_CUDA
+    // Free device mem allocations.
+    cudaFree(bitmapTableCTX);
+    cudaFree(bitmapTable);
+    cudaFree(data);
+    cudaFree(result);
+#endif
+}
+
+bool CudaContext::isValid() const
+{
+#if defined HAVE_CUDA
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(err));
+        return false;
+    }
+#endif
+    return bitmapTable != NULL && bitmapTableCTX != NULL && data != NULL && result != NULL;
+}
+
+
+#if defined HAVE_CUDA
+
+CudaCompressor::CudaCompressor(CudaContext & ctx) : m_ctx(ctx)
+{
+
+}
+
+void CudaCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+    nvDebugCheck(d == 1);
+    nvDebugCheck(cuda::isHardwarePresent());
+
+#if defined HAVE_CUDA
+
+    // Allocate image as a cuda array.
+    const uint count = w * h;
+    Color32 * tmp = malloc<Color32>(count);
+    for (uint i = 0; i < count; i++) {
+        tmp[i].r = uint8(clamp(data[i + count*0], 0.0f, 1.0f) * 255);
+        tmp[i].g = uint8(clamp(data[i + count*1], 0.0f, 1.0f) * 255);
+        tmp[i].b = uint8(clamp(data[i + count*2], 0.0f, 1.0f) * 255);
+        tmp[i].a = uint8(clamp(data[i + count*3], 0.0f, 1.0f) * 255);
+    }
+
+    cudaArray * d_image;
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
+    cudaMallocArray(&d_image, &channelDesc, w, h);
+
+    cudaMemcpyToArray(d_image, 0, 0, tmp, count * sizeof(Color32), cudaMemcpyHostToDevice);
+
+    free(tmp);
+
+    // To avoid the copy we could keep the data in floating point format, but the channels are not interleaved like the kernel expects.
+    /*
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat);
+    cudaMallocArray(&d_image, &channelDesc, w, h);
+
+    const int imageSize = w * h * sizeof(float) * 4;
+    cudaMemcpyToArray(d_image, 0, 0, data, imageSize, cudaMemcpyHostToDevice);
+    */
+
+    // Image size in blocks.
+    const uint bw = (w + 3) / 4;
+    const uint bh = (h + 3) / 4;
+    const uint bs = blockSize();
+    const uint blockNum = bw * bh;
+    //const uint compressedSize = blockNum * bs;
+
+    void * h_result = ::malloc(min(blockNum, MAX_BLOCKS) * bs);
+
+    setup(d_image, compressionOptions);
+
+    // Timer timer;
+    // timer.start();
+
+    uint bn = 0;
+    while (bn != blockNum)
+    {
+        uint count = min(blockNum - bn, MAX_BLOCKS);
+
+        compressBlocks(bn, count, bw, bh, alphaMode, compressionOptions, h_result);
+
+        // Check for errors.
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess)
+        {
+            //nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+            outputOptions.error(Error_CudaError);
+        }
+
+        // Output result.
+        outputOptions.writeData(h_result, count * bs);
+
+        bn += count;
+    }
+
+    //timer.stop();
+    //printf("\rCUDA time taken: %.3f seconds\n", timer.elapsed() / CLOCKS_PER_SEC);
+
+    free(h_result);
+    cudaFreeArray(d_image);
+
+#else
+    outputOptions.error(Error_CudaError);
+#endif
+}
+
+#if defined HAVE_CUDA
+
+void CudaCompressorDXT1::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+    setupCompressKernel(compressionOptions.colorWeight.ptr());
+    bindTextureToArray(image);
+}
+
+void CudaCompressorDXT1::compressBlocks(uint first, uint count, uint bw, uint bh, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    // Launch kernel.
+    compressKernelDXT1(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
+
+    // Copy result to host.
+    cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+}
+
+
+void CudaCompressorDXT3::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+    setupCompressKernel(compressionOptions.colorWeight.ptr());
+    bindTextureToArray(image);
+}
+
+void CudaCompressorDXT3::compressBlocks(uint first, uint count, uint bw, uint bh, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    // Launch kernel.
+    compressKernelDXT3(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
+
+    // Copy result to host.
+    cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);
+}
+
+
+void CudaCompressorDXT5::setup(cudaArray * image, const nvtt::CompressionOptions::Private & compressionOptions)
+{
+    setupCompressKernel(compressionOptions.colorWeight.ptr());
+    bindTextureToArray(image);
+}
+
+void CudaCompressorDXT5::compressBlocks(uint first, uint count, uint bw, uint bh, nvtt::AlphaMode alphaMode, const nvtt::CompressionOptions::Private & compressionOptions, void * output)
+{
+    /*// Launch kernel.
+    compressKernelDXT5(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
+
+    // Copy result to host.
+    cudaMemcpy(output, m_ctx.result, count * 16, cudaMemcpyDeviceToHost);*/
+
+    // Launch kernel.
+    if (alphaMode == AlphaMode_Transparency)
+    {
+    //	compressWeightedKernelDXT1(first, count, bw, m_ctx.result, m_ctx.bitmapTable);
+    }
+    else
+    {
+    //	compressKernelDXT1_Level4(first, count, w, m_ctx.result, m_ctx.bitmapTable);
+    }
+
+    // Compress alpha in parallel with the GPU.
+    for (uint i = 0; i < count; i++)
+    {
+        //ColorBlock rgba(blockLinearImage + (first + i) * 16);
+        //OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
+    }
+
+    // Copy result to host.
+    cudaMemcpy(output, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+
+    // @@ Interleave color and alpha blocks.
+
+}
+
+#endif // defined HAVE_CUDA
+
+
+
+
+// @@ This code is very repetitive and needs to be cleaned up.
+
+#if 0
+
+
+/*
+// Convert linear image to block linear.
+static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
+{
+	const uint w = (image->width() + 3) / 4;
+	const uint h = (image->height() + 3) / 4;
+
+	for(uint by = 0; by < h; by++) {
+		for(uint bx = 0; bx < w; bx++) {
+			const uint bw = min(image->width() - bx * 4, 4U);
+			const uint bh = min(image->height() - by * 4, 4U);
+
+			for (uint i = 0; i < 16; i++) {
+				const int x = (i % 4) % bw;
+				const int y = (i / 4) % bh;
+				blockLinearImage[(by * w + bx) * 16 + i] = image->pixel(bx * 4 + x, by * 4 + y).u;
+			}
+		}
+	}
+}
+*/
+
+
+/// Compress image using CUDA.
+void CudaCompressor::compressDXT3(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	// Image size in blocks.
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+	convertToBlockLinear(m_image, blockLinearImage);
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+
+	AlphaBlockDXT3 * alphaBlocks = NULL;
+	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	clock_t start = clock();
+
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, MAX_BLOCKS);
+
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		if (m_alphaMode == AlphaMode_Transparency)
+		{
+			compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
+		}
+		else
+		{
+			compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
+		}
+
+		// Compress alpha in parallel with the GPU.
+		for (uint i = 0; i < count; i++)
+		{
+			ColorBlock rgba(blockLinearImage + (bn + i) * 16);
+			OptimalCompress::compressDXT3A(rgba, alphaBlocks + i);
+		}
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+			outputOptions.error(Error_CudaError);
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		for (uint i = 0; i < count; i++)
+		{
+			outputOptions.writeData(alphaBlocks + i, 8);
+			outputOptions.writeData(blockLinearImage + i * 2, 8);
+		}
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(alphaBlocks);
+	free(blockLinearImage);
+
+#else
+	outputOptions.error(Error_CudaError);
+#endif
+}
+
+
+/// Compress image using CUDA.
+void CudaCompressor::compressDXT5(const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	// Image size in blocks.
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+	convertToBlockLinear(m_image, blockLinearImage);
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+
+	AlphaBlockDXT5 * alphaBlocks = NULL;
+	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	clock_t start = clock();
+
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, MAX_BLOCKS);
+
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		if (m_alphaMode == AlphaMode_Transparency)
+		{
+			compressWeightedKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
+		}
+		else
+		{
+			compressKernelDXT1_Level4(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
+		}
+
+		// Compress alpha in parallel with the GPU.
+		for (uint i = 0; i < count; i++)
+		{
+			ColorBlock rgba(blockLinearImage + (bn + i) * 16);
+			QuickCompress::compressDXT5A(rgba, alphaBlocks + i);
+		}
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+			outputOptions.error(Error_CudaError);
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		for (uint i = 0; i < count; i++)
+		{
+			outputOptions.writeData(alphaBlocks + i, 8);
+			outputOptions.writeData(blockLinearImage + i * 2, 8);
+		}
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(alphaBlocks);
+	free(blockLinearImage);
+
+#else
+	outputOptions.error(Error_CudaError);
+#endif
+}
+
+
+void CudaCompressor::compressDXT1n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	// Image size in blocks.
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+
+	clock_t start = clock();
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	// TODO: Add support for multiple GPUs.
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, MAX_BLOCKS);
+
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		compressNormalKernelDXT1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTable);
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+			outputOptions.error(Error_CudaError);
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		outputOptions.writeData(blockLinearImage, count * 8);
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(blockLinearImage);
+
+#else
+	outputOptions.error(Error_CudaError);
+#endif
+}
+
+
+void CudaCompressor::compressCTX1(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	// Image size in blocks.
+	const uint w = (m_image->width() + 3) / 4;
+	const uint h = (m_image->height() + 3) / 4;
+
+	uint imageSize = w * h * 16 * sizeof(Color32);
+    uint * blockLinearImage = (uint *) malloc(imageSize);
+	convertToBlockLinear(m_image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!
+
+	const uint blockNum = w * h;
+	const uint compressedSize = blockNum * 8;
+
+	clock_t start = clock();
+
+	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	
+	// TODO: Add support for multiple GPUs.
+	uint bn = 0;
+	while(bn != blockNum)
+	{
+		uint count = min(blockNum - bn, MAX_BLOCKS);
+
+	    cudaMemcpy(m_ctx.data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+
+		// Launch kernel.
+		compressKernelCTX1(count, m_ctx.data, m_ctx.result, m_ctx.bitmapTableCTX);
+
+		// Check for errors.
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess)
+		{
+			nvDebug("CUDA Error: %s\n", cudaGetErrorString(err));
+
+			outputOptions.error(Error_CudaError);
+		}
+
+		// Copy result to host, overwrite swizzled image.
+		cudaMemcpy(blockLinearImage, m_ctx.result, count * 8, cudaMemcpyDeviceToHost);
+
+		// Output result.
+		outputOptions.writeData(blockLinearImage, count * 8);
+
+		bn += count;
+	}
+
+	clock_t end = clock();
+	//printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
+
+	free(blockLinearImage);
+
+#else
+	outputOptions.error(Error_CudaError);
+#endif
+}
+
+
+void CudaCompressor::compressDXT5n(const nvtt::CompressionOptions::Private & compressionOptions, const nvtt::OutputOptions::Private & outputOptions)
+{
+	nvDebugCheck(cuda::isHardwarePresent());
+#if defined HAVE_CUDA
+
+	// @@ TODO
+
+#else
+	outputOptions.error(Error_CudaError);
+#endif
+}
+
+#endif // 0
+
+#endif // defined HAVE_CUDA
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaMath.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaMath.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaMath.h
@@ -1,260 +1,433 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-// Math functions and operators to be used with vector types.
-
-#ifndef CUDAMATH_H
-#define CUDAMATH_H
-
-#include <float.h>
-
-
-inline __device__ __host__ float3 operator *(float3 a, float3 b)
-{
-    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-}
-
-inline __device__ __host__ float3 operator *(float f, float3 v)
-{
-    return make_float3(v.x*f, v.y*f, v.z*f);
-}
-
-inline __device__ __host__ float3 operator *(float3 v, float f)
-{
-    return make_float3(v.x*f, v.y*f, v.z*f);
-}
-
-inline __device__ __host__ float3 operator +(float3 a, float3 b)
-{
-    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-}
-
-inline __device__ __host__ void operator +=(float3 & b, float3 a)
-{
-    b.x += a.x;
-    b.y += a.y;
-    b.z += a.z;
-}
-
-inline __device__ __host__ float3 operator -(float3 a, float3 b)
-{
-    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
-}
-
-inline __device__ __host__ void operator -=(float3 & b, float3 a)
-{
-    b.x -= a.x;
-    b.y -= a.y;
-    b.z -= a.z;
-}
-
-inline __device__ __host__ float3 operator /(float3 v, float f)
-{
-    float inv = 1.0f / f;
-    return v * inv;
-}
-
-inline __device__ __host__ void operator /=(float3 & b, float f)
-{
-    float inv = 1.0f / f;
-    b.x *= inv;
-    b.y *= inv;
-    b.z *= inv;
-}
-
-inline __device__ __host__ bool operator ==(float3 a, float3 b)
-{
-	return a.x == b.x && a.y == b.y && a.z == b.z;
-}
-
-inline __device__ __host__ float dot(float3 a, float3 b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-inline __device__ __host__ float dot(float4 a, float4 b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __device__ __host__ float clamp(float f, float a, float b)
-{
-    return max(a, min(f, b));
-}
-
-inline __device__ __host__ float3 clamp(float3 v, float a, float b)
-{
-    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-
-inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
-{
-    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-
-
-inline __device__ __host__ float3 normalize(float3 v)
-{
-    float len = 1.0f / sqrtf(dot(v, v));
-    return make_float3(v.x * len, v.y * len, v.z * len);
-}
-
-
-
-
-// Use power method to find the first eigenvector.
-// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
-inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
-{
-	// 8 iterations seems to be more than enough.
-
-	float3 row0 = make_float3(matrix[0], matrix[1], matrix[2]);
-	float3 row1 = make_float3(matrix[1], matrix[3], matrix[4]);
-	float3 row2 = make_float3(matrix[2], matrix[4], matrix[5]);
-
-	float r0 = dot(row0, row0);
-	float r1 = dot(row1, row1);
-	float r2 = dot(row2, row2);
-
-	float3 v;
-	if (r0 > r1 && r0 > r2) v = row0;
-	else if (r1 > r2) v = row1;
-	else v = row2;
-
-	//float3 v = make_float3(1.0f, 1.0f, 1.0f);
-	for(int i = 0; i < 8; i++) {
-		float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
-		float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
-		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
-		float m = max(max(x, y), z);        
-		float iv = 1.0f / m;
-		if (m == 0.0f) iv = 0.0f;
-		v = make_float3(x*iv, y*iv, z*iv);
-	}
-
-	return v;
-}
-
-inline __device__ bool singleColor(const float3 * colors)
-{
-#if __DEVICE_EMULATION__
-	bool sameColor = false;
-	for (int i = 0; i < 16; i++)
-	{
-		sameColor &= (colors[i] == colors[0]);
-	}
-	return sameColor;
-#else
-	__shared__ int sameColor[16];
-	
-	const int idx = threadIdx.x;
-	
-	sameColor[idx] = (colors[idx] == colors[0]);
-	sameColor[idx] &= sameColor[idx^8];
-	sameColor[idx] &= sameColor[idx^4];
-	sameColor[idx] &= sameColor[idx^2];
-	sameColor[idx] &= sameColor[idx^1];
-	
-	return sameColor[0];
-#endif
-}
-
-inline __device__ void colorSums(const float3 * colors, float3 * sums)
-{
-#if __DEVICE_EMULATION__
-	float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
-	for (int i = 0; i < 16; i++)
-	{
-		color_sum += colors[i];
-	}
-
-	for (int i = 0; i < 16; i++)
-	{
-		sums[i] = color_sum;
-	}
-#else
-
-	const int idx = threadIdx.x;
-
-	sums[idx] = colors[idx];
-	sums[idx] += sums[idx^8];
-	sums[idx] += sums[idx^4];
-	sums[idx] += sums[idx^2];
-	sums[idx] += sums[idx^1];
-
-#endif
-}
-
-inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
-{
-	// Compute covariance matrix of the given colors.
-#if __DEVICE_EMULATION__
-	float covariance[6] = {0, 0, 0, 0, 0, 0};
-	for (int i = 0; i < 16; i++)
-	{
-		float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
-		covariance[0] += a.x * a.x;
-		covariance[1] += a.x * a.y;
-		covariance[2] += a.x * a.z;
-		covariance[3] += a.y * a.y;
-		covariance[4] += a.y * a.z;
-		covariance[5] += a.z * a.z;
-	}
-#else
-
-	const int idx = threadIdx.x;
-
-	float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
-
-	// @@ Eliminate two-way bank conflicts here.
-	// @@ It seems that doing that and unrolling the reduction doesn't help...
-	__shared__ float covariance[16*6];
-
-	covariance[6 * idx + 0] = diff.x * diff.x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
-	covariance[6 * idx + 1] = diff.x * diff.y;
-	covariance[6 * idx + 2] = diff.x * diff.z;
-	covariance[6 * idx + 3] = diff.y * diff.y;
-	covariance[6 * idx + 4] = diff.y * diff.z;
-	covariance[6 * idx + 5] = diff.z * diff.z;
-
-	for(int d = 8; d > 0; d >>= 1)
-	{
-		if (idx < d)
-		{
-			covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
-			covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
-			covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
-			covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
-			covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
-			covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
-		}
-	}
-
-#endif
-
-	// Compute first eigen vector.
-	return firstEigenVector(covariance);
-}
-
-
-#endif // CUDAMATH_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+// Math functions and operators to be used with vector types.
+
+#ifndef CUDAMATH_H
+#define CUDAMATH_H
+
+
+
+inline __device__ __host__ float3 operator *(float3 a, float3 b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+inline __device__ __host__ float3 operator *(float f, float3 v)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator *(float3 v, float f)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator +(float3 a, float3 b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+inline __device__ __host__ void operator +=(float3 & b, float3 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+    b.z += a.z;
+}
+
+inline __device__ __host__ float3 operator -(float3 a, float3 b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+inline __device__ __host__ void operator -=(float3 & b, float3 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+    b.z -= a.z;
+}
+
+inline __device__ __host__ float3 operator /(float3 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float3 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+    b.y *= inv;
+    b.z *= inv;
+}
+
+inline __device__ __host__ bool operator ==(float3 a, float3 b)
+{
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+
+// float2 operators
+inline __device__ __host__ float2 operator *(float2 a, float2 b)
+{
+    return make_float2(a.x*b.x, a.y*b.y);
+}
+
+inline __device__ __host__ float2 operator *(float f, float2 v)
+{
+    return make_float2(v.x*f, v.y*f);
+}
+
+inline __device__ __host__ float2 operator *(float2 v, float f)
+{
+    return make_float2(v.x*f, v.y*f);
+}
+
+inline __device__ __host__ float2 operator +(float2 a, float2 b)
+{
+    return make_float2(a.x+b.x, a.y+b.y);
+}
+
+inline __device__ __host__ void operator +=(float2 & b, float2 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+}
+
+inline __device__ __host__ float2 operator -(float2 a, float2 b)
+{
+    return make_float2(a.x-b.x, a.y-b.y);
+}
+
+inline __device__ __host__ void operator -=(float2 & b, float2 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+}
+
+inline __device__ __host__ float2 operator /(float2 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float2 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+    b.y *= inv;
+}
+
+inline __device__ __host__ bool operator ==(float2 a, float2 b)
+{
+    return a.x == b.x && a.y == b.y;
+}
+
+
+inline __device__ __host__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+
+inline __device__ __host__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline __device__ __host__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+
+inline __device__ __host__ float3 normalize(float3 v)
+{
+    float len = 1.0f / sqrtf(dot(v, v));
+    return make_float3(v.x * len, v.y * len, v.z * len);
+}
+
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    const float s = 1.0f - t;
+    return make_float3(s * a.x + t * b.x, s * a.y + t * b.y, s * a.z + t * b.z);
+}
+
+inline __device__ __host__ float lengthSquared(float3 a)
+{
+    return dot(a, a);
+}
+
+inline __device__ __host__ float lengthSquared(float2 a)
+{
+    return dot(a, a);
+}
+
+
+// Use power method to find the first eigenvector.
+// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
+inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
+{
+    // 8 iterations seems to be more than enough.
+
+    float3 row0 = make_float3(matrix[0], matrix[1], matrix[2]);
+    float3 row1 = make_float3(matrix[1], matrix[3], matrix[4]);
+    float3 row2 = make_float3(matrix[2], matrix[4], matrix[5]);
+
+    float r0 = dot(row0, row0);
+    float r1 = dot(row1, row1);
+    float r2 = dot(row2, row2);
+
+    float3 v;
+    if (r0 > r1 && r0 > r2) v = row0;
+    else if (r1 > r2) v = row1;
+    else v = row2;
+
+    //float3 v = make_float3(1.0f, 1.0f, 1.0f);
+    for(int i = 0; i < 8; i++) {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+        float m = max(max(x, y), z);        
+        float iv = 1.0f / m;
+        if (m == 0.0f) iv = 0.0f;
+        v = make_float3(x*iv, y*iv, z*iv);
+    }
+
+    return v;
+}
+
+
+inline __device__ bool singleColor(const float3 * colors)
+{
+#if __DEVICE_EMULATION__
+    bool sameColor = false;
+    for (int i = 0; i < 16; i++)
+    {
+        sameColor &= (colors[i] == colors[0]);
+    }
+    return sameColor;
+#else
+    __shared__ int sameColor[16];
+
+    const int idx = threadIdx.x;
+
+    sameColor[idx] = (colors[idx] == colors[0]);
+    sameColor[idx] &= sameColor[idx^8];
+    sameColor[idx] &= sameColor[idx^4];
+    sameColor[idx] &= sameColor[idx^2];
+    sameColor[idx] &= sameColor[idx^1];
+
+    return sameColor[0];
+#endif
+}
+
+inline __device__ void colorSums(const float3 * colors, float3 * sums)
+{
+#if __DEVICE_EMULATION__
+    float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
+    for (int i = 0; i < 16; i++)
+    {
+        color_sum += colors[i];
+    }
+
+    for (int i = 0; i < 16; i++)
+    {
+        sums[i] = color_sum;
+    }
+#else
+
+    const int idx = threadIdx.x;
+
+    sums[idx] = colors[idx];
+    sums[idx] += sums[idx^8];
+    sums[idx] += sums[idx^4];
+    sums[idx] += sums[idx^2];
+    sums[idx] += sums[idx^1];
+
+#endif
+}
+
+inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
+{
+    // Compute covariance matrix of the given colors.
+#if __DEVICE_EMULATION__
+    float covariance[6] = {0, 0, 0, 0, 0, 0};
+    for (int i = 0; i < 16; i++)
+    {
+        float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
+        covariance[0] += a.x * a.x;
+        covariance[1] += a.x * a.y;
+        covariance[2] += a.x * a.z;
+        covariance[3] += a.y * a.y;
+        covariance[4] += a.y * a.z;
+        covariance[5] += a.z * a.z;
+    }
+#else
+
+    const int idx = threadIdx.x;
+
+    float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
+
+    // @@ Eliminate two-way bank conflicts here.
+    // @@ It seems that doing that and unrolling the reduction doesn't help...
+    __shared__ float covariance[16*6];
+
+    covariance[6 * idx + 0] = diff.x * diff.x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
+    covariance[6 * idx + 1] = diff.x * diff.y;
+    covariance[6 * idx + 2] = diff.x * diff.z;
+    covariance[6 * idx + 3] = diff.y * diff.y;
+    covariance[6 * idx + 4] = diff.y * diff.z;
+    covariance[6 * idx + 5] = diff.z * diff.z;
+
+    for(int d = 8; d > 0; d >>= 1)
+    {
+        if (idx < d)
+        {
+            covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
+            covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
+            covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
+            covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
+            covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
+            covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
+        }
+    }
+
+#endif
+
+    // Compute first eigen vector.
+    return firstEigenVector(covariance);
+}
+
+
+// @@ For 2D this may not be the most efficient method. It's a quadratic equation, right?
+inline __device__ __host__ float2 firstEigenVector2D( float matrix[3] )
+{
+    // @@ 8 iterations is probably more than enough.
+
+    const float2 row0 = make_float2(matrix[0], matrix[1]);
+    const float2 row1 = make_float2(matrix[1], matrix[2]);
+
+    float r0 = lengthSquared(row0);
+    float r1 = lengthSquared(row1);
+
+    float2 v;
+    if (r0 > r1) v = row0;
+    v = row1;
+
+    //float2 v = make_float2(1.0f, 1.0f);
+    for(int i = 0; i < 8; i++) {
+        float x = v.x * matrix[0] + v.y * matrix[1];
+        float y = v.x * matrix[1] + v.y * matrix[2];
+        float m = max(x, y);        
+        float iv = 1.0f / m;
+        if (m == 0.0f) iv = 0.0f;
+        v = make_float2(x*iv, y*iv);
+    }
+
+    return v;
+}
+
+inline __device__ void colorSums(const float2 * colors, float2 * sums)
+{
+#if __DEVICE_EMULATION__
+    float2 color_sum = make_float2(0.0f, 0.0f);
+    for (int i = 0; i < 16; i++)
+    {
+        color_sum += colors[i];
+    }
+
+    for (int i = 0; i < 16; i++)
+    {
+        sums[i] = color_sum;
+    }
+#else
+
+    const int idx = threadIdx.x;
+
+    sums[idx] = colors[idx];
+    sums[idx] += sums[idx^8];
+    sums[idx] += sums[idx^4];
+    sums[idx] += sums[idx^2];
+    sums[idx] += sums[idx^1];
+
+#endif
+}
+
+inline __device__ float2 bestFitLine(const float2 * colors, float2 color_sum)
+{
+    // Compute covariance matrix of the given colors.
+#if __DEVICE_EMULATION__
+    float covariance[3] = {0, 0, 0};
+    for (int i = 0; i < 16; i++)
+    {
+        float2 a = (colors[i] - color_sum * (1.0f / 16.0f));
+        covariance[0] += a.x * a.x;
+        covariance[1] += a.x * a.y;
+        covariance[2] += a.y * a.y;
+    }
+#else
+
+    const int idx = threadIdx.x;
+
+    float2 diff = (colors[idx] - color_sum * (1.0f / 16.0f));
+
+    __shared__ float covariance[16*3];
+
+    covariance[3 * idx + 0] = diff.x * diff.x;
+    covariance[3 * idx + 1] = diff.x * diff.y;
+    covariance[3 * idx + 2] = diff.y * diff.y;
+
+    for(int d = 8; d > 0; d >>= 1)
+    {
+        if (idx < d)
+        {
+            covariance[3 * idx + 0] += covariance[3 * (idx+d) + 0];
+            covariance[3 * idx + 1] += covariance[3 * (idx+d) + 1];
+            covariance[3 * idx + 2] += covariance[3 * (idx+d) + 2];
+        }
+    }
+
+#endif
+
+    // Compute first eigen vector.
+    return firstEigenVector2D(covariance);
+}
+
+
+#endif // CUDAMATH_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaUtils.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaUtils.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaUtils.h
@@ -1,4 +1,5 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
 // 
 // Permission is hereby granted, free of charge, to any person
 // obtaining a copy of this software and associated documentation
@@ -32,10 +33,8 @@
 		bool isHardwarePresent();
 		int deviceCount();
 		int getFastestDevice();
-		bool isValidDevice(int i);
-
-		bool initDevice(int * device_ptr);
-		void exitDevice();
+		bool setDevice(int i);
+		void exit();
 	};
 	
 } // nv namespace
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaUtils.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaUtils.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/cuda/CudaUtils.cpp
@@ -1,300 +1,239 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Debug.h>
-#include <nvcore/Library.h>
-#include "CudaUtils.h"
-
-#if defined HAVE_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif
-
-using namespace nv;
-using namespace cuda;
-
-/* @@ Move this to win32 utils or somewhere else.
-#if NV_OS_WIN32
-
-#define WINDOWS_LEAN_AND_MEAN
-#include <windows.h>
-
-static bool isWindowsVista()
-{
-OSVERSIONINFO osvi;
-osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
-
-::GetVersionEx(&osvi);
-return osvi.dwMajorVersion >= 6;
-}
-
-
-typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
-
-static bool isWow32()
-{
-LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
-
-BOOL bIsWow64 = FALSE;
-
-if (NULL != fnIsWow64Process)
-{
-if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
-{
-// Assume 32 bits.
-return true;
-}
-}
-
-return !bIsWow64;
-}
-
-#endif
-*/
-
-
-static bool isCudaDriverAvailable(int version)
-{
-#if defined HAVE_CUDA
-#if NV_OS_WIN32
-	Library nvcuda("nvcuda.dll");
-#else
-	Library nvcuda(NV_LIBRARY_NAME(cuda));
-#endif
-
-	if (!nvcuda.isValid())
-	{
-		nvDebug("*** CUDA driver not found.\n");
-		return false;
-	}
-
-	if (version >= 2000)
-	{
-		void * address = nvcuda.bindSymbol("cuStreamCreate");
-		if (address == NULL) {
-			nvDebug("*** CUDA driver version < 2.0.\n");
-			return false;
-		}
-	}
-
-	if (version >= 2010)
-	{
-		void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
-		if (address == NULL) {
-			nvDebug("*** CUDA driver version < 2.1.\n");
-			return false;
-		}
-	}
-
-	if (version >= 2020)
-	{
-		typedef CUresult (CUDAAPI * PFCU_DRIVERGETVERSION)(int * version);
-
-		PFCU_DRIVERGETVERSION driverGetVersion = (PFCU_DRIVERGETVERSION)nvcuda.bindSymbol("cuDriverGetVersion");
-		if (driverGetVersion == NULL) {
-			nvDebug("*** CUDA driver version < 2.2.\n");
-			return false;
-		}
-
-		int driverVersion;
-		CUresult err = driverGetVersion(&driverVersion);
-		if (err != CUDA_SUCCESS) {
-			nvDebug("*** Error querying driver version: '%s'.\n", cudaGetErrorString((cudaError_t)err));
-			return false;
-		}
-
-		return driverVersion >= version;
-	}
-#endif // HAVE_CUDA
-
-	return true;
-}
-
-
-/// Determine if CUDA is available.
-bool nv::cuda::isHardwarePresent()
-{
-#if defined HAVE_CUDA
-	// Make sure that CUDA driver matches CUDA runtime.
-	if (!isCudaDriverAvailable(CUDART_VERSION))
-	{
-		nvDebug("CUDA driver not available for CUDA runtime %d\n", CUDART_VERSION);
-		return false;
-	}
-
-	int count = deviceCount();
-	if (count == 1)
-	{
-		// Make sure it's not an emulation device.
-		cudaDeviceProp deviceProp;
-		cudaGetDeviceProperties(&deviceProp, 0);
-
-		// deviceProp.name != Device Emulation (CPU)
-		if (deviceProp.major == -1 || deviceProp.minor == -1)
-		{
-			return false;
-		}
-	}
-
-	// @@ Make sure that warp size == 32
-
-	// @@ Make sure available GPU is faster than the CPU.
-
-	return count > 0;
-#else
-	return false;
-#endif
-}
-
-/// Get number of CUDA enabled devices.
-int nv::cuda::deviceCount()
-{
-#if defined HAVE_CUDA
-	int gpuCount = 0;
-
-	cudaError_t result = cudaGetDeviceCount(&gpuCount);
-
-	if (result == cudaSuccess)
-	{
-		return gpuCount;
-	}
-#endif
-	return 0;
-}
-
-
-// Make sure device meets requirements:
-// - Not an emulation device.
-// - Not an integrated device?
-// - Faster than CPU.
-bool nv::cuda::isValidDevice(int i)
-{
-#if defined HAVE_CUDA
-
-	cudaDeviceProp device_properties;
-	cudaGetDeviceProperties(&device_properties, i);
-	int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
-
-	if (device_properties.major == -1 || device_properties.minor == -1) {
-		// Emulation device.
-		return false;
-	}
-
-#if CUDART_VERSION >= 2030 // 2.3
-	/*if (device_properties.integrated)
-	{
-		// Integrated devices.
-		return false;
-	}*/
-#endif
-
-	return true;
-#else
-	return false;
-#endif
-}
-
-int nv::cuda::getFastestDevice()
-{
-	int max_gflops_device = -1;
-#if defined HAVE_CUDA
-	int max_gflops = 0;
-
-	const int device_count = deviceCount();
-	for (int i = 0; i < device_count; i++)
-	{
-		if (isValidDevice(i))
-		{
-			cudaDeviceProp device_properties;
-			cudaGetDeviceProperties(&device_properties, i);
-			int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
-
-			if (gflops > max_gflops)
-			{
-				max_gflops = gflops;
-				max_gflops_device = i;
-			}
-		}
-	}
-#endif
-	return max_gflops_device;
-}
-
-
-/// Activate the given devices.
-bool nv::cuda::initDevice(int * device_ptr)
-{
-	nvDebugCheck(device_ptr != NULL);
-#if defined HAVE_CUDA
-
-#if CUDART_VERSION >= 2030 // 2.3
-
-	// Set device flags to yield in order to play nice with other threads and to find out if CUDA was already active.
-	cudaError_t resul = cudaSetDeviceFlags(cudaDeviceScheduleYield);
-
-#endif
-
-	int device = getFastestDevice();
-
-	if (device == -1)
-	{
-		// No device is fast enough.
-		*device_ptr = -1;
-		return false;
-	}
-
-	// Select CUDA device.
-	cudaError_t result = cudaSetDevice(device);
-
-	if (result == cudaErrorSetOnActiveProcess)
-	{
-		int device;
-		result = cudaGetDevice(&device);
-
-		*device_ptr = -1;  // No device to cleanup.
-		return isValidDevice(device); // Return true if device is valid.
-	}
-	else if (result != cudaSuccess)
-	{
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
-		*device_ptr = -1;
-		return false;
-	}
-
-	*device_ptr = device;
-	return true;
-#else
-	return false;
-#endif
-}
-
-void nv::cuda::exitDevice()
-{
-#if defined HAVE_CUDA
-	cudaError_t result = cudaThreadExit();
-
-	if (result != cudaSuccess) {
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
-	}
-#endif
-}
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvcore/Debug.h"
+#include "CudaUtils.h"
+
+#if defined HAVE_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif
+
+using namespace nv;
+using namespace cuda;
+
+/* @@ Move this to win32 utils or somewhere else.
+#if NV_OS_WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+
+static bool isWindowsVista()
+{
+	OSVERSIONINFO osvi;
+	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+	::GetVersionEx(&osvi);
+	return osvi.dwMajorVersion >= 6;
+}
+
+
+typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+
+static bool isWow32()
+{
+	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+
+	BOOL bIsWow64 = FALSE;
+ 
+	if (NULL != fnIsWow64Process)
+	{
+		if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+		{
+			// Assume 32 bits.
+			return true;
+		}
+	}
+
+	return !bIsWow64;
+}
+
+#endif
+*/
+
+
+static bool isCudaDriverAvailable(int version)
+{
+#if defined HAVE_CUDA
+#if NV_OS_WIN32
+	Library nvcuda("nvcuda.dll");
+#else
+	Library nvcuda(NV_LIBRARY_NAME(cuda));
+#endif
+	
+	if (!nvcuda.isValid())
+	{
+		nvDebug("*** CUDA driver not found.\n");
+		return false;
+	}
+	
+	if (version >= 2000)
+	{
+		void * address = nvcuda.bindSymbol("cuStreamCreate");
+		if (address == NULL) {
+			nvDebug("*** CUDA driver version < 2.0.\n");
+			return false;
+		}
+	}
+
+	if (version >= 2010)
+	{
+		void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
+		if (address == NULL) {
+			nvDebug("*** CUDA driver version < 2.1.\n");
+			return false;
+		}
+	}
+	
+	if (version >= 2020)
+	{
+		typedef CUresult (CUDAAPI * PFCU_DRIVERGETVERSION)(int * version);
+
+		PFCU_DRIVERGETVERSION driverGetVersion = (PFCU_DRIVERGETVERSION)nvcuda.bindSymbol("cuDriverGetVersion");
+		if (driverGetVersion == NULL) {
+			nvDebug("*** CUDA driver version < 2.2.\n");
+			return false;
+		}
+
+		int driverVersion;
+		CUresult err = driverGetVersion(&driverVersion);
+		if (err != CUDA_SUCCESS) {
+			nvDebug("*** Error querying driver version: '%s'.\n", cudaGetErrorString((cudaError_t)err));
+			return false;
+		}
+
+		return driverVersion >= version;
+	}
+#endif // HAVE_CUDA
+
+	return true;
+}
+
+
+/// Determine if CUDA is available.
+bool nv::cuda::isHardwarePresent()
+{
+#if defined HAVE_CUDA
+	// Make sure that CUDA driver matches CUDA runtime.
+	if (!isCudaDriverAvailable(CUDART_VERSION))
+	{
+		nvDebug("CUDA driver not available for CUDA runtime %d\n", CUDART_VERSION);
+		return false;
+	}
+
+	int count = deviceCount();
+	if (count == 1)
+	{
+		// Make sure it's not an emulation device.
+		cudaDeviceProp deviceProp;
+		cudaGetDeviceProperties(&deviceProp, 0);
+
+		// deviceProp.name != Device Emulation (CPU)
+		if (deviceProp.major == -1 || deviceProp.minor == -1)
+		{
+			return false;
+		}
+	}
+
+	// @@ Make sure that warp size == 32
+
+	return count > 0;
+#else
+	return false;
+#endif
+}
+
+/// Get number of CUDA enabled devices.
+int nv::cuda::deviceCount()
+{
+#if defined HAVE_CUDA
+	int gpuCount = 0;
+
+	cudaError_t result = cudaGetDeviceCount(&gpuCount);
+
+	if (result == cudaSuccess)
+	{
+		return gpuCount;
+	}
+#endif
+	return 0;
+}
+
+int nv::cuda::getFastestDevice()
+{
+	int max_gflops_device = 0;
+#if defined HAVE_CUDA
+	int max_gflops = 0;
+
+	const int device_count = deviceCount();
+	int current_device = 0;
+	while (current_device < device_count)
+	{
+		cudaDeviceProp device_properties;
+		cudaGetDeviceProperties(&device_properties, current_device);
+		int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
+
+		if (device_properties.major != -1 && device_properties.minor != -1)
+		{
+			if( gflops > max_gflops )
+			{
+				max_gflops = gflops;
+				max_gflops_device = current_device;
+			}
+		}
+		
+		current_device++;
+	}
+#endif
+	return max_gflops_device;
+}
+
+
+/// Activate the given devices.
+bool nv::cuda::setDevice(int i)
+{
+	nvCheck(i < deviceCount());
+#if defined HAVE_CUDA
+	cudaError_t result = cudaSetDevice(i);
+
+	if (result != cudaSuccess) {
+		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+	}
+
+	return result == cudaSuccess;
+#else
+	return false;
+#endif
+}
+
+void nv::cuda::exit()
+{
+#if defined HAVE_CUDA
+	cudaError_t result = cudaThreadExit();
+
+	if (result != cudaSuccess) {
+		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+	}
+#endif
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/nvtt_experimental.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/nvtt_experimental.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/nvtt_experimental.h
@@ -0,0 +1,103 @@
+
+#ifndef NVTT_EXPERIMENTAL_H
+#define NVTT_EXPERIMENTAL_H
+
+#include <nvtt/nvtt.h>
+
+typedef struct NvttTexture NvttTexture;
+typedef struct NvttOutputOptions NvttOutputOptions;
+
+
+// Global functions
+void nvttInitialize(...);
+unsigned int nvttGetVersion();
+const char * nvttGetErrorString(unsigned int error);
+
+
+// Texture functions
+NvttTexture * nvttCreateTexture();
+void nvttDestroyTexture(NvttTexture * tex);
+
+void nvttSetTexture2D(NvttTexture * tex, NvttInputFormat format, uint w, uint h, uint idx, void * data);
+
+void nvttResize(NvttTexture * img, uint w, uint h);
+unsigned int nvttDownsample(NvttTexture * img);
+
+void nvttOutputCompressed(NvttTexture * img, NvttOutputFormat format);
+void nvttOutputPixelFormat(NvttTexture * img, NvttOutputFormat format);
+
+
+
+
+// How to control the compression parameters?
+
+// Using many arguments:
+// void nvttCompressImage(img, format, quality, r, g, b, a, ...);
+
+// Using existing compression option class:
+// compressionOptions = nvttCreateCompressionOptions();
+// nvttSetCompressionOptionsFormat(compressionOptions, format);
+// nvttSetCompressionOptionsQuality(compressionOptions, quality);
+// nvttSetCompressionOptionsQuality(compressionOptions, quality);
+// nvttSetCompressionOptionsColorWeights(compressionOptions, r, g, b, a);
+// ...
+// nvttCompressImage(img, compressionOptions);
+
+// Using thread local context state:
+// void nvttSetCompressionFormat(format);
+// void nvttSetCompressionQuality(quality);
+// void nvttSetCompressionColorWeights(r, g, b, a);
+// ...
+// nvttCompressImage(img);
+
+// Using thread local context state, but with GL style function arguments:
+// nvttCompressorParameteri(NVTT_FORMAT, format);
+// nvttCompressorParameteri(NVTT_QUALITY, quality);
+// nvttCompressorParameterf(NVTT_COLOR_WEIGHT_RED, r);
+// nvttCompressorParameterf(NVTT_COLOR_WEIGHT_GREEN, g);
+// nvttCompressorParameterf(NVTT_COLOR_WEIGHT_BLUE, b);
+// nvttCompressorParameterf(NVTT_COLOR_WEIGHT_ALPHA, a);
+// or nvttCompressorParameter4f(NVTT_COLOR_WEIGHTS, r, g, b, a);
+// ...
+// nvttCompressImage(img);
+
+// How do we get the compressed output?
+// - Using callbacks. (via new entrypoints, or through outputOptions)
+// - Return it explicitely from nvttCompressImage.
+// - Store it along the image, retrieve later explicitely with 'nvttGetCompressedData(img, ...)'
+
+/*
+
+// Global functions
+void nvttInitialize(...);
+unsigned int nvttGetVersion();
+const char * nvttGetErrorString(unsigned int error);
+
+// Context object
+void nvttCreateContext();
+void nvttDestroyContext();
+
+void nvttSetParameter1i(unsigned int name, int value);
+
+void nvttSetParameter1f(unsigned int name, float value);
+void nvttSetParameter2f(unsigned int name, float v0, float v1);
+void nvttSetParameter3f(unsigned int name, float v0, float v1, float v2);
+void nvttSetParameter4f(unsigned int name, float v0, float v1, float v2, float v3);
+
+// Image object
+NvttImage * nvttCreateImage();
+void nvttDestroyImage(NvttImage * img);
+
+void nvttSetImageData(NvttImage * image, NvttInputFormat format, unsigned int w, unsigned int h, void * data);
+
+void nvttSetImageParameter1i(NvttImage * image, unsigned int name, int value);
+void nvttSetImageParameter1f(NvttImage * image, unsigned int name, float value);
+
+void nvttResizeImage(NvttImage * image, unsigned int w, unsigned int h);
+void nvttQuantizeImage(NvttImage * image, bool dither, unsigned int rbits, unsigned int gbits, unsigned int bbits, unsigned int abits);
+void nvttCompressImage(NvttImage * image, void * buffer, int size);
+
+*/
+
+
+#endif // NVTT_EXPERIMENTAL_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/nvtt_experimental.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/nvtt_experimental.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/nvtt_experimental.cpp
@@ -0,0 +1,57 @@
+
+#include "nvtt_experimental.h"
+
+struct NvttTexture
+{
+	NvttTexture() :
+		m_constant(false),
+		m_image(NULL),
+		m_floatImage(NULL)
+	{
+	}
+	
+	~NvttTexture()
+	{
+		if (m_constant && m_image) m_image->unwrap();
+		delete m_image;
+		delete m_floatImage;
+	}
+	
+	bool m_constant;
+	Image * m_image;
+	FloatImage * m_floatImage;
+};
+
+NvttTexture * nvttCreateTexture() 
+{
+	return new NvttTexture();
+}
+	
+void nvttDestroyTexture(NvttTexture * tex)
+{
+	delete tex;
+}
+
+void nvttSetImageData(NvttImage * img, NvttInputFormat format, uint w, uint h, void * data)
+{
+	nvCheck(img != NULL);
+	
+	if (format == NVTT_InputFormat_BGRA_8UB)
+	{
+		img->m_constant = false;
+		img->m_image->allocate(w, h);
+		memcpy(img->m_image->pixels(), data, w * h * 4);
+	}
+	else
+	{
+		nvCheck(false);
+	}
+}
+
+void nvttCompressImage(NvttImage * img, NvttFormat format)
+{
+	nvCheck(img != NULL);
+
+	// @@ Invoke appropriate compressor.
+}
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/test.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/test.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/experimental/test.cpp
@@ -0,0 +1,61 @@
+
+#include "nvtt_experimental.h"
+
+/*
+Errors in the original API:
+- Too many memory copies.
+- Implementation too complicated.
+- Error output should not be in output options.
+- Data driven interface. Follows the dialog model. Provide all the data upfront.
+*/
+
+
+// Output texture with mipmaps
+void example0()
+{
+	CompressionOptions compressionOptions;
+	OutputOptions outputOptions;
+	
+	Texture img;
+	img.setTexture2D(format, w, h, 0, data);
+
+	Compressor context;
+	context.outputHeader(outputOptions);
+	context.outputCompressed(img, compressionOptions, outputOptions);
+
+	img.toLinear(2.2);	
+	while (img.downsample(NVTT_FILTER_BOX))
+	{
+		img.toGamma(2.2);	
+		outputCompressed(img, compressionOptions, outputOptions);		
+	}
+}
+
+
+// Output texture with colored mipmaps
+void example1()
+{
+	CompressionOptions compressionOptions;
+	OutputOptions outputOptions;
+	
+	Texture img;
+	img.setTexture2D(format, w, h, 0, data);
+
+	Compressor context;
+	context.outputHeader(outputOptions);
+	context.outputCompressed(img, compressionOptions, outputOptions);
+
+	img.toLinear(2.2);	
+	while (img.downsample(NVTT_FILTER_BOX))
+	{
+		img.toGamma(2.2);
+		
+		Texture mipmap = img;
+		mipmap.blend(color[i].r, color[i].g, color[i].b, 0.5f);
+		
+		context.outputCompressed(mipmap, compressionOptions, outputOptions);		
+	}
+}
+
+
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt.h
@@ -1,308 +1,676 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NV_TT_H
-#define NV_TT_H
-
-// Function linkage
-#if NVTT_SHARED
-
-#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ || defined __MINGW32__
-#	ifdef NVTT_EXPORTS
-#		define NVTT_API __declspec(dllexport)
-#	else
-#		define NVTT_API __declspec(dllimport)
-#	endif
-#endif
-
-#if defined __GNUC__ >= 4
-#	ifdef NVTT_EXPORTS
-#		define NVTT_API __attribute__((visibility("default")))
-#	endif
-#endif
-
-#endif // NVTT_SHARED
-
-#if !defined NVTT_API
-#	define NVTT_API
-#endif
-
-#define NVTT_VERSION 200
-
-#define NVTT_DECLARE_PIMPL(Class) \
-	private: \
-		Class(const Class &); \
-		void operator=(const Class &); \
-	public: \
-		struct Private; \
-		Private & m
-
-
-// Public interface.
-namespace nvtt
-{
-	/// Supported compression formats.
-	enum Format
-	{
-		// No compression.
-		Format_RGB,
-		Format_RGBA = Format_RGB,
-		
-		// DX9 formats.
-		Format_DXT1,
-		Format_DXT1a,   // DXT1 with binary alpha.
-		Format_DXT3,
-		Format_DXT5,
-		Format_DXT5n,   // Compressed HILO: R=1, G=y, B=0, A=x
-		
-		// DX10 formats.
-		Format_BC1 = Format_DXT1,
-		Format_BC1a = Format_DXT1a,
-		Format_BC2 = Format_DXT3,
-		Format_BC3 = Format_DXT5,
-		Format_BC3n = Format_DXT5n,
-		Format_BC4,     // ATI1
-		Format_BC5,     // 3DC, ATI2
-	};
-	
-	/// Quality modes.
-	enum Quality
-	{
-		Quality_Fastest,
-		Quality_Normal,
-		Quality_Production,
-		Quality_Highest,
-	};
-
-	/// Compression options. This class describes the desired compression format and other compression settings.
-	struct CompressionOptions
-	{
-		NVTT_DECLARE_PIMPL(CompressionOptions);
-
-		NVTT_API CompressionOptions();
-		NVTT_API ~CompressionOptions();
-		
-		NVTT_API void reset();
-		
-		NVTT_API void setFormat(Format format);
-		NVTT_API void setQuality(Quality quality);
-		NVTT_API void setColorWeights(float red, float green, float blue, float alpha = 1.0f);
-		
-		NVTT_API void setExternalCompressor(const char * name);
-
-		// Set color mask to describe the RGB/RGBA format.
-		NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
-
-		NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
-	};
-
-
-	/// Wrap modes.
-	enum WrapMode
-	{
-		WrapMode_Clamp,
-		WrapMode_Repeat,
-		WrapMode_Mirror,
-	};
-	
-	/// Texture types.
-	enum TextureType
-	{
-		TextureType_2D,
-		TextureType_Cube,
-	//	TextureType_3D,
-	};
-	
-	/// Input formats.
-	enum InputFormat
-	{
-		InputFormat_BGRA_8UB,
-	//	InputFormat_RGBE_8UB,
-	//	InputFormat_BGRA_32F,
-	};
-	
-	/// Mipmap downsampling filters.
-	enum MipmapFilter
-	{
-		MipmapFilter_Box,       ///< Box filter is quite good and very fast.
-		MipmapFilter_Triangle,  ///< Triangle filter blurs the results too much, but that might be what you want.
-		MipmapFilter_Kaiser,    ///< Kaiser-windowed Sinc filter is the best downsampling filter.
-	};
-	
-	/// Color transformation.
-	enum ColorTransform
-	{
-		ColorTransform_None,
-		ColorTransform_Linear,
-	};
-	
-	/// Extents rounding mode.
-	enum RoundMode
-	{
-		RoundMode_None,
-		RoundMode_ToNextPowerOfTwo,
-		RoundMode_ToNearestPowerOfTwo,
-		RoundMode_ToPreviousPowerOfTwo,
-	};
-	
-	/// Alpha mode.
-	enum AlphaMode
-	{
-		AlphaMode_None,
-		AlphaMode_Transparency,
-		AlphaMode_Premultiplied,
-	};
-
-	/// Input options. Specify format and layout of the input texture.
-	struct InputOptions
-	{
-		NVTT_DECLARE_PIMPL(InputOptions);
-
-		NVTT_API InputOptions();
-		NVTT_API ~InputOptions();
-		
-		// Set default options.
-		NVTT_API void reset();
-		
-		// Setup input layout.
-		NVTT_API void setTextureLayout(TextureType type, int w, int h, int d = 1);
-		NVTT_API void resetTextureLayout();
-		
-		// Set mipmap data. Copies the data.
-		NVTT_API bool setMipmapData(const void * data, int w, int h, int d = 1, int face = 0, int mipmap = 0);
-		
-		// Describe the format of the input.
-		NVTT_API void setFormat(InputFormat format);
-		
-		// Set the way the input alpha channel is interpreted.
-		NVTT_API void setAlphaMode(AlphaMode alphaMode);
-		
-		// Set gamma settings.
-		NVTT_API void setGamma(float inputGamma, float outputGamma);
-		
-		// Set texture wrappign mode.
-		NVTT_API void setWrapMode(WrapMode mode);
-		
-		// Set mipmapping options.
-		NVTT_API void setMipmapFilter(MipmapFilter filter);
-		NVTT_API void setMipmapGeneration(bool enabled, int maxLevel = -1);
-		NVTT_API void setKaiserParameters(float width, float alpha, float stretch);
-
-		// Set normal map options.
-		NVTT_API void setNormalMap(bool b);
-		NVTT_API void setConvertToNormalMap(bool convert);
-		NVTT_API void setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale);
-		NVTT_API void setNormalFilter(float sm, float medium, float big, float large);
-		NVTT_API void setNormalizeMipmaps(bool b);
-		
-		// Set color transforms. @@ Not implemented!
-		NVTT_API void setColorTransform(ColorTransform t);
-		NVTT_API void setLinearTransform(int channel, float w0, float w1, float w2, float w3);
-		
-		// Set resizing options.
-		NVTT_API void setMaxExtents(int d);
-		NVTT_API void setRoundMode(RoundMode mode);
-	};
-	
-	
-	/// Output handler.
-	struct OutputHandler
-	{
-		virtual ~OutputHandler() {}
-		
-		/// Indicate the start of a new compressed image that's part of the final texture.
-		virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel) = 0;
-		
-		/// Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
-		virtual bool writeData(const void * data, int size) = 0;
-	};
-
-	/// Error codes.
-	enum Error
-	{
-		Error_Unknown,
-		Error_InvalidInput,
-		Error_UnsupportedFeature,
-		Error_CudaError,
-  		Error_FileOpen,
-  		Error_FileWrite,
-	};
-	
-	/// Error handler.
-	struct ErrorHandler
-	{
-		virtual ~ErrorHandler() {}
-		
-		// Signal error.
-		virtual void error(Error e) = 0;
-	};
-
-
-	/// Output Options. This class holds pointers to the interfaces that are used to report the output of 
-	/// the compressor to the user.
-	struct OutputOptions
-	{
-		NVTT_DECLARE_PIMPL(OutputOptions);
-
-		NVTT_API OutputOptions();
-		NVTT_API ~OutputOptions();
-		
-		// Set default options.
-		NVTT_API void reset();
-		
-		NVTT_API void setFileName(const char * fileName);
-		
-		NVTT_API void setOutputHandler(OutputHandler * outputHandler);
-		NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
-		NVTT_API void setOutputHeader(bool outputHeader);
-	};
-
-
-	/// Texture compressor.
-	struct Compressor
-	{
-		NVTT_DECLARE_PIMPL(Compressor);
-
-		NVTT_API Compressor();
-		NVTT_API ~Compressor();
-
-		NVTT_API void enableCudaAcceleration(bool enable);
-		NVTT_API bool isCudaAccelerationEnabled() const;
-
-		// Main entrypoint of the compression library.
-		NVTT_API bool process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
-		
-		// Estimate the size of compressing the input with the given options.
-		NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
-	};
-	
-	
-	// Return string for the given error code.
-	NVTT_API const char * errorString(Error e);
-
-	// Return NVTT version.
-	NVTT_API unsigned int version();
-
-} // nvtt namespace
-
-#endif // NV_TT_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#pragma once
+#ifndef NVTT_H
+#define NVTT_H
+
+// Function linkage
+#if NVTT_SHARED
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ || defined __MINGW32__
+#  ifdef NVTT_EXPORTS
+#    define NVTT_API __declspec(dllexport)
+#  else
+#    define NVTT_API __declspec(dllimport)
+#  endif
+#endif
+
+#if defined __GNUC__ >= 4
+#  ifdef NVTT_EXPORTS
+#    define NVTT_API __attribute__((visibility("default")))
+#  endif
+#endif
+
+#endif // NVTT_SHARED
+
+#if !defined NVTT_API
+#  define NVTT_API
+#endif
+
+#define NVTT_VERSION 20100
+
+#define NVTT_FORBID_COPY(Class) \
+    private: \
+        Class(const Class &); \
+        void operator=(const Class &); \
+    public:
+
+#define NVTT_DECLARE_PIMPL(Class) \
+    public: \
+        struct Private; \
+        Private & m
+
+
+// Public interface.
+namespace nvtt
+{
+    // Forward declarations.
+    struct Surface;
+    struct CubeSurface;
+
+
+    // Supported block-compression formats.
+    // @@ I wish I had distinguished between "formats" and compressors.
+    // That is:
+    // - 'DXT1' is a format 'DXT1a' and 'DXT1n' are DXT1 compressors.
+    // - 'DXT3' is a format 'DXT3n' is a DXT3 compressor.
+    // Having multiple enums for the same ids only creates confusion. Clean this up.
+    enum Format
+    {
+        // No block-compression (linear).
+        Format_RGB,
+        Format_RGBA = Format_RGB,
+
+        // DX9 formats.
+        Format_DXT1,
+        Format_DXT1a,   // DXT1 with binary alpha.
+        Format_DXT3,
+        Format_DXT5,
+        Format_DXT5n,   // Compressed HILO: R=1, G=y, B=0, A=x
+
+        // DX10 formats.
+        Format_BC1 = Format_DXT1,
+        Format_BC1a = Format_DXT1a,
+        Format_BC2 = Format_DXT3,
+        Format_BC3 = Format_DXT5,
+        Format_BC3n = Format_DXT5n,
+        Format_BC4,     // ATI1
+        Format_BC5,     // 3DC, ATI2
+
+        Format_DXT1n,   // Not supported.
+        Format_CTX1,    // Not supported.
+
+        Format_BC6,
+        Format_BC7,
+
+        Format_BC3_RGBM,    // 
+
+        Format_Count
+    };
+
+    // Pixel types. These basically indicate how the output should be interpreted, but do not have any influence over the input. They are only relevant in RGBA mode.
+    enum PixelType
+    {
+        PixelType_UnsignedNorm = 0,
+        PixelType_SignedNorm = 1,   // Not supported yet.
+        PixelType_UnsignedInt = 2,  // Not supported yet.
+        PixelType_SignedInt = 3,    // Not supported yet.
+        PixelType_Float = 4,
+        PixelType_UnsignedFloat = 5,
+        PixelType_SharedExp = 6,    // Shared exponent.
+    };
+
+    // Quality modes.
+    enum Quality
+    {
+        Quality_Fastest,
+        Quality_Normal,
+        Quality_Production,
+        Quality_Highest,
+    };
+
+    // DXT decoder.
+    enum Decoder
+    {
+        Decoder_D3D10,
+        Decoder_D3D9,
+        Decoder_NV5x,
+        //Decoder_RSX, // To take advantage of DXT5 bug.
+    };
+
+
+    // Compression options. This class describes the desired compression format and other compression settings.
+    struct CompressionOptions
+    {
+        NVTT_FORBID_COPY(CompressionOptions);
+        NVTT_DECLARE_PIMPL(CompressionOptions);
+
+        NVTT_API CompressionOptions();
+        NVTT_API ~CompressionOptions();
+
+        NVTT_API void reset();
+
+        NVTT_API void setFormat(Format format);
+        NVTT_API void setQuality(Quality quality);
+        NVTT_API void setColorWeights(float red, float green, float blue, float alpha = 1.0f);
+
+        NVTT_API void setExternalCompressor(const char * name);
+
+        // Set color mask to describe the RGB/RGBA format.
+        NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
+        NVTT_API void setPixelFormat(unsigned char rsize, unsigned char gsize, unsigned char bsize, unsigned char asize);
+
+        NVTT_API void setPixelType(PixelType pixelType);
+
+        NVTT_API void setPitchAlignment(int pitchAlignment);
+
+        // @@ I wish this wasn't part of the compression options. Quantization is applied before compression. We don't have compressors with error diffusion. 
+        // @@ These options are only taken into account when using the InputOptions API.
+        NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
+
+        NVTT_API void setTargetDecoder(Decoder decoder);
+
+        // Translate to and from D3D formats.
+        NVTT_API unsigned int d3d9Format() const;
+        //NVTT_API bool setD3D9Format(unsigned int format);
+        //NVTT_API unsigned int dxgiFormat() const;
+        //NVTT_API bool setDxgiFormat(unsigned int format);
+    };
+
+    /*
+    // DXGI_FORMAT_R16G16_FLOAT
+    compressionOptions.setPixelType(PixelType_Float);
+    compressionOptions.setPixelFormat2(16, 16, 0, 0);
+
+    // DXGI_FORMAT_R32G32B32A32_FLOAT
+    compressionOptions.setPixelType(PixelType_Float);
+    compressionOptions.setPixelFormat2(32, 32, 32, 32);
+    */
+
+
+    // Wrap modes.
+    enum WrapMode
+    {
+        WrapMode_Clamp,
+        WrapMode_Repeat,
+        WrapMode_Mirror,
+    };
+
+    // Texture types.
+    enum TextureType
+    {
+        TextureType_2D,
+        TextureType_Cube,
+        TextureType_3D,
+        TextureType_Array,
+    };
+
+    // Input formats.
+    enum InputFormat
+    {
+        InputFormat_BGRA_8UB,   // Normalized [0, 1] 8 bit fixed point.
+        InputFormat_RGBA_16F,   // 16 bit floating point.
+        InputFormat_RGBA_32F,   // 32 bit floating point.
+        InputFormat_R_32F,      // Single channel 32 bit floating point.
+    };
+
+    // Mipmap downsampling filters.
+    enum MipmapFilter
+    {
+        MipmapFilter_Box,       // Box filter is quite good and very fast.
+        MipmapFilter_Triangle,  // Triangle filter blurs the results too much, but that might be what you want.
+        MipmapFilter_Kaiser,    // Kaiser-windowed Sinc filter is the best downsampling filter.
+    };
+
+    // Texture resize filters.
+    enum ResizeFilter
+    {
+        ResizeFilter_Box,
+        ResizeFilter_Triangle,
+        ResizeFilter_Kaiser,
+        ResizeFilter_Mitchell,
+    };
+
+    // Extents rounding mode.
+    enum RoundMode
+    {
+        RoundMode_None,
+        RoundMode_ToNextPowerOfTwo,
+        RoundMode_ToNearestPowerOfTwo,
+        RoundMode_ToPreviousPowerOfTwo,
+        RoundMode_ToNextMultipleOfFour,                     // (New in NVTT 2.1)
+        RoundMode_ToNearestMultipleOfFour,                  // (New in NVTT 2.1)
+        RoundMode_ToPreviousMultipleOfFour,                 // (New in NVTT 2.1)
+    };
+
+    // Alpha mode.
+    enum AlphaMode
+    {
+        AlphaMode_None,
+        AlphaMode_Transparency,
+        AlphaMode_Premultiplied,
+    };
+
+    // Input options. Specify format and layout of the input texture. (Deprecated in NVTT 2.1)
+    struct InputOptions
+    {
+        NVTT_FORBID_COPY(InputOptions);
+        NVTT_DECLARE_PIMPL(InputOptions);
+
+        NVTT_API InputOptions();
+        NVTT_API ~InputOptions();
+
+        // Set default options.
+        NVTT_API void reset();
+
+        // Setup input layout.
+        NVTT_API void setTextureLayout(TextureType type, int w, int h, int d = 1, int arraySize = 1);
+        NVTT_API void resetTextureLayout();
+
+        // Set mipmap data. Copies the data.
+        NVTT_API bool setMipmapData(const void * data, int w, int h, int d = 1, int face = 0, int mipmap = 0);
+
+        // Describe the format of the input.
+        NVTT_API void setFormat(InputFormat format);
+
+        // Set the way the input alpha channel is interpreted. @@ Not implemented!
+        NVTT_API void setAlphaMode(AlphaMode alphaMode);
+
+        // Set gamma settings.
+        NVTT_API void setGamma(float inputGamma, float outputGamma);
+
+        // Set texture wrapping mode.
+        NVTT_API void setWrapMode(WrapMode mode);
+
+        // Set mipmapping options.
+        NVTT_API void setMipmapFilter(MipmapFilter filter);
+        NVTT_API void setMipmapGeneration(bool enabled, int maxLevel = -1);
+        NVTT_API void setKaiserParameters(float width, float alpha, float stretch);
+
+        // Set normal map options.
+        NVTT_API void setNormalMap(bool b);
+        NVTT_API void setConvertToNormalMap(bool convert);
+        NVTT_API void setHeightEvaluation(float redScale, float greenScale, float blueScale, float alphaScale);
+        NVTT_API void setNormalFilter(float sm, float medium, float big, float large);
+        NVTT_API void setNormalizeMipmaps(bool b);
+
+        // Set resizing options.
+        NVTT_API void setMaxExtents(int d);
+        NVTT_API void setRoundMode(RoundMode mode);
+    };
+
+
+    // Output handler.
+    struct OutputHandler
+    {
+        virtual ~OutputHandler() {}
+
+        // Indicate the start of a new compressed image that's part of the final texture.
+        virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel) = 0;
+
+        // Output data. Compressed data is output as soon as it's generated to minimize memory allocations.
+        virtual bool writeData(const void * data, int size) = 0;
+
+        // Indicate the end of the compressed image. (New in NVTT 2.1)
+        virtual void endImage() = 0;
+    };
+
+    // Error codes.
+    enum Error
+    {
+        Error_Unknown,
+        Error_InvalidInput,
+        Error_UnsupportedFeature,
+        Error_CudaError,
+        Error_FileOpen,
+        Error_FileWrite,
+        Error_UnsupportedOutputFormat,
+        Error_Count
+    };
+
+    // Error handler.
+    struct ErrorHandler
+    {
+        virtual ~ErrorHandler() {}
+
+        // Signal error.
+        virtual void error(Error e) = 0;
+    };
+
+    // Container.
+    enum Container
+    {
+        Container_DDS,
+        Container_DDS10,
+        // Container_KTX,   // Khronos Texture: http://www.khronos.org/opengles/sdk/tools/KTX/
+        // Container_VTF,   // Valve Texture Format: http://developer.valvesoftware.com/wiki/Valve_Texture_Format
+    };
+
+
+    // Output Options. This class holds pointers to the interfaces that are used to report the output of
+    // the compressor to the user.
+    struct OutputOptions
+    {
+        NVTT_FORBID_COPY(OutputOptions);
+        NVTT_DECLARE_PIMPL(OutputOptions);
+
+        NVTT_API OutputOptions();
+        NVTT_API ~OutputOptions();
+
+        // Set default options.
+        NVTT_API void reset();
+
+        NVTT_API void setFileName(const char * fileName);
+        NVTT_API void setFileHandle(void * fp);
+
+        NVTT_API void setOutputHandler(OutputHandler * outputHandler);
+        NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
+
+        NVTT_API void setOutputHeader(bool outputHeader);
+        NVTT_API void setContainer(Container container);
+        NVTT_API void setUserVersion(int version);
+        NVTT_API void setSrgbFlag(bool b);
+    };
+
+    // (New in NVTT 2.1)
+    typedef void Task(void * context, int id);
+
+    // (New in NVTT 2.1)
+    struct TaskDispatcher
+    {
+        virtual ~TaskDispatcher() {}
+
+        virtual void dispatch(Task * task, void * context, int count) = 0;
+    };
+
+    // Context.
+    struct Compressor
+    {
+        NVTT_FORBID_COPY(Compressor);
+        NVTT_DECLARE_PIMPL(Compressor);
+
+        NVTT_API Compressor();
+        NVTT_API ~Compressor();
+
+        // Context settings.
+        NVTT_API void enableCudaAcceleration(bool enable);
+        NVTT_API bool isCudaAccelerationEnabled() const;
+        NVTT_API void setTaskDispatcher(TaskDispatcher * disp); // (New in NVTT 2.1)
+
+        // InputOptions API.
+        NVTT_API bool process(const InputOptions & inputOptions, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
+
+        // Surface API. (New in NVTT 2.1)
+        NVTT_API bool outputHeader(const Surface & img, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API bool compress(const Surface & img, int face, int mipmap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(const Surface & img, int mipmapCount, const CompressionOptions & compressionOptions) const;
+
+        // CubeSurface API. (New in NVTT 2.1)
+        NVTT_API bool outputHeader(const CubeSurface & cube, int mipmapCount, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API bool compress(const CubeSurface & cube, int mipmap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(const CubeSurface & cube, int mipmapCount, const CompressionOptions & compressionOptions) const;
+
+        // Raw API. (New in NVTT 2.1)
+        NVTT_API bool outputHeader(TextureType type, int w, int h, int d, int arraySize, int mipmapCount, bool isNormalMap, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API bool compress(int w, int h, int d, int face, int mipmap, const float * rgba, const CompressionOptions & compressionOptions, const OutputOptions & outputOptions) const;
+        NVTT_API int estimateSize(int w, int h, int d, int mipmapCount, const CompressionOptions & compressionOptions) const;
+    };
+
+    // "Compressor" is deprecated. This should have been called "Context"
+    typedef Compressor Context;
+
+    // (New in NVTT 2.1)
+    enum NormalTransform {
+        NormalTransform_Orthographic,
+        NormalTransform_Stereographic,
+        NormalTransform_Paraboloid,
+        NormalTransform_Quartic
+        //NormalTransform_DualParaboloid,
+    };
+
+    // (New in NVTT 2.1)
+    enum ToneMapper {
+        ToneMapper_Linear,
+        ToneMapper_Reindhart,
+        ToneMapper_Halo,
+        ToneMapper_Lightmap,
+    };
+
+
+    // A surface is one level of a 2D or 3D texture. (New in NVTT 2.1)
+    // @@ It would be nice to add support for texture borders for correct resizing of tiled textures and constrained DXT compression.
+    struct Surface
+    {
+        NVTT_API Surface();
+        NVTT_API Surface(const Surface & img);
+        NVTT_API ~Surface();
+
+        NVTT_API void operator=(const Surface & img);
+
+        // Texture parameters.
+        NVTT_API void setWrapMode(WrapMode mode);
+        NVTT_API void setAlphaMode(AlphaMode alphaMode);
+        NVTT_API void setNormalMap(bool isNormalMap);
+
+        // Queries.
+        NVTT_API bool isNull() const;
+        NVTT_API int width() const;
+        NVTT_API int height() const;
+        NVTT_API int depth() const;
+        NVTT_API TextureType type() const;
+        NVTT_API WrapMode wrapMode() const;
+        NVTT_API AlphaMode alphaMode() const;
+        NVTT_API bool isNormalMap() const;
+        NVTT_API int countMipmaps() const;
+        NVTT_API int countMipmaps(int min_size) const;
+        NVTT_API float alphaTestCoverage(float alphaRef = 0.5, int alpha_channel = 3) const;
+        NVTT_API float average(int channel, int alpha_channel = -1, float gamma = 2.2f) const;
+        NVTT_API const float * data() const;
+        NVTT_API const float * channel(int i) const;
+        NVTT_API void histogram(int channel, float rangeMin, float rangeMax, int binCount, int * binPtr) const;
+        NVTT_API void range(int channel, float * rangeMin, float * rangeMax, int alpha_channel = -1, float alpha_ref = 0.f) const;
+
+        // Texture data.
+        NVTT_API bool load(const char * fileName, bool * hasAlpha = 0);
+        NVTT_API bool save(const char * fileName, bool hasAlpha = 0, bool hdr = 0) const;
+        NVTT_API bool setImage(int w, int h, int d);
+        NVTT_API bool setImage(InputFormat format, int w, int h, int d, const void * data);
+        NVTT_API bool setImage(InputFormat format, int w, int h, int d, const void * r, const void * g, const void * b, const void * a);
+        NVTT_API bool setImage2D(Format format, Decoder decoder, int w, int h, const void * data);
+
+        // Resizing methods.
+        NVTT_API void resize(int w, int h, int d, ResizeFilter filter);
+        NVTT_API void resize(int w, int h, int d, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize_make_square(int maxExtent, RoundMode roundMode, ResizeFilter filter);
+
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, int min_size = 1);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0, int min_size = 1);
+        NVTT_API bool buildNextMipmapSolidColor(const float * const color_components);
+        NVTT_API void canvasSize(int w, int h, int d);
+        // associated to resizing:
+        NVTT_API bool canMakeNextMipmap(int min_size = 1);
+
+        // Color transforms.
+        NVTT_API void toLinear(float gamma);
+        NVTT_API void toGamma(float gamma);
+        NVTT_API void toLinear(int channel, float gamma);
+        NVTT_API void toGamma(int channel, float gamma);
+        NVTT_API void toSrgb();
+        NVTT_API void toLinearFromSrgb();
+        NVTT_API void toXenonSrgb();
+        NVTT_API void transform(const float w0[4], const float w1[4], const float w2[4], const float w3[4], const float offset[4]);
+        NVTT_API void swizzle(int r, int g, int b, int a);
+        NVTT_API void scaleBias(int channel, float scale, float bias);
+        NVTT_API void clamp(int channel, float low = 0.0f, float high = 1.0f);
+        NVTT_API void blend(float r, float g, float b, float a, float t);
+        NVTT_API void premultiplyAlpha();
+        NVTT_API void toGreyScale(float redScale, float greenScale, float blueScale, float alphaScale);
+        NVTT_API void setBorder(float r, float g, float b, float a);
+        NVTT_API void fill(float r, float g, float b, float a);
+        NVTT_API void scaleAlphaToCoverage(float coverage, float alphaRef = 0.5f, int alpha_channel = 3);
+        NVTT_API void toRGBM(float range = 1.0f, float threshold = 0.25f);
+        NVTT_API void fromRGBM(float range = 1.0f, float threshold = 0.25f);
+        NVTT_API void toLM(float range = 1.0f, float threshold = 0.0f);
+        NVTT_API void toRGBE(int mantissaBits, int exponentBits);
+        NVTT_API void fromRGBE(int mantissaBits, int exponentBits);
+        NVTT_API void toYCoCg();
+        NVTT_API void blockScaleCoCg(int bits = 5, float threshold = 0.0f);
+        NVTT_API void fromYCoCg();
+        NVTT_API void toLUVW(float range = 1.0f);
+        NVTT_API void fromLUVW(float range = 1.0f);
+        NVTT_API void abs(int channel);
+        NVTT_API void convolve(int channel, int kernelSize, float * kernelData);
+        NVTT_API void toLogScale(int channel, float base);
+        NVTT_API void fromLogScale(int channel, float base);
+        NVTT_API void setAtlasBorder(int w, int h, float r, float g, float b, float a);
+
+        NVTT_API void toneMap(ToneMapper tm, float * parameters);
+
+        //NVTT_API void blockLuminanceScale(float scale);
+
+        // Color quantization.
+        NVTT_API void binarize(int channel, float threshold, bool dither);
+        NVTT_API void quantize(int channel, int bits, bool exactEndPoints, bool dither);
+
+        // Normal map transforms.
+        NVTT_API void toNormalMap(float sm, float medium, float big, float large);
+        NVTT_API void normalizeNormalMap();
+        NVTT_API void transformNormals(NormalTransform xform);
+        NVTT_API void reconstructNormals(NormalTransform xform);
+        NVTT_API void toCleanNormalMap();
+        NVTT_API void packNormals(float scale = 0.5f, float bias = 0.5f);       // [-1,1] -> [ 0,1]
+        NVTT_API void expandNormals(float scale = 2.0f, float bias = -1.0f);    // [ 0,1] -> [-1,1]
+        NVTT_API Surface createToksvigMap(float power) const;
+        NVTT_API Surface createCleanMap() const;
+
+        // Geometric transforms.
+        NVTT_API void flipX();
+        NVTT_API void flipY();
+        NVTT_API void flipZ();
+        NVTT_API Surface createSubImage(int x0, int x1, int y0, int y1, int z0, int z1) const;
+
+        // Copy image data.
+        NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel);
+        NVTT_API bool copyChannel(const Surface & srcImage, int srcChannel, int dstChannel);
+
+        NVTT_API bool addChannel(const Surface & img, int srcChannel, int dstChannel, float scale);
+
+        NVTT_API bool copy(const Surface & src, int xsrc, int ysrc, int zsrc, int xsize, int ysize, int zsize, int xdst, int ydst, int zdst);
+
+
+    //private:
+        void detach();
+
+        struct Private;
+        Private * m;
+    };
+
+
+    // Cube layout formats. (New in NVTT 2.1)
+    enum CubeLayout {
+        CubeLayout_VerticalCross,
+        CubeLayout_HorizontalCross,
+        CubeLayout_Column,
+        CubeLayout_Row,
+        CubeLayout_LatitudeLongitude
+    };
+
+    // (New in NVTT 2.1)
+    enum EdgeFixup {
+        EdgeFixup_None,
+        EdgeFixup_Stretch,
+        EdgeFixup_Warp,
+        EdgeFixup_Average,
+    };
+
+    // A CubeSurface is one level of a cube map texture. (New in NVTT 2.1)
+    struct CubeSurface
+    {
+        NVTT_API CubeSurface();
+        NVTT_API CubeSurface(const CubeSurface & img);
+        NVTT_API ~CubeSurface();
+
+        NVTT_API void operator=(const CubeSurface & img);
+
+        // Queries.
+        NVTT_API bool isNull() const;
+        NVTT_API int edgeLength() const;
+        NVTT_API int countMipmaps() const;
+
+        // Texture data.
+        NVTT_API bool load(const char * fileName, int mipmap);
+        NVTT_API bool save(const char * fileName) const;
+
+        NVTT_API Surface & face(int face);
+        NVTT_API const Surface & face(int face) const;
+
+        // Layout conversion. @@ Not implemented.
+        NVTT_API void fold(const Surface & img, CubeLayout layout);
+        NVTT_API Surface unfold(CubeLayout layout) const;
+
+        // @@ Angular extent filtering.
+
+        // @@ Add resizing methods.
+
+        // @@ Add edge fixup methods.
+
+        NVTT_API float average(int channel) const;
+        NVTT_API void range(int channel, float * minimum_ptr, float * maximum_ptr) const;
+        NVTT_API void clamp(int channel, float low = 0.0f, float high = 1.0f);
+
+
+        // Filtering.
+        NVTT_API CubeSurface irradianceFilter(int size, EdgeFixup fixupMethod) const;
+        NVTT_API CubeSurface cosinePowerFilter(int size, float cosinePower, EdgeFixup fixupMethod) const;
+
+        NVTT_API CubeSurface fastResample(int size, EdgeFixup fixupMethod) const;
+
+
+        /*
+        NVTT_API void resize(int w, int h, ResizeFilter filter);
+        NVTT_API void resize(int w, int h, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter);
+        NVTT_API void resize(int maxExtent, RoundMode mode, ResizeFilter filter, float filterWidth, const float * params = 0);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter);
+        NVTT_API bool buildNextMipmap(MipmapFilter filter, float filterWidth, const float * params = 0);
+        */
+
+        // Color transforms.
+        NVTT_API void toLinear(float gamma);
+        NVTT_API void toGamma(float gamma);
+
+    //private:
+        void detach();
+
+        struct Private;
+        Private * m;
+    };
+
+
+    // Return string for the given error code.
+    NVTT_API const char * errorString(Error e);
+
+    // Return NVTT version.
+    NVTT_API unsigned int version();
+
+    // Image comparison and error measurement functions. (New in NVTT 2.1)
+    NVTT_API float rmsError(const Surface & reference, const Surface & img);
+    NVTT_API float rmsAlphaError(const Surface & reference, const Surface & img);
+    NVTT_API float cieLabError(const Surface & reference, const Surface & img);
+    NVTT_API float angularError(const Surface & reference, const Surface & img);
+    NVTT_API Surface diff(const Surface & reference, const Surface & img, float scale);
+
+    NVTT_API float rmsToneMappedError(const Surface & reference, const Surface & img, float exposure);
+
+
+    NVTT_API Surface histogram(const Surface & img, int width, int height);
+    NVTT_API Surface histogram(const Surface & img, float minRange, float maxRange, int width, int height);
+
+} // nvtt namespace
+
+#endif // NVTT_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt.cpp
@@ -1,55 +1,59 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "nvtt.h"
-
-using namespace nvtt;
-
-/// Return a string for the given error.
-const char * nvtt::errorString(Error e)
-{
-	switch(e)
-	{
-		case Error_Unknown:
-			return "Unknown error";
-		case Error_InvalidInput:
-			return "Invalid input";
-		case Error_UnsupportedFeature:
-			return "Unsupported feature";
-		case Error_CudaError:
-			return "CUDA error";
-		case Error_FileOpen:
-			return "Error opening file";
-		case Error_FileWrite:
-			return "Error writing through output handler";
-	}
-	
-	return "Invalid error";
-}
-
-/// Return NVTT version.
-unsigned int nvtt::version()
-{
-	return NVTT_VERSION;
-}
-
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvtt.h"
+#include "nvcore/nvcore.h"
+
+using namespace nvtt;
+
+// Return a string for the given error.
+const char * nvtt::errorString(Error e)
+{
+    NV_COMPILER_CHECK(Error_Count == 7);
+    switch(e)
+    {
+        case Error_Unknown:
+            return "Unknown error";
+        case Error_InvalidInput:
+            return "Invalid input";
+        case Error_UnsupportedFeature:
+            return "Unsupported feature";
+        case Error_CudaError:
+            return "CUDA error";
+        case Error_FileOpen:
+            return "Error opening file";
+        case Error_FileWrite:
+            return "Error writing through output handler";
+        case Error_UnsupportedOutputFormat:
+            return "The container file does not support the selected output format";
+    }
+
+    return "Invalid error";
+}
+
+// Return NVTT version.
+unsigned int nvtt::version()
+{
+    return NVTT_VERSION;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt_wrapper.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt_wrapper.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt_wrapper.h
@@ -1,241 +1,235 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef NVTT_WRAPPER_H
-#define NVTT_WRAPPER_H
-
-// Function linkage
-#if NVTT_SHARED
-
-#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ || defined __MINGW32__
-#	ifdef NVTT_EXPORTS
-#		define NVTT_API __declspec(dllexport)
-#	else
-#		define NVTT_API __declspec(dllimport)
-#	endif
-#endif
-
-#if defined __GNUC__ >= 4
-#	ifdef NVTT_EXPORTS
-#		define NVTT_API __attribute__((visibility("default")))
-#	endif
-#endif
-
-#endif // NVTT_SHARED
-
-#if !defined NVTT_API
-#	define NVTT_API
-#endif
-
-#define NVTT_VERSION 200
-
-#ifdef __cplusplus
-typedef struct nvtt::InputOptions NvttInputOptions;
-typedef struct nvtt::CompressionOptions NvttCompressionOptions;
-typedef struct nvtt::OutputOptions NvttOutputOptions;
-typedef struct nvtt::Compressor NvttCompressor;
-#else
-typedef struct NvttInputOptions NvttInputOptions;
-typedef struct NvttCompressionOptions NvttCompressionOptions;
-typedef struct NvttOutputOptions NvttOutputOptions;
-typedef struct NvttCompressor NvttCompressor;
-#endif
-
-/// Supported compression formats.
-typedef enum
-{
-	// No compression.
-	NVTT_Format_RGB,
-	NVTT_Format_RGBA = NVTT_Format_RGB,
-
-	// DX9 formats.
-	NVTT_Format_DXT1,
-	NVTT_Format_DXT1a,
-	NVTT_Format_DXT3,
-	NVTT_Format_DXT5,
-	NVTT_Format_DXT5n,
-	
-	// DX10 formats.
-	NVTT_Format_BC1 = NVTT_Format_DXT1,
-	NVTT_Format_BC1a = NVTT_Format_DXT1a,
-	NVTT_Format_BC2 = NVTT_Format_DXT3,
-	NVTT_Format_BC3 = NVTT_Format_DXT5,
-	NVTT_Format_BC3n = NVTT_Format_DXT5n,
-	NVTT_Format_BC4,
-	NVTT_Format_BC5,
-} NvttFormat;
-
-/// Quality modes.
-typedef enum
-{
-	NVTT_Quality_Fastest,
-	NVTT_Quality_Normal,
-	NVTT_Quality_Production,
-	NVTT_Quality_Highest,
-} NvttQuality;
-
-/// Wrap modes.
-typedef enum
-{
-	NVTT_WrapMode_Clamp,
-	NVTT_WrapMode_Repeat,
-	NVTT_WrapMode_Mirror,
-} NvttWrapMode;
-
-/// Texture types.
-typedef enum
-{
-	NVTT_TextureType_2D,
-	NVTT_TextureType_Cube,
-} NvttTextureType;
-
-/// Input formats.
-typedef enum
-{
-	NVTT_InputFormat_BGRA_8UB,
-} NvttInputFormat;
-
-/// Mipmap downsampling filters.
-typedef enum
-{
-	NVTT_MipmapFilter_Box,
-	NVTT_MipmapFilter_Triangle,
-	NVTT_MipmapFilter_Kaiser,
-} NvttMipmapFilter;
-
-/// Color transformation.
-typedef enum
-{
-	NVTT_ColorTransform_None,
-	NVTT_ColorTransform_Linear,
-} NvttColorTransform;
-
-/// Extents rounding mode.
-typedef enum
-{
-	NVTT_RoundMode_None,
-	NVTT_RoundMode_ToNextPowerOfTwo,
-	NVTT_RoundMode_ToNearestPowerOfTwo,
-	NVTT_RoundMode_ToPreviousPowerOfTwo,
-} NvttRoundMode;
-
-/// Alpha mode.
-typedef enum
-{
-	NVTT_AlphaMode_None,
-	NVTT_AlphaMode_Transparency,
-	NVTT_AlphaMode_Premultiplied,
-} NvttAlphaMode;
-
-typedef enum
-{
-	NVTT_Error_InvalidInput,
-	NVTT_Error_UserInterruption,
-	NVTT_Error_UnsupportedFeature,
-	NVTT_Error_CudaError,
-	NVTT_Error_Unknown,
-	NVTT_Error_FileOpen,
-	NVTT_Error_FileWrite,
-} NvttError;
-
-typedef enum
-{
-	NVTT_False,
-	NVTT_True,
-} NvttBoolean;
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Callbacks
-//typedef void (* nvttErrorHandler)(NvttError e);
-//typedef void (* nvttOutputHandler)(const void * data, int size);
-//typedef void (* nvttImageHandler)(int size, int width, int height, int depth, int face, int miplevel);
-
-
-// InputOptions class.
-NVTT_API NvttInputOptions * nvttCreateInputOptions();
-NVTT_API void nvttDestroyInputOptions(NvttInputOptions * inputOptions);
-
-NVTT_API void nvttSetInputOptionsTextureLayout(NvttInputOptions * inputOptions, NvttTextureType type, int w, int h, int d);
-NVTT_API void nvttResetInputOptionsTextureLayout(NvttInputOptions * inputOptions);
-NVTT_API NvttBoolean nvttSetInputOptionsMipmapData(NvttInputOptions * inputOptions, const void * data, int w, int h, int d, int face, int mipmap);
-NVTT_API void nvttSetInputOptionsFormat(NvttInputOptions * inputOptions, NvttInputFormat format);
-NVTT_API void nvttSetInputOptionsAlphaMode(NvttInputOptions * inputOptions, NvttAlphaMode alphaMode);
-NVTT_API void nvttSetInputOptionsGamma(NvttInputOptions * inputOptions, float inputGamma, float outputGamma);
-NVTT_API void nvttSetInputOptionsWrapMode(NvttInputOptions * inputOptions, NvttWrapMode mode);
-NVTT_API void nvttSetInputOptionsMipmapFilter(NvttInputOptions * inputOptions, NvttMipmapFilter filter);
-NVTT_API void nvttSetInputOptionsMipmapGeneration(NvttInputOptions * inputOptions, NvttBoolean enabled, int maxLevel);
-NVTT_API void nvttSetInputOptionsKaiserParameters(NvttInputOptions * inputOptions, float width, float alpha, float stretch);
-NVTT_API void nvttSetInputOptionsNormalMap(NvttInputOptions * inputOptions, NvttBoolean b);
-NVTT_API void nvttSetInputOptionsConvertToNormalMap(NvttInputOptions * inputOptions, NvttBoolean convert);
-NVTT_API void nvttSetInputOptionsHeightEvaluation(NvttInputOptions * inputOptions, float redScale, float greenScale, float blueScale, float alphaScale);
-NVTT_API void nvttSetInputOptionsNormalFilter(NvttInputOptions * inputOptions, float sm, float medium, float big, float large);
-NVTT_API void nvttSetInputOptionsNormalizeMipmaps(NvttInputOptions * inputOptions, NvttBoolean b);
-NVTT_API void nvttSetInputOptionsColorTransform(NvttInputOptions * inputOptions, NvttColorTransform t);
-NVTT_API void nvttSetInputOptionsLinearTransform(NvttInputOptions * inputOptions, int channel, float w0, float w1, float w2, float w3);
-NVTT_API void nvttSetInputOptionsMaxExtents(NvttInputOptions * inputOptions, int dim);
-NVTT_API void nvttSetInputOptionsRoundMode(NvttInputOptions * inputOptions, NvttRoundMode mode);
-
-
-// CompressionOptions class.
-NVTT_API NvttCompressionOptions * nvttCreateCompressionOptions();
-NVTT_API void nvttDestroyCompressionOptions(NvttCompressionOptions * compressionOptions);
-
-NVTT_API void nvttSetCompressionOptionsFormat(NvttCompressionOptions * compressionOptions, NvttFormat format);
-NVTT_API void nvttSetCompressionOptionsQuality(NvttCompressionOptions * compressionOptions, NvttQuality quality);
-NVTT_API void nvttSetCompressionOptionsColorWeights(NvttCompressionOptions * compressionOptions, float red, float green, float blue, float alpha);
-NVTT_API void nvttSetCompressionOptionsPixelFormat(NvttCompressionOptions * compressionOptions, unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
-NVTT_API void nvttSetCompressionOptionsQuantization(NvttCompressionOptions * compressionOptions, NvttBoolean colorDithering, NvttBoolean alphaDithering, NvttBoolean binaryAlpha, int alphaThreshold);
-
-
-// OutputOptions class.
-NVTT_API NvttOutputOptions * nvttCreateOutputOptions();
-NVTT_API void nvttDestroyOutputOptions(NvttOutputOptions * outputOptions);
-
-NVTT_API void nvttSetOutputOptionsFileName(NvttOutputOptions * outputOptions, const char * fileName);
-NVTT_API void nvttSetOutputOptionsOutputHeader(NvttOutputOptions * outputOptions, NvttBoolean b);
-//NVTT_API void nvttSetOutputOptionsErrorHandler(NvttOutputOptions * outputOptions, nvttErrorHandler errorHandler);
-//NVTT_API void nvttSetOutputOptionsOutputHandler(NvttOutputOptions * outputOptions, nvttOutputHandler outputHandler, nvttImageHandler imageHandler);
-
-
-// Compressor class.
-NVTT_API NvttCompressor * nvttCreateCompressor();
-NVTT_API void nvttDestroyCompressor(NvttCompressor * compressor);
-
-NVTT_API NvttBoolean nvttCompress(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions, const NvttOutputOptions * outputOptions);
-NVTT_API int nvttEstimateSize(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions);
-
-
-// Global functions.
-NVTT_API const char * nvttErrorString(NvttError e);
-NVTT_API unsigned int nvttVersion();
-
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // NVTT_WRAPPER_H
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef NVTT_WRAPPER_H
+#define NVTT_WRAPPER_H
+
+// Function linkage
+#if NVTT_SHARED
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__ || defined __MINGW32__
+#	ifdef NVTT_EXPORTS
+#		define NVTT_API __declspec(dllexport)
+#	else
+#		define NVTT_API __declspec(dllimport)
+#	endif
+#endif
+
+#if defined __GNUC__ >= 4
+#	ifdef NVTT_EXPORTS
+#		define NVTT_API __attribute__((visibility("default")))
+#	endif
+#endif
+
+#endif // NVTT_SHARED
+
+#if !defined NVTT_API
+#	define NVTT_API
+#endif
+
+#define NVTT_VERSION 20100
+
+#ifdef __cplusplus
+typedef struct nvtt::InputOptions NvttInputOptions;
+typedef struct nvtt::CompressionOptions NvttCompressionOptions;
+typedef struct nvtt::OutputOptions NvttOutputOptions;
+typedef struct nvtt::Compressor NvttCompressor;
+#else
+typedef struct NvttInputOptions NvttInputOptions;
+typedef struct NvttCompressionOptions NvttCompressionOptions;
+typedef struct NvttOutputOptions NvttOutputOptions;
+typedef struct NvttCompressor NvttCompressor;
+#endif
+
+/// Supported compression formats.
+typedef enum
+{
+	// No compression.
+	NVTT_Format_RGB,
+	NVTT_Format_RGBA = NVTT_Format_RGB,
+
+	// DX9 formats.
+	NVTT_Format_DXT1,
+	NVTT_Format_DXT1a,
+	NVTT_Format_DXT3,
+	NVTT_Format_DXT5,
+	NVTT_Format_DXT5n,
+	
+	// DX10 formats.
+	NVTT_Format_BC1 = NVTT_Format_DXT1,
+	NVTT_Format_BC1a = NVTT_Format_DXT1a,
+	NVTT_Format_BC2 = NVTT_Format_DXT3,
+	NVTT_Format_BC3 = NVTT_Format_DXT5,
+	NVTT_Format_BC3n = NVTT_Format_DXT5n,
+	NVTT_Format_BC4,
+	NVTT_Format_BC5,
+} NvttFormat;
+
+/// Quality modes.
+typedef enum
+{
+	NVTT_Quality_Fastest,
+	NVTT_Quality_Normal,
+	NVTT_Quality_Production,
+	NVTT_Quality_Highest,
+} NvttQuality;
+
+/// Wrap modes.
+typedef enum
+{
+	NVTT_WrapMode_Clamp,
+	NVTT_WrapMode_Repeat,
+	NVTT_WrapMode_Mirror,
+} NvttWrapMode;
+
+/// Texture types.
+typedef enum
+{
+	NVTT_TextureType_2D,
+	NVTT_TextureType_Cube,
+} NvttTextureType;
+
+/// Input formats.
+typedef enum
+{
+	NVTT_InputFormat_BGRA_8UB,
+} NvttInputFormat;
+
+/// Mipmap downsampling filters.
+typedef enum
+{
+	NVTT_MipmapFilter_Box,
+	NVTT_MipmapFilter_Triangle,
+	NVTT_MipmapFilter_Kaiser,
+} NvttMipmapFilter;
+
+/// Extents rounding mode.
+typedef enum
+{
+	NVTT_RoundMode_None,
+	NVTT_RoundMode_ToNextPowerOfTwo,
+	NVTT_RoundMode_ToNearestPowerOfTwo,
+	NVTT_RoundMode_ToPreviousPowerOfTwo,
+} NvttRoundMode;
+
+/// Alpha mode.
+typedef enum
+{
+	NVTT_AlphaMode_None,
+	NVTT_AlphaMode_Transparency,
+	NVTT_AlphaMode_Premultiplied,
+} NvttAlphaMode;
+
+typedef enum
+{
+	NVTT_Error_InvalidInput,
+	NVTT_Error_UserInterruption,
+	NVTT_Error_UnsupportedFeature,
+	NVTT_Error_CudaError,
+	NVTT_Error_Unknown,
+	NVTT_Error_FileOpen,
+	NVTT_Error_FileWrite,
+    NVTT_Error_UnsupportedOutputFormat,
+} NvttError;
+
+typedef enum
+{
+	NVTT_False,
+	NVTT_True,
+} NvttBoolean;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Callbacks
+//typedef void (* nvttErrorHandler)(NvttError e);
+typedef void (* nvttBeginImageHandler)(int size, int width, int height, int depth, int face, int miplevel);
+typedef bool (* nvttOutputHandler)(const void * data, int size);
+typedef void (* nvttEndImageHandler)();
+
+
+// InputOptions class.
+NVTT_API NvttInputOptions * nvttCreateInputOptions();
+NVTT_API void nvttDestroyInputOptions(NvttInputOptions * inputOptions);
+
+NVTT_API void nvttSetInputOptionsTextureLayout(NvttInputOptions * inputOptions, NvttTextureType type, int w, int h, int d);
+NVTT_API void nvttResetInputOptionsTextureLayout(NvttInputOptions * inputOptions);
+NVTT_API NvttBoolean nvttSetInputOptionsMipmapData(NvttInputOptions * inputOptions, const void * data, int w, int h, int d, int face, int mipmap);
+NVTT_API void nvttSetInputOptionsFormat(NvttInputOptions * inputOptions, NvttInputFormat format);
+NVTT_API void nvttSetInputOptionsAlphaMode(NvttInputOptions * inputOptions, NvttAlphaMode alphaMode);
+NVTT_API void nvttSetInputOptionsGamma(NvttInputOptions * inputOptions, float inputGamma, float outputGamma);
+NVTT_API void nvttSetInputOptionsWrapMode(NvttInputOptions * inputOptions, NvttWrapMode mode);
+NVTT_API void nvttSetInputOptionsMipmapFilter(NvttInputOptions * inputOptions, NvttMipmapFilter filter);
+NVTT_API void nvttSetInputOptionsMipmapGeneration(NvttInputOptions * inputOptions, NvttBoolean enabled, int maxLevel);
+NVTT_API void nvttSetInputOptionsKaiserParameters(NvttInputOptions * inputOptions, float width, float alpha, float stretch);
+NVTT_API void nvttSetInputOptionsNormalMap(NvttInputOptions * inputOptions, NvttBoolean b);
+NVTT_API void nvttSetInputOptionsConvertToNormalMap(NvttInputOptions * inputOptions, NvttBoolean convert);
+NVTT_API void nvttSetInputOptionsHeightEvaluation(NvttInputOptions * inputOptions, float redScale, float greenScale, float blueScale, float alphaScale);
+NVTT_API void nvttSetInputOptionsNormalFilter(NvttInputOptions * inputOptions, float sm, float medium, float big, float large);
+NVTT_API void nvttSetInputOptionsNormalizeMipmaps(NvttInputOptions * inputOptions, NvttBoolean b);
+NVTT_API void nvttSetInputOptionsMaxExtents(NvttInputOptions * inputOptions, int dim);
+NVTT_API void nvttSetInputOptionsRoundMode(NvttInputOptions * inputOptions, NvttRoundMode mode);
+
+
+// CompressionOptions class.
+NVTT_API NvttCompressionOptions * nvttCreateCompressionOptions();
+NVTT_API void nvttDestroyCompressionOptions(NvttCompressionOptions * compressionOptions);
+
+NVTT_API void nvttSetCompressionOptionsFormat(NvttCompressionOptions * compressionOptions, NvttFormat format);
+NVTT_API void nvttSetCompressionOptionsQuality(NvttCompressionOptions * compressionOptions, NvttQuality quality);
+NVTT_API void nvttSetCompressionOptionsColorWeights(NvttCompressionOptions * compressionOptions, float red, float green, float blue, float alpha);
+NVTT_API void nvttSetCompressionOptionsPixelFormat(NvttCompressionOptions * compressionOptions, unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);
+NVTT_API void nvttSetCompressionOptionsQuantization(NvttCompressionOptions * compressionOptions, NvttBoolean colorDithering, NvttBoolean alphaDithering, NvttBoolean binaryAlpha, int alphaThreshold);
+
+
+// OutputOptions class.
+NVTT_API NvttOutputOptions * nvttCreateOutputOptions();
+NVTT_API void nvttDestroyOutputOptions(NvttOutputOptions * outputOptions);
+
+NVTT_API void nvttSetOutputOptionsFileName(NvttOutputOptions * outputOptions, const char * fileName);
+NVTT_API void nvttSetOutputOptionsOutputHeader(NvttOutputOptions * outputOptions, NvttBoolean b);
+//NVTT_API void nvttSetOutputOptionsErrorHandler(NvttOutputOptions * outputOptions, nvttErrorHandler errorHandler);
+NVTT_API void nvttSetOutputOptionsOutputHandler(NvttOutputOptions * outputOptions, nvttBeginImageHandler beginImageHandler, nvttOutputHandler outputHandler, nvttEndImageHandler endImageHandler);
+
+
+// Compressor class.
+NVTT_API NvttCompressor * nvttCreateCompressor();
+NVTT_API void nvttDestroyCompressor(NvttCompressor * compressor);
+
+NVTT_API NvttBoolean nvttCompress(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions, const NvttOutputOptions * outputOptions);
+NVTT_API int nvttEstimateSize(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions);
+
+
+// Global functions.
+NVTT_API const char * nvttErrorString(NvttError e);
+NVTT_API unsigned int nvttVersion();
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NVTT_WRAPPER_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt_wrapper.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt_wrapper.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/nvtt_wrapper.cpp
@@ -1,208 +1,293 @@
-
-#include "nvtt.h"
-#include "nvtt_wrapper.h"
-
-
-// InputOptions class.
-NvttInputOptions * nvttCreateInputOptions()
-{
-	return new nvtt::InputOptions();
-}
-
-void nvttDestroyInputOptions(NvttInputOptions * inputOptions)
-{
-	delete inputOptions;
-}
-
-void nvttSetInputOptionsTextureLayout(NvttInputOptions * inputOptions, NvttTextureType type, int w, int h, int d)
-{
-	inputOptions->setTextureLayout((nvtt::TextureType)type, w, h, d);
-}
-
-void nvttResetInputOptionsTextureLayout(NvttInputOptions * inputOptions)
-{
-	inputOptions->resetTextureLayout();
-}
-
-NvttBoolean nvttSetInputOptionsMipmapData(NvttInputOptions * inputOptions, const void * data, int w, int h, int d, int face, int mipmap)
-{
-	return (NvttBoolean)inputOptions->setMipmapData(data, w, h, d, face, mipmap);
-}
-
-void nvttSetInputOptionsFormat(NvttInputOptions * inputOptions, NvttInputFormat format)
-{
-	inputOptions->setFormat((nvtt::InputFormat)format);
-}
-
-void nvttSetInputOptionsAlphaMode(NvttInputOptions * inputOptions, NvttAlphaMode alphaMode)
-{
-	inputOptions->setAlphaMode((nvtt::AlphaMode)alphaMode);
-}
-
-void nvttSetInputOptionsGamma(NvttInputOptions * inputOptions, float inputGamma, float outputGamma)
-{
-	inputOptions->setGamma(inputGamma, outputGamma);
-}
-
-void nvttSetInputOptionsWrapMode(NvttInputOptions * inputOptions, NvttWrapMode mode)
-{
-	inputOptions->setWrapMode((nvtt::WrapMode)mode);
-}
-
-void nvttSetInputOptionsMipmapFilter(NvttInputOptions * inputOptions, NvttMipmapFilter filter)
-{
-	inputOptions->setMipmapFilter((nvtt::MipmapFilter)filter);
-}
-
-void nvttSetInputOptionsMipmapGeneration(NvttInputOptions * inputOptions, NvttBoolean enabled, int maxLevel)
-{
-	inputOptions->setMipmapGeneration(enabled != NVTT_False, maxLevel);
-}
-
-void nvttSetInputOptionsKaiserParameters(NvttInputOptions * inputOptions, float width, float alpha, float stretch)
-{
-	inputOptions->setKaiserParameters(width, alpha, stretch);
-}
-
-void nvttSetInputOptionsNormalMap(NvttInputOptions * inputOptions, NvttBoolean b)
-{
-	inputOptions->setNormalMap(b != NVTT_False);
-}
-
-void nvttSetInputOptionsConvertToNormalMap(NvttInputOptions * inputOptions, NvttBoolean convert)
-{
-	inputOptions->setConvertToNormalMap(convert != NVTT_False);
-}
-
-void nvttSetInputOptionsHeightEvaluation(NvttInputOptions * inputOptions, float redScale, float greenScale, float blueScale, float alphaScale)
-{
-	inputOptions->setHeightEvaluation(redScale, greenScale, blueScale, alphaScale);
-}
-
-void nvttSetInputOptionsNormalFilter(NvttInputOptions * inputOptions, float small, float medium, float big, float large)
-{
-	inputOptions->setNormalFilter(small, medium, big, large);
-}
-
-void nvttSetInputOptionsNormalizeMipmaps(NvttInputOptions * inputOptions, NvttBoolean b)
-{
-	inputOptions->setNormalizeMipmaps(b != NVTT_False);
-}
-
-void nvttSetInputOptionsColorTransform(NvttInputOptions * inputOptions, NvttColorTransform t)
-{
-	inputOptions->setColorTransform((nvtt::ColorTransform)t);
-}
-
-void nvttSetInputOptionsLinearTransfrom(NvttInputOptions * inputOptions, int channel, float w0, float w1, float w2, float w3)
-{
-	inputOptions->setLinearTransform(channel, w0, w1, w2, w3);
-}
-
-void nvttSetInputOptionsMaxExtents(NvttInputOptions * inputOptions, int dim)
-{
-	inputOptions->setMaxExtents(dim);
-}
-
-void nvttSetInputOptionsRoundMode(NvttInputOptions * inputOptions, NvttRoundMode mode)
-{
-	inputOptions->setRoundMode((nvtt::RoundMode)mode);
-}
-
-
-// CompressionOptions class.
-NvttCompressionOptions * nvttCreateCompressionOptions()
-{
-	return new nvtt::CompressionOptions();
-}
-
-void nvttDestroyCompressionOptions(NvttCompressionOptions * compressionOptions)
-{
-	delete compressionOptions;
-}
-
-void nvttSetCompressionOptionsFormat(NvttCompressionOptions * compressionOptions, NvttFormat format)
-{
-	compressionOptions->setFormat((nvtt::Format)format);
-}
-
-void nvttSetCompressionOptionsQuality(NvttCompressionOptions * compressionOptions, NvttQuality quality)
-{
-	compressionOptions->setQuality((nvtt::Quality)quality);
-}
-
-void nvttSetCompressionOptionsColorWeights(NvttCompressionOptions * compressionOptions, float red, float green, float blue, float alpha)
-{
-	compressionOptions->setColorWeights(red, green, blue, alpha);
-}
-
-/*void nvttEnableCompressionOptionsCudaCompression(NvttCompressionOptions * compressionOptions, NvttBoolean enable)
-{
-	compressionOptions->enableCudaCompression(enable != NVTT_False);
-}*/
-
-void nvttSetCompressionOptionsPixelFormat(NvttCompressionOptions * compressionOptions, unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask)
-{
-	compressionOptions->setPixelFormat(bitcount, rmask, gmask, bmask, amask);
-}
-
-void nvttSetCompressionOptionsQuantization(NvttCompressionOptions * compressionOptions, NvttBoolean colorDithering, NvttBoolean alphaDithering, NvttBoolean binaryAlpha, int alphaThreshold)
-{
-	compressionOptions->setQuantization(colorDithering != NVTT_False, alphaDithering != NVTT_False, binaryAlpha != NVTT_False, alphaThreshold);
-}
-
-
-// OutputOptions class.
-NvttOutputOptions * nvttCreateOutputOptions()
-{
-	return new nvtt::OutputOptions();
-}
-
-void nvttDestroyOutputOptions(NvttOutputOptions * outputOptions)
-{
-	delete outputOptions;
-}
-
-void nvttSetOutputOptionsFileName(NvttOutputOptions * outputOptions, const char * fileName)
-{
-	outputOptions->setFileName(fileName);
-}
-
-void nvttSetOutputOptionsOutputHeader(NvttOutputOptions * outputOptions, NvttBoolean b)
-{
-	outputOptions->setOutputHeader(b != NVTT_False);
-}
-/*
-void nvttSetOutputOptionsErrorHandler(NvttOutputOptions * outputOptions, nvttErrorHandler errorHandler)
-{
-	outputOptions->setErrorHandler(errorHandler);
-}
-
-void nvttSetOutputOptionsOutputHandler(NvttOutputOptions * outputOptions, nvttOutputHandler outputHandler, nvttImageHandler imageHandler)
-{
-}
-*/
-
-
-// Compressor class.
-NvttBoolean nvttCompress(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions, const NvttOutputOptions * outputOptions)
-{
-	return (NvttBoolean)compressor->process(*inputOptions, *compressionOptions, *outputOptions);
-}
-
-int nvttEstimateSize(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions)
-{
-	return compressor->estimateSize(*inputOptions, *compressionOptions);
-}
-
-
-// Global functions.
-const char * nvttErrorString(NvttError e)
-{
-	return nvtt::errorString((nvtt::Error)e);
-}
-
-unsigned int nvttVersion()
-{
-	return nvtt::version();
-}
+// Copyright (c) 2009-2011 Ignacio Castano <castano@gmail.com>
+// Copyright (c) 2007-2009 NVIDIA Corporation -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include "nvtt.h"
+#include "nvtt_wrapper.h"
+
+#include "OutputOptions.h"
+
+// An OutputHandler that sets and calls function pointers, rather than
+// requiring interfaces to derive from OutputHandler itself
+struct HandlerProxy : public nvtt::OutputHandler
+{
+public:
+
+    HandlerProxy() {}
+
+    nvttBeginImageHandler beginImageHandler;
+    nvttOutputHandler writeDataHandler;
+    nvttEndImageHandler endImageHandler;
+
+    virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
+    {
+        if (beginImageHandler != NULL) 
+        {
+            beginImageHandler(size, width, height, depth, face, miplevel);
+        }
+    }
+
+
+    virtual bool writeData(const void * data, int size)
+    {
+        if (writeDataHandler != NULL)
+        {
+            return writeDataHandler(data, size);
+        }
+        return false;
+    }
+
+    virtual void endImage()
+    {
+        if (endImageHandler != NULL) 
+        {
+            endImageHandler();
+        }
+    }
+};
+
+
+// InputOptions class.
+NvttInputOptions * nvttCreateInputOptions()
+{
+    return new nvtt::InputOptions();
+}
+
+void nvttDestroyInputOptions(NvttInputOptions * inputOptions)
+{
+    delete inputOptions;
+}
+
+void nvttSetInputOptionsTextureLayout(NvttInputOptions * inputOptions, NvttTextureType type, int w, int h, int d)
+{
+    inputOptions->setTextureLayout((nvtt::TextureType)type, w, h, d);
+}
+
+void nvttResetInputOptionsTextureLayout(NvttInputOptions * inputOptions)
+{
+    inputOptions->resetTextureLayout();
+}
+
+NvttBoolean nvttSetInputOptionsMipmapData(NvttInputOptions * inputOptions, const void * data, int w, int h, int d, int face, int mipmap)
+{
+    return (NvttBoolean)inputOptions->setMipmapData(data, w, h, d, face, mipmap);
+}
+
+void nvttSetInputOptionsFormat(NvttInputOptions * inputOptions, NvttInputFormat format)
+{
+    inputOptions->setFormat((nvtt::InputFormat)format);
+}
+
+void nvttSetInputOptionsAlphaMode(NvttInputOptions * inputOptions, NvttAlphaMode alphaMode)
+{
+    inputOptions->setAlphaMode((nvtt::AlphaMode)alphaMode);
+}
+
+void nvttSetInputOptionsGamma(NvttInputOptions * inputOptions, float inputGamma, float outputGamma)
+{
+    inputOptions->setGamma(inputGamma, outputGamma);
+}
+
+void nvttSetInputOptionsWrapMode(NvttInputOptions * inputOptions, NvttWrapMode mode)
+{
+    inputOptions->setWrapMode((nvtt::WrapMode)mode);
+}
+
+void nvttSetInputOptionsMipmapFilter(NvttInputOptions * inputOptions, NvttMipmapFilter filter)
+{
+    inputOptions->setMipmapFilter((nvtt::MipmapFilter)filter);
+}
+
+void nvttSetInputOptionsMipmapGeneration(NvttInputOptions * inputOptions, NvttBoolean enabled, int maxLevel)
+{
+    inputOptions->setMipmapGeneration(enabled != NVTT_False, maxLevel);
+}
+
+void nvttSetInputOptionsKaiserParameters(NvttInputOptions * inputOptions, float width, float alpha, float stretch)
+{
+    inputOptions->setKaiserParameters(width, alpha, stretch);
+}
+
+void nvttSetInputOptionsNormalMap(NvttInputOptions * inputOptions, NvttBoolean b)
+{
+    inputOptions->setNormalMap(b != NVTT_False);
+}
+
+void nvttSetInputOptionsConvertToNormalMap(NvttInputOptions * inputOptions, NvttBoolean convert)
+{
+    inputOptions->setConvertToNormalMap(convert != NVTT_False);
+}
+
+void nvttSetInputOptionsHeightEvaluation(NvttInputOptions * inputOptions, float redScale, float greenScale, float blueScale, float alphaScale)
+{
+    inputOptions->setHeightEvaluation(redScale, greenScale, blueScale, alphaScale);
+}
+
+void nvttSetInputOptionsNormalFilter(NvttInputOptions * inputOptions, float small, float medium, float big, float large)
+{
+    inputOptions->setNormalFilter(small, medium, big, large);
+}
+
+void nvttSetInputOptionsNormalizeMipmaps(NvttInputOptions * inputOptions, NvttBoolean b)
+{
+    inputOptions->setNormalizeMipmaps(b != NVTT_False);
+}
+
+void nvttSetInputOptionsMaxExtents(NvttInputOptions * inputOptions, int dim)
+{
+    inputOptions->setMaxExtents(dim);
+}
+
+void nvttSetInputOptionsRoundMode(NvttInputOptions * inputOptions, NvttRoundMode mode)
+{
+    inputOptions->setRoundMode((nvtt::RoundMode)mode);
+}
+
+
+// CompressionOptions class.
+NvttCompressionOptions * nvttCreateCompressionOptions()
+{
+    return new nvtt::CompressionOptions();
+}
+
+void nvttDestroyCompressionOptions(NvttCompressionOptions * compressionOptions)
+{
+    delete compressionOptions;
+}
+
+void nvttSetCompressionOptionsFormat(NvttCompressionOptions * compressionOptions, NvttFormat format)
+{
+    compressionOptions->setFormat((nvtt::Format)format);
+}
+
+void nvttSetCompressionOptionsQuality(NvttCompressionOptions * compressionOptions, NvttQuality quality)
+{
+    compressionOptions->setQuality((nvtt::Quality)quality);
+}
+
+void nvttSetCompressionOptionsColorWeights(NvttCompressionOptions * compressionOptions, float red, float green, float blue, float alpha)
+{
+    compressionOptions->setColorWeights(red, green, blue, alpha);
+}
+
+/*void nvttEnableCompressionOptionsCudaCompression(NvttCompressionOptions * compressionOptions, NvttBoolean enable)
+{
+compressionOptions->enableCudaCompression(enable != NVTT_False);
+}*/
+
+void nvttSetCompressionOptionsPixelFormat(NvttCompressionOptions * compressionOptions, unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask)
+{
+    compressionOptions->setPixelFormat(bitcount, rmask, gmask, bmask, amask);
+}
+
+void nvttSetCompressionOptionsQuantization(NvttCompressionOptions * compressionOptions, NvttBoolean colorDithering, NvttBoolean alphaDithering, NvttBoolean binaryAlpha, int alphaThreshold)
+{
+    compressionOptions->setQuantization(colorDithering != NVTT_False, alphaDithering != NVTT_False, binaryAlpha != NVTT_False, alphaThreshold);
+}
+
+
+// OutputOptions class.
+NvttOutputOptions * nvttCreateOutputOptions()
+{
+    nvtt::OutputOptions * outputOptions = new nvtt::OutputOptions();
+    HandlerProxy * handlerProxy = new HandlerProxy();
+
+    outputOptions->m.wrapperProxy = handlerProxy;
+
+    return outputOptions;
+}
+
+void nvttDestroyOutputOptions(NvttOutputOptions * outputOptions)
+{
+	HandlerProxy * handlerProxy = (HandlerProxy *)outputOptions->m.wrapperProxy;
+	delete handlerProxy;
+    delete outputOptions;
+}
+
+void nvttSetOutputOptionsFileName(NvttOutputOptions * outputOptions, const char * fileName)
+{
+    outputOptions->setFileName(fileName);
+}
+
+void nvttSetOutputOptionsOutputHeader(NvttOutputOptions * outputOptions, NvttBoolean b)
+{
+    outputOptions->setOutputHeader(b != NVTT_False);
+}
+/*
+void nvttSetOutputOptionsErrorHandler(NvttOutputOptions * outputOptions, nvttErrorHandler errorHandler)
+{
+    outputOptions->setErrorHandler(errorHandler);
+}
+*/
+
+void nvttSetOutputOptionsOutputHandler(NvttOutputOptions * outputOptions, nvttBeginImageHandler beginImageHandler, nvttOutputHandler writeDataHandler, nvttEndImageHandler endImageHandler)
+{
+	HandlerProxy * handler = (HandlerProxy *)outputOptions->m.wrapperProxy;
+
+	handler->beginImageHandler = beginImageHandler;
+	handler->writeDataHandler = writeDataHandler;
+    handler->endImageHandler = endImageHandler;
+
+    if(beginImageHandler == NULL && writeDataHandler == NULL && endImageHandler == NULL)
+    {
+		outputOptions->setOutputHandler(NULL);
+    }
+    else
+    {
+		outputOptions->setOutputHandler(handler);
+    }
+}
+
+
+// Compressor class.
+NvttCompressor * nvttCreateCompressor()
+{
+    return new nvtt::Compressor();
+}
+
+void nvttDestroyCompressor(NvttCompressor * compressor)
+{
+    delete compressor;
+}
+
+NvttBoolean nvttCompress(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions, const NvttOutputOptions * outputOptions)
+{
+    return (NvttBoolean)compressor->process(*inputOptions, *compressionOptions, *outputOptions);
+}
+
+int nvttEstimateSize(const NvttCompressor * compressor, const NvttInputOptions * inputOptions, const NvttCompressionOptions * compressionOptions)
+{
+    return compressor->estimateSize(*inputOptions, *compressionOptions);
+}
+
+
+// Global functions.
+const char * nvttErrorString(NvttError e)
+{
+    return nvtt::errorString((nvtt::Error)e);
+}
+
+unsigned int nvttVersion()
+{
+    return nvtt::version();
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/CMakeLists.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/CMakeLists.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/CMakeLists.txt
@@ -22,11 +22,14 @@
 
 ADD_LIBRARY(squish STATIC ${SQUISH_SRCS})
 
-IF("${CMAKE_CXX_COMPILER}" MATCHES "clang(\\+\\+)?$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-	SET(CMAKE_COMPILER_IS_CLANGXX 1)
-ENDIF()
-
-IF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
-	SET_TARGET_PROPERTIES(squish PROPERTIES COMPILE_FLAGS -fPIC)
-ENDIF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
+IF(NOT WIN32)
+
+	IF("${CMAKE_CXX_COMPILER}" MATCHES "clang(\\+\\+)?$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+		SET(CMAKE_COMPILER_IS_CLANGXX 1)
+	ENDIF()
+
+    IF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
+		SET_TARGET_PROPERTIES(squish PROPERTIES COMPILE_FLAGS -fPIC)
+    ENDIF(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX)
+ENDIF(NOT WIN32)
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/alpha.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/alpha.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/alpha.h
@@ -26,7 +26,7 @@
 #ifndef SQUISH_ALPHA_H
 #define SQUISH_ALPHA_H
 
-#include <squish.h>
+#include "squish.h"
 
 namespace squish {
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/clusterfit.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/clusterfit.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/clusterfit.h
@@ -23,15 +23,15 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_CLUSTERFIT_H
-#define SQUISH_CLUSTERFIT_H
+#ifndef NV_SQUISH_CLUSTERFIT_H
+#define NV_SQUISH_CLUSTERFIT_H
 
 #include "squish.h"
 #include "maths.h"
 #include "simd.h"
 #include "colourfit.h"
 
-namespace squish {
+namespace nvsquish {
 
 class ClusterFit : public ColourFit
 {
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/clusterfit.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/clusterfit.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/clusterfit.cpp
@@ -28,7 +28,7 @@
 #include "colourblock.h"
 #include <cfloat>
 
-namespace squish {
+namespace nvsquish {
 
 ClusterFit::ClusterFit()
 {
@@ -109,7 +109,7 @@
 float ClusterFit::GetBestError() const
 {
 #if SQUISH_USE_SIMD
-	return m_besterror.GetVec3().X();
+	return m_besterror.GetX();
 #else
 	return m_besterror;
 #endif
@@ -280,15 +280,6 @@
 					m_beta[k] = m_weights[k];
 				}
 
-				/*unsigned int permutation = 0;
-				for(int p = 0; p < 16; p++) {
-					permutation |= indices[p] << (p * 2);
-				}
-				if (debug) printf("%X:\t", permutation);
-
-				if (debug && permutation == 0x55FFFFAA) __debugbreak();
-				*/
-
 				// solve a least squares problem to place the endpoints
 #if SQUISH_USE_SIMD
 				Vec4 start, end;
@@ -392,8 +383,7 @@
 
 	// clamp to the grid
 	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-//	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-	Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f ); // IC: use approximate grid fitting.
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
 	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
 	a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
@@ -468,8 +458,7 @@
 
 	// clamp to the grid
 	Vec3 const grid( 31.0f, 63.0f, 31.0f );
-	//Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-	Vec3 const gridrcp(0.03227752766457f, 0.01583151765563f, 0.03227752766457f); // IC: use approximate grid fitting.
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
 	Vec3 const half( 0.5f );
 	a = Floor( grid*a + half )*gridrcp;
 	b = Floor( grid*b + half )*gridrcp;
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourblock.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourblock.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourblock.h
@@ -23,13 +23,13 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_COLOURBLOCK_H
-#define SQUISH_COLOURBLOCK_H
+#ifndef NV_SQUISH_COLOURBLOCK_H
+#define NV_SQUISH_COLOURBLOCK_H
 
 #include "squish.h"
 #include "maths.h"
 
-namespace squish {
+namespace nvsquish {
 
 void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
 void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourblock.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourblock.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourblock.cpp
@@ -25,7 +25,7 @@
    
 #include "colourblock.h"
 
-namespace squish {
+namespace nvsquish {
 
 static int FloatToInt( float a, int limit )
 {
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourfit.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourfit.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourfit.h
@@ -23,13 +23,13 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_COLOURFIT_H
-#define SQUISH_COLOURFIT_H
+#ifndef NV_SQUISH_COLOURFIT_H
+#define NV_SQUISH_COLOURFIT_H
 
 #include "squish.h"
 #include "maths.h"
 
-namespace squish {
+namespace nvsquish {
 
 class ColourSet;
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourfit.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourfit.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourfit.cpp
@@ -22,11 +22,11 @@
 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 	
    -------------------------------------------------------------------------- */
-   
+
 #include "colourfit.h"
 #include "colourset.h"
 
-namespace squish {
+namespace nvsquish {
 
 ColourFit::ColourFit()
 {
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourset.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourset.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourset.h
@@ -23,21 +23,21 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_COLOURSET_H
-#define SQUISH_COLOURSET_H
+#ifndef NV_SQUISH_COLOURSET_H
+#define NV_SQUISH_COLOURSET_H
 
 #include "squish.h"
 #include "maths.h"
 #include "simd.h"
 
-namespace squish {
+namespace nvsquish {
 
 /*! @brief Represents a set of block colours
 */
 class ColourSet
 {
 public:
-	ColourSet( u8 const* rgba, int flags, bool createMinimalSet = false );
+	ColourSet( u8 const* rgba, int flags, bool createMinimalSet = true );
 
 	int GetCount() const { return m_count; }
 	Vec3 const* GetPoints() const { return m_points; }
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourset.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourset.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/colourset.cpp
@@ -25,7 +25,7 @@
    
 #include "colourset.h"
 
-namespace squish {
+namespace nvsquish {
 
 // @@ Add flags:
 // - MatchTransparent
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/extra/squishgen2.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/extra/squishgen2.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/extra/squishgen2.cpp
@@ -0,0 +1,113 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2008 Ignacio Castano                      castano@gmail.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <float.h>
+#include <math.h>
+
+struct Precomp {
+	float alpha2_sum;
+	float beta2_sum;
+	float alphabeta_sum;
+	float factor;
+};
+
+
+int main()
+{
+	int i = 0;
+	
+	printf("struct Precomp {\n");
+	printf("\tfloat alpha2_sum;\n");
+	printf("\tfloat beta2_sum;\n");
+	printf("\tfloat alphabeta_sum;\n");
+	printf("\tfloat factor;\n");
+	printf("};\n\n");
+
+	printf("static const SQUISH_ALIGN_16 Precomp s_threeElement[153] = {\n");
+	
+	// Three element clusters:
+	for( int c0 = 0; c0 <= 16; c0++)	// At least two clusters.
+	{
+		for( int c1 = 0; c1 <=  16-c0; c1++)
+		{
+			int c2 = 16 - c0 - c1;
+
+			Precomp p;
+			p.alpha2_sum = c0 + c1 * 0.25f;
+			p.beta2_sum = c2 + c1 * 0.25f;
+			p.alphabeta_sum = c1 * 0.25f;
+			p.factor = 1.0f / (p.alpha2_sum * p.beta2_sum - p.alphabeta_sum * p.alphabeta_sum);
+
+			if (isfinite(p.factor))
+			{
+				printf("\t{ %ff, %ff, %ff, %ff }, // %d (%d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, p.factor, i, c0, c1, c2);
+			}
+			else
+			{
+				printf("\t{ %ff, %ff, %ff, FLT_MAX }, // %d (%d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, i, c0, c1, c2);
+			}
+			
+			i++;
+		}
+	}
+	printf("}; // %d three cluster elements\n\n", i);
+	
+	printf("static const SQUISH_ALIGN_16 Precomp s_fourElement[969] = {\n");
+
+	// Four element clusters:
+	i = 0;
+	for( int c0 = 0; c0 <= 16; c0++)
+	{
+		for( int c1 = 0; c1 <=  16-c0; c1++)
+		{
+			for( int c2 = 0; c2 <=  16-c0-c1; c2++)
+			{
+				int c3 = 16 - c0 - c1 - c2;
+
+				Precomp p;			
+				p.alpha2_sum = c0 + c1 * (4.0f/9.0f) + c2 * (1.0f/9.0f);
+				p.beta2_sum = c3 + c2 * (4.0f/9.0f) + c1 * (1.0f/9.0f);
+				p.alphabeta_sum = (c1 + c2) * (2.0f/9.0f);
+				p.factor = 1.0f / (p.alpha2_sum * p.beta2_sum - p.alphabeta_sum * p.alphabeta_sum);
+
+				if (isfinite(p.factor))
+				{
+					printf("\t{ %ff, %ff, %ff, %ff }, // %d (%d %d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, p.factor, i, c0, c1, c2, c3);
+				}
+				else
+				{
+					printf("\t{ %ff, %ff, %ff, FLT_MAX }, // %d (%d %d %d %d)\n", p.alpha2_sum, p.beta2_sum, p.alphabeta_sum, i, c0, c1, c2, c3);
+				}
+
+				i++;
+			}
+		}
+	}
+	printf("}; // %d four cluster elements\n\n", i);
+
+	return 0;
+}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterfit.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterfit.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterfit.h
@@ -24,15 +24,15 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_FASTCLUSTERFIT_H
-#define SQUISH_FASTCLUSTERFIT_H
+#ifndef NV_SQUISH_FASTCLUSTERFIT_H
+#define NV_SQUISH_FASTCLUSTERFIT_H
 
 #include "squish.h"
 #include "maths.h"
 #include "simd.h"
 #include "colourfit.h"
 
-namespace squish {
+namespace nvsquish {
 
 class FastClusterFit : public ColourFit
 {
@@ -53,14 +53,14 @@
 	Vec3 m_principle;
 
 #if SQUISH_USE_SIMD
-	Vec4 m_unweighted[16];
+	Vec4 m_unweighted[17];
 	Vec4 m_metric;
 	Vec4 m_metricSqr;
 	Vec4 m_xxsum;
 	Vec4 m_xsum;
 	Vec4 m_besterror;
 #else
-	Vec3 m_unweighted[16];
+	Vec3 m_unweighted[17];
 	Vec3 m_metric;
 	Vec3 m_metricSqr;
 	Vec3 m_xxsum;
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterfit.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterfit.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterfit.cpp
@@ -31,7 +31,7 @@
 
 #include "fastclusterlookup.inl"
 
-namespace squish {
+namespace nvsquish {
 
 FastClusterFit::FastClusterFit()
 {
@@ -129,6 +129,8 @@
 	Vec4 const zero = VEC4_CONST(0.0f);
 	Vec4 const half = VEC4_CONST(0.5f);
 	Vec4 const two = VEC4_CONST(2.0);
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 	 
 	// declare variables
 	Vec4 beststart = VEC4_CONST( 0.0f );
@@ -160,25 +162,22 @@
 			Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
 			Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
 			
-			// clamp the output to [0, 1]
+			// clamp to the grid
 			a = Min( one, Max( zero, a ) );
 			b = Min( one, Max( zero, b ) );
-			
-			// clamp to the grid
-			Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-			Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
 			a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
 			b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
 			
-			// compute the error
-			Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
-			Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-			Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
-			
+			// compute the error (we skip the constant xxsum)
+			Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+			Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+			Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+			Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
 			// apply the metric to the error term
-			Vec4 e4 = e3 * m_metricSqr;
-			Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
-			
+			Vec4 e5 = e4 * m_metricSqr;
+			Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
 			// keep the solution if it wins
 			if( CompareAnyLessThan( error, besterror ) )
 			{
@@ -274,7 +273,7 @@
 				Vec4 const factor = constants.SplatW();
 				i++;
 				
-				Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird);
+				Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0));
 				Vec4 const betax_sum = m_xsum - alphax_sum;
 				
 				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
@@ -286,18 +285,19 @@
 				
 				// clamp to the grid
 				Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-				Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
+				Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
 				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
 				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
 				
-				// compute the error
-				Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
-				Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-				Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
-				
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
 				// apply the metric to the error term
-				Vec4 e4 = e3 * m_metricSqr;
-				Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
+				Vec4 e5 = e4 * m_metricSqr;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
 				
 				// keep the solution if it wins
 				if( CompareAnyLessThan( error, besterror ) )
@@ -370,6 +370,12 @@
 
 void FastClusterFit::Compress3( void* block )
 {
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	Vec3 const half( 0.5f );
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
 	// declare variables
 	Vec3 beststart( 0.0f );
 	Vec3 bestend( 0.0f );
@@ -399,16 +405,9 @@
 			Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
 			Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
 			
-			// clamp the output to [0, 1]
-			Vec3 const one( 1.0f );
-			Vec3 const zero( 0.0f );
+			// clamp to the grid
 			a = Min( one, Max( zero, a ) );
 			b = Min( one, Max( zero, b ) );
-			
-			// clamp to the grid
-			Vec3 const grid( 31.0f, 63.0f, 31.0f );
-			Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
-			Vec3 const half( 0.5f );
 			a = Floor( grid*a + half )*gridrcp;
 			b = Floor( grid*b + half )*gridrcp;
 			
@@ -477,6 +476,12 @@
 
 void FastClusterFit::Compress4( void* block )
 {
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	Vec3 const half( 0.5f );
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
 	// declare variables
 	Vec3 beststart( 0.0f );
 	Vec3 bestend( 0.0f );
@@ -511,16 +516,9 @@
 				Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
 				Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
 				
-				// clamp the output to [0, 1]
-				Vec3 const one( 1.0f );
-				Vec3 const zero( 0.0f );
+				// clamp to the grid
 				a = Min( one, Max( zero, a ) );
 				b = Min( one, Max( zero, b ) );
-				
-				// clamp to the grid
-				Vec3 const grid( 31.0f, 63.0f, 31.0f );
-				Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f );
-				Vec3 const half( 0.5f );
 				a = Floor( grid*a + half )*gridrcp;
 				b = Floor( grid*b + half )*gridrcp;
 				
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterlookup.inl
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterlookup.inl
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/fastclusterlookup.inl
@@ -1,1135 +1,1135 @@
-struct Precomp {
-	float alpha2_sum;
-	float beta2_sum;
-	float alphabeta_sum;
-	float factor;
-};
-
-static const SQUISH_ALIGN_16 Precomp s_threeElement[153] = {
-	{ 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 16)
-	{ 0.250000f, 15.250000f, 0.250000f, 0.266667f }, // 1 (0 1 15)
-	{ 0.500000f, 14.500000f, 0.500000f, 0.142857f }, // 2 (0 2 14)
-	{ 0.750000f, 13.750000f, 0.750000f, 0.102564f }, // 3 (0 3 13)
-	{ 1.000000f, 13.000000f, 1.000000f, 0.083333f }, // 4 (0 4 12)
-	{ 1.250000f, 12.250000f, 1.250000f, 0.072727f }, // 5 (0 5 11)
-	{ 1.500000f, 11.500000f, 1.500000f, 0.066667f }, // 6 (0 6 10)
-	{ 1.750000f, 10.750000f, 1.750000f, 0.063492f }, // 7 (0 7 9)
-	{ 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 8 (0 8 8)
-	{ 2.250000f, 9.250000f, 2.250000f, 0.063492f }, // 9 (0 9 7)
-	{ 2.500000f, 8.500000f, 2.500000f, 0.066667f }, // 10 (0 10 6)
-	{ 2.750000f, 7.750000f, 2.750000f, 0.072727f }, // 11 (0 11 5)
-	{ 3.000000f, 7.000000f, 3.000000f, 0.083333f }, // 12 (0 12 4)
-	{ 3.250000f, 6.250000f, 3.250000f, 0.102564f }, // 13 (0 13 3)
-	{ 3.500000f, 5.500000f, 3.500000f, 0.142857f }, // 14 (0 14 2)
-	{ 3.750000f, 4.750000f, 3.750000f, 0.266667f }, // 15 (0 15 1)
-	{ 4.000000f, 4.000000f, 4.000000f, FLT_MAX }, // 16 (0 16 0)
-	{ 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 17 (1 0 15)
-	{ 1.250000f, 14.250000f, 0.250000f, 0.056338f }, // 18 (1 1 14)
-	{ 1.500000f, 13.500000f, 0.500000f, 0.050000f }, // 19 (1 2 13)
-	{ 1.750000f, 12.750000f, 0.750000f, 0.045977f }, // 20 (1 3 12)
-	{ 2.000000f, 12.000000f, 1.000000f, 0.043478f }, // 21 (1 4 11)
-	{ 2.250000f, 11.250000f, 1.250000f, 0.042105f }, // 22 (1 5 10)
-	{ 2.500000f, 10.500000f, 1.500000f, 0.041667f }, // 23 (1 6 9)
-	{ 2.750000f, 9.750000f, 1.750000f, 0.042105f }, // 24 (1 7 8)
-	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 25 (1 8 7)
-	{ 3.250000f, 8.250000f, 2.250000f, 0.045977f }, // 26 (1 9 6)
-	{ 3.500000f, 7.500000f, 2.500000f, 0.050000f }, // 27 (1 10 5)
-	{ 3.750000f, 6.750000f, 2.750000f, 0.056338f }, // 28 (1 11 4)
-	{ 4.000000f, 6.000000f, 3.000000f, 0.066667f }, // 29 (1 12 3)
-	{ 4.250000f, 5.250000f, 3.250000f, 0.085106f }, // 30 (1 13 2)
-	{ 4.500000f, 4.500000f, 3.500000f, 0.125000f }, // 31 (1 14 1)
-	{ 4.750000f, 3.750000f, 3.750000f, 0.266667f }, // 32 (1 15 0)
-	{ 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 33 (2 0 14)
-	{ 2.250000f, 13.250000f, 0.250000f, 0.033613f }, // 34 (2 1 13)
-	{ 2.500000f, 12.500000f, 0.500000f, 0.032258f }, // 35 (2 2 12)
-	{ 2.750000f, 11.750000f, 0.750000f, 0.031496f }, // 36 (2 3 11)
-	{ 3.000000f, 11.000000f, 1.000000f, 0.031250f }, // 37 (2 4 10)
-	{ 3.250000f, 10.250000f, 1.250000f, 0.031496f }, // 38 (2 5 9)
-	{ 3.500000f, 9.500000f, 1.500000f, 0.032258f }, // 39 (2 6 8)
-	{ 3.750000f, 8.750000f, 1.750000f, 0.033613f }, // 40 (2 7 7)
-	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 41 (2 8 6)
-	{ 4.250000f, 7.250000f, 2.250000f, 0.038835f }, // 42 (2 9 5)
-	{ 4.500000f, 6.500000f, 2.500000f, 0.043478f }, // 43 (2 10 4)
-	{ 4.750000f, 5.750000f, 2.750000f, 0.050633f }, // 44 (2 11 3)
-	{ 5.000000f, 5.000000f, 3.000000f, 0.062500f }, // 45 (2 12 2)
-	{ 5.250000f, 4.250000f, 3.250000f, 0.085106f }, // 46 (2 13 1)
-	{ 5.500000f, 3.500000f, 3.500000f, 0.142857f }, // 47 (2 14 0)
-	{ 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 48 (3 0 13)
-	{ 3.250000f, 12.250000f, 0.250000f, 0.025157f }, // 49 (3 1 12)
-	{ 3.500000f, 11.500000f, 0.500000f, 0.025000f }, // 50 (3 2 11)
-	{ 3.750000f, 10.750000f, 0.750000f, 0.025157f }, // 51 (3 3 10)
-	{ 4.000000f, 10.000000f, 1.000000f, 0.025641f }, // 52 (3 4 9)
-	{ 4.250000f, 9.250000f, 1.250000f, 0.026490f }, // 53 (3 5 8)
-	{ 4.500000f, 8.500000f, 1.500000f, 0.027778f }, // 54 (3 6 7)
-	{ 4.750000f, 7.750000f, 1.750000f, 0.029630f }, // 55 (3 7 6)
-	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 56 (3 8 5)
-	{ 5.250000f, 6.250000f, 2.250000f, 0.036036f }, // 57 (3 9 4)
-	{ 5.500000f, 5.500000f, 2.500000f, 0.041667f }, // 58 (3 10 3)
-	{ 5.750000f, 4.750000f, 2.750000f, 0.050633f }, // 59 (3 11 2)
-	{ 6.000000f, 4.000000f, 3.000000f, 0.066667f }, // 60 (3 12 1)
-	{ 6.250000f, 3.250000f, 3.250000f, 0.102564f }, // 61 (3 13 0)
-	{ 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 62 (4 0 12)
-	{ 4.250000f, 11.250000f, 0.250000f, 0.020942f }, // 63 (4 1 11)
-	{ 4.500000f, 10.500000f, 0.500000f, 0.021277f }, // 64 (4 2 10)
-	{ 4.750000f, 9.750000f, 0.750000f, 0.021858f }, // 65 (4 3 9)
-	{ 5.000000f, 9.000000f, 1.000000f, 0.022727f }, // 66 (4 4 8)
-	{ 5.250000f, 8.250000f, 1.250000f, 0.023952f }, // 67 (4 5 7)
-	{ 5.500000f, 7.500000f, 1.500000f, 0.025641f }, // 68 (4 6 6)
-	{ 5.750000f, 6.750000f, 1.750000f, 0.027972f }, // 69 (4 7 5)
-	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 70 (4 8 4)
-	{ 6.250000f, 5.250000f, 2.250000f, 0.036036f }, // 71 (4 9 3)
-	{ 6.500000f, 4.500000f, 2.500000f, 0.043478f }, // 72 (4 10 2)
-	{ 6.750000f, 3.750000f, 2.750000f, 0.056338f }, // 73 (4 11 1)
-	{ 7.000000f, 3.000000f, 3.000000f, 0.083333f }, // 74 (4 12 0)
-	{ 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 75 (5 0 11)
-	{ 5.250000f, 10.250000f, 0.250000f, 0.018605f }, // 76 (5 1 10)
-	{ 5.500000f, 9.500000f, 0.500000f, 0.019231f }, // 77 (5 2 9)
-	{ 5.750000f, 8.750000f, 0.750000f, 0.020101f }, // 78 (5 3 8)
-	{ 6.000000f, 8.000000f, 1.000000f, 0.021277f }, // 79 (5 4 7)
-	{ 6.250000f, 7.250000f, 1.250000f, 0.022857f }, // 80 (5 5 6)
-	{ 6.500000f, 6.500000f, 1.500000f, 0.025000f }, // 81 (5 6 5)
-	{ 6.750000f, 5.750000f, 1.750000f, 0.027972f }, // 82 (5 7 4)
-	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 83 (5 8 3)
-	{ 7.250000f, 4.250000f, 2.250000f, 0.038835f }, // 84 (5 9 2)
-	{ 7.500000f, 3.500000f, 2.500000f, 0.050000f }, // 85 (5 10 1)
-	{ 7.750000f, 2.750000f, 2.750000f, 0.072727f }, // 86 (5 11 0)
-	{ 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 87 (6 0 10)
-	{ 6.250000f, 9.250000f, 0.250000f, 0.017316f }, // 88 (6 1 9)
-	{ 6.500000f, 8.500000f, 0.500000f, 0.018182f }, // 89 (6 2 8)
-	{ 6.750000f, 7.750000f, 0.750000f, 0.019324f }, // 90 (6 3 7)
-	{ 7.000000f, 7.000000f, 1.000000f, 0.020833f }, // 91 (6 4 6)
-	{ 7.250000f, 6.250000f, 1.250000f, 0.022857f }, // 92 (6 5 5)
-	{ 7.500000f, 5.500000f, 1.500000f, 0.025641f }, // 93 (6 6 4)
-	{ 7.750000f, 4.750000f, 1.750000f, 0.029630f }, // 94 (6 7 3)
-	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 95 (6 8 2)
-	{ 8.250000f, 3.250000f, 2.250000f, 0.045977f }, // 96 (6 9 1)
-	{ 8.500000f, 2.500000f, 2.500000f, 0.066667f }, // 97 (6 10 0)
-	{ 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 98 (7 0 9)
-	{ 7.250000f, 8.250000f, 0.250000f, 0.016736f }, // 99 (7 1 8)
-	{ 7.500000f, 7.500000f, 0.500000f, 0.017857f }, // 100 (7 2 7)
-	{ 7.750000f, 6.750000f, 0.750000f, 0.019324f }, // 101 (7 3 6)
-	{ 8.000000f, 6.000000f, 1.000000f, 0.021277f }, // 102 (7 4 5)
-	{ 8.250000f, 5.250000f, 1.250000f, 0.023952f }, // 103 (7 5 4)
-	{ 8.500000f, 4.500000f, 1.500000f, 0.027778f }, // 104 (7 6 3)
-	{ 8.750000f, 3.750000f, 1.750000f, 0.033613f }, // 105 (7 7 2)
-	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 106 (7 8 1)
-	{ 9.250000f, 2.250000f, 2.250000f, 0.063492f }, // 107 (7 9 0)
-	{ 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 108 (8 0 8)
-	{ 8.250000f, 7.250000f, 0.250000f, 0.016736f }, // 109 (8 1 7)
-	{ 8.500000f, 6.500000f, 0.500000f, 0.018182f }, // 110 (8 2 6)
-	{ 8.750000f, 5.750000f, 0.750000f, 0.020101f }, // 111 (8 3 5)
-	{ 9.000000f, 5.000000f, 1.000000f, 0.022727f }, // 112 (8 4 4)
-	{ 9.250000f, 4.250000f, 1.250000f, 0.026490f }, // 113 (8 5 3)
-	{ 9.500000f, 3.500000f, 1.500000f, 0.032258f }, // 114 (8 6 2)
-	{ 9.750000f, 2.750000f, 1.750000f, 0.042105f }, // 115 (8 7 1)
-	{ 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 116 (8 8 0)
-	{ 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 117 (9 0 7)
-	{ 9.250000f, 6.250000f, 0.250000f, 0.017316f }, // 118 (9 1 6)
-	{ 9.500000f, 5.500000f, 0.500000f, 0.019231f }, // 119 (9 2 5)
-	{ 9.750000f, 4.750000f, 0.750000f, 0.021858f }, // 120 (9 3 4)
-	{ 10.000000f, 4.000000f, 1.000000f, 0.025641f }, // 121 (9 4 3)
-	{ 10.250000f, 3.250000f, 1.250000f, 0.031496f }, // 122 (9 5 2)
-	{ 10.500000f, 2.500000f, 1.500000f, 0.041667f }, // 123 (9 6 1)
-	{ 10.750000f, 1.750000f, 1.750000f, 0.063492f }, // 124 (9 7 0)
-	{ 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 125 (10 0 6)
-	{ 10.250000f, 5.250000f, 0.250000f, 0.018605f }, // 126 (10 1 5)
-	{ 10.500000f, 4.500000f, 0.500000f, 0.021277f }, // 127 (10 2 4)
-	{ 10.750000f, 3.750000f, 0.750000f, 0.025157f }, // 128 (10 3 3)
-	{ 11.000000f, 3.000000f, 1.000000f, 0.031250f }, // 129 (10 4 2)
-	{ 11.250000f, 2.250000f, 1.250000f, 0.042105f }, // 130 (10 5 1)
-	{ 11.500000f, 1.500000f, 1.500000f, 0.066667f }, // 131 (10 6 0)
-	{ 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 132 (11 0 5)
-	{ 11.250000f, 4.250000f, 0.250000f, 0.020942f }, // 133 (11 1 4)
-	{ 11.500000f, 3.500000f, 0.500000f, 0.025000f }, // 134 (11 2 3)
-	{ 11.750000f, 2.750000f, 0.750000f, 0.031496f }, // 135 (11 3 2)
-	{ 12.000000f, 2.000000f, 1.000000f, 0.043478f }, // 136 (11 4 1)
-	{ 12.250000f, 1.250000f, 1.250000f, 0.072727f }, // 137 (11 5 0)
-	{ 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 138 (12 0 4)
-	{ 12.250000f, 3.250000f, 0.250000f, 0.025157f }, // 139 (12 1 3)
-	{ 12.500000f, 2.500000f, 0.500000f, 0.032258f }, // 140 (12 2 2)
-	{ 12.750000f, 1.750000f, 0.750000f, 0.045977f }, // 141 (12 3 1)
-	{ 13.000000f, 1.000000f, 1.000000f, 0.083333f }, // 142 (12 4 0)
-	{ 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 143 (13 0 3)
-	{ 13.250000f, 2.250000f, 0.250000f, 0.033613f }, // 144 (13 1 2)
-	{ 13.500000f, 1.500000f, 0.500000f, 0.050000f }, // 145 (13 2 1)
-	{ 13.750000f, 0.750000f, 0.750000f, 0.102564f }, // 146 (13 3 0)
-	{ 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 147 (14 0 2)
-	{ 14.250000f, 1.250000f, 0.250000f, 0.056338f }, // 148 (14 1 1)
-	{ 14.500000f, 0.500000f, 0.500000f, 0.142857f }, // 149 (14 2 0)
-	{ 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 150 (15 0 1)
-	{ 15.250000f, 0.250000f, 0.250000f, 0.266667f }, // 151 (15 1 0)
-	{ 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 152 (16 0 0)
-}; // 153 three cluster elements
-
-static const SQUISH_ALIGN_16 Precomp s_fourElement[969] = {
-	{ 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 0 16)
-	{ 0.111111f, 15.444445f, 0.222222f, 0.600000f }, // 1 (0 0 1 15)
-	{ 0.222222f, 14.888889f, 0.444444f, 0.321429f }, // 2 (0 0 2 14)
-	{ 0.333333f, 14.333333f, 0.666667f, 0.230769f }, // 3 (0 0 3 13)
-	{ 0.444444f, 13.777778f, 0.888889f, 0.187500f }, // 4 (0 0 4 12)
-	{ 0.555556f, 13.222222f, 1.111111f, 0.163636f }, // 5 (0 0 5 11)
-	{ 0.666667f, 12.666667f, 1.333333f, 0.150000f }, // 6 (0 0 6 10)
-	{ 0.777778f, 12.111111f, 1.555556f, 0.142857f }, // 7 (0 0 7 9)
-	{ 0.888889f, 11.555555f, 1.777778f, 0.140625f }, // 8 (0 0 8 8)
-	{ 1.000000f, 11.000000f, 2.000000f, 0.142857f }, // 9 (0 0 9 7)
-	{ 1.111111f, 10.444445f, 2.222222f, 0.150000f }, // 10 (0 0 10 6)
-	{ 1.222222f, 9.888889f, 2.444444f, 0.163636f }, // 11 (0 0 11 5)
-	{ 1.333333f, 9.333333f, 2.666667f, 0.187500f }, // 12 (0 0 12 4)
-	{ 1.444444f, 8.777778f, 2.888889f, 0.230769f }, // 13 (0 0 13 3)
-	{ 1.555556f, 8.222222f, 3.111111f, 0.321429f }, // 14 (0 0 14 2)
-	{ 1.666667f, 7.666667f, 3.333333f, 0.600000f }, // 15 (0 0 15 1)
-	{ 1.777778f, 7.111111f, 3.555556f, FLT_MAX }, // 16 (0 0 16 0)
-	{ 0.444444f, 15.111111f, 0.222222f, 0.150000f }, // 17 (0 1 0 15)
-	{ 0.555556f, 14.555555f, 0.444444f, 0.126761f }, // 18 (0 1 1 14)
-	{ 0.666667f, 14.000000f, 0.666667f, 0.112500f }, // 19 (0 1 2 13)
-	{ 0.777778f, 13.444445f, 0.888889f, 0.103448f }, // 20 (0 1 3 12)
-	{ 0.888889f, 12.888889f, 1.111111f, 0.097826f }, // 21 (0 1 4 11)
-	{ 1.000000f, 12.333333f, 1.333333f, 0.094737f }, // 22 (0 1 5 10)
-	{ 1.111111f, 11.777778f, 1.555556f, 0.093750f }, // 23 (0 1 6 9)
-	{ 1.222222f, 11.222222f, 1.777778f, 0.094737f }, // 24 (0 1 7 8)
-	{ 1.333333f, 10.666667f, 2.000000f, 0.097826f }, // 25 (0 1 8 7)
-	{ 1.444444f, 10.111111f, 2.222222f, 0.103448f }, // 26 (0 1 9 6)
-	{ 1.555556f, 9.555555f, 2.444444f, 0.112500f }, // 27 (0 1 10 5)
-	{ 1.666667f, 9.000000f, 2.666667f, 0.126761f }, // 28 (0 1 11 4)
-	{ 1.777778f, 8.444445f, 2.888889f, 0.150000f }, // 29 (0 1 12 3)
-	{ 1.888889f, 7.888889f, 3.111111f, 0.191489f }, // 30 (0 1 13 2)
-	{ 2.000000f, 7.333333f, 3.333333f, 0.281250f }, // 31 (0 1 14 1)
-	{ 2.111111f, 6.777778f, 3.555556f, 0.600000f }, // 32 (0 1 15 0)
-	{ 0.888889f, 14.222222f, 0.444444f, 0.080357f }, // 33 (0 2 0 14)
-	{ 1.000000f, 13.666667f, 0.666667f, 0.075630f }, // 34 (0 2 1 13)
-	{ 1.111111f, 13.111111f, 0.888889f, 0.072581f }, // 35 (0 2 2 12)
-	{ 1.222222f, 12.555555f, 1.111111f, 0.070866f }, // 36 (0 2 3 11)
-	{ 1.333333f, 12.000000f, 1.333333f, 0.070313f }, // 37 (0 2 4 10)
-	{ 1.444444f, 11.444445f, 1.555556f, 0.070866f }, // 38 (0 2 5 9)
-	{ 1.555556f, 10.888889f, 1.777778f, 0.072581f }, // 39 (0 2 6 8)
-	{ 1.666667f, 10.333333f, 2.000000f, 0.075630f }, // 40 (0 2 7 7)
-	{ 1.777778f, 9.777778f, 2.222222f, 0.080357f }, // 41 (0 2 8 6)
-	{ 1.888889f, 9.222222f, 2.444444f, 0.087379f }, // 42 (0 2 9 5)
-	{ 2.000000f, 8.666667f, 2.666667f, 0.097826f }, // 43 (0 2 10 4)
-	{ 2.111111f, 8.111111f, 2.888889f, 0.113924f }, // 44 (0 2 11 3)
-	{ 2.222222f, 7.555556f, 3.111111f, 0.140625f }, // 45 (0 2 12 2)
-	{ 2.333333f, 7.000000f, 3.333333f, 0.191489f }, // 46 (0 2 13 1)
-	{ 2.444444f, 6.444445f, 3.555556f, 0.321429f }, // 47 (0 2 14 0)
-	{ 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 48 (0 3 0 13)
-	{ 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 49 (0 3 1 12)
-	{ 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 50 (0 3 2 11)
-	{ 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 51 (0 3 3 10)
-	{ 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 52 (0 3 4 9)
-	{ 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 53 (0 3 5 8)
-	{ 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 54 (0 3 6 7)
-	{ 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 55 (0 3 7 6)
-	{ 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 56 (0 3 8 5)
-	{ 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 57 (0 3 9 4)
-	{ 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 58 (0 3 10 3)
-	{ 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 59 (0 3 11 2)
-	{ 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 60 (0 3 12 1)
-	{ 2.777778f, 6.111111f, 3.555556f, 0.230769f }, // 61 (0 3 13 0)
-	{ 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 62 (0 4 0 12)
-	{ 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 63 (0 4 1 11)
-	{ 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 64 (0 4 2 10)
-	{ 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 65 (0 4 3 9)
-	{ 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 66 (0 4 4 8)
-	{ 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 67 (0 4 5 7)
-	{ 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 68 (0 4 6 6)
-	{ 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 69 (0 4 7 5)
-	{ 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 70 (0 4 8 4)
-	{ 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 71 (0 4 9 3)
-	{ 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 72 (0 4 10 2)
-	{ 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 73 (0 4 11 1)
-	{ 3.111111f, 5.777778f, 3.555556f, 0.187500f }, // 74 (0 4 12 0)
-	{ 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 75 (0 5 0 11)
-	{ 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 76 (0 5 1 10)
-	{ 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 77 (0 5 2 9)
-	{ 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 78 (0 5 3 8)
-	{ 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 79 (0 5 4 7)
-	{ 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 80 (0 5 5 6)
-	{ 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 81 (0 5 6 5)
-	{ 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 82 (0 5 7 4)
-	{ 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 83 (0 5 8 3)
-	{ 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 84 (0 5 9 2)
-	{ 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 85 (0 5 10 1)
-	{ 3.444444f, 5.444445f, 3.555556f, 0.163636f }, // 86 (0 5 11 0)
-	{ 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 87 (0 6 0 10)
-	{ 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 88 (0 6 1 9)
-	{ 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 89 (0 6 2 8)
-	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 90 (0 6 3 7)
-	{ 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 91 (0 6 4 6)
-	{ 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 92 (0 6 5 5)
-	{ 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 93 (0 6 6 4)
-	{ 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 94 (0 6 7 3)
-	{ 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 95 (0 6 8 2)
-	{ 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 96 (0 6 9 1)
-	{ 3.777778f, 5.111111f, 3.555556f, 0.150000f }, // 97 (0 6 10 0)
-	{ 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 98 (0 7 0 9)
-	{ 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 99 (0 7 1 8)
-	{ 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 100 (0 7 2 7)
-	{ 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 101 (0 7 3 6)
-	{ 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 102 (0 7 4 5)
-	{ 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 103 (0 7 5 4)
-	{ 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 104 (0 7 6 3)
-	{ 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 105 (0 7 7 2)
-	{ 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 106 (0 7 8 1)
-	{ 4.111111f, 4.777778f, 3.555556f, 0.142857f }, // 107 (0 7 9 0)
-	{ 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 108 (0 8 0 8)
-	{ 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 109 (0 8 1 7)
-	{ 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 110 (0 8 2 6)
-	{ 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 111 (0 8 3 5)
-	{ 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 112 (0 8 4 4)
-	{ 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 113 (0 8 5 3)
-	{ 4.222222f, 5.555555f, 3.111111f, 0.072581f }, // 114 (0 8 6 2)
-	{ 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 115 (0 8 7 1)
-	{ 4.444445f, 4.444445f, 3.555556f, 0.140625f }, // 116 (0 8 8 0)
-	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 117 (0 9 0 7)
-	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 118 (0 9 1 6)
-	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 119 (0 9 2 5)
-	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 120 (0 9 3 4)
-	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 121 (0 9 4 3)
-	{ 4.555556f, 5.222222f, 3.111111f, 0.070866f }, // 122 (0 9 5 2)
-	{ 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 123 (0 9 6 1)
-	{ 4.777778f, 4.111111f, 3.555556f, 0.142857f }, // 124 (0 9 7 0)
-	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 125 (0 10 0 6)
-	{ 4.555556f, 6.555555f, 2.444444f, 0.041860f }, // 126 (0 10 1 5)
-	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 127 (0 10 2 4)
-	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 128 (0 10 3 3)
-	{ 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 129 (0 10 4 2)
-	{ 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 130 (0 10 5 1)
-	{ 5.111111f, 3.777778f, 3.555556f, 0.150000f }, // 131 (0 10 6 0)
-	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 132 (0 11 0 5)
-	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 133 (0 11 1 4)
-	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 134 (0 11 2 3)
-	{ 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 135 (0 11 3 2)
-	{ 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 136 (0 11 4 1)
-	{ 5.444445f, 3.444444f, 3.555556f, 0.163636f }, // 137 (0 11 5 0)
-	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 138 (0 12 0 4)
-	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 139 (0 12 1 3)
-	{ 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 140 (0 12 2 2)
-	{ 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 141 (0 12 3 1)
-	{ 5.777778f, 3.111111f, 3.555556f, 0.187500f }, // 142 (0 12 4 0)
-	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 143 (0 13 0 3)
-	{ 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 144 (0 13 1 2)
-	{ 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 145 (0 13 2 1)
-	{ 6.111111f, 2.777778f, 3.555556f, 0.230769f }, // 146 (0 13 3 0)
-	{ 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 147 (0 14 0 2)
-	{ 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 148 (0 14 1 1)
-	{ 6.444445f, 2.444444f, 3.555556f, 0.321429f }, // 149 (0 14 2 0)
-	{ 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 150 (0 15 0 1)
-	{ 6.777778f, 2.111111f, 3.555556f, 0.600000f }, // 151 (0 15 1 0)
-	{ 7.111111f, 1.777778f, 3.555556f, FLT_MAX }, // 152 (0 16 0 0)
-	{ 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 153 (1 0 0 15)
-	{ 1.111111f, 14.444445f, 0.222222f, 0.062500f }, // 154 (1 0 1 14)
-	{ 1.222222f, 13.888889f, 0.444444f, 0.059603f }, // 155 (1 0 2 13)
-	{ 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 156 (1 0 3 12)
-	{ 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 157 (1 0 4 11)
-	{ 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 158 (1 0 5 10)
-	{ 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 159 (1 0 6 9)
-	{ 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 160 (1 0 7 8)
-	{ 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 161 (1 0 8 7)
-	{ 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 162 (1 0 9 6)
-	{ 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 163 (1 0 10 5)
-	{ 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 164 (1 0 11 4)
-	{ 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 165 (1 0 12 3)
-	{ 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 166 (1 0 13 2)
-	{ 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 167 (1 0 14 1)
-	{ 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 168 (1 0 15 0)
-	{ 1.444444f, 14.111111f, 0.222222f, 0.049180f }, // 169 (1 1 0 14)
-	{ 1.555556f, 13.555555f, 0.444444f, 0.047872f }, // 170 (1 1 1 13)
-	{ 1.666667f, 13.000000f, 0.666667f, 0.047120f }, // 171 (1 1 2 12)
-	{ 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 172 (1 1 3 11)
-	{ 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 173 (1 1 4 10)
-	{ 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 174 (1 1 5 9)
-	{ 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 175 (1 1 6 8)
-	{ 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 176 (1 1 7 7)
-	{ 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 177 (1 1 8 6)
-	{ 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 178 (1 1 9 5)
-	{ 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 179 (1 1 10 4)
-	{ 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 180 (1 1 11 3)
-	{ 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 181 (1 1 12 2)
-	{ 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 182 (1 1 13 1)
-	{ 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 183 (1 1 14 0)
-	{ 1.888889f, 13.222222f, 0.444444f, 0.040359f }, // 184 (1 2 0 13)
-	{ 2.000000f, 12.666667f, 0.666667f, 0.040179f }, // 185 (1 2 1 12)
-	{ 2.111111f, 12.111111f, 0.888889f, 0.040359f }, // 186 (1 2 2 11)
-	{ 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 187 (1 2 3 10)
-	{ 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 188 (1 2 4 9)
-	{ 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 189 (1 2 5 8)
-	{ 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 190 (1 2 6 7)
-	{ 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 191 (1 2 7 6)
-	{ 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 192 (1 2 8 5)
-	{ 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 193 (1 2 9 4)
-	{ 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 194 (1 2 10 3)
-	{ 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 195 (1 2 11 2)
-	{ 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 196 (1 2 12 1)
-	{ 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 197 (1 2 13 0)
-	{ 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 198 (1 3 0 12)
-	{ 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 199 (1 3 1 11)
-	{ 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 200 (1 3 2 10)
-	{ 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 201 (1 3 3 9)
-	{ 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 202 (1 3 4 8)
-	{ 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 203 (1 3 5 7)
-	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 204 (1 3 6 6)
-	{ 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 205 (1 3 7 5)
-	{ 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 206 (1 3 8 4)
-	{ 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 207 (1 3 9 3)
-	{ 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 208 (1 3 10 2)
-	{ 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 209 (1 3 11 1)
-	{ 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 210 (1 3 12 0)
-	{ 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 211 (1 4 0 11)
-	{ 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 212 (1 4 1 10)
-	{ 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 213 (1 4 2 9)
-	{ 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 214 (1 4 3 8)
-	{ 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 215 (1 4 4 7)
-	{ 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 216 (1 4 5 6)
-	{ 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 217 (1 4 6 5)
-	{ 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 218 (1 4 7 4)
-	{ 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 219 (1 4 8 3)
-	{ 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 220 (1 4 9 2)
-	{ 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 221 (1 4 10 1)
-	{ 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 222 (1 4 11 0)
-	{ 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 223 (1 5 0 10)
-	{ 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 224 (1 5 1 9)
-	{ 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 225 (1 5 2 8)
-	{ 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 226 (1 5 3 7)
-	{ 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 227 (1 5 4 6)
-	{ 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 228 (1 5 5 5)
-	{ 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 229 (1 5 6 4)
-	{ 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 230 (1 5 7 3)
-	{ 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 231 (1 5 8 2)
-	{ 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 232 (1 5 9 1)
-	{ 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 233 (1 5 10 0)
-	{ 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 234 (1 6 0 9)
-	{ 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 235 (1 6 1 8)
-	{ 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 236 (1 6 2 7)
-	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 237 (1 6 3 6)
-	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 238 (1 6 4 5)
-	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 239 (1 6 5 4)
-	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 240 (1 6 6 3)
-	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 241 (1 6 7 2)
-	{ 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 242 (1 6 8 1)
-	{ 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 243 (1 6 9 0)
-	{ 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 244 (1 7 0 8)
-	{ 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 245 (1 7 1 7)
-	{ 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 246 (1 7 2 6)
-	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 247 (1 7 3 5)
-	{ 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 248 (1 7 4 4)
-	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 249 (1 7 5 3)
-	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 250 (1 7 6 2)
-	{ 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 251 (1 7 7 1)
-	{ 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 252 (1 7 8 0)
-	{ 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 253 (1 8 0 7)
-	{ 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 254 (1 8 1 6)
-	{ 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 255 (1 8 2 5)
-	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 256 (1 8 3 4)
-	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 257 (1 8 4 3)
-	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 258 (1 8 5 2)
-	{ 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 259 (1 8 6 1)
-	{ 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 260 (1 8 7 0)
-	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 261 (1 9 0 6)
-	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 262 (1 9 1 5)
-	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 263 (1 9 2 4)
-	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 264 (1 9 3 3)
-	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 265 (1 9 4 2)
-	{ 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 266 (1 9 5 1)
-	{ 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 267 (1 9 6 0)
-	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 268 (1 10 0 5)
-	{ 5.555556f, 5.555555f, 2.444444f, 0.040179f }, // 269 (1 10 1 4)
-	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 270 (1 10 2 3)
-	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 271 (1 10 3 2)
-	{ 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 272 (1 10 4 1)
-	{ 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 273 (1 10 5 0)
-	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 274 (1 11 0 4)
-	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 275 (1 11 1 3)
-	{ 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 276 (1 11 2 2)
-	{ 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 277 (1 11 3 1)
-	{ 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 278 (1 11 4 0)
-	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 279 (1 12 0 3)
-	{ 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 280 (1 12 1 2)
-	{ 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 281 (1 12 2 1)
-	{ 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 282 (1 12 3 0)
-	{ 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 283 (1 13 0 2)
-	{ 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 284 (1 13 1 1)
-	{ 7.000000f, 2.333333f, 3.333333f, 0.191489f }, // 285 (1 13 2 0)
-	{ 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 286 (1 14 0 1)
-	{ 7.333333f, 2.000000f, 3.333333f, 0.281250f }, // 287 (1 14 1 0)
-	{ 7.666667f, 1.666667f, 3.333333f, 0.600000f }, // 288 (1 15 0 0)
-	{ 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 289 (2 0 0 14)
-	{ 2.111111f, 13.444445f, 0.222222f, 0.035294f }, // 290 (2 0 1 13)
-	{ 2.222222f, 12.888889f, 0.444444f, 0.035156f }, // 291 (2 0 2 12)
-	{ 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 292 (2 0 3 11)
-	{ 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 293 (2 0 4 10)
-	{ 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 294 (2 0 5 9)
-	{ 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 295 (2 0 6 8)
-	{ 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 296 (2 0 7 7)
-	{ 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 297 (2 0 8 6)
-	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 298 (2 0 9 5)
-	{ 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 299 (2 0 10 4)
-	{ 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 300 (2 0 11 3)
-	{ 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 301 (2 0 12 2)
-	{ 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 302 (2 0 13 1)
-	{ 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 303 (2 0 14 0)
-	{ 2.444444f, 13.111111f, 0.222222f, 0.031250f }, // 304 (2 1 0 13)
-	{ 2.555556f, 12.555555f, 0.444444f, 0.031359f }, // 305 (2 1 1 12)
-	{ 2.666667f, 12.000000f, 0.666667f, 0.031690f }, // 306 (2 1 2 11)
-	{ 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 307 (2 1 3 10)
-	{ 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 308 (2 1 4 9)
-	{ 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 309 (2 1 5 8)
-	{ 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 310 (2 1 6 7)
-	{ 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 311 (2 1 7 6)
-	{ 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 312 (2 1 8 5)
-	{ 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 313 (2 1 9 4)
-	{ 3.555556f, 7.555556f, 2.444444f, 0.047872f }, // 314 (2 1 10 3)
-	{ 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 315 (2 1 11 2)
-	{ 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 316 (2 1 12 1)
-	{ 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 317 (2 1 13 0)
-	{ 2.888889f, 12.222222f, 0.444444f, 0.028481f }, // 318 (2 2 0 12)
-	{ 3.000000f, 11.666667f, 0.666667f, 0.028939f }, // 319 (2 2 1 11)
-	{ 3.111111f, 11.111111f, 0.888889f, 0.029605f }, // 320 (2 2 2 10)
-	{ 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 321 (2 2 3 9)
-	{ 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 322 (2 2 4 8)
-	{ 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 323 (2 2 5 7)
-	{ 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 324 (2 2 6 6)
-	{ 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 325 (2 2 7 5)
-	{ 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 326 (2 2 8 4)
-	{ 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 327 (2 2 9 3)
-	{ 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 328 (2 2 10 2)
-	{ 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 329 (2 2 11 1)
-	{ 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 330 (2 2 12 0)
-	{ 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 331 (2 3 0 11)
-	{ 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 332 (2 3 1 10)
-	{ 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 333 (2 3 2 9)
-	{ 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 334 (2 3 3 8)
-	{ 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 335 (2 3 4 7)
-	{ 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 336 (2 3 5 6)
-	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 337 (2 3 6 5)
-	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 338 (2 3 7 4)
-	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 339 (2 3 8 3)
-	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 340 (2 3 9 2)
-	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 341 (2 3 10 1)
-	{ 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 342 (2 3 11 0)
-	{ 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 343 (2 4 0 10)
-	{ 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 344 (2 4 1 9)
-	{ 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 345 (2 4 2 8)
-	{ 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 346 (2 4 3 7)
-	{ 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 347 (2 4 4 6)
-	{ 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 348 (2 4 5 5)
-	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 349 (2 4 6 4)
-	{ 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 350 (2 4 7 3)
-	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 351 (2 4 8 2)
-	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 352 (2 4 9 1)
-	{ 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 353 (2 4 10 0)
-	{ 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 354 (2 5 0 9)
-	{ 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 355 (2 5 1 8)
-	{ 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 356 (2 5 2 7)
-	{ 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 357 (2 5 3 6)
-	{ 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 358 (2 5 4 5)
-	{ 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 359 (2 5 5 4)
-	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 360 (2 5 6 3)
-	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 361 (2 5 7 2)
-	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 362 (2 5 8 1)
-	{ 5.222222f, 4.555556f, 3.111111f, 0.070866f }, // 363 (2 5 9 0)
-	{ 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 364 (2 6 0 8)
-	{ 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 365 (2 6 1 7)
-	{ 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 366 (2 6 2 6)
-	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 367 (2 6 3 5)
-	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 368 (2 6 4 4)
-	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 369 (2 6 5 3)
-	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 370 (2 6 6 2)
-	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 371 (2 6 7 1)
-	{ 5.555555f, 4.222222f, 3.111111f, 0.072581f }, // 372 (2 6 8 0)
-	{ 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 373 (2 7 0 7)
-	{ 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 374 (2 7 1 6)
-	{ 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 375 (2 7 2 5)
-	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 376 (2 7 3 4)
-	{ 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 377 (2 7 4 3)
-	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 378 (2 7 5 2)
-	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 379 (2 7 6 1)
-	{ 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 380 (2 7 7 0)
-	{ 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 381 (2 8 0 6)
-	{ 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 382 (2 8 1 5)
-	{ 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 383 (2 8 2 4)
-	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 384 (2 8 3 3)
-	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 385 (2 8 4 2)
-	{ 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 386 (2 8 5 1)
-	{ 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 387 (2 8 6 0)
-	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 388 (2 9 0 5)
-	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 389 (2 9 1 4)
-	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 390 (2 9 2 3)
-	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 391 (2 9 3 2)
-	{ 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 392 (2 9 4 1)
-	{ 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 393 (2 9 5 0)
-	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 394 (2 10 0 4)
-	{ 6.555556f, 4.555555f, 2.444444f, 0.041860f }, // 395 (2 10 1 3)
-	{ 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 396 (2 10 2 2)
-	{ 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 397 (2 10 3 1)
-	{ 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 398 (2 10 4 0)
-	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 399 (2 11 0 3)
-	{ 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 400 (2 11 1 2)
-	{ 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 401 (2 11 2 1)
-	{ 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 402 (2 11 3 0)
-	{ 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 403 (2 12 0 2)
-	{ 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 404 (2 12 1 1)
-	{ 7.555556f, 2.222222f, 3.111111f, 0.140625f }, // 405 (2 12 2 0)
-	{ 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 406 (2 13 0 1)
-	{ 7.888889f, 1.888889f, 3.111111f, 0.191489f }, // 407 (2 13 1 0)
-	{ 8.222222f, 1.555556f, 3.111111f, 0.321429f }, // 408 (2 14 0 0)
-	{ 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 409 (3 0 0 13)
-	{ 3.111111f, 12.444445f, 0.222222f, 0.025862f }, // 410 (3 0 1 12)
-	{ 3.222222f, 11.888889f, 0.444444f, 0.026239f }, // 411 (3 0 2 11)
-	{ 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 412 (3 0 3 10)
-	{ 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 413 (3 0 4 9)
-	{ 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 414 (3 0 5 8)
-	{ 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 415 (3 0 6 7)
-	{ 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 416 (3 0 7 6)
-	{ 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 417 (3 0 8 5)
-	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 418 (3 0 9 4)
-	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 419 (3 0 10 3)
-	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 420 (3 0 11 2)
-	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 421 (3 0 12 1)
-	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 422 (3 0 13 0)
-	{ 3.444444f, 12.111111f, 0.222222f, 0.024000f }, // 423 (3 1 0 12)
-	{ 3.555556f, 11.555555f, 0.444444f, 0.024457f }, // 424 (3 1 1 11)
-	{ 3.666667f, 11.000000f, 0.666667f, 0.025070f }, // 425 (3 1 2 10)
-	{ 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 426 (3 1 3 9)
-	{ 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 427 (3 1 4 8)
-	{ 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 428 (3 1 5 7)
-	{ 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 429 (3 1 6 6)
-	{ 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 430 (3 1 7 5)
-	{ 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 431 (3 1 8 4)
-	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 432 (3 1 9 3)
-	{ 4.555555f, 6.555556f, 2.444444f, 0.041860f }, // 433 (3 1 10 2)
-	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 434 (3 1 11 1)
-	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 435 (3 1 12 0)
-	{ 3.888889f, 11.222222f, 0.444444f, 0.023018f }, // 436 (3 2 0 11)
-	{ 4.000000f, 10.666667f, 0.666667f, 0.023684f }, // 437 (3 2 1 10)
-	{ 4.111111f, 10.111111f, 0.888889f, 0.024523f }, // 438 (3 2 2 9)
-	{ 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 439 (3 2 3 8)
-	{ 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 440 (3 2 4 7)
-	{ 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 441 (3 2 5 6)
-	{ 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 442 (3 2 6 5)
-	{ 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 443 (3 2 7 4)
-	{ 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 444 (3 2 8 3)
-	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 445 (3 2 9 2)
-	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 446 (3 2 10 1)
-	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 447 (3 2 11 0)
-	{ 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 448 (3 3 0 10)
-	{ 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 449 (3 3 1 9)
-	{ 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 450 (3 3 2 8)
-	{ 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 451 (3 3 3 7)
-	{ 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 452 (3 3 4 6)
-	{ 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 453 (3 3 5 5)
-	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 454 (3 3 6 4)
-	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 455 (3 3 7 3)
-	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 456 (3 3 8 2)
-	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 457 (3 3 9 1)
-	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 458 (3 3 10 0)
-	{ 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 459 (3 4 0 9)
-	{ 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 460 (3 4 1 8)
-	{ 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 461 (3 4 2 7)
-	{ 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 462 (3 4 3 6)
-	{ 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 463 (3 4 4 5)
-	{ 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 464 (3 4 5 4)
-	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 465 (3 4 6 3)
-	{ 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 466 (3 4 7 2)
-	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 467 (3 4 8 1)
-	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 468 (3 4 9 0)
-	{ 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 469 (3 5 0 8)
-	{ 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 470 (3 5 1 7)
-	{ 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 471 (3 5 2 6)
-	{ 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 472 (3 5 3 5)
-	{ 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 473 (3 5 4 4)
-	{ 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 474 (3 5 5 3)
-	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 475 (3 5 6 2)
-	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 476 (3 5 7 1)
-	{ 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 477 (3 5 8 0)
-	{ 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 478 (3 6 0 7)
-	{ 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 479 (3 6 1 6)
-	{ 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 480 (3 6 2 5)
-	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 481 (3 6 3 4)
-	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 482 (3 6 4 3)
-	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 483 (3 6 5 2)
-	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 484 (3 6 6 1)
-	{ 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 485 (3 6 7 0)
-	{ 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 486 (3 7 0 6)
-	{ 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 487 (3 7 1 5)
-	{ 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 488 (3 7 2 4)
-	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 489 (3 7 3 3)
-	{ 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 490 (3 7 4 2)
-	{ 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 491 (3 7 5 1)
-	{ 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 492 (3 7 6 0)
-	{ 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 493 (3 8 0 5)
-	{ 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 494 (3 8 1 4)
-	{ 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 495 (3 8 2 3)
-	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 496 (3 8 3 2)
-	{ 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 497 (3 8 4 1)
-	{ 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 498 (3 8 5 0)
-	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 499 (3 9 0 4)
-	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 500 (3 9 1 3)
-	{ 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 501 (3 9 2 2)
-	{ 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 502 (3 9 3 1)
-	{ 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 503 (3 9 4 0)
-	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 504 (3 10 0 3)
-	{ 7.555556f, 3.555556f, 2.444444f, 0.047872f }, // 505 (3 10 1 2)
-	{ 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 506 (3 10 2 1)
-	{ 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 507 (3 10 3 0)
-	{ 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 508 (3 11 0 2)
-	{ 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 509 (3 11 1 1)
-	{ 8.111111f, 2.111111f, 2.888889f, 0.113924f }, // 510 (3 11 2 0)
-	{ 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 511 (3 12 0 1)
-	{ 8.444445f, 1.777778f, 2.888889f, 0.150000f }, // 512 (3 12 1 0)
-	{ 8.777778f, 1.444444f, 2.888889f, 0.230769f }, // 513 (3 13 0 0)
-	{ 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 514 (4 0 0 12)
-	{ 4.111111f, 11.444445f, 0.222222f, 0.021277f }, // 515 (4 0 1 11)
-	{ 4.222222f, 10.888889f, 0.444444f, 0.021845f }, // 516 (4 0 2 10)
-	{ 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 517 (4 0 3 9)
-	{ 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 518 (4 0 4 8)
-	{ 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 519 (4 0 5 7)
-	{ 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 520 (4 0 6 6)
-	{ 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 521 (4 0 7 5)
-	{ 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 522 (4 0 8 4)
-	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 523 (4 0 9 3)
-	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 524 (4 0 10 2)
-	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 525 (4 0 11 1)
-	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 526 (4 0 12 0)
-	{ 4.444445f, 11.111111f, 0.222222f, 0.020270f }, // 527 (4 1 0 11)
-	{ 4.555555f, 10.555555f, 0.444444f, 0.020882f }, // 528 (4 1 1 10)
-	{ 4.666667f, 10.000000f, 0.666667f, 0.021635f }, // 529 (4 1 2 9)
-	{ 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 530 (4 1 3 8)
-	{ 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 531 (4 1 4 7)
-	{ 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 532 (4 1 5 6)
-	{ 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 533 (4 1 6 5)
-	{ 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 534 (4 1 7 4)
-	{ 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 535 (4 1 8 3)
-	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 536 (4 1 9 2)
-	{ 5.555555f, 5.555556f, 2.444444f, 0.040179f }, // 537 (4 1 10 1)
-	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 538 (4 1 11 0)
-	{ 4.888889f, 10.222222f, 0.444444f, 0.020089f }, // 539 (4 2 0 10)
-	{ 5.000000f, 9.666667f, 0.666667f, 0.020882f }, // 540 (4 2 1 9)
-	{ 5.111111f, 9.111111f, 0.888889f, 0.021845f }, // 541 (4 2 2 8)
-	{ 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 542 (4 2 3 7)
-	{ 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 543 (4 2 4 6)
-	{ 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 544 (4 2 5 5)
-	{ 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 545 (4 2 6 4)
-	{ 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 546 (4 2 7 3)
-	{ 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 547 (4 2 8 2)
-	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 548 (4 2 9 1)
-	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 549 (4 2 10 0)
-	{ 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 550 (4 3 0 9)
-	{ 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 551 (4 3 1 8)
-	{ 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 552 (4 3 2 7)
-	{ 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 553 (4 3 3 6)
-	{ 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 554 (4 3 4 5)
-	{ 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 555 (4 3 5 4)
-	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 556 (4 3 6 3)
-	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 557 (4 3 7 2)
-	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 558 (4 3 8 1)
-	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 559 (4 3 9 0)
-	{ 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 560 (4 4 0 8)
-	{ 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 561 (4 4 1 7)
-	{ 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 562 (4 4 2 6)
-	{ 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 563 (4 4 3 5)
-	{ 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 564 (4 4 4 4)
-	{ 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 565 (4 4 5 3)
-	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 566 (4 4 6 2)
-	{ 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 567 (4 4 7 1)
-	{ 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 568 (4 4 8 0)
-	{ 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 569 (4 5 0 7)
-	{ 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 570 (4 5 1 6)
-	{ 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 571 (4 5 2 5)
-	{ 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 572 (4 5 3 4)
-	{ 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 573 (4 5 4 3)
-	{ 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 574 (4 5 5 2)
-	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 575 (4 5 6 1)
-	{ 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 576 (4 5 7 0)
-	{ 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 577 (4 6 0 6)
-	{ 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 578 (4 6 1 5)
-	{ 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 579 (4 6 2 4)
-	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 580 (4 6 3 3)
-	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 581 (4 6 4 2)
-	{ 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 582 (4 6 5 1)
-	{ 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 583 (4 6 6 0)
-	{ 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 584 (4 7 0 5)
-	{ 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 585 (4 7 1 4)
-	{ 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 586 (4 7 2 3)
-	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 587 (4 7 3 2)
-	{ 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 588 (4 7 4 1)
-	{ 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 589 (4 7 5 0)
-	{ 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 590 (4 8 0 4)
-	{ 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 591 (4 8 1 3)
-	{ 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 592 (4 8 2 2)
-	{ 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 593 (4 8 3 1)
-	{ 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 594 (4 8 4 0)
-	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 595 (4 9 0 3)
-	{ 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 596 (4 9 1 2)
-	{ 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 597 (4 9 2 1)
-	{ 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 598 (4 9 3 0)
-	{ 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 599 (4 10 0 2)
-	{ 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 600 (4 10 1 1)
-	{ 8.666667f, 2.000000f, 2.666667f, 0.097826f }, // 601 (4 10 2 0)
-	{ 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 602 (4 11 0 1)
-	{ 9.000000f, 1.666667f, 2.666667f, 0.126761f }, // 603 (4 11 1 0)
-	{ 9.333333f, 1.333333f, 2.666667f, 0.187500f }, // 604 (4 12 0 0)
-	{ 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 605 (5 0 0 11)
-	{ 5.111111f, 10.444445f, 0.222222f, 0.018750f }, // 606 (5 0 1 10)
-	{ 5.222222f, 9.888889f, 0.444444f, 0.019438f }, // 607 (5 0 2 9)
-	{ 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 608 (5 0 3 8)
-	{ 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 609 (5 0 4 7)
-	{ 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 610 (5 0 5 6)
-	{ 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 611 (5 0 6 5)
-	{ 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 612 (5 0 7 4)
-	{ 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 613 (5 0 8 3)
-	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 614 (5 0 9 2)
-	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 615 (5 0 10 1)
-	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 616 (5 0 11 0)
-	{ 5.444445f, 10.111111f, 0.222222f, 0.018182f }, // 617 (5 1 0 10)
-	{ 5.555555f, 9.555555f, 0.444444f, 0.018908f }, // 618 (5 1 1 9)
-	{ 5.666667f, 9.000000f, 0.666667f, 0.019780f }, // 619 (5 1 2 8)
-	{ 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 620 (5 1 3 7)
-	{ 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 621 (5 1 4 6)
-	{ 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 622 (5 1 5 5)
-	{ 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 623 (5 1 6 4)
-	{ 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 624 (5 1 7 3)
-	{ 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 625 (5 1 8 2)
-	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 626 (5 1 9 1)
-	{ 6.555555f, 4.555556f, 2.444444f, 0.041860f }, // 627 (5 1 10 0)
-	{ 5.888889f, 9.222222f, 0.444444f, 0.018480f }, // 628 (5 2 0 9)
-	{ 6.000000f, 8.666667f, 0.666667f, 0.019397f }, // 629 (5 2 1 8)
-	{ 6.111111f, 8.111111f, 0.888889f, 0.020501f }, // 630 (5 2 2 7)
-	{ 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 631 (5 2 3 6)
-	{ 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 632 (5 2 4 5)
-	{ 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 633 (5 2 5 4)
-	{ 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 634 (5 2 6 3)
-	{ 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 635 (5 2 7 2)
-	{ 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 636 (5 2 8 1)
-	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 637 (5 2 9 0)
-	{ 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 638 (5 3 0 8)
-	{ 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 639 (5 3 1 7)
-	{ 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 640 (5 3 2 6)
-	{ 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 641 (5 3 3 5)
-	{ 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 642 (5 3 4 4)
-	{ 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 643 (5 3 5 3)
-	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 644 (5 3 6 2)
-	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 645 (5 3 7 1)
-	{ 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 646 (5 3 8 0)
-	{ 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 647 (5 4 0 7)
-	{ 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 648 (5 4 1 6)
-	{ 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 649 (5 4 2 5)
-	{ 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 650 (5 4 3 4)
-	{ 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 651 (5 4 4 3)
-	{ 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 652 (5 4 5 2)
-	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 653 (5 4 6 1)
-	{ 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 654 (5 4 7 0)
-	{ 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 655 (5 5 0 6)
-	{ 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 656 (5 5 1 5)
-	{ 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 657 (5 5 2 4)
-	{ 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 658 (5 5 3 3)
-	{ 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 659 (5 5 4 2)
-	{ 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 660 (5 5 5 1)
-	{ 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 661 (5 5 6 0)
-	{ 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 662 (5 6 0 5)
-	{ 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 663 (5 6 1 4)
-	{ 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 664 (5 6 2 3)
-	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 665 (5 6 3 2)
-	{ 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 666 (5 6 4 1)
-	{ 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 667 (5 6 5 0)
-	{ 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 668 (5 7 0 4)
-	{ 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 669 (5 7 1 3)
-	{ 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 670 (5 7 2 2)
-	{ 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 671 (5 7 3 1)
-	{ 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 672 (5 7 4 0)
-	{ 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 673 (5 8 0 3)
-	{ 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 674 (5 8 1 2)
-	{ 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 675 (5 8 2 1)
-	{ 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 676 (5 8 3 0)
-	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 677 (5 9 0 2)
-	{ 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 678 (5 9 1 1)
-	{ 9.222222f, 1.888889f, 2.444444f, 0.087379f }, // 679 (5 9 2 0)
-	{ 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 680 (5 10 0 1)
-	{ 9.555555f, 1.555556f, 2.444444f, 0.112500f }, // 681 (5 10 1 0)
-	{ 9.888889f, 1.222222f, 2.444444f, 0.163636f }, // 682 (5 11 0 0)
-	{ 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 683 (6 0 0 10)
-	{ 6.111111f, 9.444445f, 0.222222f, 0.017341f }, // 684 (6 0 1 9)
-	{ 6.222222f, 8.888889f, 0.444444f, 0.018145f }, // 685 (6 0 2 8)
-	{ 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 686 (6 0 3 7)
-	{ 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 687 (6 0 4 6)
-	{ 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 688 (6 0 5 5)
-	{ 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 689 (6 0 6 4)
-	{ 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 690 (6 0 7 3)
-	{ 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 691 (6 0 8 2)
-	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 692 (6 0 9 1)
-	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 693 (6 0 10 0)
-	{ 6.444445f, 9.111111f, 0.222222f, 0.017045f }, // 694 (6 1 0 9)
-	{ 6.555555f, 8.555555f, 0.444444f, 0.017893f }, // 695 (6 1 1 8)
-	{ 6.666667f, 8.000000f, 0.666667f, 0.018908f }, // 696 (6 1 2 7)
-	{ 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 697 (6 1 3 6)
-	{ 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 698 (6 1 4 5)
-	{ 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 699 (6 1 5 4)
-	{ 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 700 (6 1 6 3)
-	{ 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 701 (6 1 7 2)
-	{ 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 702 (6 1 8 1)
-	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 703 (6 1 9 0)
-	{ 6.888889f, 8.222222f, 0.444444f, 0.017717f }, // 704 (6 2 0 8)
-	{ 7.000000f, 7.666667f, 0.666667f, 0.018789f }, // 705 (6 2 1 7)
-	{ 7.111111f, 7.111111f, 0.888889f, 0.020089f }, // 706 (6 2 2 6)
-	{ 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 707 (6 2 3 5)
-	{ 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 708 (6 2 4 4)
-	{ 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 709 (6 2 5 3)
-	{ 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 710 (6 2 6 2)
-	{ 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 711 (6 2 7 1)
-	{ 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 712 (6 2 8 0)
-	{ 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 713 (6 3 0 7)
-	{ 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 714 (6 3 1 6)
-	{ 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 715 (6 3 2 5)
-	{ 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 716 (6 3 3 4)
-	{ 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 717 (6 3 4 3)
-	{ 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 718 (6 3 5 2)
-	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 719 (6 3 6 1)
-	{ 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 720 (6 3 7 0)
-	{ 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 721 (6 4 0 6)
-	{ 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 722 (6 4 1 5)
-	{ 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 723 (6 4 2 4)
-	{ 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 724 (6 4 3 3)
-	{ 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 725 (6 4 4 2)
-	{ 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 726 (6 4 5 1)
-	{ 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 727 (6 4 6 0)
-	{ 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 728 (6 5 0 5)
-	{ 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 729 (6 5 1 4)
-	{ 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 730 (6 5 2 3)
-	{ 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 731 (6 5 3 2)
-	{ 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 732 (6 5 4 1)
-	{ 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 733 (6 5 5 0)
-	{ 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 734 (6 6 0 4)
-	{ 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 735 (6 6 1 3)
-	{ 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 736 (6 6 2 2)
-	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 737 (6 6 3 1)
-	{ 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 738 (6 6 4 0)
-	{ 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 739 (6 7 0 3)
-	{ 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 740 (6 7 1 2)
-	{ 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 741 (6 7 2 1)
-	{ 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 742 (6 7 3 0)
-	{ 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 743 (6 8 0 2)
-	{ 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 744 (6 8 1 1)
-	{ 9.777778f, 1.777778f, 2.222222f, 0.080357f }, // 745 (6 8 2 0)
-	{ 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 746 (6 9 0 1)
-	{ 10.111111f, 1.444444f, 2.222222f, 0.103448f }, // 747 (6 9 1 0)
-	{ 10.444445f, 1.111111f, 2.222222f, 0.150000f }, // 748 (6 10 0 0)
-	{ 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 749 (7 0 0 9)
-	{ 7.111111f, 8.444445f, 0.222222f, 0.016667f }, // 750 (7 0 1 8)
-	{ 7.222222f, 7.888889f, 0.444444f, 0.017613f }, // 751 (7 0 2 7)
-	{ 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 752 (7 0 3 6)
-	{ 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 753 (7 0 4 5)
-	{ 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 754 (7 0 5 4)
-	{ 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 755 (7 0 6 3)
-	{ 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 756 (7 0 7 2)
-	{ 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 757 (7 0 8 1)
-	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 758 (7 0 9 0)
-	{ 7.444445f, 8.111111f, 0.222222f, 0.016575f }, // 759 (7 1 0 8)
-	{ 7.555555f, 7.555555f, 0.444444f, 0.017578f }, // 760 (7 1 1 7)
-	{ 7.666667f, 7.000000f, 0.666667f, 0.018789f }, // 761 (7 1 2 6)
-	{ 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 762 (7 1 3 5)
-	{ 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 763 (7 1 4 4)
-	{ 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 764 (7 1 5 3)
-	{ 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 765 (7 1 6 2)
-	{ 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 766 (7 1 7 1)
-	{ 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 767 (7 1 8 0)
-	{ 7.888889f, 7.222222f, 0.444444f, 0.017613f }, // 768 (7 2 0 7)
-	{ 8.000000f, 6.666667f, 0.666667f, 0.018908f }, // 769 (7 2 1 6)
-	{ 8.111111f, 6.111111f, 0.888889f, 0.020501f }, // 770 (7 2 2 5)
-	{ 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 771 (7 2 3 4)
-	{ 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 772 (7 2 4 3)
-	{ 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 773 (7 2 5 2)
-	{ 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 774 (7 2 6 1)
-	{ 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 775 (7 2 7 0)
-	{ 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 776 (7 3 0 6)
-	{ 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 777 (7 3 1 5)
-	{ 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 778 (7 3 2 4)
-	{ 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 779 (7 3 3 3)
-	{ 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 780 (7 3 4 2)
-	{ 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 781 (7 3 5 1)
-	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 782 (7 3 6 0)
-	{ 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 783 (7 4 0 5)
-	{ 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 784 (7 4 1 4)
-	{ 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 785 (7 4 2 3)
-	{ 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 786 (7 4 3 2)
-	{ 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 787 (7 4 4 1)
-	{ 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 788 (7 4 5 0)
-	{ 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 789 (7 5 0 4)
-	{ 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 790 (7 5 1 3)
-	{ 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 791 (7 5 2 2)
-	{ 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 792 (7 5 3 1)
-	{ 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 793 (7 5 4 0)
-	{ 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 794 (7 6 0 3)
-	{ 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 795 (7 6 1 2)
-	{ 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 796 (7 6 2 1)
-	{ 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 797 (7 6 3 0)
-	{ 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 798 (7 7 0 2)
-	{ 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 799 (7 7 1 1)
-	{ 10.333333f, 1.666667f, 2.000000f, 0.075630f }, // 800 (7 7 2 0)
-	{ 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 801 (7 8 0 1)
-	{ 10.666667f, 1.333333f, 2.000000f, 0.097826f }, // 802 (7 8 1 0)
-	{ 11.000000f, 1.000000f, 2.000000f, 0.142857f }, // 803 (7 9 0 0)
-	{ 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 804 (8 0 0 8)
-	{ 8.111111f, 7.444445f, 0.222222f, 0.016575f }, // 805 (8 0 1 7)
-	{ 8.222222f, 6.888889f, 0.444444f, 0.017717f }, // 806 (8 0 2 6)
-	{ 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 807 (8 0 3 5)
-	{ 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 808 (8 0 4 4)
-	{ 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 809 (8 0 5 3)
-	{ 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 810 (8 0 6 2)
-	{ 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 811 (8 0 7 1)
-	{ 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 812 (8 0 8 0)
-	{ 8.444445f, 7.111111f, 0.222222f, 0.016667f }, // 813 (8 1 0 7)
-	{ 8.555555f, 6.555555f, 0.444444f, 0.017893f }, // 814 (8 1 1 6)
-	{ 8.666667f, 6.000000f, 0.666667f, 0.019397f }, // 815 (8 1 2 5)
-	{ 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 816 (8 1 3 4)
-	{ 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 817 (8 1 4 3)
-	{ 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 818 (8 1 5 2)
-	{ 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 819 (8 1 6 1)
-	{ 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 820 (8 1 7 0)
-	{ 8.888889f, 6.222222f, 0.444444f, 0.018145f }, // 821 (8 2 0 6)
-	{ 9.000000f, 5.666667f, 0.666667f, 0.019780f }, // 822 (8 2 1 5)
-	{ 9.111111f, 5.111111f, 0.888889f, 0.021845f }, // 823 (8 2 2 4)
-	{ 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 824 (8 2 3 3)
-	{ 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 825 (8 2 4 2)
-	{ 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 826 (8 2 5 1)
-	{ 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 827 (8 2 6 0)
-	{ 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 828 (8 3 0 5)
-	{ 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 829 (8 3 1 4)
-	{ 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 830 (8 3 2 3)
-	{ 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 831 (8 3 3 2)
-	{ 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 832 (8 3 4 1)
-	{ 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 833 (8 3 5 0)
-	{ 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 834 (8 4 0 4)
-	{ 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 835 (8 4 1 3)
-	{ 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 836 (8 4 2 2)
-	{ 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 837 (8 4 3 1)
-	{ 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 838 (8 4 4 0)
-	{ 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 839 (8 5 0 3)
-	{ 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 840 (8 5 1 2)
-	{ 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 841 (8 5 2 1)
-	{ 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 842 (8 5 3 0)
-	{ 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 843 (8 6 0 2)
-	{ 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 844 (8 6 1 1)
-	{ 10.888889f, 1.555556f, 1.777778f, 0.072581f }, // 845 (8 6 2 0)
-	{ 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 846 (8 7 0 1)
-	{ 11.222222f, 1.222222f, 1.777778f, 0.094737f }, // 847 (8 7 1 0)
-	{ 11.555555f, 0.888889f, 1.777778f, 0.140625f }, // 848 (8 8 0 0)
-	{ 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 849 (9 0 0 7)
-	{ 9.111111f, 6.444445f, 0.222222f, 0.017045f }, // 850 (9 0 1 6)
-	{ 9.222222f, 5.888889f, 0.444444f, 0.018480f }, // 851 (9 0 2 5)
-	{ 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 852 (9 0 3 4)
-	{ 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 853 (9 0 4 3)
-	{ 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 854 (9 0 5 2)
-	{ 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 855 (9 0 6 1)
-	{ 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 856 (9 0 7 0)
-	{ 9.444445f, 6.111111f, 0.222222f, 0.017341f }, // 857 (9 1 0 6)
-	{ 9.555555f, 5.555555f, 0.444444f, 0.018908f }, // 858 (9 1 1 5)
-	{ 9.666667f, 5.000000f, 0.666667f, 0.020882f }, // 859 (9 1 2 4)
-	{ 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 860 (9 1 3 3)
-	{ 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 861 (9 1 4 2)
-	{ 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 862 (9 1 5 1)
-	{ 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 863 (9 1 6 0)
-	{ 9.888889f, 5.222222f, 0.444444f, 0.019438f }, // 864 (9 2 0 5)
-	{ 10.000000f, 4.666667f, 0.666667f, 0.021635f }, // 865 (9 2 1 4)
-	{ 10.111111f, 4.111111f, 0.888889f, 0.024523f }, // 866 (9 2 2 3)
-	{ 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 867 (9 2 3 2)
-	{ 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 868 (9 2 4 1)
-	{ 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 869 (9 2 5 0)
-	{ 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 870 (9 3 0 4)
-	{ 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 871 (9 3 1 3)
-	{ 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 872 (9 3 2 2)
-	{ 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 873 (9 3 3 1)
-	{ 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 874 (9 3 4 0)
-	{ 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 875 (9 4 0 3)
-	{ 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 876 (9 4 1 2)
-	{ 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 877 (9 4 2 1)
-	{ 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 878 (9 4 3 0)
-	{ 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 879 (9 5 0 2)
-	{ 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 880 (9 5 1 1)
-	{ 11.444445f, 1.444444f, 1.555556f, 0.070866f }, // 881 (9 5 2 0)
-	{ 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 882 (9 6 0 1)
-	{ 11.777778f, 1.111111f, 1.555556f, 0.093750f }, // 883 (9 6 1 0)
-	{ 12.111111f, 0.777778f, 1.555556f, 0.142857f }, // 884 (9 7 0 0)
-	{ 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 885 (10 0 0 6)
-	{ 10.111111f, 5.444445f, 0.222222f, 0.018182f }, // 886 (10 0 1 5)
-	{ 10.222222f, 4.888889f, 0.444444f, 0.020089f }, // 887 (10 0 2 4)
-	{ 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 888 (10 0 3 3)
-	{ 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 889 (10 0 4 2)
-	{ 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 890 (10 0 5 1)
-	{ 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 891 (10 0 6 0)
-	{ 10.444445f, 5.111111f, 0.222222f, 0.018750f }, // 892 (10 1 0 5)
-	{ 10.555555f, 4.555555f, 0.444444f, 0.020882f }, // 893 (10 1 1 4)
-	{ 10.666667f, 4.000000f, 0.666667f, 0.023684f }, // 894 (10 1 2 3)
-	{ 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 895 (10 1 3 2)
-	{ 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 896 (10 1 4 1)
-	{ 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 897 (10 1 5 0)
-	{ 10.888889f, 4.222222f, 0.444444f, 0.021845f }, // 898 (10 2 0 4)
-	{ 11.000000f, 3.666667f, 0.666667f, 0.025070f }, // 899 (10 2 1 3)
-	{ 11.111111f, 3.111111f, 0.888889f, 0.029605f }, // 900 (10 2 2 2)
-	{ 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 901 (10 2 3 1)
-	{ 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 902 (10 2 4 0)
-	{ 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 903 (10 3 0 3)
-	{ 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 904 (10 3 1 2)
-	{ 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 905 (10 3 2 1)
-	{ 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 906 (10 3 3 0)
-	{ 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 907 (10 4 0 2)
-	{ 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 908 (10 4 1 1)
-	{ 12.000000f, 1.333333f, 1.333333f, 0.070313f }, // 909 (10 4 2 0)
-	{ 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 910 (10 5 0 1)
-	{ 12.333333f, 1.000000f, 1.333333f, 0.094737f }, // 911 (10 5 1 0)
-	{ 12.666667f, 0.666667f, 1.333333f, 0.150000f }, // 912 (10 6 0 0)
-	{ 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 913 (11 0 0 5)
-	{ 11.111111f, 4.444445f, 0.222222f, 0.020270f }, // 914 (11 0 1 4)
-	{ 11.222222f, 3.888889f, 0.444444f, 0.023018f }, // 915 (11 0 2 3)
-	{ 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 916 (11 0 3 2)
-	{ 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 917 (11 0 4 1)
-	{ 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 918 (11 0 5 0)
-	{ 11.444445f, 4.111111f, 0.222222f, 0.021277f }, // 919 (11 1 0 4)
-	{ 11.555555f, 3.555556f, 0.444444f, 0.024457f }, // 920 (11 1 1 3)
-	{ 11.666667f, 3.000000f, 0.666667f, 0.028939f }, // 921 (11 1 2 2)
-	{ 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 922 (11 1 3 1)
-	{ 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 923 (11 1 4 0)
-	{ 11.888889f, 3.222222f, 0.444444f, 0.026239f }, // 924 (11 2 0 3)
-	{ 12.000000f, 2.666667f, 0.666667f, 0.031690f }, // 925 (11 2 1 2)
-	{ 12.111111f, 2.111111f, 0.888889f, 0.040359f }, // 926 (11 2 2 1)
-	{ 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 927 (11 2 3 0)
-	{ 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 928 (11 3 0 2)
-	{ 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 929 (11 3 1 1)
-	{ 12.555555f, 1.222222f, 1.111111f, 0.070866f }, // 930 (11 3 2 0)
-	{ 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 931 (11 4 0 1)
-	{ 12.888889f, 0.888889f, 1.111111f, 0.097826f }, // 932 (11 4 1 0)
-	{ 13.222222f, 0.555556f, 1.111111f, 0.163636f }, // 933 (11 5 0 0)
-	{ 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 934 (12 0 0 4)
-	{ 12.111111f, 3.444444f, 0.222222f, 0.024000f }, // 935 (12 0 1 3)
-	{ 12.222222f, 2.888889f, 0.444444f, 0.028481f }, // 936 (12 0 2 2)
-	{ 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 937 (12 0 3 1)
-	{ 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 938 (12 0 4 0)
-	{ 12.444445f, 3.111111f, 0.222222f, 0.025862f }, // 939 (12 1 0 3)
-	{ 12.555555f, 2.555556f, 0.444444f, 0.031359f }, // 940 (12 1 1 2)
-	{ 12.666667f, 2.000000f, 0.666667f, 0.040179f }, // 941 (12 1 2 1)
-	{ 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 942 (12 1 3 0)
-	{ 12.888889f, 2.222222f, 0.444444f, 0.035156f }, // 943 (12 2 0 2)
-	{ 13.000000f, 1.666667f, 0.666667f, 0.047120f }, // 944 (12 2 1 1)
-	{ 13.111111f, 1.111111f, 0.888889f, 0.072581f }, // 945 (12 2 2 0)
-	{ 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 946 (12 3 0 1)
-	{ 13.444445f, 0.777778f, 0.888889f, 0.103448f }, // 947 (12 3 1 0)
-	{ 13.777778f, 0.444444f, 0.888889f, 0.187500f }, // 948 (12 4 0 0)
-	{ 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 949 (13 0 0 3)
-	{ 13.111111f, 2.444444f, 0.222222f, 0.031250f }, // 950 (13 0 1 2)
-	{ 13.222222f, 1.888889f, 0.444444f, 0.040359f }, // 951 (13 0 2 1)
-	{ 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 952 (13 0 3 0)
-	{ 13.444445f, 2.111111f, 0.222222f, 0.035294f }, // 953 (13 1 0 2)
-	{ 13.555555f, 1.555556f, 0.444444f, 0.047872f }, // 954 (13 1 1 1)
-	{ 13.666667f, 1.000000f, 0.666667f, 0.075630f }, // 955 (13 1 2 0)
-	{ 13.888889f, 1.222222f, 0.444444f, 0.059603f }, // 956 (13 2 0 1)
-	{ 14.000000f, 0.666667f, 0.666667f, 0.112500f }, // 957 (13 2 1 0)
-	{ 14.333333f, 0.333333f, 0.666667f, 0.230769f }, // 958 (13 3 0 0)
-	{ 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 959 (14 0 0 2)
-	{ 14.111111f, 1.444444f, 0.222222f, 0.049180f }, // 960 (14 0 1 1)
-	{ 14.222222f, 0.888889f, 0.444444f, 0.080357f }, // 961 (14 0 2 0)
-	{ 14.444445f, 1.111111f, 0.222222f, 0.062500f }, // 962 (14 1 0 1)
-	{ 14.555555f, 0.555556f, 0.444444f, 0.126761f }, // 963 (14 1 1 0)
-	{ 14.888889f, 0.222222f, 0.444444f, 0.321429f }, // 964 (14 2 0 0)
-	{ 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 965 (15 0 0 1)
-	{ 15.111111f, 0.444444f, 0.222222f, 0.150000f }, // 966 (15 0 1 0)
-	{ 15.444445f, 0.111111f, 0.222222f, 0.600000f }, // 967 (15 1 0 0)
-	{ 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 968 (16 0 0 0)
-}; // 969 four cluster elements
-
+struct Precomp {
+	float alpha2_sum;
+	float beta2_sum;
+	float alphabeta_sum;
+	float factor;
+};
+
+static const SQUISH_ALIGN_16 Precomp s_threeElement[153] = {
+	{ 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 16)
+	{ 0.250000f, 15.250000f, 0.250000f, 0.266667f }, // 1 (0 1 15)
+	{ 0.500000f, 14.500000f, 0.500000f, 0.142857f }, // 2 (0 2 14)
+	{ 0.750000f, 13.750000f, 0.750000f, 0.102564f }, // 3 (0 3 13)
+	{ 1.000000f, 13.000000f, 1.000000f, 0.083333f }, // 4 (0 4 12)
+	{ 1.250000f, 12.250000f, 1.250000f, 0.072727f }, // 5 (0 5 11)
+	{ 1.500000f, 11.500000f, 1.500000f, 0.066667f }, // 6 (0 6 10)
+	{ 1.750000f, 10.750000f, 1.750000f, 0.063492f }, // 7 (0 7 9)
+	{ 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 8 (0 8 8)
+	{ 2.250000f, 9.250000f, 2.250000f, 0.063492f }, // 9 (0 9 7)
+	{ 2.500000f, 8.500000f, 2.500000f, 0.066667f }, // 10 (0 10 6)
+	{ 2.750000f, 7.750000f, 2.750000f, 0.072727f }, // 11 (0 11 5)
+	{ 3.000000f, 7.000000f, 3.000000f, 0.083333f }, // 12 (0 12 4)
+	{ 3.250000f, 6.250000f, 3.250000f, 0.102564f }, // 13 (0 13 3)
+	{ 3.500000f, 5.500000f, 3.500000f, 0.142857f }, // 14 (0 14 2)
+	{ 3.750000f, 4.750000f, 3.750000f, 0.266667f }, // 15 (0 15 1)
+	{ 4.000000f, 4.000000f, 4.000000f, FLT_MAX }, // 16 (0 16 0)
+	{ 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 17 (1 0 15)
+	{ 1.250000f, 14.250000f, 0.250000f, 0.056338f }, // 18 (1 1 14)
+	{ 1.500000f, 13.500000f, 0.500000f, 0.050000f }, // 19 (1 2 13)
+	{ 1.750000f, 12.750000f, 0.750000f, 0.045977f }, // 20 (1 3 12)
+	{ 2.000000f, 12.000000f, 1.000000f, 0.043478f }, // 21 (1 4 11)
+	{ 2.250000f, 11.250000f, 1.250000f, 0.042105f }, // 22 (1 5 10)
+	{ 2.500000f, 10.500000f, 1.500000f, 0.041667f }, // 23 (1 6 9)
+	{ 2.750000f, 9.750000f, 1.750000f, 0.042105f }, // 24 (1 7 8)
+	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 25 (1 8 7)
+	{ 3.250000f, 8.250000f, 2.250000f, 0.045977f }, // 26 (1 9 6)
+	{ 3.500000f, 7.500000f, 2.500000f, 0.050000f }, // 27 (1 10 5)
+	{ 3.750000f, 6.750000f, 2.750000f, 0.056338f }, // 28 (1 11 4)
+	{ 4.000000f, 6.000000f, 3.000000f, 0.066667f }, // 29 (1 12 3)
+	{ 4.250000f, 5.250000f, 3.250000f, 0.085106f }, // 30 (1 13 2)
+	{ 4.500000f, 4.500000f, 3.500000f, 0.125000f }, // 31 (1 14 1)
+	{ 4.750000f, 3.750000f, 3.750000f, 0.266667f }, // 32 (1 15 0)
+	{ 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 33 (2 0 14)
+	{ 2.250000f, 13.250000f, 0.250000f, 0.033613f }, // 34 (2 1 13)
+	{ 2.500000f, 12.500000f, 0.500000f, 0.032258f }, // 35 (2 2 12)
+	{ 2.750000f, 11.750000f, 0.750000f, 0.031496f }, // 36 (2 3 11)
+	{ 3.000000f, 11.000000f, 1.000000f, 0.031250f }, // 37 (2 4 10)
+	{ 3.250000f, 10.250000f, 1.250000f, 0.031496f }, // 38 (2 5 9)
+	{ 3.500000f, 9.500000f, 1.500000f, 0.032258f }, // 39 (2 6 8)
+	{ 3.750000f, 8.750000f, 1.750000f, 0.033613f }, // 40 (2 7 7)
+	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 41 (2 8 6)
+	{ 4.250000f, 7.250000f, 2.250000f, 0.038835f }, // 42 (2 9 5)
+	{ 4.500000f, 6.500000f, 2.500000f, 0.043478f }, // 43 (2 10 4)
+	{ 4.750000f, 5.750000f, 2.750000f, 0.050633f }, // 44 (2 11 3)
+	{ 5.000000f, 5.000000f, 3.000000f, 0.062500f }, // 45 (2 12 2)
+	{ 5.250000f, 4.250000f, 3.250000f, 0.085106f }, // 46 (2 13 1)
+	{ 5.500000f, 3.500000f, 3.500000f, 0.142857f }, // 47 (2 14 0)
+	{ 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 48 (3 0 13)
+	{ 3.250000f, 12.250000f, 0.250000f, 0.025157f }, // 49 (3 1 12)
+	{ 3.500000f, 11.500000f, 0.500000f, 0.025000f }, // 50 (3 2 11)
+	{ 3.750000f, 10.750000f, 0.750000f, 0.025157f }, // 51 (3 3 10)
+	{ 4.000000f, 10.000000f, 1.000000f, 0.025641f }, // 52 (3 4 9)
+	{ 4.250000f, 9.250000f, 1.250000f, 0.026490f }, // 53 (3 5 8)
+	{ 4.500000f, 8.500000f, 1.500000f, 0.027778f }, // 54 (3 6 7)
+	{ 4.750000f, 7.750000f, 1.750000f, 0.029630f }, // 55 (3 7 6)
+	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 56 (3 8 5)
+	{ 5.250000f, 6.250000f, 2.250000f, 0.036036f }, // 57 (3 9 4)
+	{ 5.500000f, 5.500000f, 2.500000f, 0.041667f }, // 58 (3 10 3)
+	{ 5.750000f, 4.750000f, 2.750000f, 0.050633f }, // 59 (3 11 2)
+	{ 6.000000f, 4.000000f, 3.000000f, 0.066667f }, // 60 (3 12 1)
+	{ 6.250000f, 3.250000f, 3.250000f, 0.102564f }, // 61 (3 13 0)
+	{ 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 62 (4 0 12)
+	{ 4.250000f, 11.250000f, 0.250000f, 0.020942f }, // 63 (4 1 11)
+	{ 4.500000f, 10.500000f, 0.500000f, 0.021277f }, // 64 (4 2 10)
+	{ 4.750000f, 9.750000f, 0.750000f, 0.021858f }, // 65 (4 3 9)
+	{ 5.000000f, 9.000000f, 1.000000f, 0.022727f }, // 66 (4 4 8)
+	{ 5.250000f, 8.250000f, 1.250000f, 0.023952f }, // 67 (4 5 7)
+	{ 5.500000f, 7.500000f, 1.500000f, 0.025641f }, // 68 (4 6 6)
+	{ 5.750000f, 6.750000f, 1.750000f, 0.027972f }, // 69 (4 7 5)
+	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 70 (4 8 4)
+	{ 6.250000f, 5.250000f, 2.250000f, 0.036036f }, // 71 (4 9 3)
+	{ 6.500000f, 4.500000f, 2.500000f, 0.043478f }, // 72 (4 10 2)
+	{ 6.750000f, 3.750000f, 2.750000f, 0.056338f }, // 73 (4 11 1)
+	{ 7.000000f, 3.000000f, 3.000000f, 0.083333f }, // 74 (4 12 0)
+	{ 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 75 (5 0 11)
+	{ 5.250000f, 10.250000f, 0.250000f, 0.018605f }, // 76 (5 1 10)
+	{ 5.500000f, 9.500000f, 0.500000f, 0.019231f }, // 77 (5 2 9)
+	{ 5.750000f, 8.750000f, 0.750000f, 0.020101f }, // 78 (5 3 8)
+	{ 6.000000f, 8.000000f, 1.000000f, 0.021277f }, // 79 (5 4 7)
+	{ 6.250000f, 7.250000f, 1.250000f, 0.022857f }, // 80 (5 5 6)
+	{ 6.500000f, 6.500000f, 1.500000f, 0.025000f }, // 81 (5 6 5)
+	{ 6.750000f, 5.750000f, 1.750000f, 0.027972f }, // 82 (5 7 4)
+	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 83 (5 8 3)
+	{ 7.250000f, 4.250000f, 2.250000f, 0.038835f }, // 84 (5 9 2)
+	{ 7.500000f, 3.500000f, 2.500000f, 0.050000f }, // 85 (5 10 1)
+	{ 7.750000f, 2.750000f, 2.750000f, 0.072727f }, // 86 (5 11 0)
+	{ 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 87 (6 0 10)
+	{ 6.250000f, 9.250000f, 0.250000f, 0.017316f }, // 88 (6 1 9)
+	{ 6.500000f, 8.500000f, 0.500000f, 0.018182f }, // 89 (6 2 8)
+	{ 6.750000f, 7.750000f, 0.750000f, 0.019324f }, // 90 (6 3 7)
+	{ 7.000000f, 7.000000f, 1.000000f, 0.020833f }, // 91 (6 4 6)
+	{ 7.250000f, 6.250000f, 1.250000f, 0.022857f }, // 92 (6 5 5)
+	{ 7.500000f, 5.500000f, 1.500000f, 0.025641f }, // 93 (6 6 4)
+	{ 7.750000f, 4.750000f, 1.750000f, 0.029630f }, // 94 (6 7 3)
+	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 95 (6 8 2)
+	{ 8.250000f, 3.250000f, 2.250000f, 0.045977f }, // 96 (6 9 1)
+	{ 8.500000f, 2.500000f, 2.500000f, 0.066667f }, // 97 (6 10 0)
+	{ 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 98 (7 0 9)
+	{ 7.250000f, 8.250000f, 0.250000f, 0.016736f }, // 99 (7 1 8)
+	{ 7.500000f, 7.500000f, 0.500000f, 0.017857f }, // 100 (7 2 7)
+	{ 7.750000f, 6.750000f, 0.750000f, 0.019324f }, // 101 (7 3 6)
+	{ 8.000000f, 6.000000f, 1.000000f, 0.021277f }, // 102 (7 4 5)
+	{ 8.250000f, 5.250000f, 1.250000f, 0.023952f }, // 103 (7 5 4)
+	{ 8.500000f, 4.500000f, 1.500000f, 0.027778f }, // 104 (7 6 3)
+	{ 8.750000f, 3.750000f, 1.750000f, 0.033613f }, // 105 (7 7 2)
+	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 106 (7 8 1)
+	{ 9.250000f, 2.250000f, 2.250000f, 0.063492f }, // 107 (7 9 0)
+	{ 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 108 (8 0 8)
+	{ 8.250000f, 7.250000f, 0.250000f, 0.016736f }, // 109 (8 1 7)
+	{ 8.500000f, 6.500000f, 0.500000f, 0.018182f }, // 110 (8 2 6)
+	{ 8.750000f, 5.750000f, 0.750000f, 0.020101f }, // 111 (8 3 5)
+	{ 9.000000f, 5.000000f, 1.000000f, 0.022727f }, // 112 (8 4 4)
+	{ 9.250000f, 4.250000f, 1.250000f, 0.026490f }, // 113 (8 5 3)
+	{ 9.500000f, 3.500000f, 1.500000f, 0.032258f }, // 114 (8 6 2)
+	{ 9.750000f, 2.750000f, 1.750000f, 0.042105f }, // 115 (8 7 1)
+	{ 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 116 (8 8 0)
+	{ 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 117 (9 0 7)
+	{ 9.250000f, 6.250000f, 0.250000f, 0.017316f }, // 118 (9 1 6)
+	{ 9.500000f, 5.500000f, 0.500000f, 0.019231f }, // 119 (9 2 5)
+	{ 9.750000f, 4.750000f, 0.750000f, 0.021858f }, // 120 (9 3 4)
+	{ 10.000000f, 4.000000f, 1.000000f, 0.025641f }, // 121 (9 4 3)
+	{ 10.250000f, 3.250000f, 1.250000f, 0.031496f }, // 122 (9 5 2)
+	{ 10.500000f, 2.500000f, 1.500000f, 0.041667f }, // 123 (9 6 1)
+	{ 10.750000f, 1.750000f, 1.750000f, 0.063492f }, // 124 (9 7 0)
+	{ 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 125 (10 0 6)
+	{ 10.250000f, 5.250000f, 0.250000f, 0.018605f }, // 126 (10 1 5)
+	{ 10.500000f, 4.500000f, 0.500000f, 0.021277f }, // 127 (10 2 4)
+	{ 10.750000f, 3.750000f, 0.750000f, 0.025157f }, // 128 (10 3 3)
+	{ 11.000000f, 3.000000f, 1.000000f, 0.031250f }, // 129 (10 4 2)
+	{ 11.250000f, 2.250000f, 1.250000f, 0.042105f }, // 130 (10 5 1)
+	{ 11.500000f, 1.500000f, 1.500000f, 0.066667f }, // 131 (10 6 0)
+	{ 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 132 (11 0 5)
+	{ 11.250000f, 4.250000f, 0.250000f, 0.020942f }, // 133 (11 1 4)
+	{ 11.500000f, 3.500000f, 0.500000f, 0.025000f }, // 134 (11 2 3)
+	{ 11.750000f, 2.750000f, 0.750000f, 0.031496f }, // 135 (11 3 2)
+	{ 12.000000f, 2.000000f, 1.000000f, 0.043478f }, // 136 (11 4 1)
+	{ 12.250000f, 1.250000f, 1.250000f, 0.072727f }, // 137 (11 5 0)
+	{ 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 138 (12 0 4)
+	{ 12.250000f, 3.250000f, 0.250000f, 0.025157f }, // 139 (12 1 3)
+	{ 12.500000f, 2.500000f, 0.500000f, 0.032258f }, // 140 (12 2 2)
+	{ 12.750000f, 1.750000f, 0.750000f, 0.045977f }, // 141 (12 3 1)
+	{ 13.000000f, 1.000000f, 1.000000f, 0.083333f }, // 142 (12 4 0)
+	{ 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 143 (13 0 3)
+	{ 13.250000f, 2.250000f, 0.250000f, 0.033613f }, // 144 (13 1 2)
+	{ 13.500000f, 1.500000f, 0.500000f, 0.050000f }, // 145 (13 2 1)
+	{ 13.750000f, 0.750000f, 0.750000f, 0.102564f }, // 146 (13 3 0)
+	{ 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 147 (14 0 2)
+	{ 14.250000f, 1.250000f, 0.250000f, 0.056338f }, // 148 (14 1 1)
+	{ 14.500000f, 0.500000f, 0.500000f, 0.142857f }, // 149 (14 2 0)
+	{ 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 150 (15 0 1)
+	{ 15.250000f, 0.250000f, 0.250000f, 0.266667f }, // 151 (15 1 0)
+	{ 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 152 (16 0 0)
+}; // 153 three cluster elements
+
+static const SQUISH_ALIGN_16 Precomp s_fourElement[969] = {
+	{ 0.000000f, 16.000000f, 0.000000f, FLT_MAX }, // 0 (0 0 0 16)
+	{ 0.111111f, 15.444445f, 0.222222f, 0.600000f }, // 1 (0 0 1 15)
+	{ 0.222222f, 14.888889f, 0.444444f, 0.321429f }, // 2 (0 0 2 14)
+	{ 0.333333f, 14.333333f, 0.666667f, 0.230769f }, // 3 (0 0 3 13)
+	{ 0.444444f, 13.777778f, 0.888889f, 0.187500f }, // 4 (0 0 4 12)
+	{ 0.555556f, 13.222222f, 1.111111f, 0.163636f }, // 5 (0 0 5 11)
+	{ 0.666667f, 12.666667f, 1.333333f, 0.150000f }, // 6 (0 0 6 10)
+	{ 0.777778f, 12.111111f, 1.555556f, 0.142857f }, // 7 (0 0 7 9)
+	{ 0.888889f, 11.555555f, 1.777778f, 0.140625f }, // 8 (0 0 8 8)
+	{ 1.000000f, 11.000000f, 2.000000f, 0.142857f }, // 9 (0 0 9 7)
+	{ 1.111111f, 10.444445f, 2.222222f, 0.150000f }, // 10 (0 0 10 6)
+	{ 1.222222f, 9.888889f, 2.444444f, 0.163636f }, // 11 (0 0 11 5)
+	{ 1.333333f, 9.333333f, 2.666667f, 0.187500f }, // 12 (0 0 12 4)
+	{ 1.444444f, 8.777778f, 2.888889f, 0.230769f }, // 13 (0 0 13 3)
+	{ 1.555556f, 8.222222f, 3.111111f, 0.321429f }, // 14 (0 0 14 2)
+	{ 1.666667f, 7.666667f, 3.333333f, 0.600000f }, // 15 (0 0 15 1)
+	{ 1.777778f, 7.111111f, 3.555556f, FLT_MAX }, // 16 (0 0 16 0)
+	{ 0.444444f, 15.111111f, 0.222222f, 0.150000f }, // 17 (0 1 0 15)
+	{ 0.555556f, 14.555555f, 0.444444f, 0.126761f }, // 18 (0 1 1 14)
+	{ 0.666667f, 14.000000f, 0.666667f, 0.112500f }, // 19 (0 1 2 13)
+	{ 0.777778f, 13.444445f, 0.888889f, 0.103448f }, // 20 (0 1 3 12)
+	{ 0.888889f, 12.888889f, 1.111111f, 0.097826f }, // 21 (0 1 4 11)
+	{ 1.000000f, 12.333333f, 1.333333f, 0.094737f }, // 22 (0 1 5 10)
+	{ 1.111111f, 11.777778f, 1.555556f, 0.093750f }, // 23 (0 1 6 9)
+	{ 1.222222f, 11.222222f, 1.777778f, 0.094737f }, // 24 (0 1 7 8)
+	{ 1.333333f, 10.666667f, 2.000000f, 0.097826f }, // 25 (0 1 8 7)
+	{ 1.444444f, 10.111111f, 2.222222f, 0.103448f }, // 26 (0 1 9 6)
+	{ 1.555556f, 9.555555f, 2.444444f, 0.112500f }, // 27 (0 1 10 5)
+	{ 1.666667f, 9.000000f, 2.666667f, 0.126761f }, // 28 (0 1 11 4)
+	{ 1.777778f, 8.444445f, 2.888889f, 0.150000f }, // 29 (0 1 12 3)
+	{ 1.888889f, 7.888889f, 3.111111f, 0.191489f }, // 30 (0 1 13 2)
+	{ 2.000000f, 7.333333f, 3.333333f, 0.281250f }, // 31 (0 1 14 1)
+	{ 2.111111f, 6.777778f, 3.555556f, 0.600000f }, // 32 (0 1 15 0)
+	{ 0.888889f, 14.222222f, 0.444444f, 0.080357f }, // 33 (0 2 0 14)
+	{ 1.000000f, 13.666667f, 0.666667f, 0.075630f }, // 34 (0 2 1 13)
+	{ 1.111111f, 13.111111f, 0.888889f, 0.072581f }, // 35 (0 2 2 12)
+	{ 1.222222f, 12.555555f, 1.111111f, 0.070866f }, // 36 (0 2 3 11)
+	{ 1.333333f, 12.000000f, 1.333333f, 0.070313f }, // 37 (0 2 4 10)
+	{ 1.444444f, 11.444445f, 1.555556f, 0.070866f }, // 38 (0 2 5 9)
+	{ 1.555556f, 10.888889f, 1.777778f, 0.072581f }, // 39 (0 2 6 8)
+	{ 1.666667f, 10.333333f, 2.000000f, 0.075630f }, // 40 (0 2 7 7)
+	{ 1.777778f, 9.777778f, 2.222222f, 0.080357f }, // 41 (0 2 8 6)
+	{ 1.888889f, 9.222222f, 2.444444f, 0.087379f }, // 42 (0 2 9 5)
+	{ 2.000000f, 8.666667f, 2.666667f, 0.097826f }, // 43 (0 2 10 4)
+	{ 2.111111f, 8.111111f, 2.888889f, 0.113924f }, // 44 (0 2 11 3)
+	{ 2.222222f, 7.555556f, 3.111111f, 0.140625f }, // 45 (0 2 12 2)
+	{ 2.333333f, 7.000000f, 3.333333f, 0.191489f }, // 46 (0 2 13 1)
+	{ 2.444444f, 6.444445f, 3.555556f, 0.321429f }, // 47 (0 2 14 0)
+	{ 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 48 (0 3 0 13)
+	{ 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 49 (0 3 1 12)
+	{ 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 50 (0 3 2 11)
+	{ 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 51 (0 3 3 10)
+	{ 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 52 (0 3 4 9)
+	{ 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 53 (0 3 5 8)
+	{ 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 54 (0 3 6 7)
+	{ 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 55 (0 3 7 6)
+	{ 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 56 (0 3 8 5)
+	{ 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 57 (0 3 9 4)
+	{ 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 58 (0 3 10 3)
+	{ 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 59 (0 3 11 2)
+	{ 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 60 (0 3 12 1)
+	{ 2.777778f, 6.111111f, 3.555556f, 0.230769f }, // 61 (0 3 13 0)
+	{ 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 62 (0 4 0 12)
+	{ 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 63 (0 4 1 11)
+	{ 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 64 (0 4 2 10)
+	{ 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 65 (0 4 3 9)
+	{ 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 66 (0 4 4 8)
+	{ 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 67 (0 4 5 7)
+	{ 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 68 (0 4 6 6)
+	{ 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 69 (0 4 7 5)
+	{ 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 70 (0 4 8 4)
+	{ 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 71 (0 4 9 3)
+	{ 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 72 (0 4 10 2)
+	{ 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 73 (0 4 11 1)
+	{ 3.111111f, 5.777778f, 3.555556f, 0.187500f }, // 74 (0 4 12 0)
+	{ 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 75 (0 5 0 11)
+	{ 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 76 (0 5 1 10)
+	{ 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 77 (0 5 2 9)
+	{ 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 78 (0 5 3 8)
+	{ 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 79 (0 5 4 7)
+	{ 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 80 (0 5 5 6)
+	{ 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 81 (0 5 6 5)
+	{ 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 82 (0 5 7 4)
+	{ 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 83 (0 5 8 3)
+	{ 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 84 (0 5 9 2)
+	{ 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 85 (0 5 10 1)
+	{ 3.444444f, 5.444445f, 3.555556f, 0.163636f }, // 86 (0 5 11 0)
+	{ 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 87 (0 6 0 10)
+	{ 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 88 (0 6 1 9)
+	{ 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 89 (0 6 2 8)
+	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 90 (0 6 3 7)
+	{ 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 91 (0 6 4 6)
+	{ 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 92 (0 6 5 5)
+	{ 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 93 (0 6 6 4)
+	{ 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 94 (0 6 7 3)
+	{ 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 95 (0 6 8 2)
+	{ 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 96 (0 6 9 1)
+	{ 3.777778f, 5.111111f, 3.555556f, 0.150000f }, // 97 (0 6 10 0)
+	{ 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 98 (0 7 0 9)
+	{ 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 99 (0 7 1 8)
+	{ 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 100 (0 7 2 7)
+	{ 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 101 (0 7 3 6)
+	{ 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 102 (0 7 4 5)
+	{ 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 103 (0 7 5 4)
+	{ 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 104 (0 7 6 3)
+	{ 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 105 (0 7 7 2)
+	{ 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 106 (0 7 8 1)
+	{ 4.111111f, 4.777778f, 3.555556f, 0.142857f }, // 107 (0 7 9 0)
+	{ 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 108 (0 8 0 8)
+	{ 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 109 (0 8 1 7)
+	{ 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 110 (0 8 2 6)
+	{ 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 111 (0 8 3 5)
+	{ 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 112 (0 8 4 4)
+	{ 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 113 (0 8 5 3)
+	{ 4.222222f, 5.555555f, 3.111111f, 0.072581f }, // 114 (0 8 6 2)
+	{ 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 115 (0 8 7 1)
+	{ 4.444445f, 4.444445f, 3.555556f, 0.140625f }, // 116 (0 8 8 0)
+	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 117 (0 9 0 7)
+	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 118 (0 9 1 6)
+	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 119 (0 9 2 5)
+	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 120 (0 9 3 4)
+	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 121 (0 9 4 3)
+	{ 4.555556f, 5.222222f, 3.111111f, 0.070866f }, // 122 (0 9 5 2)
+	{ 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 123 (0 9 6 1)
+	{ 4.777778f, 4.111111f, 3.555556f, 0.142857f }, // 124 (0 9 7 0)
+	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 125 (0 10 0 6)
+	{ 4.555556f, 6.555555f, 2.444444f, 0.041860f }, // 126 (0 10 1 5)
+	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 127 (0 10 2 4)
+	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 128 (0 10 3 3)
+	{ 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 129 (0 10 4 2)
+	{ 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 130 (0 10 5 1)
+	{ 5.111111f, 3.777778f, 3.555556f, 0.150000f }, // 131 (0 10 6 0)
+	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 132 (0 11 0 5)
+	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 133 (0 11 1 4)
+	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 134 (0 11 2 3)
+	{ 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 135 (0 11 3 2)
+	{ 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 136 (0 11 4 1)
+	{ 5.444445f, 3.444444f, 3.555556f, 0.163636f }, // 137 (0 11 5 0)
+	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 138 (0 12 0 4)
+	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 139 (0 12 1 3)
+	{ 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 140 (0 12 2 2)
+	{ 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 141 (0 12 3 1)
+	{ 5.777778f, 3.111111f, 3.555556f, 0.187500f }, // 142 (0 12 4 0)
+	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 143 (0 13 0 3)
+	{ 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 144 (0 13 1 2)
+	{ 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 145 (0 13 2 1)
+	{ 6.111111f, 2.777778f, 3.555556f, 0.230769f }, // 146 (0 13 3 0)
+	{ 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 147 (0 14 0 2)
+	{ 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 148 (0 14 1 1)
+	{ 6.444445f, 2.444444f, 3.555556f, 0.321429f }, // 149 (0 14 2 0)
+	{ 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 150 (0 15 0 1)
+	{ 6.777778f, 2.111111f, 3.555556f, 0.600000f }, // 151 (0 15 1 0)
+	{ 7.111111f, 1.777778f, 3.555556f, FLT_MAX }, // 152 (0 16 0 0)
+	{ 1.000000f, 15.000000f, 0.000000f, 0.066667f }, // 153 (1 0 0 15)
+	{ 1.111111f, 14.444445f, 0.222222f, 0.062500f }, // 154 (1 0 1 14)
+	{ 1.222222f, 13.888889f, 0.444444f, 0.059603f }, // 155 (1 0 2 13)
+	{ 1.333333f, 13.333333f, 0.666667f, 0.057692f }, // 156 (1 0 3 12)
+	{ 1.444444f, 12.777778f, 0.888889f, 0.056604f }, // 157 (1 0 4 11)
+	{ 1.555556f, 12.222222f, 1.111111f, 0.056250f }, // 158 (1 0 5 10)
+	{ 1.666667f, 11.666667f, 1.333333f, 0.056604f }, // 159 (1 0 6 9)
+	{ 1.777778f, 11.111111f, 1.555556f, 0.057692f }, // 160 (1 0 7 8)
+	{ 1.888889f, 10.555555f, 1.777778f, 0.059603f }, // 161 (1 0 8 7)
+	{ 2.000000f, 10.000000f, 2.000000f, 0.062500f }, // 162 (1 0 9 6)
+	{ 2.111111f, 9.444445f, 2.222222f, 0.066667f }, // 163 (1 0 10 5)
+	{ 2.222222f, 8.888889f, 2.444444f, 0.072581f }, // 164 (1 0 11 4)
+	{ 2.333333f, 8.333333f, 2.666667f, 0.081081f }, // 165 (1 0 12 3)
+	{ 2.444444f, 7.777778f, 2.888889f, 0.093750f }, // 166 (1 0 13 2)
+	{ 2.555556f, 7.222222f, 3.111111f, 0.113924f }, // 167 (1 0 14 1)
+	{ 2.666667f, 6.666667f, 3.333333f, 0.150000f }, // 168 (1 0 15 0)
+	{ 1.444444f, 14.111111f, 0.222222f, 0.049180f }, // 169 (1 1 0 14)
+	{ 1.555556f, 13.555555f, 0.444444f, 0.047872f }, // 170 (1 1 1 13)
+	{ 1.666667f, 13.000000f, 0.666667f, 0.047120f }, // 171 (1 1 2 12)
+	{ 1.777778f, 12.444445f, 0.888889f, 0.046875f }, // 172 (1 1 3 11)
+	{ 1.888889f, 11.888889f, 1.111111f, 0.047120f }, // 173 (1 1 4 10)
+	{ 2.000000f, 11.333333f, 1.333333f, 0.047872f }, // 174 (1 1 5 9)
+	{ 2.111111f, 10.777778f, 1.555556f, 0.049180f }, // 175 (1 1 6 8)
+	{ 2.222222f, 10.222222f, 1.777778f, 0.051136f }, // 176 (1 1 7 7)
+	{ 2.333333f, 9.666667f, 2.000000f, 0.053892f }, // 177 (1 1 8 6)
+	{ 2.444444f, 9.111111f, 2.222222f, 0.057692f }, // 178 (1 1 9 5)
+	{ 2.555556f, 8.555555f, 2.444444f, 0.062937f }, // 179 (1 1 10 4)
+	{ 2.666667f, 8.000000f, 2.666667f, 0.070313f }, // 180 (1 1 11 3)
+	{ 2.777778f, 7.444445f, 2.888889f, 0.081081f }, // 181 (1 1 12 2)
+	{ 2.888889f, 6.888889f, 3.111111f, 0.097826f }, // 182 (1 1 13 1)
+	{ 3.000000f, 6.333333f, 3.333333f, 0.126761f }, // 183 (1 1 14 0)
+	{ 1.888889f, 13.222222f, 0.444444f, 0.040359f }, // 184 (1 2 0 13)
+	{ 2.000000f, 12.666667f, 0.666667f, 0.040179f }, // 185 (1 2 1 12)
+	{ 2.111111f, 12.111111f, 0.888889f, 0.040359f }, // 186 (1 2 2 11)
+	{ 2.222222f, 11.555555f, 1.111111f, 0.040909f }, // 187 (1 2 3 10)
+	{ 2.333333f, 11.000000f, 1.333333f, 0.041860f }, // 188 (1 2 4 9)
+	{ 2.444444f, 10.444445f, 1.555556f, 0.043269f }, // 189 (1 2 5 8)
+	{ 2.555556f, 9.888889f, 1.777778f, 0.045226f }, // 190 (1 2 6 7)
+	{ 2.666667f, 9.333333f, 2.000000f, 0.047872f }, // 191 (1 2 7 6)
+	{ 2.777778f, 8.777778f, 2.222222f, 0.051429f }, // 192 (1 2 8 5)
+	{ 2.888889f, 8.222222f, 2.444444f, 0.056250f }, // 193 (1 2 9 4)
+	{ 3.000000f, 7.666667f, 2.666667f, 0.062937f }, // 194 (1 2 10 3)
+	{ 3.111111f, 7.111111f, 2.888889f, 0.072581f }, // 195 (1 2 11 2)
+	{ 3.222222f, 6.555556f, 3.111111f, 0.087379f }, // 196 (1 2 12 1)
+	{ 3.333333f, 6.000000f, 3.333333f, 0.112500f }, // 197 (1 2 13 0)
+	{ 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 198 (1 3 0 12)
+	{ 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 199 (1 3 1 11)
+	{ 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 200 (1 3 2 10)
+	{ 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 201 (1 3 3 9)
+	{ 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 202 (1 3 4 8)
+	{ 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 203 (1 3 5 7)
+	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 204 (1 3 6 6)
+	{ 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 205 (1 3 7 5)
+	{ 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 206 (1 3 8 4)
+	{ 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 207 (1 3 9 3)
+	{ 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 208 (1 3 10 2)
+	{ 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 209 (1 3 11 1)
+	{ 3.666667f, 5.666667f, 3.333333f, 0.103448f }, // 210 (1 3 12 0)
+	{ 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 211 (1 4 0 11)
+	{ 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 212 (1 4 1 10)
+	{ 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 213 (1 4 2 9)
+	{ 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 214 (1 4 3 8)
+	{ 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 215 (1 4 4 7)
+	{ 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 216 (1 4 5 6)
+	{ 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 217 (1 4 6 5)
+	{ 3.555556f, 7.555555f, 2.444444f, 0.047872f }, // 218 (1 4 7 4)
+	{ 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 219 (1 4 8 3)
+	{ 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 220 (1 4 9 2)
+	{ 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 221 (1 4 10 1)
+	{ 4.000000f, 5.333333f, 3.333333f, 0.097826f }, // 222 (1 4 11 0)
+	{ 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 223 (1 5 0 10)
+	{ 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 224 (1 5 1 9)
+	{ 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 225 (1 5 2 8)
+	{ 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 226 (1 5 3 7)
+	{ 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 227 (1 5 4 6)
+	{ 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 228 (1 5 5 5)
+	{ 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 229 (1 5 6 4)
+	{ 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 230 (1 5 7 3)
+	{ 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 231 (1 5 8 2)
+	{ 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 232 (1 5 9 1)
+	{ 4.333333f, 5.000000f, 3.333333f, 0.094737f }, // 233 (1 5 10 0)
+	{ 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 234 (1 6 0 9)
+	{ 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 235 (1 6 1 8)
+	{ 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 236 (1 6 2 7)
+	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 237 (1 6 3 6)
+	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 238 (1 6 4 5)
+	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 239 (1 6 5 4)
+	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 240 (1 6 6 3)
+	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 241 (1 6 7 2)
+	{ 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 242 (1 6 8 1)
+	{ 4.666667f, 4.666667f, 3.333333f, 0.093750f }, // 243 (1 6 9 0)
+	{ 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 244 (1 7 0 8)
+	{ 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 245 (1 7 1 7)
+	{ 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 246 (1 7 2 6)
+	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 247 (1 7 3 5)
+	{ 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 248 (1 7 4 4)
+	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 249 (1 7 5 3)
+	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 250 (1 7 6 2)
+	{ 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 251 (1 7 7 1)
+	{ 5.000000f, 4.333333f, 3.333333f, 0.094737f }, // 252 (1 7 8 0)
+	{ 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 253 (1 8 0 7)
+	{ 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 254 (1 8 1 6)
+	{ 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 255 (1 8 2 5)
+	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 256 (1 8 3 4)
+	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 257 (1 8 4 3)
+	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 258 (1 8 5 2)
+	{ 5.222222f, 4.555555f, 3.111111f, 0.070866f }, // 259 (1 8 6 1)
+	{ 5.333333f, 4.000000f, 3.333333f, 0.097826f }, // 260 (1 8 7 0)
+	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 261 (1 9 0 6)
+	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 262 (1 9 1 5)
+	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 263 (1 9 2 4)
+	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 264 (1 9 3 3)
+	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 265 (1 9 4 2)
+	{ 5.555556f, 4.222222f, 3.111111f, 0.072581f }, // 266 (1 9 5 1)
+	{ 5.666667f, 3.666667f, 3.333333f, 0.103448f }, // 267 (1 9 6 0)
+	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 268 (1 10 0 5)
+	{ 5.555556f, 5.555555f, 2.444444f, 0.040179f }, // 269 (1 10 1 4)
+	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 270 (1 10 2 3)
+	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 271 (1 10 3 2)
+	{ 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 272 (1 10 4 1)
+	{ 6.000000f, 3.333333f, 3.333333f, 0.112500f }, // 273 (1 10 5 0)
+	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 274 (1 11 0 4)
+	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 275 (1 11 1 3)
+	{ 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 276 (1 11 2 2)
+	{ 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 277 (1 11 3 1)
+	{ 6.333333f, 3.000000f, 3.333333f, 0.126761f }, // 278 (1 11 4 0)
+	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 279 (1 12 0 3)
+	{ 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 280 (1 12 1 2)
+	{ 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 281 (1 12 2 1)
+	{ 6.666667f, 2.666667f, 3.333333f, 0.150000f }, // 282 (1 12 3 0)
+	{ 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 283 (1 13 0 2)
+	{ 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 284 (1 13 1 1)
+	{ 7.000000f, 2.333333f, 3.333333f, 0.191489f }, // 285 (1 13 2 0)
+	{ 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 286 (1 14 0 1)
+	{ 7.333333f, 2.000000f, 3.333333f, 0.281250f }, // 287 (1 14 1 0)
+	{ 7.666667f, 1.666667f, 3.333333f, 0.600000f }, // 288 (1 15 0 0)
+	{ 2.000000f, 14.000000f, 0.000000f, 0.035714f }, // 289 (2 0 0 14)
+	{ 2.111111f, 13.444445f, 0.222222f, 0.035294f }, // 290 (2 0 1 13)
+	{ 2.222222f, 12.888889f, 0.444444f, 0.035156f }, // 291 (2 0 2 12)
+	{ 2.333333f, 12.333333f, 0.666667f, 0.035294f }, // 292 (2 0 3 11)
+	{ 2.444444f, 11.777778f, 0.888889f, 0.035714f }, // 293 (2 0 4 10)
+	{ 2.555556f, 11.222222f, 1.111111f, 0.036437f }, // 294 (2 0 5 9)
+	{ 2.666667f, 10.666667f, 1.333333f, 0.037500f }, // 295 (2 0 6 8)
+	{ 2.777778f, 10.111111f, 1.555556f, 0.038961f }, // 296 (2 0 7 7)
+	{ 2.888889f, 9.555555f, 1.777778f, 0.040909f }, // 297 (2 0 8 6)
+	{ 3.000000f, 9.000000f, 2.000000f, 0.043478f }, // 298 (2 0 9 5)
+	{ 3.111111f, 8.444445f, 2.222222f, 0.046875f }, // 299 (2 0 10 4)
+	{ 3.222222f, 7.888889f, 2.444444f, 0.051429f }, // 300 (2 0 11 3)
+	{ 3.333333f, 7.333333f, 2.666667f, 0.057692f }, // 301 (2 0 12 2)
+	{ 3.444444f, 6.777778f, 2.888889f, 0.066667f }, // 302 (2 0 13 1)
+	{ 3.555556f, 6.222222f, 3.111111f, 0.080357f }, // 303 (2 0 14 0)
+	{ 2.444444f, 13.111111f, 0.222222f, 0.031250f }, // 304 (2 1 0 13)
+	{ 2.555556f, 12.555555f, 0.444444f, 0.031359f }, // 305 (2 1 1 12)
+	{ 2.666667f, 12.000000f, 0.666667f, 0.031690f }, // 306 (2 1 2 11)
+	{ 2.777778f, 11.444445f, 0.888889f, 0.032258f }, // 307 (2 1 3 10)
+	{ 2.888889f, 10.888889f, 1.111111f, 0.033088f }, // 308 (2 1 4 9)
+	{ 3.000000f, 10.333333f, 1.333333f, 0.034221f }, // 309 (2 1 5 8)
+	{ 3.111111f, 9.777778f, 1.555556f, 0.035714f }, // 310 (2 1 6 7)
+	{ 3.222222f, 9.222222f, 1.777778f, 0.037657f }, // 311 (2 1 7 6)
+	{ 3.333333f, 8.666667f, 2.000000f, 0.040179f }, // 312 (2 1 8 5)
+	{ 3.444444f, 8.111111f, 2.222222f, 0.043478f }, // 313 (2 1 9 4)
+	{ 3.555556f, 7.555556f, 2.444444f, 0.047872f }, // 314 (2 1 10 3)
+	{ 3.666667f, 7.000000f, 2.666667f, 0.053892f }, // 315 (2 1 11 2)
+	{ 3.777778f, 6.444445f, 2.888889f, 0.062500f }, // 316 (2 1 12 1)
+	{ 3.888889f, 5.888889f, 3.111111f, 0.075630f }, // 317 (2 1 13 0)
+	{ 2.888889f, 12.222222f, 0.444444f, 0.028481f }, // 318 (2 2 0 12)
+	{ 3.000000f, 11.666667f, 0.666667f, 0.028939f }, // 319 (2 2 1 11)
+	{ 3.111111f, 11.111111f, 0.888889f, 0.029605f }, // 320 (2 2 2 10)
+	{ 3.222222f, 10.555555f, 1.111111f, 0.030508f }, // 321 (2 2 3 9)
+	{ 3.333333f, 10.000000f, 1.333333f, 0.031690f }, // 322 (2 2 4 8)
+	{ 3.444444f, 9.444445f, 1.555556f, 0.033210f }, // 323 (2 2 5 7)
+	{ 3.555556f, 8.888889f, 1.777778f, 0.035156f }, // 324 (2 2 6 6)
+	{ 3.666667f, 8.333333f, 2.000000f, 0.037657f }, // 325 (2 2 7 5)
+	{ 3.777778f, 7.777778f, 2.222222f, 0.040909f }, // 326 (2 2 8 4)
+	{ 3.888889f, 7.222222f, 2.444444f, 0.045226f }, // 327 (2 2 9 3)
+	{ 4.000000f, 6.666667f, 2.666667f, 0.051136f }, // 328 (2 2 10 2)
+	{ 4.111111f, 6.111111f, 2.888889f, 0.059603f }, // 329 (2 2 11 1)
+	{ 4.222222f, 5.555556f, 3.111111f, 0.072581f }, // 330 (2 2 12 0)
+	{ 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 331 (2 3 0 11)
+	{ 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 332 (2 3 1 10)
+	{ 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 333 (2 3 2 9)
+	{ 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 334 (2 3 3 8)
+	{ 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 335 (2 3 4 7)
+	{ 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 336 (2 3 5 6)
+	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 337 (2 3 6 5)
+	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 338 (2 3 7 4)
+	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 339 (2 3 8 3)
+	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 340 (2 3 9 2)
+	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 341 (2 3 10 1)
+	{ 4.555555f, 5.222222f, 3.111111f, 0.070866f }, // 342 (2 3 11 0)
+	{ 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 343 (2 4 0 10)
+	{ 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 344 (2 4 1 9)
+	{ 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 345 (2 4 2 8)
+	{ 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 346 (2 4 3 7)
+	{ 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 347 (2 4 4 6)
+	{ 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 348 (2 4 5 5)
+	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 349 (2 4 6 4)
+	{ 4.555555f, 6.555555f, 2.444444f, 0.041860f }, // 350 (2 4 7 3)
+	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 351 (2 4 8 2)
+	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 352 (2 4 9 1)
+	{ 4.888889f, 4.888889f, 3.111111f, 0.070313f }, // 353 (2 4 10 0)
+	{ 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 354 (2 5 0 9)
+	{ 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 355 (2 5 1 8)
+	{ 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 356 (2 5 2 7)
+	{ 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 357 (2 5 3 6)
+	{ 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 358 (2 5 4 5)
+	{ 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 359 (2 5 5 4)
+	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 360 (2 5 6 3)
+	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 361 (2 5 7 2)
+	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 362 (2 5 8 1)
+	{ 5.222222f, 4.555556f, 3.111111f, 0.070866f }, // 363 (2 5 9 0)
+	{ 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 364 (2 6 0 8)
+	{ 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 365 (2 6 1 7)
+	{ 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 366 (2 6 2 6)
+	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 367 (2 6 3 5)
+	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 368 (2 6 4 4)
+	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 369 (2 6 5 3)
+	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 370 (2 6 6 2)
+	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 371 (2 6 7 1)
+	{ 5.555555f, 4.222222f, 3.111111f, 0.072581f }, // 372 (2 6 8 0)
+	{ 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 373 (2 7 0 7)
+	{ 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 374 (2 7 1 6)
+	{ 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 375 (2 7 2 5)
+	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 376 (2 7 3 4)
+	{ 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 377 (2 7 4 3)
+	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 378 (2 7 5 2)
+	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 379 (2 7 6 1)
+	{ 5.888889f, 3.888889f, 3.111111f, 0.075630f }, // 380 (2 7 7 0)
+	{ 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 381 (2 8 0 6)
+	{ 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 382 (2 8 1 5)
+	{ 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 383 (2 8 2 4)
+	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 384 (2 8 3 3)
+	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 385 (2 8 4 2)
+	{ 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 386 (2 8 5 1)
+	{ 6.222222f, 3.555556f, 3.111111f, 0.080357f }, // 387 (2 8 6 0)
+	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 388 (2 9 0 5)
+	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 389 (2 9 1 4)
+	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 390 (2 9 2 3)
+	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 391 (2 9 3 2)
+	{ 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 392 (2 9 4 1)
+	{ 6.555556f, 3.222222f, 3.111111f, 0.087379f }, // 393 (2 9 5 0)
+	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 394 (2 10 0 4)
+	{ 6.555556f, 4.555555f, 2.444444f, 0.041860f }, // 395 (2 10 1 3)
+	{ 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 396 (2 10 2 2)
+	{ 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 397 (2 10 3 1)
+	{ 6.888889f, 2.888889f, 3.111111f, 0.097826f }, // 398 (2 10 4 0)
+	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 399 (2 11 0 3)
+	{ 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 400 (2 11 1 2)
+	{ 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 401 (2 11 2 1)
+	{ 7.222222f, 2.555556f, 3.111111f, 0.113924f }, // 402 (2 11 3 0)
+	{ 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 403 (2 12 0 2)
+	{ 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 404 (2 12 1 1)
+	{ 7.555556f, 2.222222f, 3.111111f, 0.140625f }, // 405 (2 12 2 0)
+	{ 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 406 (2 13 0 1)
+	{ 7.888889f, 1.888889f, 3.111111f, 0.191489f }, // 407 (2 13 1 0)
+	{ 8.222222f, 1.555556f, 3.111111f, 0.321429f }, // 408 (2 14 0 0)
+	{ 3.000000f, 13.000000f, 0.000000f, 0.025641f }, // 409 (3 0 0 13)
+	{ 3.111111f, 12.444445f, 0.222222f, 0.025862f }, // 410 (3 0 1 12)
+	{ 3.222222f, 11.888889f, 0.444444f, 0.026239f }, // 411 (3 0 2 11)
+	{ 3.333333f, 11.333333f, 0.666667f, 0.026786f }, // 412 (3 0 3 10)
+	{ 3.444444f, 10.777778f, 0.888889f, 0.027523f }, // 413 (3 0 4 9)
+	{ 3.555556f, 10.222222f, 1.111111f, 0.028481f }, // 414 (3 0 5 8)
+	{ 3.666667f, 9.666667f, 1.333333f, 0.029703f }, // 415 (3 0 6 7)
+	{ 3.777778f, 9.111111f, 1.555556f, 0.031250f }, // 416 (3 0 7 6)
+	{ 3.888889f, 8.555555f, 1.777778f, 0.033210f }, // 417 (3 0 8 5)
+	{ 4.000000f, 8.000000f, 2.000000f, 0.035714f }, // 418 (3 0 9 4)
+	{ 4.111111f, 7.444445f, 2.222222f, 0.038961f }, // 419 (3 0 10 3)
+	{ 4.222222f, 6.888889f, 2.444444f, 0.043269f }, // 420 (3 0 11 2)
+	{ 4.333333f, 6.333333f, 2.666667f, 0.049180f }, // 421 (3 0 12 1)
+	{ 4.444445f, 5.777778f, 2.888889f, 0.057692f }, // 422 (3 0 13 0)
+	{ 3.444444f, 12.111111f, 0.222222f, 0.024000f }, // 423 (3 1 0 12)
+	{ 3.555556f, 11.555555f, 0.444444f, 0.024457f }, // 424 (3 1 1 11)
+	{ 3.666667f, 11.000000f, 0.666667f, 0.025070f }, // 425 (3 1 2 10)
+	{ 3.777778f, 10.444445f, 0.888889f, 0.025862f }, // 426 (3 1 3 9)
+	{ 3.888889f, 9.888889f, 1.111111f, 0.026866f }, // 427 (3 1 4 8)
+	{ 4.000000f, 9.333333f, 1.333333f, 0.028125f }, // 428 (3 1 5 7)
+	{ 4.111111f, 8.777778f, 1.555556f, 0.029703f }, // 429 (3 1 6 6)
+	{ 4.222222f, 8.222222f, 1.777778f, 0.031690f }, // 430 (3 1 7 5)
+	{ 4.333333f, 7.666667f, 2.000000f, 0.034221f }, // 431 (3 1 8 4)
+	{ 4.444445f, 7.111111f, 2.222222f, 0.037500f }, // 432 (3 1 9 3)
+	{ 4.555555f, 6.555556f, 2.444444f, 0.041860f }, // 433 (3 1 10 2)
+	{ 4.666667f, 6.000000f, 2.666667f, 0.047872f }, // 434 (3 1 11 1)
+	{ 4.777778f, 5.444445f, 2.888889f, 0.056604f }, // 435 (3 1 12 0)
+	{ 3.888889f, 11.222222f, 0.444444f, 0.023018f }, // 436 (3 2 0 11)
+	{ 4.000000f, 10.666667f, 0.666667f, 0.023684f }, // 437 (3 2 1 10)
+	{ 4.111111f, 10.111111f, 0.888889f, 0.024523f }, // 438 (3 2 2 9)
+	{ 4.222222f, 9.555555f, 1.111111f, 0.025568f }, // 439 (3 2 3 8)
+	{ 4.333333f, 9.000000f, 1.333333f, 0.026866f }, // 440 (3 2 4 7)
+	{ 4.444445f, 8.444445f, 1.555556f, 0.028481f }, // 441 (3 2 5 6)
+	{ 4.555555f, 7.888889f, 1.777778f, 0.030508f }, // 442 (3 2 6 5)
+	{ 4.666667f, 7.333333f, 2.000000f, 0.033088f }, // 443 (3 2 7 4)
+	{ 4.777778f, 6.777778f, 2.222222f, 0.036437f }, // 444 (3 2 8 3)
+	{ 4.888889f, 6.222222f, 2.444444f, 0.040909f }, // 445 (3 2 9 2)
+	{ 5.000000f, 5.666667f, 2.666667f, 0.047120f }, // 446 (3 2 10 1)
+	{ 5.111111f, 5.111111f, 2.888889f, 0.056250f }, // 447 (3 2 11 0)
+	{ 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 448 (3 3 0 10)
+	{ 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 449 (3 3 1 9)
+	{ 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 450 (3 3 2 8)
+	{ 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 451 (3 3 3 7)
+	{ 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 452 (3 3 4 6)
+	{ 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 453 (3 3 5 5)
+	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 454 (3 3 6 4)
+	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 455 (3 3 7 3)
+	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 456 (3 3 8 2)
+	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 457 (3 3 9 1)
+	{ 5.444445f, 4.777778f, 2.888889f, 0.056604f }, // 458 (3 3 10 0)
+	{ 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 459 (3 4 0 9)
+	{ 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 460 (3 4 1 8)
+	{ 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 461 (3 4 2 7)
+	{ 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 462 (3 4 3 6)
+	{ 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 463 (3 4 4 5)
+	{ 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 464 (3 4 5 4)
+	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 465 (3 4 6 3)
+	{ 5.555555f, 5.555555f, 2.444444f, 0.040179f }, // 466 (3 4 7 2)
+	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 467 (3 4 8 1)
+	{ 5.777778f, 4.444445f, 2.888889f, 0.057692f }, // 468 (3 4 9 0)
+	{ 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 469 (3 5 0 8)
+	{ 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 470 (3 5 1 7)
+	{ 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 471 (3 5 2 6)
+	{ 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 472 (3 5 3 5)
+	{ 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 473 (3 5 4 4)
+	{ 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 474 (3 5 5 3)
+	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 475 (3 5 6 2)
+	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 476 (3 5 7 1)
+	{ 6.111111f, 4.111111f, 2.888889f, 0.059603f }, // 477 (3 5 8 0)
+	{ 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 478 (3 6 0 7)
+	{ 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 479 (3 6 1 6)
+	{ 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 480 (3 6 2 5)
+	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 481 (3 6 3 4)
+	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 482 (3 6 4 3)
+	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 483 (3 6 5 2)
+	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 484 (3 6 6 1)
+	{ 6.444445f, 3.777778f, 2.888889f, 0.062500f }, // 485 (3 6 7 0)
+	{ 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 486 (3 7 0 6)
+	{ 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 487 (3 7 1 5)
+	{ 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 488 (3 7 2 4)
+	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 489 (3 7 3 3)
+	{ 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 490 (3 7 4 2)
+	{ 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 491 (3 7 5 1)
+	{ 6.777778f, 3.444444f, 2.888889f, 0.066667f }, // 492 (3 7 6 0)
+	{ 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 493 (3 8 0 5)
+	{ 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 494 (3 8 1 4)
+	{ 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 495 (3 8 2 3)
+	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 496 (3 8 3 2)
+	{ 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 497 (3 8 4 1)
+	{ 7.111111f, 3.111111f, 2.888889f, 0.072581f }, // 498 (3 8 5 0)
+	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 499 (3 9 0 4)
+	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 500 (3 9 1 3)
+	{ 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 501 (3 9 2 2)
+	{ 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 502 (3 9 3 1)
+	{ 7.444445f, 2.777778f, 2.888889f, 0.081081f }, // 503 (3 9 4 0)
+	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 504 (3 10 0 3)
+	{ 7.555556f, 3.555556f, 2.444444f, 0.047872f }, // 505 (3 10 1 2)
+	{ 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 506 (3 10 2 1)
+	{ 7.777778f, 2.444444f, 2.888889f, 0.093750f }, // 507 (3 10 3 0)
+	{ 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 508 (3 11 0 2)
+	{ 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 509 (3 11 1 1)
+	{ 8.111111f, 2.111111f, 2.888889f, 0.113924f }, // 510 (3 11 2 0)
+	{ 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 511 (3 12 0 1)
+	{ 8.444445f, 1.777778f, 2.888889f, 0.150000f }, // 512 (3 12 1 0)
+	{ 8.777778f, 1.444444f, 2.888889f, 0.230769f }, // 513 (3 13 0 0)
+	{ 4.000000f, 12.000000f, 0.000000f, 0.020833f }, // 514 (4 0 0 12)
+	{ 4.111111f, 11.444445f, 0.222222f, 0.021277f }, // 515 (4 0 1 11)
+	{ 4.222222f, 10.888889f, 0.444444f, 0.021845f }, // 516 (4 0 2 10)
+	{ 4.333333f, 10.333333f, 0.666667f, 0.022556f }, // 517 (4 0 3 9)
+	{ 4.444445f, 9.777778f, 0.888889f, 0.023438f }, // 518 (4 0 4 8)
+	{ 4.555555f, 9.222222f, 1.111111f, 0.024523f }, // 519 (4 0 5 7)
+	{ 4.666667f, 8.666667f, 1.333333f, 0.025862f }, // 520 (4 0 6 6)
+	{ 4.777778f, 8.111111f, 1.555556f, 0.027523f }, // 521 (4 0 7 5)
+	{ 4.888889f, 7.555555f, 1.777778f, 0.029605f }, // 522 (4 0 8 4)
+	{ 5.000000f, 7.000000f, 2.000000f, 0.032258f }, // 523 (4 0 9 3)
+	{ 5.111111f, 6.444445f, 2.222222f, 0.035714f }, // 524 (4 0 10 2)
+	{ 5.222222f, 5.888889f, 2.444444f, 0.040359f }, // 525 (4 0 11 1)
+	{ 5.333333f, 5.333333f, 2.666667f, 0.046875f }, // 526 (4 0 12 0)
+	{ 4.444445f, 11.111111f, 0.222222f, 0.020270f }, // 527 (4 1 0 11)
+	{ 4.555555f, 10.555555f, 0.444444f, 0.020882f }, // 528 (4 1 1 10)
+	{ 4.666667f, 10.000000f, 0.666667f, 0.021635f }, // 529 (4 1 2 9)
+	{ 4.777778f, 9.444445f, 0.888889f, 0.022556f }, // 530 (4 1 3 8)
+	{ 4.888889f, 8.888889f, 1.111111f, 0.023684f }, // 531 (4 1 4 7)
+	{ 5.000000f, 8.333333f, 1.333333f, 0.025070f }, // 532 (4 1 5 6)
+	{ 5.111111f, 7.777778f, 1.555556f, 0.026786f }, // 533 (4 1 6 5)
+	{ 5.222222f, 7.222222f, 1.777778f, 0.028939f }, // 534 (4 1 7 4)
+	{ 5.333333f, 6.666667f, 2.000000f, 0.031690f }, // 535 (4 1 8 3)
+	{ 5.444445f, 6.111111f, 2.222222f, 0.035294f }, // 536 (4 1 9 2)
+	{ 5.555555f, 5.555556f, 2.444444f, 0.040179f }, // 537 (4 1 10 1)
+	{ 5.666667f, 5.000000f, 2.666667f, 0.047120f }, // 538 (4 1 11 0)
+	{ 4.888889f, 10.222222f, 0.444444f, 0.020089f }, // 539 (4 2 0 10)
+	{ 5.000000f, 9.666667f, 0.666667f, 0.020882f }, // 540 (4 2 1 9)
+	{ 5.111111f, 9.111111f, 0.888889f, 0.021845f }, // 541 (4 2 2 8)
+	{ 5.222222f, 8.555555f, 1.111111f, 0.023018f }, // 542 (4 2 3 7)
+	{ 5.333333f, 8.000000f, 1.333333f, 0.024457f }, // 543 (4 2 4 6)
+	{ 5.444445f, 7.444445f, 1.555556f, 0.026239f }, // 544 (4 2 5 5)
+	{ 5.555555f, 6.888889f, 1.777778f, 0.028481f }, // 545 (4 2 6 4)
+	{ 5.666667f, 6.333333f, 2.000000f, 0.031359f }, // 546 (4 2 7 3)
+	{ 5.777778f, 5.777778f, 2.222222f, 0.035156f }, // 547 (4 2 8 2)
+	{ 5.888889f, 5.222222f, 2.444444f, 0.040359f }, // 548 (4 2 9 1)
+	{ 6.000000f, 4.666667f, 2.666667f, 0.047872f }, // 549 (4 2 10 0)
+	{ 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 550 (4 3 0 9)
+	{ 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 551 (4 3 1 8)
+	{ 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 552 (4 3 2 7)
+	{ 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 553 (4 3 3 6)
+	{ 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 554 (4 3 4 5)
+	{ 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 555 (4 3 5 4)
+	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 556 (4 3 6 3)
+	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 557 (4 3 7 2)
+	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 558 (4 3 8 1)
+	{ 6.333333f, 4.333333f, 2.666667f, 0.049180f }, // 559 (4 3 9 0)
+	{ 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 560 (4 4 0 8)
+	{ 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 561 (4 4 1 7)
+	{ 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 562 (4 4 2 6)
+	{ 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 563 (4 4 3 5)
+	{ 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 564 (4 4 4 4)
+	{ 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 565 (4 4 5 3)
+	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 566 (4 4 6 2)
+	{ 6.555555f, 4.555555f, 2.444444f, 0.041860f }, // 567 (4 4 7 1)
+	{ 6.666667f, 4.000000f, 2.666667f, 0.051136f }, // 568 (4 4 8 0)
+	{ 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 569 (4 5 0 7)
+	{ 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 570 (4 5 1 6)
+	{ 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 571 (4 5 2 5)
+	{ 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 572 (4 5 3 4)
+	{ 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 573 (4 5 4 3)
+	{ 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 574 (4 5 5 2)
+	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 575 (4 5 6 1)
+	{ 7.000000f, 3.666667f, 2.666667f, 0.053892f }, // 576 (4 5 7 0)
+	{ 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 577 (4 6 0 6)
+	{ 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 578 (4 6 1 5)
+	{ 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 579 (4 6 2 4)
+	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 580 (4 6 3 3)
+	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 581 (4 6 4 2)
+	{ 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 582 (4 6 5 1)
+	{ 7.333333f, 3.333333f, 2.666667f, 0.057692f }, // 583 (4 6 6 0)
+	{ 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 584 (4 7 0 5)
+	{ 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 585 (4 7 1 4)
+	{ 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 586 (4 7 2 3)
+	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 587 (4 7 3 2)
+	{ 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 588 (4 7 4 1)
+	{ 7.666667f, 3.000000f, 2.666667f, 0.062937f }, // 589 (4 7 5 0)
+	{ 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 590 (4 8 0 4)
+	{ 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 591 (4 8 1 3)
+	{ 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 592 (4 8 2 2)
+	{ 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 593 (4 8 3 1)
+	{ 8.000000f, 2.666667f, 2.666667f, 0.070313f }, // 594 (4 8 4 0)
+	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 595 (4 9 0 3)
+	{ 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 596 (4 9 1 2)
+	{ 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 597 (4 9 2 1)
+	{ 8.333333f, 2.333333f, 2.666667f, 0.081081f }, // 598 (4 9 3 0)
+	{ 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 599 (4 10 0 2)
+	{ 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 600 (4 10 1 1)
+	{ 8.666667f, 2.000000f, 2.666667f, 0.097826f }, // 601 (4 10 2 0)
+	{ 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 602 (4 11 0 1)
+	{ 9.000000f, 1.666667f, 2.666667f, 0.126761f }, // 603 (4 11 1 0)
+	{ 9.333333f, 1.333333f, 2.666667f, 0.187500f }, // 604 (4 12 0 0)
+	{ 5.000000f, 11.000000f, 0.000000f, 0.018182f }, // 605 (5 0 0 11)
+	{ 5.111111f, 10.444445f, 0.222222f, 0.018750f }, // 606 (5 0 1 10)
+	{ 5.222222f, 9.888889f, 0.444444f, 0.019438f }, // 607 (5 0 2 9)
+	{ 5.333333f, 9.333333f, 0.666667f, 0.020270f }, // 608 (5 0 3 8)
+	{ 5.444445f, 8.777778f, 0.888889f, 0.021277f }, // 609 (5 0 4 7)
+	{ 5.555555f, 8.222222f, 1.111111f, 0.022500f }, // 610 (5 0 5 6)
+	{ 5.666667f, 7.666667f, 1.333333f, 0.024000f }, // 611 (5 0 6 5)
+	{ 5.777778f, 7.111111f, 1.555556f, 0.025862f }, // 612 (5 0 7 4)
+	{ 5.888889f, 6.555555f, 1.777778f, 0.028213f }, // 613 (5 0 8 3)
+	{ 6.000000f, 6.000000f, 2.000000f, 0.031250f }, // 614 (5 0 9 2)
+	{ 6.111111f, 5.444445f, 2.222222f, 0.035294f }, // 615 (5 0 10 1)
+	{ 6.222222f, 4.888889f, 2.444444f, 0.040909f }, // 616 (5 0 11 0)
+	{ 5.444445f, 10.111111f, 0.222222f, 0.018182f }, // 617 (5 1 0 10)
+	{ 5.555555f, 9.555555f, 0.444444f, 0.018908f }, // 618 (5 1 1 9)
+	{ 5.666667f, 9.000000f, 0.666667f, 0.019780f }, // 619 (5 1 2 8)
+	{ 5.777778f, 8.444445f, 0.888889f, 0.020833f }, // 620 (5 1 3 7)
+	{ 5.888889f, 7.888889f, 1.111111f, 0.022113f }, // 621 (5 1 4 6)
+	{ 6.000000f, 7.333333f, 1.333333f, 0.023684f }, // 622 (5 1 5 5)
+	{ 6.111111f, 6.777778f, 1.555556f, 0.025641f }, // 623 (5 1 6 4)
+	{ 6.222222f, 6.222222f, 1.777778f, 0.028125f }, // 624 (5 1 7 3)
+	{ 6.333333f, 5.666667f, 2.000000f, 0.031359f }, // 625 (5 1 8 2)
+	{ 6.444445f, 5.111111f, 2.222222f, 0.035714f }, // 626 (5 1 9 1)
+	{ 6.555555f, 4.555556f, 2.444444f, 0.041860f }, // 627 (5 1 10 0)
+	{ 5.888889f, 9.222222f, 0.444444f, 0.018480f }, // 628 (5 2 0 9)
+	{ 6.000000f, 8.666667f, 0.666667f, 0.019397f }, // 629 (5 2 1 8)
+	{ 6.111111f, 8.111111f, 0.888889f, 0.020501f }, // 630 (5 2 2 7)
+	{ 6.222222f, 7.555555f, 1.111111f, 0.021845f }, // 631 (5 2 3 6)
+	{ 6.333333f, 7.000000f, 1.333333f, 0.023499f }, // 632 (5 2 4 5)
+	{ 6.444445f, 6.444445f, 1.555556f, 0.025568f }, // 633 (5 2 5 4)
+	{ 6.555555f, 5.888889f, 1.777778f, 0.028213f }, // 634 (5 2 6 3)
+	{ 6.666667f, 5.333333f, 2.000000f, 0.031690f }, // 635 (5 2 7 2)
+	{ 6.777778f, 4.777778f, 2.222222f, 0.036437f }, // 636 (5 2 8 1)
+	{ 6.888889f, 4.222222f, 2.444444f, 0.043269f }, // 637 (5 2 9 0)
+	{ 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 638 (5 3 0 8)
+	{ 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 639 (5 3 1 7)
+	{ 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 640 (5 3 2 6)
+	{ 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 641 (5 3 3 5)
+	{ 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 642 (5 3 4 4)
+	{ 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 643 (5 3 5 3)
+	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 644 (5 3 6 2)
+	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 645 (5 3 7 1)
+	{ 7.222222f, 3.888889f, 2.444444f, 0.045226f }, // 646 (5 3 8 0)
+	{ 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 647 (5 4 0 7)
+	{ 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 648 (5 4 1 6)
+	{ 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 649 (5 4 2 5)
+	{ 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 650 (5 4 3 4)
+	{ 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 651 (5 4 4 3)
+	{ 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 652 (5 4 5 2)
+	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 653 (5 4 6 1)
+	{ 7.555555f, 3.555556f, 2.444444f, 0.047872f }, // 654 (5 4 7 0)
+	{ 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 655 (5 5 0 6)
+	{ 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 656 (5 5 1 5)
+	{ 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 657 (5 5 2 4)
+	{ 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 658 (5 5 3 3)
+	{ 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 659 (5 5 4 2)
+	{ 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 660 (5 5 5 1)
+	{ 7.888889f, 3.222222f, 2.444444f, 0.051429f }, // 661 (5 5 6 0)
+	{ 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 662 (5 6 0 5)
+	{ 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 663 (5 6 1 4)
+	{ 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 664 (5 6 2 3)
+	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 665 (5 6 3 2)
+	{ 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 666 (5 6 4 1)
+	{ 8.222222f, 2.888889f, 2.444444f, 0.056250f }, // 667 (5 6 5 0)
+	{ 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 668 (5 7 0 4)
+	{ 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 669 (5 7 1 3)
+	{ 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 670 (5 7 2 2)
+	{ 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 671 (5 7 3 1)
+	{ 8.555555f, 2.555556f, 2.444444f, 0.062937f }, // 672 (5 7 4 0)
+	{ 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 673 (5 8 0 3)
+	{ 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 674 (5 8 1 2)
+	{ 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 675 (5 8 2 1)
+	{ 8.888889f, 2.222222f, 2.444444f, 0.072581f }, // 676 (5 8 3 0)
+	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 677 (5 9 0 2)
+	{ 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 678 (5 9 1 1)
+	{ 9.222222f, 1.888889f, 2.444444f, 0.087379f }, // 679 (5 9 2 0)
+	{ 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 680 (5 10 0 1)
+	{ 9.555555f, 1.555556f, 2.444444f, 0.112500f }, // 681 (5 10 1 0)
+	{ 9.888889f, 1.222222f, 2.444444f, 0.163636f }, // 682 (5 11 0 0)
+	{ 6.000000f, 10.000000f, 0.000000f, 0.016667f }, // 683 (6 0 0 10)
+	{ 6.111111f, 9.444445f, 0.222222f, 0.017341f }, // 684 (6 0 1 9)
+	{ 6.222222f, 8.888889f, 0.444444f, 0.018145f }, // 685 (6 0 2 8)
+	{ 6.333333f, 8.333333f, 0.666667f, 0.019108f }, // 686 (6 0 3 7)
+	{ 6.444445f, 7.777778f, 0.888889f, 0.020270f }, // 687 (6 0 4 6)
+	{ 6.555555f, 7.222222f, 1.111111f, 0.021687f }, // 688 (6 0 5 5)
+	{ 6.666667f, 6.666667f, 1.333333f, 0.023438f }, // 689 (6 0 6 4)
+	{ 6.777778f, 6.111111f, 1.555556f, 0.025641f }, // 690 (6 0 7 3)
+	{ 6.888889f, 5.555555f, 1.777778f, 0.028481f }, // 691 (6 0 8 2)
+	{ 7.000000f, 5.000000f, 2.000000f, 0.032258f }, // 692 (6 0 9 1)
+	{ 7.111111f, 4.444445f, 2.222222f, 0.037500f }, // 693 (6 0 10 0)
+	{ 6.444445f, 9.111111f, 0.222222f, 0.017045f }, // 694 (6 1 0 9)
+	{ 6.555555f, 8.555555f, 0.444444f, 0.017893f }, // 695 (6 1 1 8)
+	{ 6.666667f, 8.000000f, 0.666667f, 0.018908f }, // 696 (6 1 2 7)
+	{ 6.777778f, 7.444445f, 0.888889f, 0.020134f }, // 697 (6 1 3 6)
+	{ 6.888889f, 6.888889f, 1.111111f, 0.021635f }, // 698 (6 1 4 5)
+	{ 7.000000f, 6.333333f, 1.333333f, 0.023499f }, // 699 (6 1 5 4)
+	{ 7.111111f, 5.777778f, 1.555556f, 0.025862f }, // 700 (6 1 6 3)
+	{ 7.222222f, 5.222222f, 1.777778f, 0.028939f }, // 701 (6 1 7 2)
+	{ 7.333333f, 4.666667f, 2.000000f, 0.033088f }, // 702 (6 1 8 1)
+	{ 7.444445f, 4.111111f, 2.222222f, 0.038961f }, // 703 (6 1 9 0)
+	{ 6.888889f, 8.222222f, 0.444444f, 0.017717f }, // 704 (6 2 0 8)
+	{ 7.000000f, 7.666667f, 0.666667f, 0.018789f }, // 705 (6 2 1 7)
+	{ 7.111111f, 7.111111f, 0.888889f, 0.020089f }, // 706 (6 2 2 6)
+	{ 7.222222f, 6.555555f, 1.111111f, 0.021687f }, // 707 (6 2 3 5)
+	{ 7.333333f, 6.000000f, 1.333333f, 0.023684f }, // 708 (6 2 4 4)
+	{ 7.444445f, 5.444445f, 1.555556f, 0.026239f }, // 709 (6 2 5 3)
+	{ 7.555555f, 4.888889f, 1.777778f, 0.029605f }, // 710 (6 2 6 2)
+	{ 7.666667f, 4.333333f, 2.000000f, 0.034221f }, // 711 (6 2 7 1)
+	{ 7.777778f, 3.777778f, 2.222222f, 0.040909f }, // 712 (6 2 8 0)
+	{ 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 713 (6 3 0 7)
+	{ 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 714 (6 3 1 6)
+	{ 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 715 (6 3 2 5)
+	{ 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 716 (6 3 3 4)
+	{ 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 717 (6 3 4 3)
+	{ 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 718 (6 3 5 2)
+	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 719 (6 3 6 1)
+	{ 8.111111f, 3.444444f, 2.222222f, 0.043478f }, // 720 (6 3 7 0)
+	{ 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 721 (6 4 0 6)
+	{ 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 722 (6 4 1 5)
+	{ 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 723 (6 4 2 4)
+	{ 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 724 (6 4 3 3)
+	{ 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 725 (6 4 4 2)
+	{ 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 726 (6 4 5 1)
+	{ 8.444445f, 3.111111f, 2.222222f, 0.046875f }, // 727 (6 4 6 0)
+	{ 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 728 (6 5 0 5)
+	{ 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 729 (6 5 1 4)
+	{ 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 730 (6 5 2 3)
+	{ 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 731 (6 5 3 2)
+	{ 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 732 (6 5 4 1)
+	{ 8.777778f, 2.777778f, 2.222222f, 0.051429f }, // 733 (6 5 5 0)
+	{ 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 734 (6 6 0 4)
+	{ 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 735 (6 6 1 3)
+	{ 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 736 (6 6 2 2)
+	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 737 (6 6 3 1)
+	{ 9.111111f, 2.444444f, 2.222222f, 0.057692f }, // 738 (6 6 4 0)
+	{ 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 739 (6 7 0 3)
+	{ 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 740 (6 7 1 2)
+	{ 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 741 (6 7 2 1)
+	{ 9.444445f, 2.111111f, 2.222222f, 0.066667f }, // 742 (6 7 3 0)
+	{ 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 743 (6 8 0 2)
+	{ 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 744 (6 8 1 1)
+	{ 9.777778f, 1.777778f, 2.222222f, 0.080357f }, // 745 (6 8 2 0)
+	{ 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 746 (6 9 0 1)
+	{ 10.111111f, 1.444444f, 2.222222f, 0.103448f }, // 747 (6 9 1 0)
+	{ 10.444445f, 1.111111f, 2.222222f, 0.150000f }, // 748 (6 10 0 0)
+	{ 7.000000f, 9.000000f, 0.000000f, 0.015873f }, // 749 (7 0 0 9)
+	{ 7.111111f, 8.444445f, 0.222222f, 0.016667f }, // 750 (7 0 1 8)
+	{ 7.222222f, 7.888889f, 0.444444f, 0.017613f }, // 751 (7 0 2 7)
+	{ 7.333333f, 7.333333f, 0.666667f, 0.018750f }, // 752 (7 0 3 6)
+	{ 7.444445f, 6.777778f, 0.888889f, 0.020134f }, // 753 (7 0 4 5)
+	{ 7.555555f, 6.222222f, 1.111111f, 0.021845f }, // 754 (7 0 5 4)
+	{ 7.666667f, 5.666667f, 1.333333f, 0.024000f }, // 755 (7 0 6 3)
+	{ 7.777778f, 5.111111f, 1.555556f, 0.026786f }, // 756 (7 0 7 2)
+	{ 7.888889f, 4.555555f, 1.777778f, 0.030508f }, // 757 (7 0 8 1)
+	{ 8.000000f, 4.000000f, 2.000000f, 0.035714f }, // 758 (7 0 9 0)
+	{ 7.444445f, 8.111111f, 0.222222f, 0.016575f }, // 759 (7 1 0 8)
+	{ 7.555555f, 7.555555f, 0.444444f, 0.017578f }, // 760 (7 1 1 7)
+	{ 7.666667f, 7.000000f, 0.666667f, 0.018789f }, // 761 (7 1 2 6)
+	{ 7.777778f, 6.444445f, 0.888889f, 0.020270f }, // 762 (7 1 3 5)
+	{ 7.888889f, 5.888889f, 1.111111f, 0.022113f }, // 763 (7 1 4 4)
+	{ 8.000000f, 5.333333f, 1.333333f, 0.024457f }, // 764 (7 1 5 3)
+	{ 8.111111f, 4.777778f, 1.555556f, 0.027523f }, // 765 (7 1 6 2)
+	{ 8.222222f, 4.222222f, 1.777778f, 0.031690f }, // 766 (7 1 7 1)
+	{ 8.333333f, 3.666667f, 2.000000f, 0.037657f }, // 767 (7 1 8 0)
+	{ 7.888889f, 7.222222f, 0.444444f, 0.017613f }, // 768 (7 2 0 7)
+	{ 8.000000f, 6.666667f, 0.666667f, 0.018908f }, // 769 (7 2 1 6)
+	{ 8.111111f, 6.111111f, 0.888889f, 0.020501f }, // 770 (7 2 2 5)
+	{ 8.222222f, 5.555555f, 1.111111f, 0.022500f }, // 771 (7 2 3 4)
+	{ 8.333333f, 5.000000f, 1.333333f, 0.025070f }, // 772 (7 2 4 3)
+	{ 8.444445f, 4.444445f, 1.555556f, 0.028481f }, // 773 (7 2 5 2)
+	{ 8.555555f, 3.888889f, 1.777778f, 0.033210f }, // 774 (7 2 6 1)
+	{ 8.666667f, 3.333333f, 2.000000f, 0.040179f }, // 775 (7 2 7 0)
+	{ 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 776 (7 3 0 6)
+	{ 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 777 (7 3 1 5)
+	{ 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 778 (7 3 2 4)
+	{ 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 779 (7 3 3 3)
+	{ 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 780 (7 3 4 2)
+	{ 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 781 (7 3 5 1)
+	{ 9.000000f, 3.000000f, 2.000000f, 0.043478f }, // 782 (7 3 6 0)
+	{ 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 783 (7 4 0 5)
+	{ 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 784 (7 4 1 4)
+	{ 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 785 (7 4 2 3)
+	{ 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 786 (7 4 3 2)
+	{ 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 787 (7 4 4 1)
+	{ 9.333333f, 2.666667f, 2.000000f, 0.047872f }, // 788 (7 4 5 0)
+	{ 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 789 (7 5 0 4)
+	{ 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 790 (7 5 1 3)
+	{ 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 791 (7 5 2 2)
+	{ 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 792 (7 5 3 1)
+	{ 9.666667f, 2.333333f, 2.000000f, 0.053892f }, // 793 (7 5 4 0)
+	{ 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 794 (7 6 0 3)
+	{ 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 795 (7 6 1 2)
+	{ 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 796 (7 6 2 1)
+	{ 10.000000f, 2.000000f, 2.000000f, 0.062500f }, // 797 (7 6 3 0)
+	{ 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 798 (7 7 0 2)
+	{ 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 799 (7 7 1 1)
+	{ 10.333333f, 1.666667f, 2.000000f, 0.075630f }, // 800 (7 7 2 0)
+	{ 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 801 (7 8 0 1)
+	{ 10.666667f, 1.333333f, 2.000000f, 0.097826f }, // 802 (7 8 1 0)
+	{ 11.000000f, 1.000000f, 2.000000f, 0.142857f }, // 803 (7 9 0 0)
+	{ 8.000000f, 8.000000f, 0.000000f, 0.015625f }, // 804 (8 0 0 8)
+	{ 8.111111f, 7.444445f, 0.222222f, 0.016575f }, // 805 (8 0 1 7)
+	{ 8.222222f, 6.888889f, 0.444444f, 0.017717f }, // 806 (8 0 2 6)
+	{ 8.333333f, 6.333333f, 0.666667f, 0.019108f }, // 807 (8 0 3 5)
+	{ 8.444445f, 5.777778f, 0.888889f, 0.020833f }, // 808 (8 0 4 4)
+	{ 8.555555f, 5.222222f, 1.111111f, 0.023018f }, // 809 (8 0 5 3)
+	{ 8.666667f, 4.666667f, 1.333333f, 0.025862f }, // 810 (8 0 6 2)
+	{ 8.777778f, 4.111111f, 1.555556f, 0.029703f }, // 811 (8 0 7 1)
+	{ 8.888889f, 3.555556f, 1.777778f, 0.035156f }, // 812 (8 0 8 0)
+	{ 8.444445f, 7.111111f, 0.222222f, 0.016667f }, // 813 (8 1 0 7)
+	{ 8.555555f, 6.555555f, 0.444444f, 0.017893f }, // 814 (8 1 1 6)
+	{ 8.666667f, 6.000000f, 0.666667f, 0.019397f }, // 815 (8 1 2 5)
+	{ 8.777778f, 5.444445f, 0.888889f, 0.021277f }, // 816 (8 1 3 4)
+	{ 8.888889f, 4.888889f, 1.111111f, 0.023684f }, // 817 (8 1 4 3)
+	{ 9.000000f, 4.333333f, 1.333333f, 0.026866f }, // 818 (8 1 5 2)
+	{ 9.111111f, 3.777778f, 1.555556f, 0.031250f }, // 819 (8 1 6 1)
+	{ 9.222222f, 3.222222f, 1.777778f, 0.037657f }, // 820 (8 1 7 0)
+	{ 8.888889f, 6.222222f, 0.444444f, 0.018145f }, // 821 (8 2 0 6)
+	{ 9.000000f, 5.666667f, 0.666667f, 0.019780f }, // 822 (8 2 1 5)
+	{ 9.111111f, 5.111111f, 0.888889f, 0.021845f }, // 823 (8 2 2 4)
+	{ 9.222222f, 4.555555f, 1.111111f, 0.024523f }, // 824 (8 2 3 3)
+	{ 9.333333f, 4.000000f, 1.333333f, 0.028125f }, // 825 (8 2 4 2)
+	{ 9.444445f, 3.444444f, 1.555556f, 0.033210f }, // 826 (8 2 5 1)
+	{ 9.555555f, 2.888889f, 1.777778f, 0.040909f }, // 827 (8 2 6 0)
+	{ 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 828 (8 3 0 5)
+	{ 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 829 (8 3 1 4)
+	{ 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 830 (8 3 2 3)
+	{ 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 831 (8 3 3 2)
+	{ 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 832 (8 3 4 1)
+	{ 9.888889f, 2.555556f, 1.777778f, 0.045226f }, // 833 (8 3 5 0)
+	{ 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 834 (8 4 0 4)
+	{ 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 835 (8 4 1 3)
+	{ 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 836 (8 4 2 2)
+	{ 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 837 (8 4 3 1)
+	{ 10.222222f, 2.222222f, 1.777778f, 0.051136f }, // 838 (8 4 4 0)
+	{ 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 839 (8 5 0 3)
+	{ 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 840 (8 5 1 2)
+	{ 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 841 (8 5 2 1)
+	{ 10.555555f, 1.888889f, 1.777778f, 0.059603f }, // 842 (8 5 3 0)
+	{ 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 843 (8 6 0 2)
+	{ 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 844 (8 6 1 1)
+	{ 10.888889f, 1.555556f, 1.777778f, 0.072581f }, // 845 (8 6 2 0)
+	{ 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 846 (8 7 0 1)
+	{ 11.222222f, 1.222222f, 1.777778f, 0.094737f }, // 847 (8 7 1 0)
+	{ 11.555555f, 0.888889f, 1.777778f, 0.140625f }, // 848 (8 8 0 0)
+	{ 9.000000f, 7.000000f, 0.000000f, 0.015873f }, // 849 (9 0 0 7)
+	{ 9.111111f, 6.444445f, 0.222222f, 0.017045f }, // 850 (9 0 1 6)
+	{ 9.222222f, 5.888889f, 0.444444f, 0.018480f }, // 851 (9 0 2 5)
+	{ 9.333333f, 5.333333f, 0.666667f, 0.020270f }, // 852 (9 0 3 4)
+	{ 9.444445f, 4.777778f, 0.888889f, 0.022556f }, // 853 (9 0 4 3)
+	{ 9.555555f, 4.222222f, 1.111111f, 0.025568f }, // 854 (9 0 5 2)
+	{ 9.666667f, 3.666667f, 1.333333f, 0.029703f }, // 855 (9 0 6 1)
+	{ 9.777778f, 3.111111f, 1.555556f, 0.035714f }, // 856 (9 0 7 0)
+	{ 9.444445f, 6.111111f, 0.222222f, 0.017341f }, // 857 (9 1 0 6)
+	{ 9.555555f, 5.555555f, 0.444444f, 0.018908f }, // 858 (9 1 1 5)
+	{ 9.666667f, 5.000000f, 0.666667f, 0.020882f }, // 859 (9 1 2 4)
+	{ 9.777778f, 4.444445f, 0.888889f, 0.023438f }, // 860 (9 1 3 3)
+	{ 9.888889f, 3.888889f, 1.111111f, 0.026866f }, // 861 (9 1 4 2)
+	{ 10.000000f, 3.333333f, 1.333333f, 0.031690f }, // 862 (9 1 5 1)
+	{ 10.111111f, 2.777778f, 1.555556f, 0.038961f }, // 863 (9 1 6 0)
+	{ 9.888889f, 5.222222f, 0.444444f, 0.019438f }, // 864 (9 2 0 5)
+	{ 10.000000f, 4.666667f, 0.666667f, 0.021635f }, // 865 (9 2 1 4)
+	{ 10.111111f, 4.111111f, 0.888889f, 0.024523f }, // 866 (9 2 2 3)
+	{ 10.222222f, 3.555556f, 1.111111f, 0.028481f }, // 867 (9 2 3 2)
+	{ 10.333333f, 3.000000f, 1.333333f, 0.034221f }, // 868 (9 2 4 1)
+	{ 10.444445f, 2.444444f, 1.555556f, 0.043269f }, // 869 (9 2 5 0)
+	{ 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 870 (9 3 0 4)
+	{ 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 871 (9 3 1 3)
+	{ 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 872 (9 3 2 2)
+	{ 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 873 (9 3 3 1)
+	{ 10.777778f, 2.111111f, 1.555556f, 0.049180f }, // 874 (9 3 4 0)
+	{ 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 875 (9 4 0 3)
+	{ 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 876 (9 4 1 2)
+	{ 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 877 (9 4 2 1)
+	{ 11.111111f, 1.777778f, 1.555556f, 0.057692f }, // 878 (9 4 3 0)
+	{ 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 879 (9 5 0 2)
+	{ 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 880 (9 5 1 1)
+	{ 11.444445f, 1.444444f, 1.555556f, 0.070866f }, // 881 (9 5 2 0)
+	{ 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 882 (9 6 0 1)
+	{ 11.777778f, 1.111111f, 1.555556f, 0.093750f }, // 883 (9 6 1 0)
+	{ 12.111111f, 0.777778f, 1.555556f, 0.142857f }, // 884 (9 7 0 0)
+	{ 10.000000f, 6.000000f, 0.000000f, 0.016667f }, // 885 (10 0 0 6)
+	{ 10.111111f, 5.444445f, 0.222222f, 0.018182f }, // 886 (10 0 1 5)
+	{ 10.222222f, 4.888889f, 0.444444f, 0.020089f }, // 887 (10 0 2 4)
+	{ 10.333333f, 4.333333f, 0.666667f, 0.022556f }, // 888 (10 0 3 3)
+	{ 10.444445f, 3.777778f, 0.888889f, 0.025862f }, // 889 (10 0 4 2)
+	{ 10.555555f, 3.222222f, 1.111111f, 0.030508f }, // 890 (10 0 5 1)
+	{ 10.666667f, 2.666667f, 1.333333f, 0.037500f }, // 891 (10 0 6 0)
+	{ 10.444445f, 5.111111f, 0.222222f, 0.018750f }, // 892 (10 1 0 5)
+	{ 10.555555f, 4.555555f, 0.444444f, 0.020882f }, // 893 (10 1 1 4)
+	{ 10.666667f, 4.000000f, 0.666667f, 0.023684f }, // 894 (10 1 2 3)
+	{ 10.777778f, 3.444444f, 0.888889f, 0.027523f }, // 895 (10 1 3 2)
+	{ 10.888889f, 2.888889f, 1.111111f, 0.033088f }, // 896 (10 1 4 1)
+	{ 11.000000f, 2.333333f, 1.333333f, 0.041860f }, // 897 (10 1 5 0)
+	{ 10.888889f, 4.222222f, 0.444444f, 0.021845f }, // 898 (10 2 0 4)
+	{ 11.000000f, 3.666667f, 0.666667f, 0.025070f }, // 899 (10 2 1 3)
+	{ 11.111111f, 3.111111f, 0.888889f, 0.029605f }, // 900 (10 2 2 2)
+	{ 11.222222f, 2.555556f, 1.111111f, 0.036437f }, // 901 (10 2 3 1)
+	{ 11.333333f, 2.000000f, 1.333333f, 0.047872f }, // 902 (10 2 4 0)
+	{ 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 903 (10 3 0 3)
+	{ 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 904 (10 3 1 2)
+	{ 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 905 (10 3 2 1)
+	{ 11.666667f, 1.666667f, 1.333333f, 0.056604f }, // 906 (10 3 3 0)
+	{ 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 907 (10 4 0 2)
+	{ 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 908 (10 4 1 1)
+	{ 12.000000f, 1.333333f, 1.333333f, 0.070313f }, // 909 (10 4 2 0)
+	{ 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 910 (10 5 0 1)
+	{ 12.333333f, 1.000000f, 1.333333f, 0.094737f }, // 911 (10 5 1 0)
+	{ 12.666667f, 0.666667f, 1.333333f, 0.150000f }, // 912 (10 6 0 0)
+	{ 11.000000f, 5.000000f, 0.000000f, 0.018182f }, // 913 (11 0 0 5)
+	{ 11.111111f, 4.444445f, 0.222222f, 0.020270f }, // 914 (11 0 1 4)
+	{ 11.222222f, 3.888889f, 0.444444f, 0.023018f }, // 915 (11 0 2 3)
+	{ 11.333333f, 3.333333f, 0.666667f, 0.026786f }, // 916 (11 0 3 2)
+	{ 11.444445f, 2.777778f, 0.888889f, 0.032258f }, // 917 (11 0 4 1)
+	{ 11.555555f, 2.222222f, 1.111111f, 0.040909f }, // 918 (11 0 5 0)
+	{ 11.444445f, 4.111111f, 0.222222f, 0.021277f }, // 919 (11 1 0 4)
+	{ 11.555555f, 3.555556f, 0.444444f, 0.024457f }, // 920 (11 1 1 3)
+	{ 11.666667f, 3.000000f, 0.666667f, 0.028939f }, // 921 (11 1 2 2)
+	{ 11.777778f, 2.444444f, 0.888889f, 0.035714f }, // 922 (11 1 3 1)
+	{ 11.888889f, 1.888889f, 1.111111f, 0.047120f }, // 923 (11 1 4 0)
+	{ 11.888889f, 3.222222f, 0.444444f, 0.026239f }, // 924 (11 2 0 3)
+	{ 12.000000f, 2.666667f, 0.666667f, 0.031690f }, // 925 (11 2 1 2)
+	{ 12.111111f, 2.111111f, 0.888889f, 0.040359f }, // 926 (11 2 2 1)
+	{ 12.222222f, 1.555556f, 1.111111f, 0.056250f }, // 927 (11 2 3 0)
+	{ 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 928 (11 3 0 2)
+	{ 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 929 (11 3 1 1)
+	{ 12.555555f, 1.222222f, 1.111111f, 0.070866f }, // 930 (11 3 2 0)
+	{ 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 931 (11 4 0 1)
+	{ 12.888889f, 0.888889f, 1.111111f, 0.097826f }, // 932 (11 4 1 0)
+	{ 13.222222f, 0.555556f, 1.111111f, 0.163636f }, // 933 (11 5 0 0)
+	{ 12.000000f, 4.000000f, 0.000000f, 0.020833f }, // 934 (12 0 0 4)
+	{ 12.111111f, 3.444444f, 0.222222f, 0.024000f }, // 935 (12 0 1 3)
+	{ 12.222222f, 2.888889f, 0.444444f, 0.028481f }, // 936 (12 0 2 2)
+	{ 12.333333f, 2.333333f, 0.666667f, 0.035294f }, // 937 (12 0 3 1)
+	{ 12.444445f, 1.777778f, 0.888889f, 0.046875f }, // 938 (12 0 4 0)
+	{ 12.444445f, 3.111111f, 0.222222f, 0.025862f }, // 939 (12 1 0 3)
+	{ 12.555555f, 2.555556f, 0.444444f, 0.031359f }, // 940 (12 1 1 2)
+	{ 12.666667f, 2.000000f, 0.666667f, 0.040179f }, // 941 (12 1 2 1)
+	{ 12.777778f, 1.444444f, 0.888889f, 0.056604f }, // 942 (12 1 3 0)
+	{ 12.888889f, 2.222222f, 0.444444f, 0.035156f }, // 943 (12 2 0 2)
+	{ 13.000000f, 1.666667f, 0.666667f, 0.047120f }, // 944 (12 2 1 1)
+	{ 13.111111f, 1.111111f, 0.888889f, 0.072581f }, // 945 (12 2 2 0)
+	{ 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 946 (12 3 0 1)
+	{ 13.444445f, 0.777778f, 0.888889f, 0.103448f }, // 947 (12 3 1 0)
+	{ 13.777778f, 0.444444f, 0.888889f, 0.187500f }, // 948 (12 4 0 0)
+	{ 13.000000f, 3.000000f, 0.000000f, 0.025641f }, // 949 (13 0 0 3)
+	{ 13.111111f, 2.444444f, 0.222222f, 0.031250f }, // 950 (13 0 1 2)
+	{ 13.222222f, 1.888889f, 0.444444f, 0.040359f }, // 951 (13 0 2 1)
+	{ 13.333333f, 1.333333f, 0.666667f, 0.057692f }, // 952 (13 0 3 0)
+	{ 13.444445f, 2.111111f, 0.222222f, 0.035294f }, // 953 (13 1 0 2)
+	{ 13.555555f, 1.555556f, 0.444444f, 0.047872f }, // 954 (13 1 1 1)
+	{ 13.666667f, 1.000000f, 0.666667f, 0.075630f }, // 955 (13 1 2 0)
+	{ 13.888889f, 1.222222f, 0.444444f, 0.059603f }, // 956 (13 2 0 1)
+	{ 14.000000f, 0.666667f, 0.666667f, 0.112500f }, // 957 (13 2 1 0)
+	{ 14.333333f, 0.333333f, 0.666667f, 0.230769f }, // 958 (13 3 0 0)
+	{ 14.000000f, 2.000000f, 0.000000f, 0.035714f }, // 959 (14 0 0 2)
+	{ 14.111111f, 1.444444f, 0.222222f, 0.049180f }, // 960 (14 0 1 1)
+	{ 14.222222f, 0.888889f, 0.444444f, 0.080357f }, // 961 (14 0 2 0)
+	{ 14.444445f, 1.111111f, 0.222222f, 0.062500f }, // 962 (14 1 0 1)
+	{ 14.555555f, 0.555556f, 0.444444f, 0.126761f }, // 963 (14 1 1 0)
+	{ 14.888889f, 0.222222f, 0.444444f, 0.321429f }, // 964 (14 2 0 0)
+	{ 15.000000f, 1.000000f, 0.000000f, 0.066667f }, // 965 (15 0 0 1)
+	{ 15.111111f, 0.444444f, 0.222222f, 0.150000f }, // 966 (15 0 1 0)
+	{ 15.444445f, 0.111111f, 0.222222f, 0.600000f }, // 967 (15 1 0 0)
+	{ 16.000000f, 0.000000f, 0.000000f, FLT_MAX }, // 968 (16 0 0 0)
+}; // 969 four cluster elements
+
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/maths.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/maths.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/maths.h
@@ -23,14 +23,22 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_MATHS_H
-#define SQUISH_MATHS_H
+#ifndef NV_SQUISH_MATHS_H
+#define NV_SQUISH_MATHS_H
+
+#if NV_USE_ALTIVEC
+#undef vector
+#endif
 
 #include <cmath>
 #include <algorithm>
 #include "config.h"
 
-namespace squish {
+#if NV_USE_ALTIVEC
+#define vector __vector
+#endif
+
+namespace nvsquish {
 
 class Vec3
 {
@@ -234,6 +242,6 @@
 Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights, Vec3::Arg metric );
 Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
 
-} // namespace squish
+} // namespace nvsquish
 
 #endif // ndef SQUISH_MATHS_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/maths.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/maths.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/maths.cpp
@@ -27,7 +27,7 @@
 #include "simd.h"
 #include <cfloat>
 
-namespace squish {
+namespace nvsquish {
 
 Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights, Vec3::Arg metric )
 {
@@ -134,4 +134,4 @@
 
 #endif
 
-} // namespace squish
+} // namespace nvsquish
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd.h
@@ -23,8 +23,8 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_SIMD_H
-#define SQUISH_SIMD_H
+#ifndef NV_SQUISH_SIMD_H
+#define NV_SQUISH_SIMD_H
 
 #include "maths.h"
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd_sse.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd_sse.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd_sse.h
@@ -23,8 +23,8 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_SIMD_SSE_H
-#define SQUISH_SIMD_SSE_H
+#ifndef NV_SQUISH_SIMD_SSE_H
+#define NV_SQUISH_SIMD_SSE_H
 
 #include <xmmintrin.h>
 #if ( SQUISH_USE_SSE > 1 )
@@ -35,7 +35,7 @@
 #define SQUISH_SSE_SPLAT( a )										\
 	( ( a ) | ( ( a ) << 2 ) | ( ( a ) << 4 ) | ( ( a ) << 6 ) )
 
-namespace squish {
+namespace nvsquish {
 
 #define VEC4_CONST( X ) Vec4( _mm_set1_ps( X ) )
 
@@ -72,6 +72,13 @@
 		_mm_store_ps( c, m_v );
 		return Vec3( c[0], c[1], c[2] );
 	}
+
+    float GetX() const 
+    {
+        SQUISH_ALIGN_16 float f;
+        _mm_store_ss(&f, m_v);
+        return f;
+    }
 	
 	Vec4 SplatX() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 0 ) ) ); }
 	Vec4 SplatY() const { return Vec4( _mm_shuffle_ps( m_v, m_v, SQUISH_SSE_SPLAT( 1 ) ) ); }
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd_ve.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd_ve.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/simd_ve.h
@@ -1,6 +1,7 @@
 /* -----------------------------------------------------------------------------
 
 	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2016 Raptor Engineering, LLC
 
 	Permission is hereby granted, free of charge, to any person obtaining
 	a copy of this software and associated documentation files (the 
@@ -26,12 +27,14 @@
 #ifndef SQUISH_SIMD_VE_H
 #define SQUISH_SIMD_VE_H
 
+#ifndef __APPLE_ALTIVEC__
 #include <altivec.h>
 #undef bool
+#endif
 
-namespace squish {
+namespace nvsquish {
 
-#define VEC4_CONST( X ) Vec4( ( vector float )( X ) )
+#define VEC4_CONST( X ) Vec4( vec_splats( (float)X ) )
 
 class Vec4
 {
@@ -76,7 +79,14 @@
 		u.v = m_v;
 		return Vec3( u.c[0], u.c[1], u.c[2] );
 	}
-	
+
+	float GetX() const
+	{
+		union { vector float v; float c[4]; } u;
+		u.v = m_v;
+		return u.c[0];
+	}
+
 	Vec4 SplatX() const { return Vec4( vec_splat( m_v, 0 ) ); }
 	Vec4 SplatY() const { return Vec4( vec_splat( m_v, 1 ) ); }
 	Vec4 SplatZ() const { return Vec4( vec_splat( m_v, 2 ) ); }
@@ -96,7 +106,7 @@
 	
 	Vec4& operator*=( Arg v )
 	{
-		m_v = vec_madd( m_v, v.m_v, ( vector float )( -0.0f ) );
+		m_v = vec_madd( m_v, v.m_v, vec_splats( -0.0f ) );
 		return *this;
 	}
 	
@@ -112,7 +122,7 @@
 	
 	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
 	{
-		return Vec4( vec_madd( left.m_v, right.m_v, ( vector float )( -0.0f ) ) );
+		return Vec4( vec_madd( left.m_v, right.m_v, vec_splats( -0.0f ) ) );
 	}
 	
 	//! Returns a*b + c
@@ -133,7 +143,7 @@
 		vector float estimate = vec_re( v.m_v );
 		
 		// one round of Newton-Rhaphson refinement
-		vector float diff = vec_nmsub( estimate, v.m_v, ( vector float )( 1.0f ) );
+		vector float diff = vec_nmsub( estimate, v.m_v, vec_splats( 1.0f ) );
 		return Vec4( vec_madd( diff, estimate, estimate ) );
 	}
 	
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.h
@@ -23,11 +23,11 @@
 	
    -------------------------------------------------------------------------- */
    
-#ifndef SQUISH_H
-#define SQUISH_H
+#ifndef NV_SQUISH_H
+#define NV_SQUISH_H
 
 //! All squish API functions live in this namespace.
-namespace squish {
+namespace nvsquish {
 
 // -----------------------------------------------------------------------------
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.cpp
@@ -23,7 +23,7 @@
 	
    -------------------------------------------------------------------------- */
    
-#include <squish.h>
+#include "squish.h"
 #include "colourset.h"
 #include "maths.h"
 #include "rangefit.h"
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.xcodeproj/project.pbxproj
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.xcodeproj/project.pbxproj
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/squish.xcodeproj/project.pbxproj
@@ -1,531 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 42;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		133FA0DC096A7B8E0050752E /* alpha.h in Headers */ = {isa = PBXBuildFile; fileRef = 133FA0DA096A7B8E0050752E /* alpha.h */; };
-		133FA0DD096A7B8E0050752E /* alpha.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 133FA0DB096A7B8E0050752E /* alpha.cpp */; };
-		1342B4160999DF1900152915 /* libsquish.a in Frameworks */ = {isa = PBXBuildFile; fileRef = D2AAC046055464E500DB518D /* libsquish.a */; };
-		1342B41A0999DF7000152915 /* squishpng.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1342B4190999DF7000152915 /* squishpng.cpp */; };
-		1342B43F0999E0CC00152915 /* squishtest.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1342B43E0999E0CC00152915 /* squishtest.cpp */; };
-		1342B4420999E0EC00152915 /* libsquish.a in Frameworks */ = {isa = PBXBuildFile; fileRef = D2AAC046055464E500DB518D /* libsquish.a */; };
-		1350D71A092AA858005EE038 /* clusterfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D70B092AA857005EE038 /* clusterfit.cpp */; };
-		1350D71B092AA858005EE038 /* clusterfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D70C092AA858005EE038 /* clusterfit.h */; };
-		1350D71E092AA858005EE038 /* colourblock.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D70F092AA858005EE038 /* colourblock.cpp */; };
-		1350D71F092AA858005EE038 /* colourblock.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D710092AA858005EE038 /* colourblock.h */; };
-		1350D720092AA858005EE038 /* config.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D711092AA858005EE038 /* config.h */; };
-		1350D721092AA858005EE038 /* maths.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D712092AA858005EE038 /* maths.cpp */; };
-		1350D722092AA858005EE038 /* maths.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D713092AA858005EE038 /* maths.h */; };
-		1350D725092AA858005EE038 /* rangefit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D716092AA858005EE038 /* rangefit.cpp */; };
-		1350D726092AA858005EE038 /* rangefit.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D717092AA858005EE038 /* rangefit.h */; };
-		1350D727092AA858005EE038 /* squish.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1350D718092AA858005EE038 /* squish.cpp */; };
-		1350D728092AA858005EE038 /* squish.h in Headers */ = {isa = PBXBuildFile; fileRef = 1350D719092AA858005EE038 /* squish.h */; settings = {ATTRIBUTES = (Public, ); }; };
-		139C21CF09ADAB0800A2500D /* squishgen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 139C21CE09ADAB0800A2500D /* squishgen.cpp */; };
-		139C234F09B0602700A2500D /* singlecolourfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 139C234D09B0602700A2500D /* singlecolourfit.h */; };
-		139C235009B0602700A2500D /* singlecolourfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 139C234E09B0602700A2500D /* singlecolourfit.cpp */; };
-		13A7CCA40952BE63001C963A /* colourfit.h in Headers */ = {isa = PBXBuildFile; fileRef = 13A7CCA20952BE63001C963A /* colourfit.h */; };
-		13A7CCA50952BE63001C963A /* colourfit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 13A7CCA30952BE63001C963A /* colourfit.cpp */; };
-		13C4C7AD0941C18000AC5B89 /* colourset.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 13C4C7AB0941C18000AC5B89 /* colourset.cpp */; };
-		13C4C7AE0941C18000AC5B89 /* colourset.h in Headers */ = {isa = PBXBuildFile; fileRef = 13C4C7AC0941C18000AC5B89 /* colourset.h */; };
-		13CD64C2092BCF8A00488C97 /* simd.h in Headers */ = {isa = PBXBuildFile; fileRef = 13CD64C0092BCF8A00488C97 /* simd.h */; };
-		13D0DC910931F93A00909807 /* simd_ve.h in Headers */ = {isa = PBXBuildFile; fileRef = 13D0DC900931F93A00909807 /* simd_ve.h */; };
-		13D0DC970931F9D600909807 /* simd_sse.h in Headers */ = {isa = PBXBuildFile; fileRef = 13D0DC960931F9D600909807 /* simd_sse.h */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXContainerItemProxy section */
-		1342B52B099BF72F00152915 /* PBXContainerItemProxy */ = {
-			isa = PBXContainerItemProxy;
-			containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */;
-			proxyType = 1;
-			remoteGlobalIDString = D2AAC045055464E500DB518D;
-			remoteInfo = squish;
-		};
-		1342B58E099BF93D00152915 /* PBXContainerItemProxy */ = {
-			isa = PBXContainerItemProxy;
-			containerPortal = 08FB7793FE84155DC02AAC07 /* Project object */;
-			proxyType = 1;
-			remoteGlobalIDString = D2AAC045055464E500DB518D;
-			remoteInfo = squish;
-		};
-/* End PBXContainerItemProxy section */
-
-/* Begin PBXFileReference section */
-		133FA0DA096A7B8E0050752E /* alpha.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alpha.h; sourceTree = "<group>"; };
-		133FA0DB096A7B8E0050752E /* alpha.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alpha.cpp; sourceTree = "<group>"; };
-		1342B4110999DE7F00152915 /* squishpng */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = squishpng; sourceTree = BUILT_PRODUCTS_DIR; };
-		1342B4190999DF7000152915 /* squishpng.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; name = squishpng.cpp; path = extra/squishpng.cpp; sourceTree = "<group>"; };
-		1342B4370999E07C00152915 /* squishtest */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = squishtest; sourceTree = BUILT_PRODUCTS_DIR; };
-		1342B43E0999E0CC00152915 /* squishtest.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = squishtest.cpp; path = extra/squishtest.cpp; sourceTree = "<group>"; };
-		1350D70B092AA857005EE038 /* clusterfit.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = clusterfit.cpp; sourceTree = "<group>"; };
-		1350D70C092AA858005EE038 /* clusterfit.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = clusterfit.h; sourceTree = "<group>"; };
-		1350D70F092AA858005EE038 /* colourblock.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = colourblock.cpp; sourceTree = "<group>"; };
-		1350D710092AA858005EE038 /* colourblock.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = colourblock.h; sourceTree = "<group>"; };
-		1350D711092AA858005EE038 /* config.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = config.h; sourceTree = "<group>"; };
-		1350D712092AA858005EE038 /* maths.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = maths.cpp; sourceTree = "<group>"; };
-		1350D713092AA858005EE038 /* maths.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = maths.h; sourceTree = "<group>"; };
-		1350D716092AA858005EE038 /* rangefit.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = rangefit.cpp; sourceTree = "<group>"; };
-		1350D717092AA858005EE038 /* rangefit.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = rangefit.h; sourceTree = "<group>"; };
-		1350D718092AA858005EE038 /* squish.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = squish.cpp; sourceTree = "<group>"; };
-		1350D719092AA858005EE038 /* squish.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = squish.h; sourceTree = "<group>"; };
-		13906CE3096938880000A6A7 /* texture_compression_s3tc.txt */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = texture_compression_s3tc.txt; sourceTree = "<group>"; };
-		139C21C409ADAA7000A2500D /* squishgen */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = squishgen; sourceTree = BUILT_PRODUCTS_DIR; };
-		139C21CE09ADAB0800A2500D /* squishgen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = squishgen.cpp; path = extra/squishgen.cpp; sourceTree = "<group>"; };
-		139C234D09B0602700A2500D /* singlecolourfit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = singlecolourfit.h; sourceTree = "<group>"; };
-		139C234E09B0602700A2500D /* singlecolourfit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = singlecolourfit.cpp; sourceTree = "<group>"; };
-		139C236D09B060A900A2500D /* singlecolourlookup.inl */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = singlecolourlookup.inl; sourceTree = "<group>"; };
-		13A7CCA20952BE63001C963A /* colourfit.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = colourfit.h; sourceTree = "<group>"; };
-		13A7CCA30952BE63001C963A /* colourfit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = colourfit.cpp; sourceTree = "<group>"; };
-		13C4C7AB0941C18000AC5B89 /* colourset.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = colourset.cpp; sourceTree = "<group>"; };
-		13C4C7AC0941C18000AC5B89 /* colourset.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = colourset.h; sourceTree = "<group>"; };
-		13CD64C0092BCF8A00488C97 /* simd.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simd.h; sourceTree = "<group>"; };
-		13D0DC900931F93A00909807 /* simd_ve.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simd_ve.h; sourceTree = "<group>"; };
-		13D0DC960931F9D600909807 /* simd_sse.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = simd_sse.h; sourceTree = "<group>"; };
-		D2AAC046055464E500DB518D /* libsquish.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsquish.a; sourceTree = BUILT_PRODUCTS_DIR; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		1342B40F0999DE7F00152915 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1342B4160999DF1900152915 /* libsquish.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		1342B4350999E07C00152915 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1342B4420999E0EC00152915 /* libsquish.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		139C21C209ADAA7000A2500D /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		D289987405E68DCB004EDB86 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		08FB7794FE84155DC02AAC07 /* squish */ = {
-			isa = PBXGroup;
-			children = (
-				08FB7795FE84155DC02AAC07 /* Source */,
-				C6A0FF2B0290797F04C91782 /* Documentation */,
-				1AB674ADFE9D54B511CA2CBB /* Products */,
-			);
-			name = squish;
-			sourceTree = "<group>";
-		};
-		08FB7795FE84155DC02AAC07 /* Source */ = {
-			isa = PBXGroup;
-			children = (
-				133FA0DB096A7B8E0050752E /* alpha.cpp */,
-				133FA0DA096A7B8E0050752E /* alpha.h */,
-				1350D70B092AA857005EE038 /* clusterfit.cpp */,
-				1350D70C092AA858005EE038 /* clusterfit.h */,
-				13A7CCA30952BE63001C963A /* colourfit.cpp */,
-				13A7CCA20952BE63001C963A /* colourfit.h */,
-				13C4C7AB0941C18000AC5B89 /* colourset.cpp */,
-				13C4C7AC0941C18000AC5B89 /* colourset.h */,
-				1350D70F092AA858005EE038 /* colourblock.cpp */,
-				1350D710092AA858005EE038 /* colourblock.h */,
-				13906CE3096938880000A6A7 /* texture_compression_s3tc.txt */,
-				1350D711092AA858005EE038 /* config.h */,
-				1350D712092AA858005EE038 /* maths.cpp */,
-				1350D713092AA858005EE038 /* maths.h */,
-				1350D716092AA858005EE038 /* rangefit.cpp */,
-				1350D717092AA858005EE038 /* rangefit.h */,
-				13CD64C0092BCF8A00488C97 /* simd.h */,
-				13D0DC960931F9D600909807 /* simd_sse.h */,
-				13D0DC900931F93A00909807 /* simd_ve.h */,
-				139C234E09B0602700A2500D /* singlecolourfit.cpp */,
-				139C234D09B0602700A2500D /* singlecolourfit.h */,
-				139C236D09B060A900A2500D /* singlecolourlookup.inl */,
-				1350D718092AA858005EE038 /* squish.cpp */,
-				1350D719092AA858005EE038 /* squish.h */,
-				139C21CE09ADAB0800A2500D /* squishgen.cpp */,
-				1342B4190999DF7000152915 /* squishpng.cpp */,
-				1342B43E0999E0CC00152915 /* squishtest.cpp */,
-			);
-			name = Source;
-			sourceTree = "<group>";
-		};
-		1AB674ADFE9D54B511CA2CBB /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				D2AAC046055464E500DB518D /* libsquish.a */,
-				1342B4110999DE7F00152915 /* squishpng */,
-				1342B4370999E07C00152915 /* squishtest */,
-				139C21C409ADAA7000A2500D /* squishgen */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		C6A0FF2B0290797F04C91782 /* Documentation */ = {
-			isa = PBXGroup;
-			children = (
-			);
-			name = Documentation;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXHeadersBuildPhase section */
-		D2AAC043055464E500DB518D /* Headers */ = {
-			isa = PBXHeadersBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1350D71B092AA858005EE038 /* clusterfit.h in Headers */,
-				1350D71F092AA858005EE038 /* colourblock.h in Headers */,
-				1350D720092AA858005EE038 /* config.h in Headers */,
-				1350D722092AA858005EE038 /* maths.h in Headers */,
-				1350D726092AA858005EE038 /* rangefit.h in Headers */,
-				1350D728092AA858005EE038 /* squish.h in Headers */,
-				13CD64C2092BCF8A00488C97 /* simd.h in Headers */,
-				13D0DC910931F93A00909807 /* simd_ve.h in Headers */,
-				13D0DC970931F9D600909807 /* simd_sse.h in Headers */,
-				13C4C7AE0941C18000AC5B89 /* colourset.h in Headers */,
-				13A7CCA40952BE63001C963A /* colourfit.h in Headers */,
-				133FA0DC096A7B8E0050752E /* alpha.h in Headers */,
-				139C234F09B0602700A2500D /* singlecolourfit.h in Headers */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXHeadersBuildPhase section */
-
-/* Begin PBXNativeTarget section */
-		1342B4100999DE7F00152915 /* squishpng */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1342B4130999DE9F00152915 /* Build configuration list for PBXNativeTarget "squishpng" */;
-			buildPhases = (
-				1342B40E0999DE7F00152915 /* Sources */,
-				1342B40F0999DE7F00152915 /* Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-				1342B58F099BF93D00152915 /* PBXTargetDependency */,
-			);
-			name = squishpng;
-			productName = squishpng;
-			productReference = 1342B4110999DE7F00152915 /* squishpng */;
-			productType = "com.apple.product-type.tool";
-		};
-		1342B4360999E07C00152915 /* squishtest */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1342B43B0999E0C000152915 /* Build configuration list for PBXNativeTarget "squishtest" */;
-			buildPhases = (
-				1342B4340999E07C00152915 /* Sources */,
-				1342B4350999E07C00152915 /* Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-				1342B52C099BF72F00152915 /* PBXTargetDependency */,
-			);
-			name = squishtest;
-			productName = squishtest;
-			productReference = 1342B4370999E07C00152915 /* squishtest */;
-			productType = "com.apple.product-type.tool";
-		};
-		139C21C309ADAA7000A2500D /* squishgen */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 139C21CB09ADAB0300A2500D /* Build configuration list for PBXNativeTarget "squishgen" */;
-			buildPhases = (
-				139C21C109ADAA7000A2500D /* Sources */,
-				139C21C209ADAA7000A2500D /* Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = squishgen;
-			productName = squishgen;
-			productReference = 139C21C409ADAA7000A2500D /* squishgen */;
-			productType = "com.apple.product-type.tool";
-		};
-		D2AAC045055464E500DB518D /* squish */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "squish" */;
-			buildPhases = (
-				D2AAC043055464E500DB518D /* Headers */,
-				D2AAC044055464E500DB518D /* Sources */,
-				D289987405E68DCB004EDB86 /* Frameworks */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = squish;
-			productName = squish;
-			productReference = D2AAC046055464E500DB518D /* libsquish.a */;
-			productType = "com.apple.product-type.library.static";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		08FB7793FE84155DC02AAC07 /* Project object */ = {
-			isa = PBXProject;
-			buildConfigurationList = 1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "squish" */;
-			hasScannedForEncodings = 1;
-			mainGroup = 08FB7794FE84155DC02AAC07 /* squish */;
-			projectDirPath = "";
-			targets = (
-				D2AAC045055464E500DB518D /* squish */,
-				1342B4100999DE7F00152915 /* squishpng */,
-				1342B4360999E07C00152915 /* squishtest */,
-				139C21C309ADAA7000A2500D /* squishgen */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXSourcesBuildPhase section */
-		1342B40E0999DE7F00152915 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1342B41A0999DF7000152915 /* squishpng.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		1342B4340999E07C00152915 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1342B43F0999E0CC00152915 /* squishtest.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		139C21C109ADAA7000A2500D /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				139C21CF09ADAB0800A2500D /* squishgen.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-		D2AAC044055464E500DB518D /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1350D71A092AA858005EE038 /* clusterfit.cpp in Sources */,
-				1350D71E092AA858005EE038 /* colourblock.cpp in Sources */,
-				1350D721092AA858005EE038 /* maths.cpp in Sources */,
-				1350D725092AA858005EE038 /* rangefit.cpp in Sources */,
-				1350D727092AA858005EE038 /* squish.cpp in Sources */,
-				13C4C7AD0941C18000AC5B89 /* colourset.cpp in Sources */,
-				13A7CCA50952BE63001C963A /* colourfit.cpp in Sources */,
-				133FA0DD096A7B8E0050752E /* alpha.cpp in Sources */,
-				139C235009B0602700A2500D /* singlecolourfit.cpp in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin PBXTargetDependency section */
-		1342B52C099BF72F00152915 /* PBXTargetDependency */ = {
-			isa = PBXTargetDependency;
-			target = D2AAC045055464E500DB518D /* squish */;
-			targetProxy = 1342B52B099BF72F00152915 /* PBXContainerItemProxy */;
-		};
-		1342B58F099BF93D00152915 /* PBXTargetDependency */ = {
-			isa = PBXTargetDependency;
-			target = D2AAC045055464E500DB518D /* squish */;
-			targetProxy = 1342B58E099BF93D00152915 /* PBXContainerItemProxy */;
-		};
-/* End PBXTargetDependency section */
-
-/* Begin XCBuildConfiguration section */
-		1342B4140999DE9F00152915 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				HEADER_SEARCH_PATHS = (
-					..,
-					/sw/include,
-				);
-				INSTALL_PATH = "$(HOME)/bin";
-				LIBRARY_SEARCH_PATHS = /sw/lib;
-				OTHER_LDFLAGS = "-lpng";
-				PRODUCT_NAME = squishpng;
-			};
-			name = Debug;
-		};
-		1342B4150999DE9F00152915 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				HEADER_SEARCH_PATHS = (
-					..,
-					/sw/include,
-				);
-				INSTALL_PATH = "$(HOME)/bin";
-				LIBRARY_SEARCH_PATHS = /sw/lib;
-				OTHER_LDFLAGS = "-lpng";
-				PRODUCT_NAME = squishpng;
-			};
-			name = Release;
-		};
-		1342B43C0999E0C000152915 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				HEADER_SEARCH_PATHS = ..;
-				INSTALL_PATH = "$(HOME)/bin";
-				PRODUCT_NAME = squishtest;
-			};
-			name = Debug;
-		};
-		1342B43D0999E0C000152915 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				HEADER_SEARCH_PATHS = ..;
-				INSTALL_PATH = "$(HOME)/bin";
-				PRODUCT_NAME = squishtest;
-			};
-			name = Release;
-		};
-		139C21CC09ADAB0300A2500D /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				HEADER_SEARCH_PATHS = ..;
-				INSTALL_PATH = "$(HOME)/bin";
-				PRODUCT_NAME = squishgen;
-			};
-			name = Debug;
-		};
-		139C21CD09ADAB0300A2500D /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				HEADER_SEARCH_PATHS = ..;
-				INSTALL_PATH = "$(HOME)/bin";
-				PRODUCT_NAME = squishgen;
-			};
-			name = Release;
-		};
-		1DEB91EC08733DB70010E9CD /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				COPY_PHASE_STRIP = NO;
-				GCC_PREPROCESSOR_DEFINITIONS = "SQUISH_USE_ALTIVEC=1";
-				INSTALL_PATH = /usr/local/lib;
-				OTHER_CFLAGS = "-maltivec";
-				PRODUCT_NAME = squish;
-				STRIP_INSTALLED_PRODUCT = NO;
-			};
-			name = Debug;
-		};
-		1DEB91ED08733DB70010E9CD /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				GCC_PREPROCESSOR_DEFINITIONS = "SQUISH_USE_ALTIVEC=1";
-				INSTALL_PATH = /usr/local/lib;
-				OTHER_CFLAGS = "-maltivec";
-				PRODUCT_NAME = squish;
-				STRIP_INSTALLED_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		1DEB91F008733DB70010E9CD /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				GCC_DYNAMIC_NO_PIC = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_TREAT_WARNINGS_AS_ERRORS = YES;
-				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES;
-				GCC_WARN_PEDANTIC = YES;
-				GCC_WARN_SHADOW = YES;
-				GCC_WARN_SIGN_COMPARE = YES;
-				GCC_WARN_UNUSED_PARAMETER = YES;
-				GCC_WARN_UNUSED_VALUE = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				PREBINDING = NO;
-				SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
-			};
-			name = Debug;
-		};
-		1DEB91F108733DB70010E9CD /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				GCC_DYNAMIC_NO_PIC = YES;
-				GCC_OPTIMIZATION_LEVEL = 3;
-				GCC_TREAT_WARNINGS_AS_ERRORS = YES;
-				GCC_UNROLL_LOOPS = YES;
-				GCC_WARN_ABOUT_MISSING_NEWLINE = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES;
-				GCC_WARN_PEDANTIC = YES;
-				GCC_WARN_SHADOW = YES;
-				GCC_WARN_SIGN_COMPARE = YES;
-				GCC_WARN_UNUSED_PARAMETER = YES;
-				GCC_WARN_UNUSED_VALUE = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				PREBINDING = NO;
-				SDKROOT = /Developer/SDKs/MacOSX10.4u.sdk;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		1342B4130999DE9F00152915 /* Build configuration list for PBXNativeTarget "squishpng" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1342B4140999DE9F00152915 /* Debug */,
-				1342B4150999DE9F00152915 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		1342B43B0999E0C000152915 /* Build configuration list for PBXNativeTarget "squishtest" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1342B43C0999E0C000152915 /* Debug */,
-				1342B43D0999E0C000152915 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		139C21CB09ADAB0300A2500D /* Build configuration list for PBXNativeTarget "squishgen" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				139C21CC09ADAB0300A2500D /* Debug */,
-				139C21CD09ADAB0300A2500D /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		1DEB91EB08733DB70010E9CD /* Build configuration list for PBXNativeTarget "squish" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1DEB91EC08733DB70010E9CD /* Debug */,
-				1DEB91ED08733DB70010E9CD /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		1DEB91EF08733DB70010E9CD /* Build configuration list for PBXProject "squish" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1DEB91F008733DB70010E9CD /* Debug */,
-				1DEB91F108733DB70010E9CD /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/texture_compression_s3tc.txt
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/texture_compression_s3tc.txt
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/texture_compression_s3tc.txt
@@ -1,508 +0,0 @@
-Name
-
-    EXT_texture_compression_s3tc
-
-Name Strings
-
-    GL_EXT_texture_compression_s3tc
-
-Contact
-
-    Pat Brown, NVIDIA Corporation (pbrown 'at' nvidia.com)
-
-Status
-
-    FINAL
-
-Version
-
-    1.1, 16 November 2001 (containing only clarifications relative to
-                           version 1.0, dated 7 July 2000)
-
-Number
-
-    198
-
-Dependencies
-
-    OpenGL 1.1 is required.
-
-    GL_ARB_texture_compression is required.
-
-    This extension is written against the OpenGL 1.2.1 Specification.
-
-Overview
-
-    This extension provides additional texture compression functionality
-    specific to S3's S3TC format (called DXTC in Microsoft's DirectX API),
-    subject to all the requirements and limitations described by the extension
-    GL_ARB_texture_compression.
-
-    This extension supports DXT1, DXT3, and DXT5 texture compression formats.
-    For the DXT1 image format, this specification supports an RGB-only mode
-    and a special RGBA mode with single-bit "transparent" alpha.
-
-IP Status
-
-    Contact S3 Incorporated (http://www.s3.com) regarding any intellectual
-    property issues associated with implementing this extension.
-
-    WARNING:  Vendors able to support S3TC texture compression in Direct3D
-    drivers do not necessarily have the right to use the same functionality in
-    OpenGL.
-
-Issues
-
-    (1) Should DXT2 and DXT4 (premultiplied alpha) formats be supported?
-
-        RESOLVED:  No -- insufficient interest.  Supporting DXT2 and DXT4
-        would require some rework to the TexEnv definition (maybe add a new
-        base internal format RGBA_PREMULTIPLIED_ALPHA) for these formats.
-        Note that the EXT_texture_env_combine extension (which extends normal
-        TexEnv modes) can be used to support textures with premultipled alpha.
-
-    (2) Should generic "RGB_S3TC_EXT" and "RGBA_S3TC_EXT" enums be supported
-        or should we use only the DXT<n> enums?  
-
-        RESOLVED:  No.  A generic RGBA_S3TC_EXT is problematic because DXT3
-        and DXT5 are both nominally RGBA (and DXT1 with the 1-bit alpha is
-        also) yet one format must be chosen up front.
-
-    (3) Should TexSubImage support all block-aligned edits or just the minimal
-        functionality required by the ARB_texture_compression extension?
-
-        RESOLVED:  Allow all valid block-aligned edits.
-
-    (4) A pre-compressed image with a DXT1 format can be used as either an
-        RGB_S3TC_DXT1 or an RGBA_S3TC_DXT1 image.  If the image has
-        transparent texels, how are they treated in each format?
-
-        RESOLVED:  The renderer has to make sure that an RGB_S3TC_DXT1 format
-        is decoded as RGB (where alpha is effectively one for all texels),
-        while RGBA_S3TC_DXT1 is decoded as RGBA (where alpha is zero for all
-        texels with "transparent" encodings).  Otherwise, the formats are
-        identical.
-
-    (5) Is the encoding of the RGB components for DXT1 formats correct in this
-        spec?  MSDN documentation does not specify an RGB color for the
-        "transparent" encoding.  Is it really black?
-
-        RESOLVED:  Yes.  The specification for the DXT1 format initially
-        required black, but later changed that requirement to a
-        recommendation.  All vendors involved in the definition of this
-        specification support black.  In addition, specifying black has a
-        useful behavior.
-
-        When blending multiple texels (GL_LINEAR filtering), mixing opaque and
-        transparent samples is problematic.  Defining a black color on
-        transparent texels achieves a sensible result that works like a
-        texture with premultiplied alpha.  For example, if three opaque white
-        and one transparent sample is being averaged, the result would be a
-        75% intensity gray (with an alpha of 75%).  This is the same result on
-        the color channels as would be obtained using a white color, 75%
-        alpha, and a SRC_ALPHA blend factor.
-
-    (6) Is the encoding of the RGB components for DXT3 and DXT5 formats
-        correct in this spec?  MSDN documentation suggests that the RGB blocks
-        for DXT3 and DXT5 are decoded as described by the DXT1 format.
-
-        RESOLVED:  Yes -- this appears to be a bug in the MSDN documentation.
-        The specification for the DXT2-DXT5 formats require decoding using the
-        opaque block encoding, regardless of the relative values of "color0"
-        and "color1".
-
-New Procedures and Functions
-
-    None.
-
-New Tokens
-
-    Accepted by the <internalformat> parameter of TexImage2D, CopyTexImage2D,
-    and CompressedTexImage2DARB and the <format> parameter of
-    CompressedTexSubImage2DARB:
-
-        COMPRESSED_RGB_S3TC_DXT1_EXT                   0x83F0
-        COMPRESSED_RGBA_S3TC_DXT1_EXT                  0x83F1
-        COMPRESSED_RGBA_S3TC_DXT3_EXT                  0x83F2
-        COMPRESSED_RGBA_S3TC_DXT5_EXT                  0x83F3
-
-Additions to Chapter 2 of the OpenGL 1.2.1 Specification (OpenGL Operation)
-
-    None.
-
-Additions to Chapter 3 of the OpenGL 1.2.1 Specification (Rasterization)
-
-    Add to Table 3.16.1:  Specific Compressed Internal Formats
-
-        Compressed Internal Format         Base Internal Format
-        ==========================         ====================
-        COMPRESSED_RGB_S3TC_DXT1_EXT       RGB
-        COMPRESSED_RGBA_S3TC_DXT1_EXT      RGBA
-        COMPRESSED_RGBA_S3TC_DXT3_EXT      RGBA
-        COMPRESSED_RGBA_S3TC_DXT5_EXT      RGBA
-
-    
-    Modify Section 3.8.2, Alternate Image Specification
-
-    (add to end of TexSubImage discussion, p.123 -- after edit from the
-    ARB_texture_compression spec)
-
-    If the internal format of the texture image being modified is
-    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT, the
-    texture is stored using one of the several S3TC compressed texture image
-    formats.  Such images are easily edited along 4x4 texel boundaries, so the
-    limitations on TexSubImage2D or CopyTexSubImage2D parameters are relaxed.
-    TexSubImage2D and CopyTexSubImage2D will result in an INVALID_OPERATION
-    error only if one of the following conditions occurs:
-
-        * <width> is not a multiple of four or equal to TEXTURE_WIDTH, 
-          unless <xoffset> and <yoffset> are both zero.
-        * <height> is not a multiple of four or equal to TEXTURE_HEIGHT,
-          unless <xoffset> and <yoffset> are both zero.
-        * <xoffset> or <yoffset> is not a multiple of four.
-
-    The contents of any 4x4 block of texels of an S3TC compressed texture
-    image that does not intersect the area being modified are preserved during
-    valid TexSubImage2D and CopyTexSubImage2D calls.
-
-
-    Add to Section 3.8.2, Alternate Image Specification (adding to the end of
-    the CompressedTexImage section introduced by the ARB_texture_compression
-    spec)
-
-    If <internalformat> is COMPRESSED_RGB_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT3_EXT, or
-    COMPRESSED_RGBA_S3TC_DXT5_EXT, the compressed texture is stored using one
-    of several S3TC compressed texture image formats.  The S3TC texture
-    compression algorithm supports only 2D images without borders.
-    CompressedTexImage1DARB and CompressedTexImage3DARB produce an
-    INVALID_ENUM error if <internalformat> is an S3TC format.
-    CompressedTexImage2DARB will produce an INVALID_OPERATION error if
-    <border> is non-zero.
-
-
-    Add to Section 3.8.2, Alternate Image Specification (adding to the end of
-    the CompressedTexSubImage section introduced by the
-    ARB_texture_compression spec)
-
-    If the internal format of the texture image being modified is
-    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT, the
-    texture is stored using one of the several S3TC compressed texture image
-    formats.  Since the S3TC texture compression algorithm supports only 2D
-    images, CompressedTexSubImage1DARB and CompressedTexSubImage3DARB produce
-    an INVALID_ENUM error if <format> is an S3TC format.  Since S3TC images
-    are easily edited along 4x4 texel boundaries, the limitations on
-    CompressedTexSubImage2D are relaxed.  CompressedTexSubImage2D will result
-    in an INVALID_OPERATION error only if one of the following conditions
-    occurs:
-
-        * <width> is not a multiple of four or equal to TEXTURE_WIDTH.
-        * <height> is not a multiple of four or equal to TEXTURE_HEIGHT.
-        * <xoffset> or <yoffset> is not a multiple of four.
-
-    The contents of any 4x4 block of texels of an S3TC compressed texture
-    image that does not intersect the area being modified are preserved during
-    valid TexSubImage2D and CopyTexSubImage2D calls.
-
-Additions to Chapter 4 of the OpenGL 1.2.1 Specification (Per-Fragment
-Operations and the Frame Buffer)
-
-    None.
-
-Additions to Chapter 5 of the OpenGL 1.2.1 Specification (Special Functions)
-
-    None.
-
-Additions to Chapter 6 of the OpenGL 1.2.1 Specification (State and
-State Requests)
-
-    None.
-
-Additions to Appendix A of the OpenGL 1.2.1 Specification (Invariance)
-
-    None.
-
-Additions to the AGL/GLX/WGL Specifications
-
-    None.
-
-GLX Protocol
-
-    None.
-
-Errors
-
-    INVALID_ENUM is generated by CompressedTexImage1DARB or
-    CompressedTexImage3DARB if <internalformat> is
-    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT.
-
-    INVALID_OPERATION is generated by CompressedTexImage2DARB if
-    <internalformat> is COMPRESSED_RGB_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT3_EXT, or
-    COMPRESSED_RGBA_S3TC_DXT5_EXT and <border> is not equal to zero.
-
-    INVALID_ENUM is generated by CompressedTexSubImage1DARB or
-    CompressedTexSubImage3DARB if <format> is COMPRESSED_RGB_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT3_EXT, or
-    COMPRESSED_RGBA_S3TC_DXT5_EXT.
-
-    INVALID_OPERATION is generated by TexSubImage2D CopyTexSubImage2D, or
-    CompressedTexSubImage2D if TEXTURE_INTERNAL_FORMAT is
-    COMPRESSED_RGB_S3TC_DXT1_EXT, COMPRESSED_RGBA_S3TC_DXT1_EXT,
-    COMPRESSED_RGBA_S3TC_DXT3_EXT, or COMPRESSED_RGBA_S3TC_DXT5_EXT and any of
-    the following apply: <width> is not a multiple of four or equal to
-    TEXTURE_WIDTH; <height> is not a multiple of four or equal to
-    TEXTURE_HEIGHT; <xoffset> or <yoffset> is not a multiple of four.
-
-
-    The following restrictions from the ARB_texture_compression specification
-    do not apply to S3TC texture formats, since subimage modification is
-    straightforward as long as the subimage is properly aligned.
-
-    DELETE: INVALID_OPERATION is generated by TexSubImage1D, TexSubImage2D,
-    DELETE: TexSubImage3D, CopyTexSubImage1D, CopyTexSubImage2D, or
-    DELETE: CopyTexSubImage3D if the internal format of the texture image is
-    DELETE: compressed and <xoffset>, <yoffset>, or <zoffset> does not equal
-    DELETE: -b, where b is value of TEXTURE_BORDER.
-
-    DELETE: INVALID_VALUE is generated by CompressedTexSubImage1DARB,
-    DELETE: CompressedTexSubImage2DARB, or CompressedTexSubImage3DARB if the
-    DELETE: entire texture image is not being edited:  if <xoffset>,
-    DELETE: <yoffset>, or <zoffset> is greater than -b, <xoffset> + <width> is
-    DELETE: less than w+b, <yoffset> + <height> is less than h+b, or <zoffset>
-    DELETE: + <depth> is less than d+b, where b is the value of
-    DELETE: TEXTURE_BORDER, w is the value of TEXTURE_WIDTH, h is the value of
-    DELETE: TEXTURE_HEIGHT, and d is the value of TEXTURE_DEPTH.
-
-    See also errors in the GL_ARB_texture_compression specification.
-
-New State
-
-    In the "Textures" state table, increment the TEXTURE_INTERNAL_FORMAT
-    subscript for Z by 4 in the "Type" row.
-
-New Implementation Dependent State
-
-    None
-
-Appendix
-
-    S3TC Compressed Texture Image Formats
-
-    Compressed texture images stored using the S3TC compressed image formats
-    are represented as a collection of 4x4 texel blocks, where each block
-    contains 64 or 128 bits of texel data.  The image is encoded as a normal
-    2D raster image in which each 4x4 block is treated as a single pixel.  If
-    an S3TC image has a width or height less than four, the data corresponding
-    to texels outside the image are irrelevant and undefined.
-
-    When an S3TC image with a width of <w>, height of <h>, and block size of
-    <blocksize> (8 or 16 bytes) is decoded, the corresponding image size (in
-    bytes) is:
-    
-        ceil(<w>/4) * ceil(<h>/4) * blocksize.
-
-    When decoding an S3TC image, the block containing the texel at offset
-    (<x>, <y>) begins at an offset (in bytes) relative to the base of the
-    image of:
-
-        blocksize * (ceil(<w>/4) * floor(<y>/4) + floor(<x>/4)).
-
-    The data corresponding to a specific texel (<x>, <y>) are extracted from a
-    4x4 texel block using a relative (x,y) value of
-    
-        (<x> modulo 4, <y> modulo 4).
-
-    There are four distinct S3TC image formats:
-
-    COMPRESSED_RGB_S3TC_DXT1_EXT:  Each 4x4 block of texels consists of 64
-    bits of RGB image data.  
-
-    Each RGB image data block is encoded as a sequence of 8 bytes, called (in
-    order of increasing address):
-
-            c0_lo, c0_hi, c1_lo, c1_hi, bits_0, bits_1, bits_2, bits_3
-
-        The 8 bytes of the block are decoded into three quantities:
-
-            color0 = c0_lo + c0_hi * 256
-            color1 = c1_lo + c1_hi * 256
-            bits   = bits_0 + 256 * (bits_1 + 256 * (bits_2 + 256 * bits_3))
-        
-        color0 and color1 are 16-bit unsigned integers that are unpacked to
-        RGB colors RGB0 and RGB1 as though they were 16-bit packed pixels with
-        a <format> of RGB and a type of UNSIGNED_SHORT_5_6_5.
-
-        bits is a 32-bit unsigned integer, from which a two-bit control code
-        is extracted for a texel at location (x,y) in the block using:
-
-            code(x,y) = bits[2*(4*y+x)+1..2*(4*y+x)+0]
-        
-        where bit 31 is the most significant and bit 0 is the least
-        significant bit.
-
-        The RGB color for a texel at location (x,y) in the block is given by:
-
-            RGB0,              if color0 > color1 and code(x,y) == 0
-            RGB1,              if color0 > color1 and code(x,y) == 1
-            (2*RGB0+RGB1)/3,   if color0 > color1 and code(x,y) == 2
-            (RGB0+2*RGB1)/3,   if color0 > color1 and code(x,y) == 3
-
-            RGB0,              if color0 <= color1 and code(x,y) == 0
-            RGB1,              if color0 <= color1 and code(x,y) == 1
-            (RGB0+RGB1)/2,     if color0 <= color1 and code(x,y) == 2
-            BLACK,             if color0 <= color1 and code(x,y) == 3
-
-        Arithmetic operations are done per component, and BLACK refers to an
-        RGB color where red, green, and blue are all zero.
-
-    Since this image has an RGB format, there is no alpha component and the
-    image is considered fully opaque.
-
-
-    COMPRESSED_RGBA_S3TC_DXT1_EXT:  Each 4x4 block of texels consists of 64
-    bits of RGB image data and minimal alpha information.  The RGB components
-    of a texel are extracted in the same way as COMPRESSED_RGB_S3TC_DXT1_EXT.
- 
-        The alpha component for a texel at location (x,y) in the block is
-        given by:
-
-            0.0,               if color0 <= color1 and code(x,y) == 3
-            1.0,               otherwise
-
-        IMPORTANT:  When encoding an RGBA image into a format using 1-bit
-        alpha, any texels with an alpha component less than 0.5 end up with an
-        alpha of 0.0 and any texels with an alpha component greater than or
-        equal to 0.5 end up with an alpha of 1.0.  When encoding an RGBA image
-        into the COMPRESSED_RGBA_S3TC_DXT1_EXT format, the resulting red,
-        green, and blue components of any texels with a final alpha of 0.0
-        will automatically be zero (black).  If this behavior is not desired
-        by an application, it should not use COMPRESSED_RGBA_S3TC_DXT1_EXT.
-        This format will never be used when a generic compressed internal
-        format (Table 3.16.2) is specified, although the nearly identical
-        format COMPRESSED_RGB_S3TC_DXT1_EXT (above) may be.
-
-
-    COMPRESSED_RGBA_S3TC_DXT3_EXT:  Each 4x4 block of texels consists of 64
-    bits of uncompressed alpha image data followed by 64 bits of RGB image
-    data.  
-
-    Each RGB image data block is encoded according to the
-    COMPRESSED_RGB_S3TC_DXT1_EXT format, with the exception that the two code
-    bits always use the non-transparent encodings.  In other words, they are
-    treated as though color0 > color1, regardless of the actual values of
-    color0 and color1.
-
-    Each alpha image data block is encoded as a sequence of 8 bytes, called
-    (in order of increasing address):
-
-            a0, a1, a2, a3, a4, a5, a6, a7
-
-        The 8 bytes of the block are decoded into one 64-bit integer:
-
-            alpha = a0 + 256 * (a1 + 256 * (a2 + 256 * (a3 + 256 * (a4 +
-                         256 * (a5 + 256 * (a6 + 256 * a7))))))
-
-        alpha is a 64-bit unsigned integer, from which a four-bit alpha value
-        is extracted for a texel at location (x,y) in the block using:
-
-            alpha(x,y) = bits[4*(4*y+x)+3..4*(4*y+x)+0]
-
-        where bit 63 is the most significant and bit 0 is the least
-        significant bit.
-
-        The alpha component for a texel at location (x,y) in the block is
-        given by alpha(x,y) / 15.
-
- 
-    COMPRESSED_RGBA_S3TC_DXT5_EXT:  Each 4x4 block of texels consists of 64
-    bits of compressed alpha image data followed by 64 bits of RGB image data.
-
-    Each RGB image data block is encoded according to the
-    COMPRESSED_RGB_S3TC_DXT1_EXT format, with the exception that the two code
-    bits always use the non-transparent encodings.  In other words, they are
-    treated as though color0 > color1, regardless of the actual values of
-    color0 and color1.
-
-    Each alpha image data block is encoded as a sequence of 8 bytes, called
-    (in order of increasing address):
-
-        alpha0, alpha1, bits_0, bits_1, bits_2, bits_3, bits_4, bits_5
-
-        The alpha0 and alpha1 are 8-bit unsigned bytes converted to alpha
-        components by multiplying by 1/255.
-
-        The 6 "bits" bytes of the block are decoded into one 48-bit integer:
-
-          bits = bits_0 + 256 * (bits_1 + 256 * (bits_2 + 256 * (bits_3 + 
-                          256 * (bits_4 + 256 * bits_5))))
-
-        bits is a 48-bit unsigned integer, from which a three-bit control code
-        is extracted for a texel at location (x,y) in the block using:
-
-            code(x,y) = bits[3*(4*y+x)+1..3*(4*y+x)+0]
-
-        where bit 47 is the most significant and bit 0 is the least
-        significant bit.
-
-        The alpha component for a texel at location (x,y) in the block is
-        given by:
-
-              alpha0,                   code(x,y) == 0
-              alpha1,                   code(x,y) == 1
-
-              (6*alpha0 + 1*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 2
-              (5*alpha0 + 2*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 3
-              (4*alpha0 + 3*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 4
-              (3*alpha0 + 4*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 5
-              (2*alpha0 + 5*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 6
-              (1*alpha0 + 6*alpha1)/7,  alpha0 > alpha1 and code(x,y) == 7
-
-              (4*alpha0 + 1*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 2
-              (3*alpha0 + 2*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 3
-              (2*alpha0 + 3*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 4
-              (1*alpha0 + 4*alpha1)/5,  alpha0 <= alpha1 and code(x,y) == 5
-              0.0,                      alpha0 <= alpha1 and code(x,y) == 6
-              1.0,                      alpha0 <= alpha1 and code(x,y) == 7
-
-
-Revision History
-
-    1.1,  11/16/01 pbrown:    Updated contact info, clarified where texels
-                              fall within a single block.
-
-    1.0,  07/07/00 prbrown1:  Published final version agreed to by working
-                              group members.
-
-    0.9,  06/24/00 prbrown1:  Documented that block-aligned TexSubImage calls
-                              do not modify existing texels outside the
-                              modified blocks.  Added caveat to allow for a
-                              (0,0)-anchored TexSubImage operation of
-                              arbitrary size.
-
-    0.7,  04/11/00 prbrown1:  Added issues on DXT1, DXT3, and DXT5 encodings
-                              where the MSDN documentation doesn't match what
-                              is really done.  Added enum values from the
-                              extension registry.
-
-    0.4,  03/28/00 prbrown1:  Updated to reflect final version of the
-                              ARB_texture_compression extension.  Allowed
-                              block-aligned TexSubImage calls.
-
-    0.3,  03/07/00 prbrown1:  Resolved issues pertaining to the format of RGB
-                              blocks in the DXT3 and DXT5 formats (they don't
-                              ever use the "transparent" encoding).  Fixed
-                              decoding of DXT1 blocks.  Pointed out issue of
-                              "transparent" texels in DXT1 encodings having
-                              different behaviors for RGB and RGBA internal
-                              formats.
-
-    0.2,  02/23/00 prbrown1:  Minor revisions; added several issues.
-
-    0.11, 02/17/00 prbrown1:  Slight modification to error semantics
-                              (INVALID_ENUM instead of INVALID_OPERATION).
-
-    0.1,  02/15/00 prbrown1:  Initial revision.
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squish.sln
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squish.sln
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squish.sln
@@ -1,39 +0,0 @@
-Microsoft Visual Studio Solution File, Format Version 8.00
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squish", "squish\squish.vcproj", "{6A8518C3-D81A-4428-BD7F-C37933088AC1}"
-	ProjectSection(ProjectDependencies) = postProject
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squishpng", "squishpng\squishpng.vcproj", "{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}"
-	ProjectSection(ProjectDependencies) = postProject
-		{6A8518C3-D81A-4428-BD7F-C37933088AC1} = {6A8518C3-D81A-4428-BD7F-C37933088AC1}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "squishtest", "squishtest\squishtest.vcproj", "{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}"
-	ProjectSection(ProjectDependencies) = postProject
-		{6A8518C3-D81A-4428-BD7F-C37933088AC1} = {6A8518C3-D81A-4428-BD7F-C37933088AC1}
-	EndProjectSection
-EndProject
-Global
-	GlobalSection(SolutionConfiguration) = preSolution
-		Debug = Debug
-		Release = Release
-	EndGlobalSection
-	GlobalSection(ProjectConfiguration) = postSolution
-		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Debug.ActiveCfg = Debug|Win32
-		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Debug.Build.0 = Debug|Win32
-		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Release.ActiveCfg = Release|Win32
-		{6A8518C3-D81A-4428-BD7F-C37933088AC1}.Release.Build.0 = Release|Win32
-		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Debug.ActiveCfg = Debug|Win32
-		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Debug.Build.0 = Debug|Win32
-		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Release.ActiveCfg = Release|Win32
-		{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}.Release.Build.0 = Release|Win32
-		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Debug.ActiveCfg = Debug|Win32
-		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Debug.Build.0 = Debug|Win32
-		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Release.ActiveCfg = Release|Win32
-		{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}.Release.Build.0 = Release|Win32
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-	EndGlobalSection
-	GlobalSection(ExtensibilityAddIns) = postSolution
-	EndGlobalSection
-EndGlobal
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squish/squish.vcproj
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squish/squish.vcproj
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squish/squish.vcproj
@@ -1,198 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="7.10"
-	Name="squish"
-	ProjectGUID="{6A8518C3-D81A-4428-BD7F-C37933088AC1}"
-	Keyword="Win32Proj">
-	<Platforms>
-		<Platform
-			Name="Win32"/>
-	</Platforms>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory="Debug"
-			IntermediateDirectory="Debug"
-			ConfigurationType="4"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="..\.."
-				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;SQUISH_USE_SSE=1"
-				MinimalRebuild="TRUE"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="3"
-				EnableEnhancedInstructionSet="1"
-				ForceConformanceInForLoopScope="TRUE"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				WarnAsError="TRUE"
-				Detect64BitPortabilityProblems="TRUE"
-				DebugInformationFormat="3"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="$(OutDir)/squish.lib"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="Release"
-			IntermediateDirectory="Release"
-			ConfigurationType="4"
-			CharacterSet="2"
-			WholeProgramOptimization="TRUE">
-			<Tool
-				Name="VCCLCompilerTool"
-				GlobalOptimizations="TRUE"
-				InlineFunctionExpansion="2"
-				FavorSizeOrSpeed="1"
-				OmitFramePointers="TRUE"
-				AdditionalIncludeDirectories="..\.."
-				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;SQUISH_USE_SSE=1"
-				RuntimeLibrary="2"
-				ForceConformanceInForLoopScope="TRUE"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				WarnAsError="TRUE"
-				Detect64BitPortabilityProblems="TRUE"
-				DebugInformationFormat="3"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLibrarianTool"
-				OutputFile="$(OutDir)/squish.lib"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
-			<File
-				RelativePath="..\..\alpha.cpp">
-			</File>
-			<File
-				RelativePath="..\..\clusterfit.cpp">
-			</File>
-			<File
-				RelativePath="..\..\colourblock.cpp">
-			</File>
-			<File
-				RelativePath="..\..\colourfit.cpp">
-			</File>
-			<File
-				RelativePath="..\..\colourset.cpp">
-			</File>
-			<File
-				RelativePath="..\..\maths.cpp">
-			</File>
-			<File
-				RelativePath="..\..\rangefit.cpp">
-			</File>
-			<File
-				RelativePath="..\..\singlecolourfit.cpp">
-			</File>
-			<File
-				RelativePath="..\..\squish.cpp">
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
-			<File
-				RelativePath="..\..\alpha.h">
-			</File>
-			<File
-				RelativePath="..\..\clusterfit.h">
-			</File>
-			<File
-				RelativePath="..\..\colourblock.h">
-			</File>
-			<File
-				RelativePath="..\..\colourfit.h">
-			</File>
-			<File
-				RelativePath="..\..\colourset.h">
-			</File>
-			<File
-				RelativePath="..\..\config.h">
-			</File>
-			<File
-				RelativePath="..\..\maths.h">
-			</File>
-			<File
-				RelativePath="..\..\rangefit.h">
-			</File>
-			<File
-				RelativePath="..\..\simd.h">
-			</File>
-			<File
-				RelativePath="..\..\simd_sse.h">
-			</File>
-			<File
-				RelativePath="..\..\simd_ve.h">
-			</File>
-			<File
-				RelativePath="..\..\singlecolourfit.h">
-			</File>
-			<File
-				RelativePath="..\..\singlecolourlookup.inl">
-			</File>
-			<File
-				RelativePath="..\..\squish.h">
-			</File>
-		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
-			<File
-				RelativePath="..\..\texture_compression_s3tc.txt">
-			</File>
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squishpng/squishpng.vcproj
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squishpng/squishpng.vcproj
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squishpng/squishpng.vcproj
@@ -1,140 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="7.10"
-	Name="squishpng"
-	ProjectGUID="{3BC7CF47-F1C8-4BDA-BE30-92F17B21D2C7}"
-	Keyword="Win32Proj">
-	<Platforms>
-		<Platform
-			Name="Win32"/>
-	</Platforms>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory="Debug"
-			IntermediateDirectory="Debug"
-			ConfigurationType="1"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="..\.."
-				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
-				MinimalRebuild="TRUE"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="3"
-				ForceConformanceInForLoopScope="TRUE"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				WarnAsError="TRUE"
-				Detect64BitPortabilityProblems="TRUE"
-				DebugInformationFormat="3"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="libpng13d.lib"
-				OutputFile="$(OutDir)/squishpng.exe"
-				LinkIncremental="2"
-				GenerateDebugInformation="TRUE"
-				ProgramDatabaseFile="$(OutDir)/squishpng.pdb"
-				SubSystem="1"
-				TargetMachine="1"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCWebDeploymentTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="Release"
-			IntermediateDirectory="Release"
-			ConfigurationType="1"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				AdditionalIncludeDirectories="..\.."
-				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
-				RuntimeLibrary="2"
-				ForceConformanceInForLoopScope="TRUE"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				WarnAsError="TRUE"
-				Detect64BitPortabilityProblems="TRUE"
-				DebugInformationFormat="3"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="libpng13.lib"
-				OutputFile="$(OutDir)/squishpng.exe"
-				LinkIncremental="1"
-				GenerateDebugInformation="TRUE"
-				SubSystem="1"
-				OptimizeReferences="2"
-				EnableCOMDATFolding="2"
-				TargetMachine="1"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCWebDeploymentTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
-			<File
-				RelativePath="..\..\extra\squishpng.cpp">
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
-		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squishtest/squishtest.vcproj
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squishtest/squishtest.vcproj
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/vs7/squishtest/squishtest.vcproj
@@ -1,138 +0,0 @@
-<?xml version="1.0" encoding="Windows-1252"?>
-<VisualStudioProject
-	ProjectType="Visual C++"
-	Version="7.10"
-	Name="squishtest"
-	ProjectGUID="{77A3F26C-A1D6-4535-9E37-7D3DF34E4B4B}"
-	Keyword="Win32Proj">
-	<Platforms>
-		<Platform
-			Name="Win32"/>
-	</Platforms>
-	<Configurations>
-		<Configuration
-			Name="Debug|Win32"
-			OutputDirectory="Debug"
-			IntermediateDirectory="Debug"
-			ConfigurationType="1"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="0"
-				AdditionalIncludeDirectories="..\.."
-				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
-				MinimalRebuild="TRUE"
-				BasicRuntimeChecks="3"
-				RuntimeLibrary="3"
-				ForceConformanceInForLoopScope="TRUE"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				WarnAsError="TRUE"
-				Detect64BitPortabilityProblems="TRUE"
-				DebugInformationFormat="3"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLinkerTool"
-				OutputFile="$(OutDir)/squishtest.exe"
-				LinkIncremental="2"
-				GenerateDebugInformation="TRUE"
-				ProgramDatabaseFile="$(OutDir)/squishtest.pdb"
-				SubSystem="1"
-				TargetMachine="1"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCWebDeploymentTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="Release"
-			IntermediateDirectory="Release"
-			ConfigurationType="1"
-			CharacterSet="2">
-			<Tool
-				Name="VCCLCompilerTool"
-				AdditionalIncludeDirectories="..\.."
-				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
-				RuntimeLibrary="2"
-				ForceConformanceInForLoopScope="TRUE"
-				UsePrecompiledHeader="0"
-				WarningLevel="4"
-				WarnAsError="TRUE"
-				Detect64BitPortabilityProblems="TRUE"
-				DebugInformationFormat="3"/>
-			<Tool
-				Name="VCCustomBuildTool"/>
-			<Tool
-				Name="VCLinkerTool"
-				OutputFile="$(OutDir)/squishtest.exe"
-				LinkIncremental="1"
-				GenerateDebugInformation="TRUE"
-				SubSystem="1"
-				OptimizeReferences="2"
-				EnableCOMDATFolding="2"
-				TargetMachine="1"/>
-			<Tool
-				Name="VCMIDLTool"/>
-			<Tool
-				Name="VCPostBuildEventTool"/>
-			<Tool
-				Name="VCPreBuildEventTool"/>
-			<Tool
-				Name="VCPreLinkEventTool"/>
-			<Tool
-				Name="VCResourceCompilerTool"/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"/>
-			<Tool
-				Name="VCWebDeploymentTool"/>
-			<Tool
-				Name="VCManagedWrapperGeneratorTool"/>
-			<Tool
-				Name="VCAuxiliaryManagedWrapperGeneratorTool"/>
-		</Configuration>
-	</Configurations>
-	<References>
-	</References>
-	<Files>
-		<Filter
-			Name="Source Files"
-			Filter="cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx"
-			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}">
-			<File
-				RelativePath="..\..\extra\squishtest.cpp">
-			</File>
-		</Filter>
-		<Filter
-			Name="Header Files"
-			Filter="h;hpp;hxx;hm;inl;inc;xsd"
-			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}">
-		</Filter>
-		<Filter
-			Name="Resource Files"
-			Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
-			UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}">
-		</Filter>
-	</Files>
-	<Globals>
-	</Globals>
-</VisualStudioProject>
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/weightedclusterfit.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/weightedclusterfit.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/weightedclusterfit.h
@@ -23,16 +23,16 @@
 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 	
    -------------------------------------------------------------------------- */
-   
-#ifndef SQUISH_WEIGHTEDCLUSTERFIT_H
-#define SQUISH_WEIGHTEDCLUSTERFIT_H
+
+#ifndef NV_SQUISH_WEIGHTEDCLUSTERFIT_H
+#define NV_SQUISH_WEIGHTEDCLUSTERFIT_H
 
 #include "squish.h"
 #include "maths.h"
 #include "simd.h"
 #include "colourfit.h"
 
-namespace squish {
+namespace nvsquish {
 
 class WeightedClusterFit : public ColourFit
 {
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/weightedclusterfit.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/weightedclusterfit.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/squish/weightedclusterfit.cpp
@@ -1,28 +1,28 @@
 /* -----------------------------------------------------------------------------
 
-Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
 
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the 
-"Software"), to	deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to 
-permit persons to whom the Software is furnished to do so, subject to 
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
--------------------------------------------------------------------------- */
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
 
 #include "weightedclusterfit.h"
 #include "colourset.h"
@@ -30,158 +30,277 @@
 #include <cfloat>
 
 
-namespace squish {
+namespace nvsquish {
 
-	WeightedClusterFit::WeightedClusterFit()
-	{
-	}
+WeightedClusterFit::WeightedClusterFit()
+{
+}
 
-	void WeightedClusterFit::SetColourSet( ColourSet const* colours, int flags )
-	{
-		ColourFit::SetColourSet( colours, flags );
+void WeightedClusterFit::SetColourSet( ColourSet const* colours, int flags )
+{
+	ColourFit::SetColourSet( colours, flags );
 
-		// initialise the best error
+	// initialise the best error
 #if SQUISH_USE_SIMD
-		m_besterror = VEC4_CONST( FLT_MAX );
-		Vec3 metric = m_metric.GetVec3();
+	m_besterror = VEC4_CONST( FLT_MAX );
+	Vec3 metric = m_metric.GetVec3();
 #else
-		m_besterror = FLT_MAX;
-		Vec3 metric = m_metric;
+	m_besterror = FLT_MAX;
+	Vec3 metric = m_metric;
 #endif
 
-		// cache some values
-		int const count = m_colours->GetCount();
-		Vec3 const* values = m_colours->GetPoints();
-
-		// get the covariance matrix
-		Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
-
-		// compute the principle component
-		Vec3 principle = ComputePrincipleComponent( covariance );
-
-		// build the list of values
-		float dps[16];
-		for( int i = 0; i < count; ++i )
-		{
-			dps[i] = Dot( values[i], principle );
-			m_order[i] = i;
-		}
-
-		// stable sort
-		for( int i = 0; i < count; ++i )
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights(), metric );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// build the list of values
+	float dps[16];
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], principle );
+		m_order[i] = i;
+	}
+	
+	// stable sort
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
 		{
-			for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
-			{
-				std::swap( dps[j], dps[j - 1] );
-				std::swap( m_order[j], m_order[j - 1] );
-			}
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( m_order[j], m_order[j - 1] );
 		}
-
-		// weight all the points
+	}
+	
+	// weight all the points
 #if SQUISH_USE_SIMD
-		Vec4 const* unweighted = m_colours->GetPointsSimd();
-		Vec4 const* weights = m_colours->GetWeightsSimd();
-		m_xxsum = VEC4_CONST( 0.0f );
-		m_xsum = VEC4_CONST( 0.0f );
+	Vec4 const* unweighted = m_colours->GetPointsSimd();
+	Vec4 const* weights = m_colours->GetWeightsSimd();
+	m_xxsum = VEC4_CONST( 0.0f );
+	m_xsum = VEC4_CONST( 0.0f );
 #else
-		Vec3 const* unweighted = m_colours->GetPoints();
-		float const* weights = m_colours->GetWeights();
-		m_xxsum = Vec3( 0.0f );
-		m_xsum = Vec3( 0.0f );
-		m_wsum = 0.0f;	
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xxsum = Vec3( 0.0f );
+	m_xsum = Vec3( 0.0f );
+	m_wsum = 0.0f;	
 #endif
-
-		for( int i = 0; i < count; ++i )
-		{
-			int p = m_order[i];
-			m_weighted[i] = weights[p] * unweighted[p];
-			m_xxsum += m_weighted[i] * m_weighted[i];
-			m_xsum += m_weighted[i];
+	
+	for( int i = 0; i < count; ++i )
+	{
+		int p = m_order[i];
+		m_weighted[i] = weights[p] * unweighted[p];
+		m_xxsum += m_weighted[i] * m_weighted[i];
+		m_xsum += m_weighted[i];
 #if !SQUISH_USE_SIMD		
-			m_weights[i] = weights[p];
-			m_wsum += m_weights[i];
+		m_weights[i] = weights[p];
+		m_wsum += m_weights[i];
 #endif
-		}
 	}
+}
 
 
-	void WeightedClusterFit::SetMetric(float r, float g, float b)
-	{
+void WeightedClusterFit::SetMetric(float r, float g, float b)
+{
 #if SQUISH_USE_SIMD
-		m_metric = Vec4(r, g, b, 0);
+	m_metric = Vec4(r, g, b, 0);
 #else
-		m_metric = Vec3(r, g, b);
+	m_metric = Vec3(r, g, b);
 #endif
-		m_metricSqr = m_metric * m_metric;
-	}
+	m_metricSqr = m_metric * m_metric;
+}
 
-	float WeightedClusterFit::GetBestError() const
-	{
+float WeightedClusterFit::GetBestError() const
+{
 #if SQUISH_USE_SIMD
-		Vec4 x = m_xxsum * m_metricSqr;
-		Vec4 error = m_besterror + x.SplatX() + x.SplatY() + x.SplatZ();
-		return error.GetVec3().X();
+	Vec4 x = m_xxsum * m_metricSqr;
+	Vec4 error = m_besterror + x.SplatX() + x.SplatY() + x.SplatZ();
+	return error.GetX();
 #else
-		return m_besterror + Dot(m_xxsum, m_metricSqr);
+	return m_besterror + Dot(m_xxsum, m_metricSqr);
 #endif
 
-	}
+}
 
 #if SQUISH_USE_SIMD
 
-	void WeightedClusterFit::Compress3( void* block )
+void WeightedClusterFit::Compress3( void* block )
+{
+    int const count = m_colours->GetCount();
+	Vec4 const one = VEC4_CONST(1.0f);
+	Vec4 const zero = VEC4_CONST(0.0f);
+	Vec4 const half(0.5f, 0.5f, 0.5f, 0.25f);
+	Vec4 const two = VEC4_CONST(2.0);
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+	 
+	// declare variables
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+	Vec4 x0 = zero;
+	
+	int b0 = 0, b1 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= count; c0++)
+	{	
+		Vec4 x1 = zero;
+		
+		for( int c1 = 0; c1 <= count-c0; c1++)
+		{
+			Vec4 const x2 = m_xsum - x1 - x0;
+			
+			//Vec3 const alphax_sum = x0 + x1 * 0.5f;
+			//float const alpha2_sum = w0 + w1 * 0.25f;
+			Vec4 const alphax_sum = MultiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+			Vec4 const alpha2_sum = alphax_sum.SplatW();
+			
+			//Vec3 const betax_sum = x2 + x1 * 0.5f;
+			//float const beta2_sum = w2 + w1 * 0.25f;
+			Vec4 const betax_sum = MultiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+			Vec4 const beta2_sum = betax_sum.SplatW();
+			
+			//float const alphabeta_sum = w1 * 0.25f;
+			Vec4 const alphabeta_sum = (x1 * half).SplatW(); // alphabeta_sum
+			
+			// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+			Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+			
+			Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+			Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+			
+			// clamp to the grid
+			a = Min( one, Max( zero, a ) );
+			b = Min( one, Max( zero, b ) );
+			a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
+			b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
+			
+			// compute the error (we skip the constant xxsum)
+			Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+			Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+			Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+			Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+			// apply the metric to the error term
+			Vec4 e5 = e4 * m_metricSqr;
+			Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+			
+			// keep the solution if it wins
+			if( CompareAnyLessThan( error, besterror ) )
+			{
+				besterror = error;
+				beststart = a;
+				bestend = b;
+				b0 = c0;
+				b1 = c1;
+			}
+			
+			x1 += m_weighted[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
 	{
-		int const count = m_colours->GetCount();
-		Vec4 const one = VEC4_CONST(1.0f);
-		Vec4 const zero = VEC4_CONST(0.0f);
-		Vec4 const half(0.5f, 0.5f, 0.5f, 0.25f);
-		Vec4 const two = VEC4_CONST(2.0);
-		Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-		Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-
-		// declare variables
-		Vec4 beststart = VEC4_CONST( 0.0f );
-		Vec4 bestend = VEC4_CONST( 0.0f );
-		Vec4 besterror = VEC4_CONST( FLT_MAX );
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
+		{
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < count; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < count; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+		m_colours->RemapIndices( ordered, bestindices );
 
-		Vec4 x0 = zero;
 
-		int b0 = 0, b1 = 0;
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		
+		// save the error
+		m_besterror = besterror;
+	}
+}
 
-		// check all possible clusters for this total order
-		for( int c0 = 0; c0 <= count; c0++)
+void WeightedClusterFit::Compress4( void* block )
+{
+    int const count = m_colours->GetCount();
+	Vec4 const one = VEC4_CONST(1.0f);
+	Vec4 const zero = VEC4_CONST(0.0f);
+	Vec4 const half = VEC4_CONST(0.5f);
+	Vec4 const two = VEC4_CONST(2.0);
+	Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+	Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+    Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+	
+	// declare variables
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = VEC4_CONST( FLT_MAX );
+
+	Vec4 x0 = zero;
+	int b0 = 0, b1 = 0, b2 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= count; c0++)
+	{	
+		Vec4 x1 = zero;
+		
+		for( int c1 = 0; c1 <= count-c0; c1++)
 		{	
-			Vec4 x1 = zero;
-
-			for( int c1 = 0; c1 <= count-c0; c1++)
+			Vec4 x2 = zero;
+			
+			for( int c2 = 0; c2 <= count-c0-c1; c2++)
 			{
-				Vec4 const x2 = m_xsum - x1 - x0;
-
-				//Vec3 const alphax_sum = x0 + x1 * 0.5f;
-				//float const alpha2_sum = w0 + w1 * 0.25f;
-				Vec4 const alphax_sum = MultiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+				Vec4 const x3 = m_xsum - x2 - x1 - x0;
+				
+				//Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+				//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
 				Vec4 const alpha2_sum = alphax_sum.SplatW();
-
-				//Vec3 const betax_sum = x2 + x1 * 0.5f;
-				//float const beta2_sum = w2 + w1 * 0.25f;
-				Vec4 const betax_sum = MultiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+				
+				//Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+				//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+				Vec4 const betax_sum = MultiplyAdd(x2, twothirds, MultiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
 				Vec4 const beta2_sum = betax_sum.SplatW();
-
-				//float const alphabeta_sum = w1 * 0.25f;
-				Vec4 const alphabeta_sum = (x1 * half).SplatW(); // alphabeta_sum
-
+				
+				//float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                Vec4 const alphabeta_sum = twonineths*( x1 + x2 ).SplatW(); // alphabeta_sum
+				
 				// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
 				Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-
+				
 				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
 				Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-
+				
 				// clamp to the grid
 				a = Min( one, Max( zero, a ) );
 				b = Min( one, Max( zero, b ) );
 				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
 				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
-
+				
 				// compute the error (we skip the constant xxsum)
 				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
 				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
@@ -191,7 +310,7 @@
 				// apply the metric to the error term
 				Vec4 e5 = e4 * m_metricSqr;
 				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
-
+				
 				// keep the solution if it wins
 				if( CompareAnyLessThan( error, besterror ) )
 				{
@@ -200,228 +319,216 @@
 					bestend = b;
 					b0 = c0;
 					b1 = c1;
+					b2 = c2;
 				}
-
-				x1 += m_weighted[c0+c1];
+				
+				x2 += m_weighted[c0+c1+c2];
 			}
-
-			x0 += m_weighted[c0];
+			
+			x1 += m_weighted[c0+c1];
 		}
+		
+		x0 += m_weighted[c0];
+	}
 
-		// save the block if necessary
-		if( CompareAnyLessThan( besterror, m_besterror ) )
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
 		{
-			// compute indices from cluster sizes.
-			u8 bestindices[16];
-			{
-				int i = 0;
-				for(; i < b0; i++) {
-					bestindices[i] = 0;
-				}
-				for(; i < b0+b1; i++) {
-					bestindices[i] = 2;
-				}
-				for(; i < count; i++) {
-					bestindices[i] = 1;
-				}
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices[i] = 3;
+			}
+			for(; i < count; i++) {
+				bestindices[i] = 1;
 			}
-
-			// remap the indices
-			u8 ordered[16];
-			for( int i = 0; i < count; ++i )
-				ordered[m_order[i]] = bestindices[i];
-
-			m_colours->RemapIndices( ordered, bestindices );
-
-
-			// save the block
-			WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
-
-			// save the error
-			m_besterror = besterror;
 		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < count; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+        m_colours->RemapIndices( ordered, bestindices );
+
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+		
+		// save the error
+		m_besterror = besterror;
 	}
+}
 
-	void WeightedClusterFit::Compress4( void* block )
-	{
-		int const count = m_colours->GetCount();
-		Vec4 const one = VEC4_CONST(1.0f);
-		Vec4 const zero = VEC4_CONST(0.0f);
-		Vec4 const half = VEC4_CONST(0.5f);
-		Vec4 const two = VEC4_CONST(2.0);
-		Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
-		Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
-		Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
-		Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
-		Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-
-		// declare variables
-		Vec4 beststart = VEC4_CONST( 0.0f );
-		Vec4 bestend = VEC4_CONST( 0.0f );
-		Vec4 besterror = VEC4_CONST( FLT_MAX );
-
-		Vec4 x0 = zero;
-		int b0 = 0, b1 = 0, b2 = 0;
+#else
 
-		// check all possible clusters for this total order
-		for( int c0 = 0; c0 <= count; c0++)
+void WeightedClusterFit::Compress3( void* block )
+{
+    int const count = m_colours->GetCount();
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	Vec3 const half( 0.5f );
+    Vec3 const grid( 31.0f, 63.0f, 31.0f );
+    Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+	// declare variables
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+
+	Vec3 x0(0.0f);
+	float w0 = 0.0f;
+	
+	int b0 = 0, b1 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= count; c0++)
+	{	
+		Vec3 x1(0.0f);
+		float w1 = 0.0f;
+		
+		for( int c1 = 0; c1 <= count-c0; c1++)
 		{	
-			Vec4 x1 = zero;
-
-			for( int c1 = 0; c1 <= count-c0; c1++)
-			{	
-				Vec4 x2 = zero;
-
-				for( int c2 = 0; c2 <= count-c0-c1; c2++)
-				{
-					Vec4 const x3 = m_xsum - x2 - x1 - x0;
-
-					//Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-					//float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-					Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
-					Vec4 const alpha2_sum = alphax_sum.SplatW();
-
-					//Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
-					//float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-					Vec4 const betax_sum = MultiplyAdd(x2, twothirds, MultiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
-					Vec4 const beta2_sum = betax_sum.SplatW();
-
-					//float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-					Vec4 const alphabeta_sum = twonineths*( x1 + x2 ).SplatW(); // alphabeta_sum
-
-					// float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-					Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-
-					Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-					Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-
-					// clamp to the grid
-					a = Min( one, Max( zero, a ) );
-					b = Min( one, Max( zero, b ) );
-					a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
-					b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
-
-					// compute the error (we skip the constant xxsum)
-					Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-					Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-					Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
-					Vec4 e4 = MultiplyAdd( two, e3, e1 );
-
-					// apply the metric to the error term
-					Vec4 e5 = e4 * m_metricSqr;
-					Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
-
-					// keep the solution if it wins
-					if( CompareAnyLessThan( error, besterror ) )
-					{
-						besterror = error;
-						beststart = a;
-						bestend = b;
-						b0 = c0;
-						b1 = c1;
-						b2 = c2;
-					}
-
-					x2 += m_weighted[c0+c1+c2];
-				}
-
-				x1 += m_weighted[c0+c1];
+			float w2 = m_wsum - w0 - w1;
+			
+			// These factors could be entirely precomputed.
+			float const alpha2_sum = w0 + w1 * 0.25f;
+			float const beta2_sum = w2 + w1 * 0.25f;
+			float const alphabeta_sum = w1 * 0.25f;
+			float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+			
+			Vec3 const alphax_sum = x0 + x1 * 0.5f;
+			Vec3 const betax_sum = m_xsum - alphax_sum;
+			
+			Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+			Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+			
+			// clamp to the grid
+			a = Min( one, Max( zero, a ) );
+			b = Min( one, Max( zero, b ) );
+			a = Floor( grid*a + half )*gridrcp;
+			b = Floor( grid*b + half )*gridrcp;
+			
+			// compute the error
+			Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+			
+			// apply the metric to the error term
+			float error = Dot( e1, m_metricSqr );
+			
+			// keep the solution if it wins
+			if( error < besterror )
+			{
+				besterror = error;
+				beststart = a;
+				bestend = b;
+				b0 = c0;
+				b1 = c1;
 			}
+			
+			x1 += m_weighted[c0+c1];
+			w1 += m_weights[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+		w0 += m_weights[c0];
+	}
 
-			x0 += m_weighted[c0];
-		}
-
-		// save the block if necessary
-		if( CompareAnyLessThan( besterror, m_besterror ) )
+	// save the block if necessary
+	if( besterror < m_besterror )
+	{
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
 		{
-			// compute indices from cluster sizes.
-			u8 bestindices[16];
-			{
-				int i = 0;
-				for(; i < b0; i++) {
-					bestindices[i] = 0;
-				}
-				for(; i < b0+b1; i++) {
-					bestindices[i] = 2;
-				}
-				for(; i < b0+b1+b2; i++) {
-					bestindices[i] = 3;
-				}
-				for(; i < count; i++) {
-					bestindices[i] = 1;
-				}
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
+			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < count; i++) {
+				bestindices[i] = 1;
 			}
-
-			// remap the indices
-			u8 ordered[16];
-			for( int i = 0; i < count; ++i )
-				ordered[m_order[i]] = bestindices[i];
-
-			m_colours->RemapIndices( ordered, bestindices );
-
-			// save the block
-			WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
-
-			// save the error
-			m_besterror = besterror;
 		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < count; ++i )
+			ordered[m_order[i]] = bestindices[i];
+		
+        m_colours->RemapIndices( ordered, bestindices );
+
+		// save the block
+		WriteColourBlock3( beststart, bestend, bestindices, block );
+		
+		// save the error
+		m_besterror = besterror;
 	}
+}
 
-#else
-
-	void WeightedClusterFit::Compress3( void* block )
-	{
-		int const count = m_colours->GetCount();
-		Vec3 const one( 1.0f );
-		Vec3 const zero( 0.0f );
-		Vec3 const half( 0.5f );
-		Vec3 const grid( 31.0f, 63.0f, 31.0f );
-		Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-
-		// declare variables
-		Vec3 beststart( 0.0f );
-		Vec3 bestend( 0.0f );
-		float besterror = FLT_MAX;
-
-		Vec3 x0(0.0f);
-		float w0 = 0.0f;
-
-		int b0 = 0, b1 = 0;
-
-		// check all possible clusters for this total order
-		for( int c0 = 0; c0 <= count; c0++)
+void WeightedClusterFit::Compress4( void* block )
+{
+    int const count = m_colours->GetCount();
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	Vec3 const half( 0.5f );
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+	// declare variables
+	Vec3 beststart( 0.0f );
+	Vec3 bestend( 0.0f );
+	float besterror = FLT_MAX;
+
+	Vec3 x0(0.0f);
+	float w0 = 0.0f;
+	int b0 = 0, b1 = 0, b2 = 0;
+
+	// check all possible clusters for this total order
+	for( int c0 = 0; c0 <= count; c0++)
+	{	
+		Vec3 x1(0.0f);
+		float w1 = 0.0f;
+		
+		for( int c1 = 0; c1 <= count-c0; c1++)
 		{	
-			Vec3 x1(0.0f);
-			float w1 = 0.0f;
-
-			for( int c1 = 0; c1 <= count-c0; c1++)
-			{	
-				float w2 = m_wsum - w0 - w1;
-
-				// These factors could be entirely precomputed.
-				float const alpha2_sum = w0 + w1 * 0.25f;
-				float const beta2_sum = w2 + w1 * 0.25f;
-				float const alphabeta_sum = w1 * 0.25f;
+			Vec3 x2(0.0f);
+			float w2 = 0.0f;
+			
+			for( int c2 = 0; c2 <= count-c0-c1; c2++)
+			{
+				float w3 = m_wsum - w0 - w1 - w2;
+				
+				float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+				float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+				float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
 				float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-				Vec3 const alphax_sum = x0 + x1 * 0.5f;
+				
+				Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
 				Vec3 const betax_sum = m_xsum - alphax_sum;
-
-				Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
-				Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
-
+				
+				Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+				Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+				
 				// clamp to the grid
 				a = Min( one, Max( zero, a ) );
 				b = Min( one, Max( zero, b ) );
 				a = Floor( grid*a + half )*gridrcp;
 				b = Floor( grid*b + half )*gridrcp;
-
+				
 				// compute the error
 				Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-
+				
 				// apply the metric to the error term
 				float error = Dot( e1, m_metricSqr );
-
+				
 				// keep the solution if it wins
 				if( error < besterror )
 				{
@@ -430,163 +537,56 @@
 					bestend = b;
 					b0 = c0;
 					b1 = c1;
+					b2 = c2;
 				}
-
-				x1 += m_weighted[c0+c1];
-				w1 += m_weights[c0+c1];
-			}
-
-			x0 += m_weighted[c0];
-			w0 += m_weights[c0];
-		}
-
-		// save the block if necessary
-		if( besterror < m_besterror )
-		{
-			// compute indices from cluster sizes.
-			u8 bestindices[16];
-			{
-				int i = 0;
-				for(; i < b0; i++) {
-					bestindices[i] = 0;
-				}
-				for(; i < b0+b1; i++) {
-					bestindices[i] = 2;
-				}
-				for(; i < count; i++) {
-					bestindices[i] = 1;
-				}
+				
+				x2 += m_weighted[c0+c1+c2];
+				w2 += m_weights[c0+c1+c2];
 			}
-
-			// remap the indices
-			u8 ordered[16];
-			for( int i = 0; i < count; ++i )
-				ordered[m_order[i]] = bestindices[i];
-
-			m_colours->RemapIndices( ordered, bestindices );
-
-			// save the block
-			WriteColourBlock3( beststart, bestend, bestindices, block );
-
-			// save the error
-			m_besterror = besterror;
-		}
+			
+			x1 += m_weighted[c0+c1];
+			w1 += m_weights[c0+c1];
+		}
+		
+		x0 += m_weighted[c0];
+		w0 += m_weights[c0];
 	}
 
-	void WeightedClusterFit::Compress4( void* block )
+	// save the block if necessary
+	if( besterror < m_besterror )
 	{
-		int const count = m_colours->GetCount();
-		Vec3 const one( 1.0f );
-		Vec3 const zero( 0.0f );
-		Vec3 const half( 0.5f );
-		Vec3 const grid( 31.0f, 63.0f, 31.0f );
-		Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-
-		// declare variables
-		Vec3 beststart( 0.0f );
-		Vec3 bestend( 0.0f );
-		float besterror = FLT_MAX;
-
-		Vec3 x0(0.0f);
-		float w0 = 0.0f;
-		int b0 = 0, b1 = 0, b2 = 0;
-
-		// check all possible clusters for this total order
-		for( int c0 = 0; c0 <= count; c0++)
-		{	
-			Vec3 x1(0.0f);
-			float w1 = 0.0f;
-
-			for( int c1 = 0; c1 <= count-c0; c1++)
-			{	
-				Vec3 x2(0.0f);
-				float w2 = 0.0f;
-
-				for( int c2 = 0; c2 <= count-c0-c1; c2++)
-				{
-					float w3 = m_wsum - w0 - w1 - w2;
-
-					float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-					float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-					float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-					float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-					Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-					Vec3 const betax_sum = m_xsum - alphax_sum;
-
-					Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
-					Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
-
-					// clamp to the grid
-					a = Min( one, Max( zero, a ) );
-					b = Min( one, Max( zero, b ) );
-					a = Floor( grid*a + half )*gridrcp;
-					b = Floor( grid*b + half )*gridrcp;
-
-					// compute the error
-					Vec3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-
-					// apply the metric to the error term
-					float error = Dot( e1, m_metricSqr );
-
-					// keep the solution if it wins
-					if( error < besterror )
-					{
-						besterror = error;
-						beststart = a;
-						bestend = b;
-						b0 = c0;
-						b1 = c1;
-						b2 = c2;
-					}
-
-					x2 += m_weighted[c0+c1+c2];
-					w2 += m_weights[c0+c1+c2];
-				}
-
-				x1 += m_weighted[c0+c1];
-				w1 += m_weights[c0+c1];
-			}
-
-			x0 += m_weighted[c0];
-			w0 += m_weights[c0];
-		}
-
-		// save the block if necessary
-		if( besterror < m_besterror )
+		// compute indices from cluster sizes.
+		u8 bestindices[16];
 		{
-			// compute indices from cluster sizes.
-			u8 bestindices[16];
-			{
-				int i = 0;
-				for(; i < b0; i++) {
-					bestindices[i] = 0;
-				}
-				for(; i < b0+b1; i++) {
-					bestindices[i] = 2;
-				}
-				for(; i < b0+b1+b2; i++) {
-					bestindices[i] = 3;
-				}
-				for(; i < count; i++) {
-					bestindices[i] = 1;
-				}
+			int i = 0;
+			for(; i < b0; i++) {
+				bestindices[i] = 0;
 			}
+			for(; i < b0+b1; i++) {
+				bestindices[i] = 2;
+			}
+			for(; i < b0+b1+b2; i++) {
+				bestindices[i] = 3;
+			}
+			for(; i < count; i++) {
+				bestindices[i] = 1;
+			}
+		}
+		
+		// remap the indices
+		u8 ordered[16];
+		for( int i = 0; i < count; ++i )
+			ordered[m_order[i]] = bestindices[i];
 
-			// remap the indices
-			u8 ordered[16];
-			for( int i = 0; i < count; ++i )
-				ordered[m_order[i]] = bestindices[i];
-
-			m_colours->RemapIndices( ordered, bestindices );
-
-			// save the block
-			WriteColourBlock4( beststart, bestend, bestindices, block );
+        m_colours->RemapIndices( ordered, bestindices );
+		
+		// save the block
+		WriteColourBlock4( beststart, bestend, bestindices, block );
 
-			// save the error
-			m_besterror = besterror;
-		}
+		// save the error
+		m_besterror = besterror;
 	}
+}
 
 #endif
 
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tests/ctest.c
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tests/ctest.c
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tests/ctest.c
@@ -1,35 +0,0 @@
-
-#include <stdio.h>
-
-#include <nvtt/nvtt_wrapper.h>
-
-
-int main(void)
-{
-	NvttInputOptions inputOptions = 0;
-	NvttOutputOptions outputOptions = 0;
-	NvttCompressionOptions compressionOptions = 0;
-
-	const unsigned int img[16*16];
-	
-	memset(img, 0, sizeof(unsigned int) * 16 * 16);
-
-	inputOptions = nvttCreateInputOptions();
-	nvttSetInputOptionsTextureLayout(inputOptions, NVTT_TextureType_2D, 16, 16, 1);
-	nvttSetInputOptionsMipmapData(inputOptions, img, 16, 16, 1, 0, 0);
-
-	outputOptions = nvttCreateOutputOptions();
-	nvttSetOutputOptionsFileName(outputOptions, "output.dds");
-
-	compressionOptions = nvttCreateCompressionOptions();
-	nvttSetCompressionOptionsFormat(compressionOptions, NVTT_Format_BC1);
-
-	nvttCompress(inputOptions, outputOptions, compressionOptions);
-
-	nvttDestroyCompressionOptions(compressionOptions);
-	nvttDestroyOutputOptions(outputOptions);
-	nvttDestroyInputOptions(inputOptions);
-
-	return 0;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tests/filtertest.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tests/filtertest.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tests/filtertest.cpp
@@ -1,80 +0,0 @@
-
-#include <nvimage/Filter.h>
-#include "../tools/cmdline.h"
-
-#include <math.h>
-
-using namespace nv;
-
-int main(void)
-{
-//	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	BoxFilter box1(0.5);
-	Kernel1 k1(box1, 2);
-	k1.debugPrint(); nvDebug("\n");
-
-	BoxFilter box2(1);
-	Kernel1 k2(box2, 2);
-	k2.debugPrint(); nvDebug("\n");
-
-	BoxFilter boxr3(1);
-	Kernel1 k3(boxr3, 2);
-	k3.debugPrint(); nvDebug("\n");
-
-	KaiserFilter kai4(5);
-	kai4.setParameters(4, 2);
-	Kernel1 k4(kai4, 2);
-	k4.debugPrint(); nvDebug("\n");
-
-/*	Kernel1 k3(3);
-	Kernel1 k4(9);
-	Kernel1 k5(10);
-
-//	k3.initFilter(Filter::Box);
-//	k4.initFilter(Filter::Box);
-//	k5.initFilter(Filter::Box);
-
-//	nvDebug("Box Filter:\n");
-//	k3.debugPrint(); nvDebug("\n");
-//	k4.debugPrint(); nvDebug("\n");
-//	k5.debugPrint(); nvDebug("\n");
-
-	k3.initSinc(0.75);
-	k4.initSinc(0.75);
-	k5.initSinc(0.75);
-
-	nvDebug("Sinc Filter:\n");
-	k3.debugPrint(); nvDebug("\n");
-	k4.debugPrint(); nvDebug("\n");
-	k5.debugPrint(); nvDebug("\n");
-	
-	k3.initKaiser(4, 1, 100);
-	k4.initKaiser(4, 1, 100);
-	k5.initKaiser(4, 1, 100);
-
-	nvDebug("Kaiser Filter:\n");
-	k3.debugPrint(); nvDebug("\n");
-	k4.debugPrint(); nvDebug("\n");
-	k5.debugPrint(); nvDebug("\n");
-
-	k3.initKaiser(4, 1, 10);
-	k4.initKaiser(4, 1, 10);
-	k5.initKaiser(4, 1, 10);
-
-	nvDebug("Kaiser Filter 2:\n");
-	k3.debugPrint(); nvDebug("\n");
-	k4.debugPrint(); nvDebug("\n");
-	k5.debugPrint(); nvDebug("\n");
-*/	
-	int l_start = 4;
-	int l_end = 2;
-	
-	BoxFilter filter;
-	PolyphaseKernel kp(kai4, l_start, l_end);
-	
-	kp.debugPrint();
-	
-	return 0;
-}
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/assemble.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/assemble.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/assemble.cpp
@@ -1,189 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-
-#include <nvmath/Color.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/ImageIO.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include "cmdline.h"
-
-// @@ Add decent error messages.
-// @@ Add option to resize images.
-// @@ Add support for reading DDS files with 2D images and possibly mipmaps.
-
-int main(int argc, char *argv[])
-{
-	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	bool assembleCubeMap = true;
-	bool assembleVolume = false;
-	bool assembleTextureArray = false;
-	
-	nv::Array<nv::Path> files;
-	nv::Path output = "output.dds";
-	
-	// Parse arguments.
-	for (int i = 1; i < argc; i++)
-	{
-		// Input options.
-		if (strcmp("-cube", argv[i]) == 0)
-		{
-			assembleCubeMap = true;
-			assembleVolume = false;
-			assembleTextureArray = false;
-		}
-		/*if (strcmp("-volume", argv[i]) == 0)
-		{
-			assembleCubeMap = false;
-			assembleVolume = true;
-			assembleTextureArray = false;
-		}
-		if (strcmp("-array", argv[i]) == 0)
-		{
-			assembleCubeMap = false;
-			assembleVolume = false;
-			assembleTextureArray = true;
-		}*/
-		else if (strcmp("-o", argv[i]) == 0)
-		{
-			i++;
-			if (i < argc && argv[i][0] != '-')
-			{
-				output = argv[i];
-			}
-		}
-		else if (argv[i][0] != '-')
-		{
-			files.append(argv[i]);
-		}
-	}
-	
-	if (files.count() == 0)
-	{
-		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
-		printf("usage: nvassemble [-cube|-volume|-array] 'file0' 'file1' ...\n\n");
-		return 1;
-	}
-	
-	if (nv::strCaseCmp(output.extension(), ".dds") != 0)
-	{
-		//output.stripExtension();
-		output.append(".dds");
-	}
-
-	if (assembleCubeMap && files.count() != 6)
-	{
-		printf("*** error, 6 files expected, but got %d\n", files.count());
-		return 1;
-	}
-	
-	// Load all files.
-	nv::Array<nv::Image> images;
-	
-	uint w = 0, h = 0;
-	bool hasAlpha = false;
-	
-	const uint imageCount = files.count();
-	images.resize(imageCount);
-
-	for (uint i = 0; i < imageCount; i++)
-	{
-		if (!images[i].load(files[i]))
-		{
-			printf("*** error loading file\n");
-			return 1;
-		}
-		
-		if (i == 0)
-		{
-			w = images[i].width();
-			h = images[i].height();
-		}
-		else if (images[i].width() != w || images[i].height() != h)
-		{
-			printf("*** error, size of image '%s' does not match\n", files[i].str());
-			return 1;
-		}
-		
-		if (images[i].format() == nv::Image::Format_ARGB)
-		{
-			hasAlpha = true;
-		}
-	}
-	
-	
-	nv::StdOutputStream stream(output);
-	if (stream.isError()) {
-		printf("Error opening '%s' for writting\n", output.str());
-		return 1;
-	}
-	
-	// Output DDS header.
-	nv::DDSHeader header;
-	header.setWidth(w);
-	header.setHeight(h);
-
-	if (assembleCubeMap)
-	{
-		header.setTextureCube();
-	}
-	else if (assembleVolume)
-	{
-		header.setTexture3D();
-		header.setDepth(imageCount);
-	}
-	else if (assembleTextureArray)
-	{
-		//header.setTextureArray(imageCount);
-	}
-
-	// @@ It always outputs 32 bpp.
-	header.setPitch(4 * w);
-	header.setPixelFormat(32, 0xFF0000, 0xFF00, 0xFF, hasAlpha ? 0xFF000000 : 0);
-
-	stream << header;
-
-	// Output images.
-	for (uint i = 0; i < imageCount; i++)
-	{
-		const uint pixelCount = w * h;
-		for (uint p = 0; p < pixelCount; p++)
-		{
-			nv::Color32 c = images[i].pixel(p);
-			uint8 r = c.r;
-			uint8 g = c.g;
-			uint8 b = c.b;
-			uint8 a = c.a;
-			stream << b << g << r << a;
-		}
-	}
-
-	return 0;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/benchmark.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/benchmark.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/benchmark.cpp
@@ -1,374 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include <nvtt/nvtt.h>
-
-#include "cmdline.h"
-
-#include <time.h> // clock
-
-
-struct MyErrorHandler : public nvtt::ErrorHandler
-{
-	virtual void error(nvtt::Error e)
-	{
-		nvDebugBreak();
-	}
-};
-
-
-// Set color to normal map conversion options.
-void setColorToNormalMap(nvtt::InputOptions & inputOptions)
-{
-	inputOptions.setNormalMap(false);
-	inputOptions.setConvertToNormalMap(true);
-	inputOptions.setHeightEvaluation(1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 0.0f);
-	//inputOptions.setNormalFilter(1.0f, 0, 0, 0);
-	//inputOptions.setNormalFilter(0.0f, 0, 0, 1);
-	inputOptions.setGamma(1.0f, 1.0f);
-	inputOptions.setNormalizeMipmaps(true);
-}
-
-// Set options for normal maps.
-void setNormalMap(nvtt::InputOptions & inputOptions)
-{
-	inputOptions.setNormalMap(true);
-	inputOptions.setConvertToNormalMap(false);
-	inputOptions.setGamma(1.0f, 1.0f);
-	inputOptions.setNormalizeMipmaps(true);
-}
-
-// Set options for color maps.
-void setColorMap(nvtt::InputOptions & inputOptions)
-{
-	inputOptions.setNormalMap(false);
-	inputOptions.setConvertToNormalMap(false);
-	inputOptions.setGamma(2.2f, 2.2f);
-	inputOptions.setNormalizeMipmaps(false);
-}
-
-
-
-int main(int argc, char *argv[])
-{
-	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	bool normal = false;
-	bool color2normal = false;
-	bool wrapRepeat = false;
-	bool noMipmaps = false;
-	bool fast = false;
-	bool nocuda = false;
-	bool silent = false;
-	nvtt::Format format = nvtt::Format_BC1;
-
-	const char * externalCompressor = NULL;
-
-	nv::Path input;
-	nv::Path output;
-
-
-	// Parse arguments.
-	for (int i = 1; i < argc; i++)
-	{
-		// Input options.
-		if (strcmp("-color", argv[i]) == 0)
-		{
-		}
-		else if (strcmp("-normal", argv[i]) == 0)
-		{
-			normal = true;
-		}
-		else if (strcmp("-tonormal", argv[i]) == 0)
-		{
-			color2normal = true;
-		}
-		else if (strcmp("-clamp", argv[i]) == 0)
-		{
-		}
-		else if (strcmp("-repeat", argv[i]) == 0)
-		{
-			wrapRepeat = true;
-		}
-		else if (strcmp("-nomips", argv[i]) == 0)
-		{
-			noMipmaps = true;
-		}
-
-		// Compression options.
-		else if (strcmp("-fast", argv[i]) == 0)
-		{
-			fast = true;
-		}
-		else if (strcmp("-nocuda", argv[i]) == 0)
-		{
-			nocuda = true;
-		}
-		else if (strcmp("-rgb", argv[i]) == 0)
-		{
-			format = nvtt::Format_RGB;
-		}
-		else if (strcmp("-bc1", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC1;
-		}
-		else if (strcmp("-bc1a", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC1a;
-		}
-		else if (strcmp("-bc2", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC2;
-		}
-		else if (strcmp("-bc3", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC3;
-		}
-		else if (strcmp("-bc3n", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC3n;
-		}
-		else if (strcmp("-bc4", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC4;
-		}
-		else if (strcmp("-bc5", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC5;
-		}
-
-		// Undocumented option. Mainly used for testing.
-		else if (strcmp("-ext", argv[i]) == 0)
-		{
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				externalCompressor = argv[i+1];
-				i++;
-			}
-		}
-
-		// Misc options
-		else if (strcmp("-silent", argv[i]) == 0)
-		{
-			silent = true;
-		}
-
-		else if (argv[i][0] != '-')
-		{
-			input = argv[i];
-
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				output = argv[i+1];
-			}
-			else
-			{
-				output.copy(input.str());
-				output.stripExtension();
-				output.append(".dds");
-			}
-
-			break;
-		}
-	}
-
-	printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
-
-	if (input.isNull())
-	{
-		printf("usage: nvttbenchmark [options] infile [outfile]\n\n");
-		
-		printf("Input options:\n");
-		printf("  -color   \tThe input image is a color map (default).\n");
-		printf("  -normal  \tThe input image is a normal map.\n");
-		printf("  -tonormal\tConvert input to normal map.\n");
-		printf("  -clamp   \tClamp wrapping mode (default).\n");
-		printf("  -repeat  \tRepeat wrapping mode.\n");
-		printf("  -nomips  \tDisable mipmap generation.\n\n");
-
-		printf("Compression options:\n");
-		printf("  -fast    \tFast compression.\n");
-		printf("  -nocuda  \tDo not use cuda compressor.\n");
-		printf("  -rgb     \tRGBA format\n");
-		printf("  -bc1     \tBC1 format (DXT1)\n");
-		printf("  -bc1a    \tBC1 format with binary alpha (DXT1a)\n");
-		printf("  -bc2     \tBC2 format (DXT3)\n");
-		printf("  -bc3     \tBC3 format (DXT5)\n");
-		printf("  -bc3n    \tBC3 normal map format (DXT5nm)\n");
-		printf("  -bc4     \tBC4 format (ATI1)\n");
-		printf("  -bc5     \tBC5 format (3Dc/ATI2)\n\n");
-		
-		return 1;
-	}
-
-	// @@ Make sure input file exists.
-	
-	// Set input options.
-	nvtt::InputOptions inputOptions;
-	
-	if (nv::strCaseCmp(input.extension(), ".dds") == 0)
-	{
-		// Load surface.
-		nv::DirectDrawSurface dds(input);
-		if (!dds.isValid())
-		{
-			fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
-			return 1;
-		}
-		
-		if (!dds.isSupported() || dds.isTexture3D())
-		{
-			fprintf(stderr, "The file '%s' is not a supported DDS file.\n", input.str());
-			return 1;
-		}
-		
-		uint faceCount;
-		if (dds.isTexture2D())
-		{
-			inputOptions.setTextureLayout(nvtt::TextureType_2D, dds.width(), dds.height());
-			faceCount = 1;
-		}
-		else 
-		{
-			nvDebugCheck(dds.isTextureCube());
-			inputOptions.setTextureLayout(nvtt::TextureType_Cube, dds.width(), dds.height());
-			faceCount = 6;
-		}
-		
-		uint mipmapCount = dds.mipmapCount();
-		
-		nv::Image mipmap;
-		
-		for (uint f = 0; f < faceCount; f++)
-		{
-			for (uint m = 0; m <= mipmapCount; m++)
-			{
-				dds.mipmap(&mipmap, f, m);
-				
-				inputOptions.setMipmapData(mipmap.pixels(), mipmap.width(), mipmap.height(), 1, f, m);
-			}
-		}
-	}
-	else
-	{
-		// Regular image.
-		nv::Image image;
-		if (!image.load(input))
-		{
-			fprintf(stderr, "The file '%s' is not a supported image type.\n", input.str());
-			return 1;
-		}
-		
-		inputOptions.setTextureLayout(nvtt::TextureType_2D, image.width(), image.height());
-		inputOptions.setMipmapData(image.pixels(), image.width(), image.height());
-	}
-
-	if (fast)
-	{
-		inputOptions.setMipmapping(true, nvtt::MipmapFilter_Box);
-	}
-	else
-	{
-		inputOptions.setMipmapping(true, nvtt::MipmapFilter_Box);
-		//inputOptions.setMipmapping(true, nvtt::MipmapFilter_Kaiser);
-	}
-
-	if (wrapRepeat)
-	{
-		inputOptions.setWrapMode(nvtt::WrapMode_Repeat);
-	}
-	else
-	{
-		inputOptions.setWrapMode(nvtt::WrapMode_Clamp);
-	}
-
-	if (normal)
-	{
-		setNormalMap(inputOptions);
-	}
-	else if (color2normal)
-	{
-		setColorToNormalMap(inputOptions);
-	}
-	else
-	{
-		setColorMap(inputOptions);
-	}
-	
-	if (noMipmaps)
-	{
-		inputOptions.setMipmapping(false);
-	}
-	
-
-	nvtt::CompressionOptions compressionOptions;
-	compressionOptions.setFormat(format);
-	if (fast)
-	{
-		compressionOptions.setQuality(nvtt::Quality_Fastest);
-	}
-	else
-	{
-		compressionOptions.setQuality(nvtt::Quality_Normal);
-		//compressionOptions.setQuality(nvtt::Quality_Production, 0.5f);
-		//compressionOptions.setQuality(nvtt::Quality_Highest);
-	}
-	compressionOptions.enableHardwareCompression(!nocuda);
-	compressionOptions.setColorWeights(1, 1, 1);
-
-	if (externalCompressor != NULL)
-	{
-		compressionOptions.setExternalCompressor(externalCompressor);
-	}	
-	
-	
-	MyErrorHandler errorHandler;
-	nvtt::OutputOptions outputOptions(NULL, &errorHandler);
-	
-//	printf("Press ENTER.\n");
-//	fflush(stdout);
-//	getchar();
-
-	clock_t start = clock();
-	
-	const int iterationCount = 20;
-	for (int i = 0; i < iterationCount; i++)
-	{
-		nvtt::compress(inputOptions, outputOptions, compressionOptions);
-	}
-
-	clock_t end = clock();
-	
-	float seconds = float(end-start) / CLOCKS_PER_SEC
-	printf("total time taken: %.3f seconds\n", seconds);
-	printf("time taken per texture: %.3f seconds\n", seconds / iterationCount);
-	printf("textures per second: %.3f T/s\n", iterationCount / seconds);
-	
-	return 0;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/cmdline.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/cmdline.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/cmdline.h
@@ -1,68 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef CMDLINE_H
-#define CMDLINE_H
-
-#include <nvcore/Debug.h>
-
-#include <stdio.h> // stderr
-#include <stdlib.h>	// exit
-#include <stdarg.h> // va_list
-
-
-struct MyMessageHandler : public nv::MessageHandler {
-	MyMessageHandler() {
-		nv::debug::setMessageHandler( this );
-	}
-	~MyMessageHandler() {
-		nv::debug::resetMessageHandler();
-	}
-
-	virtual void log( const char * str, va_list arg ) {
-		va_list val;
-		va_copy(val, arg);
-		vfprintf(stderr, str, arg);
-		va_end(val);		
-	}
-};
-
-
-struct MyAssertHandler : public nv::AssertHandler {
-	MyAssertHandler() {
-		nv::debug::setAssertHandler( this );
-	}
-	~MyAssertHandler() {
-		nv::debug::resetAssertHandler();
-	}
-	
-	// Handler method, note that func might be NULL!
-	virtual int assert( const char *exp, const char *file, int line, const char *func ) {
-		fprintf(stderr, "Assertion failed: %s\nIn %s:%d\n", exp, file, line);
-		nv::debug::dumpInfo();
-		exit(1);
-	}
-};
-
-
-#endif // CMDLINE_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/compress.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/compress.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/compress.cpp
@@ -1,468 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include <nvtt/nvtt.h>
-
-#include "cmdline.h"
-
-#include <time.h> // clock
-
-//#define WINDOWS_LEAN_AND_MEAN
-//#include <windows.h> // TIMER
-
-
-struct MyOutputHandler : public nvtt::OutputHandler
-{
-	MyOutputHandler(const char * name) : total(0), progress(0), percentage(0), stream(new nv::StdOutputStream(name)) {}
-	virtual ~MyOutputHandler() { delete stream; }
-	
-	void setTotal(int64 t)
-	{
-		total = t + 128;
-	}
-	void setDisplayProgress(bool b)
-	{
-		verbose = b;
-	}
-
-	virtual void beginImage(int size, int width, int height, int depth, int face, int miplevel)
-	{
-		// ignore.
-	}
-	
-	// Output data.
-	virtual bool writeData(const void * data, int size)
-	{
-		nvDebugCheck(stream != NULL);
-		stream->serialize(const_cast<void *>(data), size);
-
-		progress += size;
-		int p = int((100 * progress) / total);
-		if (verbose && p != percentage)
-		{
-			nvCheck(p >= 0);
-
-			percentage = p;
-			printf("\r%d%%", percentage);
-			fflush(stdout);
-		}
-
-		return true;
-	}
-	
-	int64 total;
-	int64 progress;
-	int percentage;
-	bool verbose;
-	nv::StdOutputStream * stream;
-};
-
-struct MyErrorHandler : public nvtt::ErrorHandler
-{
-	virtual void error(nvtt::Error e)
-	{
-#if _DEBUG
-		nvDebugBreak();
-#endif
-		printf("Error: '%s'\n", nvtt::errorString(e));
-	}
-};
-
-
-
-
-// Set color to normal map conversion options.
-void setColorToNormalMap(nvtt::InputOptions & inputOptions)
-{
-	inputOptions.setNormalMap(false);
-	inputOptions.setConvertToNormalMap(true);
-	inputOptions.setHeightEvaluation(1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 0.0f);
-	//inputOptions.setNormalFilter(1.0f, 0, 0, 0);
-	//inputOptions.setNormalFilter(0.0f, 0, 0, 1);
-	inputOptions.setGamma(1.0f, 1.0f);
-	inputOptions.setNormalizeMipmaps(true);
-}
-
-// Set options for normal maps.
-void setNormalMap(nvtt::InputOptions & inputOptions)
-{
-	inputOptions.setNormalMap(true);
-	inputOptions.setConvertToNormalMap(false);
-	inputOptions.setGamma(1.0f, 1.0f);
-	inputOptions.setNormalizeMipmaps(true);
-}
-
-// Set options for color maps.
-void setColorMap(nvtt::InputOptions & inputOptions)
-{
-	inputOptions.setNormalMap(false);
-	inputOptions.setConvertToNormalMap(false);
-	inputOptions.setGamma(2.2f, 2.2f);
-	inputOptions.setNormalizeMipmaps(false);
-}
-
-
-
-int main(int argc, char *argv[])
-{
-	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	bool alpha = false;
-	bool normal = false;
-	bool color2normal = false;
-	bool wrapRepeat = false;
-	bool noMipmaps = false;
-	bool fast = false;
-	bool nocuda = false;
-	bool silent = false;
-	bool bc1n = false;
-	nvtt::Format format = nvtt::Format_BC1;
-
-	const char * externalCompressor = NULL;
-
-	nv::Path input;
-	nv::Path output;
-
-
-	// Parse arguments.
-	for (int i = 1; i < argc; i++)
-	{
-		// Input options.
-		if (strcmp("-color", argv[i]) == 0)
-		{
-		}
-		else if (strcmp("-alpha", argv[i]) == 0)
-		{
-			alpha = true;
-		}
-		else if (strcmp("-normal", argv[i]) == 0)
-		{
-			normal = true;
-		}
-		else if (strcmp("-tonormal", argv[i]) == 0)
-		{
-			color2normal = true;
-		}
-		else if (strcmp("-clamp", argv[i]) == 0)
-		{
-		}
-		else if (strcmp("-repeat", argv[i]) == 0)
-		{
-			wrapRepeat = true;
-		}
-		else if (strcmp("-nomips", argv[i]) == 0)
-		{
-			noMipmaps = true;
-		}
-
-		// Compression options.
-		else if (strcmp("-fast", argv[i]) == 0)
-		{
-			fast = true;
-		}
-		else if (strcmp("-nocuda", argv[i]) == 0)
-		{
-			nocuda = true;
-		}
-		else if (strcmp("-rgb", argv[i]) == 0)
-		{
-			format = nvtt::Format_RGB;
-		}
-		else if (strcmp("-bc1", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC1;
-		}
-		else if (strcmp("-bc1n", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC1;
-			bc1n = true;
-		}
-		else if (strcmp("-bc1a", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC1a;
-		}
-		else if (strcmp("-bc2", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC2;
-		}
-		else if (strcmp("-bc3", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC3;
-		}
-		else if (strcmp("-bc3n", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC3n;
-		}
-		else if (strcmp("-bc4", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC4;
-		}
-		else if (strcmp("-bc5", argv[i]) == 0)
-		{
-			format = nvtt::Format_BC5;
-		}
-
-		// Undocumented option. Mainly used for testing.
-		else if (strcmp("-ext", argv[i]) == 0)
-		{
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				externalCompressor = argv[i+1];
-				i++;
-			}
-		}
-
-		// Misc options
-		else if (strcmp("-silent", argv[i]) == 0)
-		{
-			silent = true;
-		}
-
-		else if (argv[i][0] != '-')
-		{
-			input = argv[i];
-
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				output = argv[i+1];
-			}
-			else
-			{
-				output.copy(input.str());
-				output.stripExtension();
-				output.append(".dds");
-			}
-
-			break;
-		}
-	}
-
-	const uint version = nvtt::version();
-	const uint major = version / 100;
-	const uint minor = version % 100;
-	
-
-	printf("NVIDIA Texture Tools %u.%u - Copyright NVIDIA Corporation 2007\n\n", major, minor);
-
-	if (input.isNull())
-	{
-		printf("usage: nvcompress [options] infile [outfile]\n\n");
-		
-		printf("Input options:\n");
-		printf("  -color   \tThe input image is a color map (default).\n");
-		printf("  -alpha     \tThe input image has an alpha channel used for transparency.\n");		
-		printf("  -normal  \tThe input image is a normal map.\n");
-		printf("  -tonormal\tConvert input to normal map.\n");
-		printf("  -clamp   \tClamp wrapping mode (default).\n");
-		printf("  -repeat  \tRepeat wrapping mode.\n");
-		printf("  -nomips  \tDisable mipmap generation.\n\n");
-
-		printf("Compression options:\n");
-		printf("  -fast    \tFast compression.\n");
-		printf("  -nocuda  \tDo not use cuda compressor.\n");
-		printf("  -rgb     \tRGBA format\n");
-		printf("  -bc1     \tBC1 format (DXT1)\n");
-		printf("  -bc1n    \tBC1 normal map format (DXT1nm)\n");
-		printf("  -bc1a    \tBC1 format with binary alpha (DXT1a)\n");
-		printf("  -bc2     \tBC2 format (DXT3)\n");
-		printf("  -bc3     \tBC3 format (DXT5)\n");
-		printf("  -bc3n    \tBC3 normal map format (DXT5nm)\n");
-		printf("  -bc4     \tBC4 format (ATI1)\n");
-		printf("  -bc5     \tBC5 format (3Dc/ATI2)\n\n");
-		
-		return EXIT_FAILURE;
-	}
-
-	// @@ Make sure input file exists.
-	
-	// Set input options.
-	nvtt::InputOptions inputOptions;
-	
-	if (nv::strCaseCmp(input.extension(), ".dds") == 0)
-	{
-		// Load surface.
-		nv::DirectDrawSurface dds(input);
-		if (!dds.isValid())
-		{
-			fprintf(stderr, "The file '%s' is not a valid DDS file.\n", input.str());
-			return EXIT_FAILURE;
-		}
-		
-		if (!dds.isSupported() || dds.isTexture3D())
-		{
-			fprintf(stderr, "The file '%s' is not a supported DDS file.\n", input.str());
-			return EXIT_FAILURE;
-		}
-		
-		uint faceCount;
-		if (dds.isTexture2D())
-		{
-			inputOptions.setTextureLayout(nvtt::TextureType_2D, dds.width(), dds.height());
-			faceCount = 1;
-		}
-		else 
-		{
-			nvDebugCheck(dds.isTextureCube());
-			inputOptions.setTextureLayout(nvtt::TextureType_Cube, dds.width(), dds.height());
-			faceCount = 6;
-		}
-		
-		uint mipmapCount = dds.mipmapCount();
-		
-		nv::Image mipmap;
-		
-		for (uint f = 0; f < faceCount; f++)
-		{
-			for (uint m = 0; m < mipmapCount; m++)
-			{
-				dds.mipmap(&mipmap, f, m);
-				
-				inputOptions.setMipmapData(mipmap.pixels(), mipmap.width(), mipmap.height(), 1, f, m);
-			}
-		}
-	}
-	else
-	{
-		// Regular image.
-		nv::Image image;
-		if (!image.load(input))
-		{
-			fprintf(stderr, "The file '%s' is not a supported image type.\n", input.str());
-			return EXIT_FAILURE;
-		}
-		
-		inputOptions.setTextureLayout(nvtt::TextureType_2D, image.width(), image.height());
-		inputOptions.setMipmapData(image.pixels(), image.width(), image.height());
-	}
-
-	if (wrapRepeat)
-	{
-		inputOptions.setWrapMode(nvtt::WrapMode_Repeat);
-	}
-	else
-	{
-		inputOptions.setWrapMode(nvtt::WrapMode_Clamp);
-	}
-
-	if (alpha)
-	{
-		inputOptions.setAlphaMode(nvtt::AlphaMode_Transparency);
-	}
-	else
-	{
-		inputOptions.setAlphaMode(nvtt::AlphaMode_None);
-	}
-
-	if (normal)
-	{
-		setNormalMap(inputOptions);
-	}
-	else if (color2normal)
-	{
-		setColorToNormalMap(inputOptions);
-	}
-	else
-	{
-		setColorMap(inputOptions);
-	}
-	
-	if (noMipmaps)
-	{
-		inputOptions.setMipmapGeneration(false);
-	}
-
-	nvtt::CompressionOptions compressionOptions;
-	compressionOptions.setFormat(format);
-	if (fast)
-	{
-		compressionOptions.setQuality(nvtt::Quality_Fastest);
-	}
-	else
-	{
-		compressionOptions.setQuality(nvtt::Quality_Normal);
-		//compressionOptions.setQuality(nvtt::Quality_Production);
-		//compressionOptions.setQuality(nvtt::Quality_Highest);
-	}
-
-	if (bc1n)
-	{
-		compressionOptions.setColorWeights(1, 1, 0);
-	}
-
-	if (externalCompressor != NULL)
-	{
-		compressionOptions.setExternalCompressor(externalCompressor);
-	}
-
-	
-	MyErrorHandler errorHandler;
-	MyOutputHandler outputHandler(output);
-	if (outputHandler.stream->isError())
-	{
-		fprintf(stderr, "Error opening '%s' for writting\n", output.str());
-		return EXIT_FAILURE;
-	}
-
-	nvtt::Compressor compressor;
-	compressor.enableCudaAcceleration(!nocuda);
-
-	printf("CUDA acceleration ");
-	if (compressor.isCudaAccelerationEnabled())
-	{
-		printf("ENABLED\n\n");
-	}
-	else
-	{
-		printf("DISABLED\n\n");
-	}
-	
-	outputHandler.setTotal(compressor.estimateSize(inputOptions, compressionOptions));
-	outputHandler.setDisplayProgress(!silent);
-
-	nvtt::OutputOptions outputOptions;
-	//outputOptions.setFileName(output);
-	outputOptions.setOutputHandler(&outputHandler);
-	outputOptions.setErrorHandler(&errorHandler);
-	
-//	printf("Press ENTER.\n");
-//	fflush(stdout);
-//	getchar();
-
-	clock_t start = clock();
-	
-	if (!compressor.process(inputOptions, compressionOptions, outputOptions))
-	{
-		return EXIT_FAILURE;
-	}
-
-	clock_t end = clock();
-	printf("\rtime taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);
-	
-	return EXIT_SUCCESS;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.h
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.h
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.h
@@ -1,69 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#ifndef CONFIGDIALOG_H
-#define CONFIGDIALOG_H
-
-#include <QtGui/QDialog>
-
-#include "ui_configdialog.h"
-
-#include <nvtt/nvtt.h>
-
-
-class ConfigDialog : public QDialog
-{
-	Q_OBJECT
-public:
-	ConfigDialog(QWidget *parent = 0);
-	ConfigDialog(const char * fileName, QWidget *parent = 0);
-	
-protected slots:
-	
-	void openClicked();
-	void generateMipmapsChanged(int state);
-	void mipmapFilterChanged(QString name);
-	
-	void colorWeightChanged();
-	void uniformWeightToggled(bool checked);
-	void luminanceWeightToggled(bool checked);
-	
-	void normalMapModeChanged(bool checked);
-	
-	bool open(QString fileName);
-	
-private:
-	
-	void init();
-	
-private:
-	Ui::ConfigDialog ui;
-	
-	nvtt::InputOptions inputOptions;
-	nvtt::CompressionOptions compressionOptions;
-	nvtt::OutputOptions outputOptions;
-	
-};
-
-
-#endif // CONFIGDIALOG_H
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.cpp
@@ -1,170 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include "configdialog.h"
-
-#include <QtCore/QDebug>
-
-#include <QtGui/QImage>
-
-
-ConfigDialog::ConfigDialog(QWidget *parent/*=0*/) : QDialog(parent)
-{
-	init();
-}
-
-ConfigDialog::ConfigDialog(const char * fileName, QWidget *parent/*=0*/) : QDialog(parent)
-{
-	init();
-	
-	open(fileName);
-}
-
-void ConfigDialog::init()
-{
-	ui.setupUi(this);
-	
-	connect(ui.openButton, SIGNAL(clicked()), this, SLOT(openClicked()));
-	connect(ui.generateMipmapsCheckBox, SIGNAL(stateChanged(int)), this, SLOT(generateMipmapsChanged(int)));
-	connect(ui.mipmapFilterComboBox, SIGNAL(activated(QString)), this, SLOT(mipmapFilterChanged(QString)));
-	//connect(ui.mipmapFilterSettings, SIGNAL(clicked()), this, SLOT(mipmapFilterSettingsShow()));
-	
-	connect(ui.redSpinBox, SIGNAL(valueChanged(double)), this, SLOT(colorWeightChanged()));
-	connect(ui.greenSpinBox, SIGNAL(valueChanged(double)), this, SLOT(colorWeightChanged()));
-	connect(ui.blueSpinBox, SIGNAL(valueChanged(double)), this, SLOT(colorWeightChanged()));
-	connect(ui.uniformButton, SIGNAL(toggled(bool)), this, SLOT(uniformWeightToggled(bool)));
-	connect(ui.luminanceButton, SIGNAL(toggled(bool)), this, SLOT(luminanceWeightToggled(bool)));
-	
-	//connect(ui.rgbMapRadioButton, SIGNAL(toggled(bool)), this, SLOT(colorModeChanged()));
-	connect(ui.normalMapRadioButton, SIGNAL(toggled(bool)), this, SLOT(normalMapModeChanged(bool)));
-}
-
-
-void ConfigDialog::openClicked()
-{
-	// @@ Open file dialog.
-	
-	QString fileName;
-	
-	open(fileName);
-}
-
-void ConfigDialog::generateMipmapsChanged(int state)
-{
-	Q_UNUSED(state);
-	
-	bool generateMipmapEnabled = ui.generateMipmapsCheckBox->isChecked();
-	
-	ui.mipmapFilterLabel->setEnabled(generateMipmapEnabled);
-	ui.mipmapFilterComboBox->setEnabled(generateMipmapEnabled);
-	ui.limitMipmapsCheckBox->setEnabled(generateMipmapEnabled);
-	
-	bool enableFilterSettings = (ui.mipmapFilterComboBox->currentText() == "Kaiser");
-	ui.mipmapFilterSettings->setEnabled(generateMipmapEnabled && enableFilterSettings);
-	
-	bool enableMaxLevel = ui.limitMipmapsCheckBox->isChecked();
-	ui.maxLevelLabel->setEnabled(generateMipmapEnabled && enableMaxLevel);
-	ui.maxLevelSpinBox->setEnabled(generateMipmapEnabled && enableMaxLevel);
-}
-
-void ConfigDialog::mipmapFilterChanged(QString name)
-{
-	bool enableFilterSettings = (name == "Kaiser");
-	ui.mipmapFilterSettings->setEnabled(enableFilterSettings);
-}
-
-
-void ConfigDialog::colorWeightChanged()
-{
-	double r = ui.redSpinBox->value();
-	double g = ui.greenSpinBox->value();
-	double b = ui.blueSpinBox->value();
-	
-	bool uniform = (r == 1.0 && g == 1.0 && b == 1.0);
-	bool luminance = (r == 0.3 && g == 0.59 && b == 0.11);
-	
-	ui.uniformButton->setChecked(uniform);
-	ui.luminanceButton->setChecked(luminance);
-}
-
-void ConfigDialog::uniformWeightToggled(bool checked)
-{
-	if (checked)
-	{
-		ui.redSpinBox->setValue(1.0);
-		ui.greenSpinBox->setValue(1.0);
-		ui.blueSpinBox->setValue(1.0);
-	}
-}
-
-void ConfigDialog::luminanceWeightToggled(bool checked)
-{
-	if (checked)
-	{
-		ui.redSpinBox->setValue(0.3);
-		ui.greenSpinBox->setValue(0.59);
-		ui.blueSpinBox->setValue(0.11);
-	}
-}
-
-void ConfigDialog::normalMapModeChanged(bool checked)
-{
-	ui.alphaModeGroupBox->setEnabled(!checked);
-	ui.inputGammaSpinBox->setEnabled(!checked);
-	ui.inputGammaLabel->setEnabled(!checked);
-	ui.outputGammaSpinBox->setEnabled(!checked);
-	ui.outputGammaLabel->setEnabled(!checked);
-}
-
-
-bool ConfigDialog::open(QString fileName)
-{
-	// @@ Load image.
-	QImage image;
-	
-	// @@ If success.
-	{
-		ui.imagePathLineEdit->setText(fileName);
-		
-		// @@ Set image in graphics view.
-		
-		// @@ Set image description.
-		
-		// @@ Provide image to nvtt.
-		
-		int w = image.width();
-		int h = image.height();
-		void * data = NULL;
-		
-		inputOptions.setTextureLayout(nvtt::TextureType_2D, w, h);
-		inputOptions.setMipmapData(data, w, h);
-		
-		return true;
-	}
-
-	return false;
-}
-
-
-
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.ui
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.ui
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/configdialog.ui
@@ -1,1046 +0,0 @@
-<ui version="4.0" >
- <class>ConfigDialog</class>
- <widget class="QDialog" name="ConfigDialog" >
-  <property name="geometry" >
-   <rect>
-    <x>0</x>
-    <y>0</y>
-    <width>674</width>
-    <height>475</height>
-   </rect>
-  </property>
-  <property name="windowTitle" >
-   <string>NVIDIA Texture Tools</string>
-  </property>
-  <property name="windowIcon" >
-   <iconset/>
-  </property>
-  <property name="sizeGripEnabled" >
-   <bool>true</bool>
-  </property>
-  <layout class="QVBoxLayout" >
-   <item>
-    <layout class="QHBoxLayout" >
-     <item>
-      <layout class="QVBoxLayout" >
-       <item>
-        <widget class="QListWidget" name="listWidget" >
-         <property name="minimumSize" >
-          <size>
-           <width>64</width>
-           <height>0</height>
-          </size>
-         </property>
-         <property name="maximumSize" >
-          <size>
-           <width>128</width>
-           <height>16777215</height>
-          </size>
-         </property>
-         <property name="horizontalScrollBarPolicy" >
-          <enum>Qt::ScrollBarAlwaysOff</enum>
-         </property>
-         <property name="movement" >
-          <enum>QListView::Static</enum>
-         </property>
-         <property name="flow" >
-          <enum>QListView::TopToBottom</enum>
-         </property>
-         <property name="isWrapping" stdset="0" >
-          <bool>false</bool>
-         </property>
-         <property name="resizeMode" >
-          <enum>QListView::Adjust</enum>
-         </property>
-         <property name="viewMode" >
-          <enum>QListView::ListMode</enum>
-         </property>
-         <item>
-          <property name="text" >
-           <string>Input Options</string>
-          </property>
-         </item>
-         <item>
-          <property name="text" >
-           <string>Compression Options</string>
-          </property>
-         </item>
-         <item>
-          <property name="text" >
-           <string>Output Options</string>
-          </property>
-         </item>
-         <item>
-          <property name="text" >
-           <string>Preview</string>
-          </property>
-          <property name="icon" >
-           <iconset>../../../../../../castano-stuff/qshaderedit/src/images/colorpicker.png</iconset>
-          </property>
-         </item>
-        </widget>
-       </item>
-      </layout>
-     </item>
-     <item>
-      <widget class="QStackedWidget" name="stackedWidget" >
-       <property name="sizePolicy" >
-        <sizepolicy vsizetype="Expanding" hsizetype="Expanding" >
-         <horstretch>0</horstretch>
-         <verstretch>0</verstretch>
-        </sizepolicy>
-       </property>
-       <property name="currentIndex" >
-        <number>3</number>
-       </property>
-       <widget class="QWidget" name="inputPage" >
-        <layout class="QVBoxLayout" >
-         <property name="spacing" >
-          <number>-1</number>
-         </property>
-         <property name="leftMargin" >
-          <number>0</number>
-         </property>
-         <property name="topMargin" >
-          <number>0</number>
-         </property>
-         <property name="rightMargin" >
-          <number>0</number>
-         </property>
-         <property name="bottomMargin" >
-          <number>0</number>
-         </property>
-         <item>
-          <widget class="QTabWidget" name="tabWidget" >
-           <property name="currentIndex" >
-            <number>0</number>
-           </property>
-           <property name="elideMode" >
-            <enum>Qt::ElideNone</enum>
-           </property>
-           <property name="usesScrollButtons" >
-            <bool>false</bool>
-           </property>
-           <widget class="QWidget" name="widget" >
-            <attribute name="title" >
-             <string>File Path</string>
-            </attribute>
-            <layout class="QVBoxLayout" >
-             <item>
-              <layout class="QHBoxLayout" >
-               <item>
-                <widget class="QLineEdit" name="imagePathLineEdit" />
-               </item>
-               <item>
-                <widget class="QPushButton" name="openButton" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Open</string>
-                 </property>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <widget class="QGraphicsView" name="graphicsView" />
-             </item>
-            </layout>
-           </widget>
-           <widget class="QWidget" name="widget" >
-            <attribute name="title" >
-             <string>Image Settings</string>
-            </attribute>
-            <layout class="QVBoxLayout" >
-             <item>
-              <layout class="QHBoxLayout" >
-               <item>
-                <widget class="QGroupBox" name="groupBox" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="title" >
-                  <string>Color Mode</string>
-                 </property>
-                 <property name="alignment" >
-                  <set>Qt::AlignHCenter</set>
-                 </property>
-                 <layout class="QVBoxLayout" >
-                  <property name="spacing" >
-                   <number>0</number>
-                  </property>
-                  <item>
-                   <widget class="QRadioButton" name="rgbRadioButton" >
-                    <property name="text" >
-                     <string>RGB</string>
-                    </property>
-                    <property name="checked" >
-                     <bool>true</bool>
-                    </property>
-                   </widget>
-                  </item>
-                  <item>
-                   <widget class="QRadioButton" name="normalMapRadioButton" >
-                    <property name="text" >
-                     <string>Normal Map</string>
-                    </property>
-                   </widget>
-                  </item>
-                 </layout>
-                </widget>
-               </item>
-               <item>
-                <widget class="QGroupBox" name="alphaModeGroupBox" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="title" >
-                  <string>Alpha Mode</string>
-                 </property>
-                 <property name="alignment" >
-                  <set>Qt::AlignHCenter</set>
-                 </property>
-                 <layout class="QVBoxLayout" >
-                  <property name="spacing" >
-                   <number>0</number>
-                  </property>
-                  <item>
-                   <widget class="QRadioButton" name="alphaNoneRadioButton" >
-                    <property name="text" >
-                     <string>None</string>
-                    </property>
-                    <property name="checked" >
-                     <bool>false</bool>
-                    </property>
-                   </widget>
-                  </item>
-                  <item>
-                   <widget class="QRadioButton" name="alphaTransparencyRadioButton" >
-                    <property name="text" >
-                     <string>Transparency</string>
-                    </property>
-                    <property name="checked" >
-                     <bool>true</bool>
-                    </property>
-                   </widget>
-                  </item>
-                  <item>
-                   <widget class="QRadioButton" name="alphaPremultipliedRadioButton" >
-                    <property name="text" >
-                     <string>Premultiplied</string>
-                    </property>
-                   </widget>
-                  </item>
-                 </layout>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <layout class="QHBoxLayout" >
-               <property name="spacing" >
-                <number>0</number>
-               </property>
-               <item>
-                <widget class="QLabel" name="wrapModeLabel" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Preferred" hsizetype="Expanding" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Wrap Mode:</string>
-                 </property>
-                 <property name="buddy" >
-                  <cstring>mipmapFilterComboBox</cstring>
-                 </property>
-                </widget>
-               </item>
-               <item>
-                <widget class="QComboBox" name="wrapModeComboBox" >
-                 <property name="maximumSize" >
-                  <size>
-                   <width>16777215</width>
-                   <height>26</height>
-                  </size>
-                 </property>
-                 <item>
-                  <property name="text" >
-                   <string>Mirror</string>
-                  </property>
-                 </item>
-                 <item>
-                  <property name="text" >
-                   <string>Repeat</string>
-                  </property>
-                 </item>
-                 <item>
-                  <property name="text" >
-                   <string>Clamp</string>
-                  </property>
-                 </item>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <layout class="QHBoxLayout" >
-               <item>
-                <widget class="QLabel" name="inputGammaLabel" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Expanding" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Input Gamma:</string>
-                 </property>
-                 <property name="buddy" >
-                  <cstring>inputGammaSpinBox</cstring>
-                 </property>
-                </widget>
-               </item>
-               <item>
-                <widget class="QDoubleSpinBox" name="inputGammaSpinBox" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Maximum" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="buttonSymbols" >
-                  <enum>QAbstractSpinBox::UpDownArrows</enum>
-                 </property>
-                 <property name="minimum" >
-                  <double>0.050000000000000</double>
-                 </property>
-                 <property name="maximum" >
-                  <double>4.000000000000000</double>
-                 </property>
-                 <property name="singleStep" >
-                  <double>0.050000000000000</double>
-                 </property>
-                 <property name="value" >
-                  <double>2.200000000000000</double>
-                 </property>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <layout class="QHBoxLayout" >
-               <item>
-                <widget class="QLabel" name="outputGammaLabel" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Expanding" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Output Gamma:</string>
-                 </property>
-                 <property name="buddy" >
-                  <cstring>inputGammaSpinBox</cstring>
-                 </property>
-                </widget>
-               </item>
-               <item>
-                <widget class="QDoubleSpinBox" name="outputGammaSpinBox" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Maximum" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="buttonSymbols" >
-                  <enum>QAbstractSpinBox::UpDownArrows</enum>
-                 </property>
-                 <property name="minimum" >
-                  <double>0.050000000000000</double>
-                 </property>
-                 <property name="maximum" >
-                  <double>4.000000000000000</double>
-                 </property>
-                 <property name="singleStep" >
-                  <double>0.050000000000000</double>
-                 </property>
-                 <property name="value" >
-                  <double>2.200000000000000</double>
-                 </property>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <spacer>
-               <property name="orientation" >
-                <enum>Qt::Vertical</enum>
-               </property>
-               <property name="sizeHint" >
-                <size>
-                 <width>433</width>
-                 <height>16</height>
-                </size>
-               </property>
-              </spacer>
-             </item>
-            </layout>
-           </widget>
-           <widget class="QWidget" name="widget" >
-            <attribute name="title" >
-             <string>Mipmaps</string>
-            </attribute>
-            <layout class="QVBoxLayout" >
-             <item>
-              <widget class="QCheckBox" name="generateMipmapsCheckBox" >
-               <property name="text" >
-                <string>Generate mipmaps</string>
-               </property>
-               <property name="checked" >
-                <bool>true</bool>
-               </property>
-              </widget>
-             </item>
-             <item>
-              <layout class="QHBoxLayout" >
-               <property name="spacing" >
-                <number>1</number>
-               </property>
-               <item>
-                <widget class="QLabel" name="mipmapFilterLabel" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Preferred" hsizetype="Expanding" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Mipmap filter:</string>
-                 </property>
-                 <property name="buddy" >
-                  <cstring>mipmapFilterComboBox</cstring>
-                 </property>
-                </widget>
-               </item>
-               <item>
-                <widget class="QComboBox" name="mipmapFilterComboBox" >
-                 <property name="maximumSize" >
-                  <size>
-                   <width>16777215</width>
-                   <height>26</height>
-                  </size>
-                 </property>
-                 <item>
-                  <property name="text" >
-                   <string>Box</string>
-                  </property>
-                 </item>
-                 <item>
-                  <property name="text" >
-                   <string>Triangle</string>
-                  </property>
-                 </item>
-                 <item>
-                  <property name="text" >
-                   <string>Kaiser</string>
-                  </property>
-                 </item>
-                </widget>
-               </item>
-               <item>
-                <widget class="QToolButton" name="mipmapFilterSettings" >
-                 <property name="enabled" >
-                  <bool>false</bool>
-                 </property>
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="maximumSize" >
-                  <size>
-                   <width>16777215</width>
-                   <height>24</height>
-                  </size>
-                 </property>
-                 <property name="autoFillBackground" >
-                  <bool>false</bool>
-                 </property>
-                 <property name="text" >
-                  <string>...</string>
-                 </property>
-                 <property name="toolButtonStyle" >
-                  <enum>Qt::ToolButtonTextOnly</enum>
-                 </property>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <layout class="QHBoxLayout" >
-               <item>
-                <widget class="QCheckBox" name="limitMipmapsCheckBox" >
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Expanding" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Limit Mipmaps</string>
-                 </property>
-                </widget>
-               </item>
-               <item>
-                <widget class="QLabel" name="maxLevelLabel" >
-                 <property name="enabled" >
-                  <bool>false</bool>
-                 </property>
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="text" >
-                  <string>Max Level:</string>
-                 </property>
-                </widget>
-               </item>
-               <item>
-                <widget class="QSpinBox" name="maxLevelSpinBox" >
-                 <property name="enabled" >
-                  <bool>false</bool>
-                 </property>
-                 <property name="sizePolicy" >
-                  <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-                   <horstretch>0</horstretch>
-                   <verstretch>0</verstretch>
-                  </sizepolicy>
-                 </property>
-                 <property name="maximumSize" >
-                  <size>
-                   <width>80</width>
-                   <height>16777215</height>
-                  </size>
-                 </property>
-                </widget>
-               </item>
-              </layout>
-             </item>
-             <item>
-              <spacer>
-               <property name="orientation" >
-                <enum>Qt::Vertical</enum>
-               </property>
-               <property name="sizeHint" >
-                <size>
-                 <width>204</width>
-                 <height>71</height>
-                </size>
-               </property>
-              </spacer>
-             </item>
-            </layout>
-           </widget>
-           <widget class="QWidget" name="widget" >
-            <attribute name="title" >
-             <string>Normal Map</string>
-            </attribute>
-           </widget>
-          </widget>
-         </item>
-        </layout>
-       </widget>
-       <widget class="QWidget" name="compressionPage" >
-        <layout class="QVBoxLayout" >
-         <property name="leftMargin" >
-          <number>0</number>
-         </property>
-         <property name="topMargin" >
-          <number>0</number>
-         </property>
-         <property name="rightMargin" >
-          <number>0</number>
-         </property>
-         <property name="bottomMargin" >
-          <number>0</number>
-         </property>
-         <item>
-          <layout class="QHBoxLayout" >
-           <item>
-            <widget class="QLabel" name="label_3" >
-             <property name="sizePolicy" >
-              <sizepolicy vsizetype="Preferred" hsizetype="Expanding" >
-               <horstretch>0</horstretch>
-               <verstretch>0</verstretch>
-              </sizepolicy>
-             </property>
-             <property name="text" >
-              <string>Format:</string>
-             </property>
-             <property name="textFormat" >
-              <enum>Qt::PlainText</enum>
-             </property>
-             <property name="textInteractionFlags" >
-              <set>Qt::NoTextInteraction</set>
-             </property>
-             <property name="buddy" >
-              <cstring>formatComboBox</cstring>
-             </property>
-            </widget>
-           </item>
-           <item>
-            <widget class="QComboBox" name="formatComboBox" >
-             <item>
-              <property name="text" >
-               <string>Uncompressed</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>BC1 (DXT1)</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>BC1a (DXT1a)</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>BC2 (DXT3)</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>BC3 (DXT5)</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>BC4</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>BC5</string>
-              </property>
-             </item>
-            </widget>
-           </item>
-          </layout>
-         </item>
-         <item>
-          <layout class="QHBoxLayout" >
-           <item>
-            <widget class="QLabel" name="label_4" >
-             <property name="sizePolicy" >
-              <sizepolicy vsizetype="Preferred" hsizetype="Expanding" >
-               <horstretch>0</horstretch>
-               <verstretch>0</verstretch>
-              </sizepolicy>
-             </property>
-             <property name="text" >
-              <string>Quality:</string>
-             </property>
-             <property name="textFormat" >
-              <enum>Qt::PlainText</enum>
-             </property>
-             <property name="textInteractionFlags" >
-              <set>Qt::NoTextInteraction</set>
-             </property>
-             <property name="buddy" >
-              <cstring>formatComboBox</cstring>
-             </property>
-            </widget>
-           </item>
-           <item>
-            <widget class="QComboBox" name="formatComboBox_2" >
-             <property name="currentIndex" >
-              <number>1</number>
-             </property>
-             <item>
-              <property name="text" >
-               <string>Fastest</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>Normal</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>Production</string>
-              </property>
-             </item>
-             <item>
-              <property name="text" >
-               <string>Highest</string>
-              </property>
-             </item>
-            </widget>
-           </item>
-          </layout>
-         </item>
-         <item>
-          <layout class="QHBoxLayout" >
-           <item>
-            <spacer>
-             <property name="orientation" >
-              <enum>Qt::Horizontal</enum>
-             </property>
-             <property name="sizeHint" >
-              <size>
-               <width>40</width>
-               <height>20</height>
-              </size>
-             </property>
-            </spacer>
-           </item>
-           <item>
-            <widget class="QGroupBox" name="groupBox_6" >
-             <property name="sizePolicy" >
-              <sizepolicy vsizetype="Fixed" hsizetype="Fixed" >
-               <horstretch>0</horstretch>
-               <verstretch>0</verstretch>
-              </sizepolicy>
-             </property>
-             <property name="title" >
-              <string>Color Weights</string>
-             </property>
-             <property name="alignment" >
-              <set>Qt::AlignHCenter</set>
-             </property>
-             <layout class="QVBoxLayout" >
-              <item>
-               <layout class="QHBoxLayout" >
-                <item>
-                 <widget class="QLabel" name="label_10" >
-                  <property name="text" >
-                   <string>Red</string>
-                  </property>
-                  <property name="buddy" >
-                   <cstring>redSpinBox</cstring>
-                  </property>
-                 </widget>
-                </item>
-                <item>
-                 <widget class="QDoubleSpinBox" name="redSpinBox" >
-                  <property name="maximum" >
-                   <double>1.000000000000000</double>
-                  </property>
-                  <property name="singleStep" >
-                   <double>0.050000000000000</double>
-                  </property>
-                  <property name="value" >
-                   <double>1.000000000000000</double>
-                  </property>
-                 </widget>
-                </item>
-               </layout>
-              </item>
-              <item>
-               <layout class="QHBoxLayout" >
-                <item>
-                 <widget class="QLabel" name="label_11" >
-                  <property name="text" >
-                   <string>Green</string>
-                  </property>
-                  <property name="buddy" >
-                   <cstring>greenSpinBox</cstring>
-                  </property>
-                 </widget>
-                </item>
-                <item>
-                 <widget class="QDoubleSpinBox" name="greenSpinBox" >
-                  <property name="maximum" >
-                   <double>1.000000000000000</double>
-                  </property>
-                  <property name="singleStep" >
-                   <double>0.050000000000000</double>
-                  </property>
-                  <property name="value" >
-                   <double>1.000000000000000</double>
-                  </property>
-                 </widget>
-                </item>
-               </layout>
-              </item>
-              <item>
-               <layout class="QHBoxLayout" >
-                <item>
-                 <widget class="QLabel" name="label_12" >
-                  <property name="text" >
-                   <string>Blue</string>
-                  </property>
-                  <property name="buddy" >
-                   <cstring>blueSpinBox</cstring>
-                  </property>
-                 </widget>
-                </item>
-                <item>
-                 <widget class="QDoubleSpinBox" name="blueSpinBox" >
-                  <property name="maximum" >
-                   <double>1.000000000000000</double>
-                  </property>
-                  <property name="singleStep" >
-                   <double>0.050000000000000</double>
-                  </property>
-                  <property name="value" >
-                   <double>1.000000000000000</double>
-                  </property>
-                 </widget>
-                </item>
-               </layout>
-              </item>
-              <item>
-               <layout class="QHBoxLayout" >
-                <item>
-                 <widget class="QPushButton" name="uniformButton" >
-                  <property name="maximumSize" >
-                   <size>
-                    <width>16777215</width>
-                    <height>22</height>
-                   </size>
-                  </property>
-                  <property name="text" >
-                   <string>Uniform Weights</string>
-                  </property>
-                  <property name="checkable" >
-                   <bool>true</bool>
-                  </property>
-                  <property name="checked" >
-                   <bool>true</bool>
-                  </property>
-                 </widget>
-                </item>
-                <item>
-                 <widget class="QPushButton" name="luminanceButton" >
-                  <property name="maximumSize" >
-                   <size>
-                    <width>16777215</width>
-                    <height>22</height>
-                   </size>
-                  </property>
-                  <property name="text" >
-                   <string>Luminance Weights</string>
-                  </property>
-                  <property name="checkable" >
-                   <bool>true</bool>
-                  </property>
-                 </widget>
-                </item>
-               </layout>
-              </item>
-             </layout>
-            </widget>
-           </item>
-           <item>
-            <spacer>
-             <property name="orientation" >
-              <enum>Qt::Horizontal</enum>
-             </property>
-             <property name="sizeHint" >
-              <size>
-               <width>40</width>
-               <height>20</height>
-              </size>
-             </property>
-            </spacer>
-           </item>
-          </layout>
-         </item>
-         <item>
-          <spacer>
-           <property name="orientation" >
-            <enum>Qt::Vertical</enum>
-           </property>
-           <property name="sizeHint" >
-            <size>
-             <width>484</width>
-             <height>31</height>
-            </size>
-           </property>
-          </spacer>
-         </item>
-        </layout>
-       </widget>
-       <widget class="QWidget" name="outputPage" />
-       <widget class="QWidget" name="previewPage" >
-        <layout class="QVBoxLayout" >
-         <property name="leftMargin" >
-          <number>0</number>
-         </property>
-         <property name="topMargin" >
-          <number>0</number>
-         </property>
-         <property name="rightMargin" >
-          <number>0</number>
-         </property>
-         <property name="bottomMargin" >
-          <number>0</number>
-         </property>
-         <item>
-          <layout class="QHBoxLayout" >
-           <item>
-            <widget class="QGraphicsView" name="graphicsView_2" />
-           </item>
-          </layout>
-         </item>
-         <item>
-          <layout class="QHBoxLayout" >
-           <item>
-            <widget class="QCheckBox" name="filterCheckBox" >
-             <property name="text" >
-              <string>Bilinear Filter</string>
-             </property>
-             <property name="checked" >
-              <bool>true</bool>
-             </property>
-            </widget>
-           </item>
-           <item>
-            <widget class="QCheckBox" name="diffCheckBox" >
-             <property name="text" >
-              <string>View difference</string>
-             </property>
-            </widget>
-           </item>
-          </layout>
-         </item>
-        </layout>
-       </widget>
-      </widget>
-     </item>
-    </layout>
-   </item>
-   <item>
-    <widget class="Line" name="line" >
-     <property name="orientation" >
-      <enum>Qt::Horizontal</enum>
-     </property>
-    </widget>
-   </item>
-   <item>
-    <layout class="QHBoxLayout" >
-     <item>
-      <widget class="QPushButton" name="pushButton_6" >
-       <property name="text" >
-        <string>Default</string>
-       </property>
-      </widget>
-     </item>
-     <item>
-      <widget class="QProgressBar" name="progressBar" >
-       <property name="enabled" >
-        <bool>true</bool>
-       </property>
-       <property name="value" >
-        <number>0</number>
-       </property>
-       <property name="textVisible" >
-        <bool>true</bool>
-       </property>
-       <property name="orientation" >
-        <enum>Qt::Horizontal</enum>
-       </property>
-       <property name="invertedAppearance" >
-        <bool>false</bool>
-       </property>
-      </widget>
-     </item>
-     <item>
-      <widget class="QPushButton" name="pushButton" >
-       <property name="text" >
-        <string>Quit</string>
-       </property>
-      </widget>
-     </item>
-    </layout>
-   </item>
-  </layout>
- </widget>
- <resources/>
- <connections>
-  <connection>
-   <sender>listWidget</sender>
-   <signal>currentRowChanged(int)</signal>
-   <receiver>stackedWidget</receiver>
-   <slot>setCurrentIndex(int)</slot>
-   <hints>
-    <hint type="sourcelabel" >
-     <x>118</x>
-     <y>193</y>
-    </hint>
-    <hint type="destinationlabel" >
-     <x>154</x>
-     <y>220</y>
-    </hint>
-   </hints>
-  </connection>
-  <connection>
-   <sender>pushButton</sender>
-   <signal>clicked()</signal>
-   <receiver>ConfigDialog</receiver>
-   <slot>accept()</slot>
-   <hints>
-    <hint type="sourcelabel" >
-     <x>565</x>
-     <y>491</y>
-    </hint>
-    <hint type="destinationlabel" >
-     <x>582</x>
-     <y>506</y>
-    </hint>
-   </hints>
-  </connection>
-  <connection>
-   <sender>limitMipmapsCheckBox</sender>
-   <signal>clicked(bool)</signal>
-   <receiver>maxLevelSpinBox</receiver>
-   <slot>setEnabled(bool)</slot>
-   <hints>
-    <hint type="sourcelabel" >
-     <x>451</x>
-     <y>120</y>
-    </hint>
-    <hint type="destinationlabel" >
-     <x>524</x>
-     <y>120</y>
-    </hint>
-   </hints>
-  </connection>
-  <connection>
-   <sender>limitMipmapsCheckBox</sender>
-   <signal>clicked(bool)</signal>
-   <receiver>maxLevelLabel</receiver>
-   <slot>setEnabled(bool)</slot>
-   <hints>
-    <hint type="sourcelabel" >
-     <x>337</x>
-     <y>120</y>
-    </hint>
-    <hint type="destinationlabel" >
-     <x>482</x>
-     <y>124</y>
-    </hint>
-   </hints>
-  </connection>
- </connections>
-</ui>
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/ddsinfo.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/ddsinfo.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/ddsinfo.cpp
@@ -1,57 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include "cmdline.h"
-
-
-int main(int argc, char *argv[])
-{
-	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	if (argc != 2)
-	{
-		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
-		printf("usage: nvddsinfo ddsfile\n\n");
-		return 1;
-	}
-
-	// Load surface.
-	nv::DirectDrawSurface dds(argv[1]);
-	if (!dds.isValid())
-	{
-		printf("The file '%s' is not a valid DDS file.\n", argv[1]);
-		return 1;
-	}
-	
-	dds.printInfo();
-
-	return 0;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/decompress.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/decompress.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/decompress.cpp
@@ -1,71 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include <nvimage/ImageIO.h>
-
-#include "cmdline.h"
-
-int main(int argc, char *argv[])
-{
-	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	if (argc != 2)
-	{
-		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
-		printf("usage: nvdecompress 'ddsfile'\n\n");
-		return 1;
-	}
-
-	// Load surface.
-	nv::DirectDrawSurface dds(argv[1]);
-	if (!dds.isValid())
-	{
-		printf("The file '%s' is not a valid DDS file.\n", argv[1]);
-		return 1;
-	}
-	
-	nv::Path name(argv[1]);
-	name.stripExtension();
-	name.append(".tga");
-	
-	nv::StdOutputStream stream(name.str());
-	if (stream.isError()) {
-		printf("Error opening '%s' for writting\n", name.str());
-		return 1;
-	}
-	
-	// @@ TODO: Add command line options to output mipmaps, cubemap faces, etc.
-	nv::Image img;
-	dds.mipmap(&img, 0, 0); // get first image
-	nv::ImageIO::saveTGA(stream, &img);
-
-	return 0;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/imgdiff.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/imgdiff.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/imgdiff.cpp
@@ -1,296 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-#include <nvcore/Containers.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include <nvmath/Color.h>
-#include <nvmath/Vector.h>
-
-#include <math.h>
-
-#include "cmdline.h"
-
-static bool loadImage(nv::Image & image, const char * fileName)
-{
-	if (nv::strCaseCmp(nv::Path::extension(fileName), ".dds") == 0)
-	{
-		nv::DirectDrawSurface dds(fileName);
-		if (!dds.isValid())
-		{
-			printf("The file '%s' is not a valid DDS file.\n", fileName);
-			return false;
-		}
-		
-		dds.mipmap(&image, 0, 0); // get first image
-	}
-	else
-	{
-		// Regular image.
-		if (!image.load(fileName))
-		{
-			printf("The file '%s' is not a supported image type.\n", fileName);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-// @@ Compute per-tile errors.
-struct Error
-{
-	Error()
-	{
-		samples = 0;
-		mabse = 0.0f;
-		maxabse = 0.0f;
-		mse = 0.0f;
-	}
-
-	void addSample(float e)
-	{
-		samples++;
-		mabse += fabsf(e);
-		maxabse = nv::max(maxabse, fabsf(e));
-		mse += e * e;
-	}
-
-	void done()
-	{
-		mabse /= samples;
-		mse /= samples;
-		rmse = sqrtf(mse);
-		psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
-	}
-
-	void print()
-	{
-		printf("  Mean absolute error: %f\n", mabse);
-		printf("  Max absolute error: %f\n", maxabse);
-		printf("  Root mean squared error: %f\n", rmse);
-		printf("  Peak signal to noise ratio in dB: %f\n", psnr);
-	}
-
-	int samples;
-	float mabse;
-	float maxabse;
-	float mse;
-	float rmse;
-	float psnr;
-};
-
-struct NormalError
-{
-	NormalError()
-	{
-		samples = 0;
-		ade = 0.0f;
-		mse = 0.0f;
-	}
-
-	void addSample(nv::Color32 o, nv::Color32 c)
-	{
-		nv::Vector3 vo = nv::Vector3(o.r, o.g, o.b);
-		nv::Vector3 vc = nv::Vector3(c.r, c.g, c.b);
-
-		// Unpack and normalize.
-		vo = nv::normalize(2.0f * (vo / 255.0f) - 1.0f);
-		vc = nv::normalize(2.0f * (vc / 255.0f) - 1.0f);
-
-		ade += acosf(nv::clamp(dot(vo, vc), -1.0f, 1.0f));
-		mse += length_squared((vo - vc) * (255 / 2.0f));
-		
-		samples++;
-	}
-
-	void done()
-	{
-		if (samples)
-		{
-			ade /= samples;
-			mse /= samples * 3;
-			rmse = sqrtf(mse);
-			psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
-		}
-	}
-
-	void print()
-	{
-		printf("  Angular deviation error: %f\n", ade);
-		printf("  Root mean squared error: %f\n", rmse);
-		printf("  Peak signal to noise ratio in dB: %f\n", psnr);
-	}
-
-	int samples;
-	float ade;
-	float mse;
-	float rmse;
-	float psnr;
-};
-
-
-int main(int argc, char *argv[])
-{
-	MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	bool compareNormal = false;
-	bool compareAlpha = false;
-
-	nv::Path input0;
-	nv::Path input1;
-	nv::Path output;
-
-	// Parse arguments.
-	for (int i = 1; i < argc; i++)
-	{
-		// Input options.
-		if (strcmp("-normal", argv[i]) == 0)
-		{
-			compareNormal = true;
-		}
-		if (strcmp("-alpha", argv[i]) == 0)
-		{
-			compareAlpha = true;
-		}
-
-		else if (argv[i][0] != '-')
-		{
-			input0 = argv[i];
-
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				input1 = argv[i+1];
-			}
-
-			break;
-		}
-	}
-
-	if (input0.isNull() || input1.isNull())
-	{
-		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");
-		
-		printf("usage: nvimgdiff [options] original_file updated_file [output]\n\n");
-		
-		printf("Diff options:\n");
-		printf("  -normal \tCompare images as if they were normal maps.\n");
-		printf("  -alpha  \tCompare alpha weighted images.\n");
-
-		return 1;
-	}
-
-	nv::Image image0, image1;
-	if (!loadImage(image0, input0)) return 0;
-	if (!loadImage(image1, input1)) return 0;
-
-	const uint w0 = image0.width();
-	const uint h0 = image0.height();
-	const uint w1 = image1.width();
-	const uint h1 = image1.height();
-	const uint w = nv::min(w0, w1);
-	const uint h = nv::min(h0, h1);
-
-	// Compute errors.
-	Error error_r;
-	Error error_g;
-	Error error_b;
-	Error error_a;
-	Error error_total;
-	NormalError error_normal;
-
-	for (uint i = 0; i < h; i++)
-	{
-		for (uint e = 0; e < w; e++)
-		{
-			const nv::Color32 c0(image0.pixel(e, i));
-			const nv::Color32 c1(image1.pixel(e, i));
-
-			float r = float(c0.r - c1.r);
-			float g = float(c0.g - c1.g);
-			float b = float(c0.b - c1.b);
-			float a = float(c0.a - c1.a);
-
-			error_r.addSample(r);
-			error_g.addSample(g);
-			error_b.addSample(b);
-			error_a.addSample(a);
-			
-			if (compareNormal)
-			{
-				error_normal.addSample(c0, c1);
-			}
-
-			if (compareAlpha)
-			{
-				error_total.addSample(r * c0.a / 255.0f);
-				error_total.addSample(g * c0.a / 255.0f);
-				error_total.addSample(b * c0.a / 255.0f);
-			}
-			else
-			{
-				error_total.addSample(r);
-				error_total.addSample(g);
-				error_total.addSample(b);
-			}
-		}
-	}
-
-	error_r.done();
-	error_g.done();
-	error_b.done();
-	error_a.done();
-	error_total.done();
-	error_normal.done();
-	
-
-	printf("Image size compared: %dx%d\n", w, h);
-	if (w != w0 || w != w1 || h != h0 || h != h1) {
-		printf("--- NOTE: only the overlap between the 2 images (%d,%d) and (%d,%d) was compared\n", w0, h0, w1, h1);
-	}
-	printf("Total pixels: %d\n", w*h);
-
-	printf("Color:\n");
-	error_total.print();
-
-	if (compareNormal)
-	{
-		printf("Normal:\n");
-		error_normal.print();
-	}
-
-	if (compareAlpha)
-	{
-		printf("Alpha:\n");
-		error_a.print();
-	}
-
-	// @@ Write image difference.
-	
-	return 0;
-}
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/main.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/main.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/main.cpp
@@ -1,34 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <QtGui/QApplication>
-#include "configdialog.h"
-
-int main(int argc, char *argv[])
-{
-	QApplication app(argc, argv);
-	ConfigDialog dialog;
-	return dialog.exec();
-}
-
-
Index: ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/resize.cpp
===================================================================
--- ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/resize.cpp
+++ ps/trunk/libraries/source/nvtt/src/src/nvtt/tools/resize.cpp
@@ -1,183 +0,0 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Ptr.h>
-#include <nvcore/StrLib.h>
-#include <nvcore/StdStream.h>
-#include <nvcore/Containers.h>
-
-#include <nvimage/Image.h>
-#include <nvimage/ImageIO.h>
-#include <nvimage/FloatImage.h>
-#include <nvimage/Filter.h>
-#include <nvimage/DirectDrawSurface.h>
-
-#include <nvmath/Color.h>
-#include <nvmath/Vector.h>
-
-#include <math.h>
-
-#include "cmdline.h"
-
-static bool loadImage(nv::Image & image, const char * fileName)
-{
-	if (nv::strCaseCmp(nv::Path::extension(fileName), ".dds") == 0)
-	{
-		nv::DirectDrawSurface dds(fileName);
-		if (!dds.isValid())
-		{
-			printf("The file '%s' is not a valid DDS file.\n", fileName);
-			return false;
-		}
-		
-		dds.mipmap(&image, 0, 0); // get first image
-	}
-	else
-	{
-		// Regular image.
-		if (!image.load(fileName))
-		{
-			printf("The file '%s' is not a supported image type.\n", fileName);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-
-int main(int argc, char *argv[])
-{
-	//MyAssertHandler assertHandler;
-	MyMessageHandler messageHandler;
-
-	float scale = 0.5f;
-	float gamma = 2.2f;
-	nv::AutoPtr<nv::Filter> filter;
-	nv::Path input;
-	nv::Path output;
-	
-	nv::FloatImage::WrapMode wrapMode = nv::FloatImage::WrapMode_Mirror;
-
-	// Parse arguments.
-	for (int i = 1; i < argc; i++)
-	{
-		// Input options.
-		if (strcmp("-s", argv[i]) == 0)
-		{
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				scale = (float)atof(argv[i+1]);
-				i++;
-			}
-		}
-		else if (strcmp("-g", argv[i]) == 0)
-		{
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				gamma = (float)atof(argv[i+1]);
-				i++;
-			}
-		}
-		else if (strcmp("-f", argv[i]) == 0)
-		{
-			if (i+1 == argc) break;
-			i++;
-			
-			if (strcmp("box", argv[i]) == 0) filter = new nv::BoxFilter();
-			else if (strcmp("triangle", argv[i]) == 0) filter = new nv::TriangleFilter();
-			else if (strcmp("quadratic", argv[i]) == 0) filter = new nv::QuadraticFilter();
-			else if (strcmp("bspline", argv[i]) == 0) filter = new nv::BSplineFilter();
-			else if (strcmp("mitchell", argv[i]) == 0) filter = new nv::MitchellFilter();
-			else if (strcmp("lanczos", argv[i]) == 0) filter = new nv::LanczosFilter();
-			else if (strcmp("kaiser", argv[i]) == 0) {
-				filter = new nv::KaiserFilter(3);
-				((nv::KaiserFilter *)filter.ptr())->setParameters(4.0f, 1.0f);
-			}
-		}
-		else if (strcmp("-w", argv[i]) == 0)
-		{
-			if (i+1 == argc) break;
-			i++;
-
-			if (strcmp("mirror", argv[i]) == 0) wrapMode = nv::FloatImage::WrapMode_Mirror;
-			else if (strcmp("repeat", argv[i]) == 0) wrapMode = nv::FloatImage::WrapMode_Repeat;
-			else if (strcmp("clamp", argv[i]) == 0) wrapMode = nv::FloatImage::WrapMode_Clamp;
-		}
-		else if (argv[i][0] != '-')
-		{
-			input = argv[i];
-
-			if (i+1 < argc && argv[i+1][0] != '-') {
-				output = argv[i+1];
-			}
-
-			break;
-		}
-	}
-
-	if (input.isNull() || output.isNull())
-	{
-		printf("NVIDIA Texture Tools - Copyright NVIDIA Corporation 2007\n\n");	
-		
-		printf("usage: nvzoom [options] input [output]\n\n");
-		
-		printf("Options:\n");
-		printf("  -s scale     Scale factor (default = 0.5)\n");
-		printf("  -g gamma     Gamma correction (default = 2.2)\n");
-		printf("  -f filter    One of the following: (default = 'box')\n");
-		printf("                * box\n");
-		printf("                * triangle\n");
-		printf("                * quadratic\n");
-		printf("                * bspline\n");
-		printf("                * mitchell\n");
-		printf("                * lanczos\n");
-		printf("                * kaiser\n");
-		printf("  -w mode      One of the following: (default = 'mirror')\n");
-		printf("                * mirror\n");
-		printf("                * repeat\n");
-		printf("                * clamp\n");
-
-		return 1;
-	}
-	
-	if (filter == NULL)
-	{
-		filter = new nv::BoxFilter();
-	}
-
-	nv::Image image;
-	if (!loadImage(image, input)) return 0;
-
-	nv::FloatImage fimage(&image);
-	fimage.toLinear(0, 3, gamma);
-	
-	nv::AutoPtr<nv::FloatImage> fresult(fimage.resize(*filter, uint(image.width() * scale), uint(image.height() * scale), wrapMode));
-	
-	nv::AutoPtr<nv::Image> result(fresult->createImageGammaCorrect(gamma));
-	result->setFormat(nv::Image::Format_ARGB);
-
-	nv::StdOutputStream stream(output);
-	nv::ImageIO::saveTGA(stream, result.ptr());	// @@ Add generic save function. Add support for png too.
-	
-	return 0;
-}
-
Index: ps/trunk/source/graphics/TextureConverter.cpp
===================================================================
--- ps/trunk/source/graphics/TextureConverter.cpp
+++ ps/trunk/source/graphics/TextureConverter.cpp
@@ -52,6 +52,10 @@
 		memcpy(&buffer[off], data, size);
 		return true;
 	}
+
+	virtual void endImage()
+	{
+	}
 };
 
 /**
@@ -64,8 +68,6 @@
 	nvtt::InputOptions inputOptions;
 	nvtt::CompressionOptions compressionOptions;
 	nvtt::OutputOptions outputOptions;
-	bool isDXT1a; // see comment in RunThread
-	bool is8bpp;
 };
 
 /**
@@ -383,9 +385,6 @@
 	else
 		request->inputOptions.setAlphaMode(nvtt::AlphaMode_None);
 
-	request->isDXT1a = false;
-	request->is8bpp = false;
-
 	if (settings.format == FMT_RGBA)
 	{
 		request->compressionOptions.setFormat(nvtt::Format_RGBA);
@@ -396,7 +395,6 @@
 	{
 		request->compressionOptions.setFormat(nvtt::Format_RGBA);
 		request->compressionOptions.setPixelFormat(8, 0x00, 0x00, 0x00, 0xFF);
-		request->is8bpp = true;
 	}
 	else if (!hasAlpha)
 	{
@@ -406,7 +404,6 @@
 	else if (settings.format == FMT_DXT1)
 	{
 		request->compressionOptions.setFormat(nvtt::Format_DXT1a);
-		request->isDXT1a = true;
 	}
 	else if (settings.format == FMT_DXT3)
 	{
@@ -578,19 +575,6 @@
 			result->ret = compressor.process(request->inputOptions, request->compressionOptions, request->outputOptions);
 		}
 
-		// Ugly hack: NVTT 2.0 doesn't set DDPF_ALPHAPIXELS for DXT1a, so we can't
-		// distinguish it from DXT1. (It's fixed in trunk by
-		// http://code.google.com/p/nvidia-texture-tools/source/detail?r=924&path=/trunk).
-		// Rather than using a trunk NVTT (unstable, makes packaging harder)
-		// or patching our copy (makes packaging harder), we'll just manually
-		// set the flag here.
-		if (request->isDXT1a && result->ret && result->output.buffer.size() > 80)
-			result->output.buffer[80] |= 1; // DDPF_ALPHAPIXELS in DDS_PIXELFORMAT.dwFlags
-		// Ugly hack: NVTT always sets DDPF_RGB, even if we're trying to output 8-bit
-		// alpha-only DDS with no RGB components. Unset that flag.
-		if (request->is8bpp)
-			result->output.buffer[80] &= ~0x40; // DDPF_RGB in DDS_PIXELFORMAT.dwFlags
-
 		// Push the result onto the queue
 		std::lock_guard<std::mutex> wait_lock(textureConverter->m_WorkerMutex);
 		textureConverter->m_ResultQueue.push_back(result);
Index: ps/trunk/source/lib/tex/tex_dds.cpp
===================================================================
--- ps/trunk/source/lib/tex/tex_dds.cpp
+++ ps/trunk/source/lib/tex/tex_dds.cpp
@@ -310,10 +310,10 @@
 
 // DDS_PIXELFORMAT.dwFlags
 // we've seen some DXT3 files that don't have this set (which is nonsense;
-// any image lacking alpha should be stored as DXT1). it's authoritative
-// if fourcc is DXT1 (there's no other way to tell DXT1 and DXT1a apart)
-// and ignored otherwise.
+// any image lacking alpha should be stored as DXT1).
 #define DDPF_ALPHAPIXELS 0x00000001
+// DDPF_ALPHA is used instead of DDPF_ALPHAPIXELS for DXT1a.
+#define DDPF_ALPHA       0x00000002
 #define DDPF_FOURCC      0x00000004
 #define DDPF_RGB         0x00000040
 
@@ -326,7 +326,7 @@
 	u32 dwRBitMask;
 	u32 dwGBitMask;
 	u32 dwBBitMask;
-	u32 dwABitMask;                   // (DDPF_ALPHAPIXELS)
+	u32 dwABitMask;                   // (DDPF_ALPHA or DDPF_ALPHAPIXELS)
 };
 
 
@@ -435,7 +435,7 @@
 		RETURN_STATUS_IF_ERR(tex_validate_plain_format(bpp, (int)flags));
 	}
 	// .. uncompressed 8bpp greyscale
-	else if(pf_flags & DDPF_ALPHAPIXELS)
+	else if(pf_flags & DDPF_ALPHA)
 	{
 		const size_t pf_bpp    = (size_t)read_le32(&pf->dwRGBBitCount);
 		const size_t pf_a_mask = (size_t)read_le32(&pf->dwABitMask);
@@ -460,7 +460,7 @@
 		{
 		case FOURCC('D','X','T','1'):
 			bpp = 4;
-			if(pf_flags & DDPF_ALPHAPIXELS)
+			if(pf_flags & DDPF_ALPHA)
 				flags |= DXT1A | TEX_ALPHA;
 			else
 				flags |= 1;