At OpenAI we're now starting to use mixed precision quite a bit. Fp16 on the forward pass and fp32 on the backwards. I had to replace many of the primitive TF ops but this wasn't too much work. It would be nice if XLA also supported this. The basic idea is that you convert to float just after loading. And convert back to fp16 just before storing. So I use wrappers like this:
#include <cuda.h>
#include <vector_types.h>
#include <cuda_fp16.h>
__device__ __forceinline__ float4 half2floatV(uint2 v)
{
float4 r;
asm("{\n\t"
".reg .f16 a, b, c, d;\n\t"
"mov.b32 {a, b}, %4;\n\t"
"mov.b32 {c, d}, %5;\n\t"
"cvt.f32.f16 %0, a;\n\t"
"cvt.f32.f16 %1, b;\n\t"
"cvt.f32.f16 %2, c;\n\t"
"cvt.f32.f16 %3, d;\n\t"
"}" : "=f"(r.x),"=f"(r.y),"=f"(r.z),"=f"(r.w) : "r"(v.x),"r"(v.y));
return r;
}
__device__ __forceinline__ uint2 float2halfV(float4 v)
{
uint2 r;
asm("{\n\t"
".reg .f16 a, b, c, d;\n\t"
"cvt.rn.f16.f32 a, %2;\n\t"
"cvt.rn.f16.f32 b, %3;\n\t"
"cvt.rn.f16.f32 c, %4;\n\t"
"cvt.rn.f16.f32 d, %5;\n\t"
"mov.b32 %0, {a, b};\n\t"
"mov.b32 %1, {c, d};\n\t"
"}" : "=r"(r.x),"=r"(r.y) : "f"(v.x),"f"(v.y),"f"(v.z),"f"(v.w));
return r;
}
template <typename TO, typename TI> __device__ __forceinline__ void load(TO &out, const TI* __restrict__ in, int i, bool b);
template <> __device__ __forceinline__ void load<float ,float >(float &out, const float * __restrict__ in, int i, bool b)
{ if (b) out = in[i]; }
template <> __device__ __forceinline__ void load<float4,float4>(float4 &out, const float4* __restrict__ in, int i, bool b)
{ if (b) out = in[i]; }
template <> __device__ __forceinline__ void load<float ,Eigen::half>(float &out, const Eigen::half* __restrict__ in, int i, bool b)
{ Eigen::half v; v.x=0; if (b) v = in[i]; out = __half2float((__half)v); }
template <> __device__ __forceinline__ void load<float4, uint2>(float4 &out, const uint2* __restrict__ in, int i, bool b)
{ uint2 v({0,0}); if (b) v = in[i]; out = half2floatV(v); }
template <typename TO, typename TI> __device__ __forceinline__ void store(TO* out, TI val, int i, bool b);
template <> __device__ __forceinline__ void store<float ,float >(float * out, float v, int i, bool b)
{ if (b) out[i] = v; }
template <> __device__ __forceinline__ void store<float4,float4>(float4* out, float4 v, int i, bool b)
{ if (b) out[i] = v; }
template <> __device__ __forceinline__ void store<Eigen::half,float >(Eigen::half* out, float v, int i, bool b)
{ Eigen::half r(__float2half(v)); if (b) out[i] = r; }
template <> __device__ __forceinline__ void store<uint2, float4>( uint2* out, float4 v, int i, bool b)
{ uint2 r(float2halfV(v)); if (b) out[i] = r; }
Note that I don't include the conversion in the conditional. This makes it easier for the cuda compiler to batch the loads during unrolling. I'll be releasing this code soonish (just waiting on papers). This will include support for mixed precision conv and gemm.
You might also include a tensor scale factor in the conversion to support integer types, and even fp16 can sometimes benefit from a re-scaling to avoid under/overflow. Then you probably want to insert reductions in the kernel to collect stats cheaply. This lets you predict subsequent scale factors during training.
-Scott