Ristretto does not really quantize the original weights (i.e. it does not modify the .caffemodel). Before executing a convolution or fully-connected layer, it reduces the precision of the floating point weights (and input data as well). This makes that you compute the convolutional layer as if you would have reduced bit width weights and data.
template <typename Dtype>
void BaseRistrettoLayer<Dtype>::Trim2FixedPoint_cpu(Dtype* data, const int cnt,
const int bit_width, const int rounding, int fl) {
for (int index = 0; index < cnt; ++index) {
// Saturate data
Dtype max_data = (pow(2, bit_width - 1) - 1) * pow(2, -fl);
Dtype min_data = -pow(2, bit_width - 1) * pow(2, -fl);
data[index] = std::max(std::min(data[index], max_data), min_data);
// Round data
data[index] /= pow(2, -fl);
switch (rounding) {
case QuantizationParameter_Rounding_NEAREST:
data[index] = round(data[index]);
break;
case QuantizationParameter_Rounding_STOCHASTIC:
data[index] = floor(data[index] + RandUniform_cpu());
break;
default:
break;
}
// data[index] *= pow(2, -fl); /* DO NOT SCALE BACK */
}
// TO IMPLEMENT!
for (int index = 0; index < cnt; ++index) {
new_caffe_model[layer].weights[idx] = (int8_t)data[index];
}
}
Now you should be able to use it like this during inference
template <typename Dtype>
void BaseRistrettoLayer<Dtype>::Trim2FixedPointWeights_cpu(Dtype* data, const int cnt,
const int bit_width, const int rounding, int fl) {
for (int index = 0; index < cnt; ++index) {
data[index] = (float)new_caffe_model[layer].weights[idx] * pow(2, -fl)
}
}