the-algorithm/twml/libtwml/src/ops/hashing_discretizer.cpp

261 lines
9.9 KiB
C++

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/util/work_sharder.h"
#include <twml.h>
#include "tensorflow_utils.h"
using namespace tensorflow;
void ComputeHashingDiscretizer(
OpKernelContext*,
int64_t,
const twml::Map<int64_t, int64_t> &,
int64_t,
int64_t,
int64_t);
REGISTER_OP("HashingDiscretizer")
.Attr("T: {float, double}")
.Input("input_ids: int64")
.Input("input_vals: T")
.Input("bin_vals: T")
.Attr("feature_ids: tensor = { dtype: DT_INT64 }")
.Attr("n_bin: int")
.Attr("output_bits: int")
.Attr("cost_per_unit: int")
.Attr("options: int")
.Output("new_keys: int64")
.Output("new_vals: T")
.SetShapeFn(
[](::tensorflow::shape_inference::InferenceContext* c) {
c->set_output(0, c->input(0));
c->set_output(1, c->input(1));
return Status::OK();
}
)
.Doc(R"doc(
This operation discretizes a tensor containing continuous features (if calibrated).
- note - choice of float or double should be consistent among inputs/output
Input
input_ids(int64): A tensor containing input feature ids (direct from data record).
input_vals(float/double): A tensor containing input values at corresponding feature ids.
- i.e. input_ids[i] <-> input_vals[i] for each i
bin_vals(float/double): A tensor containing the bin boundaries for values of a given feature.
- float or double, matching input_vals
feature_ids(int64 attr): 1D TensorProto of feature IDs seen during calibration
-> hint: look up make_tensor_proto:
proto_init = np.array(values, dtype=np.int64)
tensor_attr = tf.make_tensor_proto(proto_init)
n_bin(int): The number of bin boundary values per feature
-> hence, n_bin + 1 buckets for each feature
output_bits(int): The maximum number of bits to use for the output IDs.
cost_per_unit(int): An estimate of the number of CPU cycles (or nanoseconds
if not CPU-bound) to complete a unit of work. Overestimating creates too
many shards and CPU time will be dominated by per-shard overhead, such as
Context creation. Underestimating may not fully make use of the specified
parallelism.
options(int): selects behavior of the op.
0x00 in bits{1:0} for std::lower_bound bucket search.
0x01 in bits{1:0} for linear bucket search
0x02 in bits{1:0} for std::upper_bound bucket search
0x00 in bits{4:2} for integer_multiplicative_hashing
0x01 in bits{4:2} for integer64_multiplicative_hashing
higher bits/other values are reserved for future extensions
Outputs
new_keys(int64): The discretized feature ids with same shape and size as keys.
new_vals(float or double): The discretized values with the same shape and size as vals.
Operation
Note that the discretization operation maps observation vectors to higher dimensional
observation vectors. Here, we describe this mapping.
Let a calibrated feature observation be given by (F,x), where F is the ID of the
feature, and x is some real value (i.e., continuous feature). This kind of
representation is useful for the representation of sparse vectors, where there
are many zeros.
For example, for a dense feature vector [1.2, 2.4, 3.6], we might have
(0, 1.2) (1, 2.4) and (2, 3.6), with feature IDs indicating the 0th, 1st, and 2nd
elements of the vector.
The disretizer performs the following operation:
(F,x) -> (map(x|F),1).
Hence, we have that map(x|F) is a new feature ID, and the value observed for that
feature is 1. We might read map(x|F) as 'the map of x for feature F'.
For each feature F, we associate a (discrete, finite) set of new feature IDs, newIDs(F).
We will then have that map(x|F) is in the set newIDs(F) for any value of x. Each
set member of newIDs(F) is associated with a 'bin', as defined by the bin
boundaries given in the bin_vals input array. For any two different feature IDs F
and G, we would ideally have that INTERSECT(newIDs(F),newIDs(G)) is the empty set.
However, this is not guaranteed for this discretizer.
In the case of this hashing discretizer, map(x|F) can actually be written as follows:
let bucket = bucket(x|F) be the the bucket index for x, according to the
calibration on F. (This is an integer value in [0,n_bin], inclusive)
F is an integer ID. Here, we have that map(x|F) = hash_fn(F,bucket). This has
the desirable property that the new ID depends only on the calibration data
supplied for feature F, and not on any other features in the dataset (e.g.,
number of other features present in the calibration data, or order of features
in the dataset). Note that PercentileDiscretizer does NOT have this property.
This comes at the expense of the possibility of output ID collisions, which
we try to minimize through the design of hash_fn.
Example - consider input vector with a single element, i.e. [x].
Let's Discretize to one of 2 values, as follows:
Let F=0 for the ID of the single feature in the vector.
Let the bin boundary of feature F=0 be BNDRY(F) = BNDRY(0) since F=0
bucket = bucket(x|F=0) = 0 if x<=BNDRY(0) else 1
Let map(x|F) = hash_fn(F=0,bucket=0) if x<=BNDRY(0) else hash_fn(F=0,bucket=1)
If we had another element y in the vector, i.e. [x, y], then we might additionally
Let F=1 for element y.
Let the bin boundary be BNDRY(F) = BNDRY(1) since F=1
bucket = bucket(x|F=1) = 0 if x<=BNDRY(1) else 1
Let map(x|F) = hash_fn(F=1,bucket=0) if x<=BNDRY(1) else hash_fn(F=1,bucket=1)
Note how the construction of map(x|F=1) does not depend on whether map(x|F=0)
was constructed.
)doc");
template<typename T>
class HashingDiscretizer : public OpKernel {
public:
explicit HashingDiscretizer(OpKernelConstruction* context) : OpKernel(context) {
OP_REQUIRES_OK(context,
context->GetAttr("n_bin", &n_bin_));
OP_REQUIRES(context,
n_bin_ > 0,
errors::InvalidArgument("Must have n_bin_ > 0."));
OP_REQUIRES_OK(context,
context->GetAttr("output_bits", &output_bits_));
OP_REQUIRES(context,
output_bits_ > 0,
errors::InvalidArgument("Must have output_bits_ > 0."));
OP_REQUIRES_OK(context,
context->GetAttr("cost_per_unit", &cost_per_unit_));
OP_REQUIRES(context,
cost_per_unit_ >= 0,
errors::InvalidArgument("Must have cost_per_unit >= 0."));
OP_REQUIRES_OK(context,
context->GetAttr("options", &options_));
// construct the ID_to_index hash map
Tensor feature_IDs;
// extract the tensors
OP_REQUIRES_OK(context,
context->GetAttr("feature_ids", &feature_IDs));
// for access to the data
// int64_t data type is set in to_layer function of the calibrator objects in Python
auto feature_IDs_flat = feature_IDs.flat<int64>();
// verify proper dimension constraints
OP_REQUIRES(context,
feature_IDs.shape().dims() == 1,
errors::InvalidArgument("feature_ids must be 1D."));
// reserve space in the hash map and fill in the values
int64_t num_features = feature_IDs.shape().dim_size(0);
#ifdef USE_DENSE_HASH
ID_to_index_.set_empty_key(0);
ID_to_index_.resize(num_features);
#else
ID_to_index_.reserve(num_features);
#endif // USE_DENSE_HASH
for (int64_t i = 0 ; i < num_features ; i++) {
ID_to_index_[feature_IDs_flat(i)] = i;
}
}
void Compute(OpKernelContext* context) override {
ComputeHashingDiscretizer(
context,
output_bits_,
ID_to_index_,
n_bin_,
cost_per_unit_,
options_);
}
private:
twml::Map<int64_t, int64_t> ID_to_index_;
int n_bin_;
int output_bits_;
int cost_per_unit_;
int options_;
};
#define REGISTER(Type) \
REGISTER_KERNEL_BUILDER( \
Name("HashingDiscretizer") \
.Device(DEVICE_CPU) \
.TypeConstraint<Type>("T"), \
HashingDiscretizer<Type>); \
REGISTER(float);
REGISTER(double);
void ComputeHashingDiscretizer(
OpKernelContext* context,
int64_t output_bits,
const twml::Map<int64_t, int64_t> &ID_to_index,
int64_t n_bin,
int64_t cost_per_unit,
int64_t options) {
const Tensor& keys = context->input(0);
const Tensor& vals = context->input(1);
const Tensor& bin_vals = context->input(2);
const int64 output_size = keys.dim_size(0);
TensorShape output_shape;
OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(&output_size, 1, &output_shape));
Tensor* new_keys = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &new_keys));
Tensor* new_vals = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(1, output_shape, &new_vals));
try {
twml::Tensor out_keys_ = TFTensor_to_twml_tensor(*new_keys);
twml::Tensor out_vals_ = TFTensor_to_twml_tensor(*new_vals);
const twml::Tensor in_keys_ = TFTensor_to_twml_tensor(keys);
const twml::Tensor in_vals_ = TFTensor_to_twml_tensor(vals);
const twml::Tensor bin_vals_ = TFTensor_to_twml_tensor(bin_vals);
// retrieve the thread pool from the op context
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
// Definition of the computation thread
auto task = [&](int64 start, int64 limit) {
twml::hashDiscretizerInfer(out_keys_, out_vals_,
in_keys_, in_vals_,
n_bin,
bin_vals_,
output_bits,
ID_to_index,
start, limit,
options);
};
// let Tensorflow split up the work as it sees fit
Shard(worker_threads.num_threads,
worker_threads.workers,
output_size,
static_cast<int64>(cost_per_unit),
task);
} catch (const std::exception &e) {
context->CtxFailureWithWarning(errors::InvalidArgument(e.what()));
}
}