View source on GitHub |
One-stop utility for preprocessing and encoding structured data.
tf.keras.utils.FeatureSpace(
features,
output_mode='concat',
crosses=None,
crossing_dim=32,
hashing_dim=32,
num_discretization_bins=32
)
Available feature types:
Note that all features can be referred to by their string name,
e.g. "integer_categorical"
. When using the string name, the default
argument values are used.
# Plain float values.
FeatureSpace.float(name=None)
# Float values to be preprocessed via featurewise standardization
# (i.e. via a `keras.layers.Normalization` layer).
FeatureSpace.float_normalized(name=None)
# Float values to be preprocessed via linear rescaling
# (i.e. via a `keras.layers.Rescaling` layer).
FeatureSpace.float_rescaled(scale=1., offset=0., name=None)
# Float values to be discretized. By default, the discrete
# representation will then be one-hot encoded.
FeatureSpace.float_discretized(
num_bins, bin_boundaries=None, output_mode="one_hot", name=None)
# Integer values to be indexed. By default, the discrete
# representation will then be one-hot encoded.
FeatureSpace.integer_categorical(
max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)
# String values to be indexed. By default, the discrete
# representation will then be one-hot encoded.
FeatureSpace.string_categorical(
max_tokens=None, num_oov_indices=1, output_mode="one_hot", name=None)
# Integer values to be hashed into a fixed number of bins.
# By default, the discrete representation will then be one-hot encoded.
FeatureSpace.integer_hashed(num_bins, output_mode="one_hot", name=None)
# String values to be hashed into a fixed number of bins.
# By default, the discrete representation will then be one-hot encoded.
FeatureSpace.string_hashed(num_bins, output_mode="one_hot", name=None)
Examples:
Basic usage with a dict of input data:
raw_data = {
"float_values": [0.0, 0.1, 0.2, 0.3],
"string_values": ["zero", "one", "two", "three"],
"int_values": [0, 1, 2, 3],
}
dataset = tf.data.Dataset.from_tensor_slices(raw_data)
feature_space = FeatureSpace(
features={
"float_values": "float_normalized",
"string_values": "string_categorical",
"int_values": "integer_categorical",
},
crosses=[("string_values", "int_values")],
output_mode="concat",
)
# Before you start using the FeatureSpace,
# you must `adapt()` it on some data.
feature_space.adapt(dataset)
# You can call the FeatureSpace on a dict of data (batched or unbatched).
output_vector = feature_space(raw_data)
Basic usage with tf.data
:
# Unlabeled data
preprocessed_ds = unlabeled_dataset.map(feature_space)
# Labeled data
preprocessed_ds = labeled_dataset.map(lambda x, y: (feature_space(x), y))
Basic usage with the Keras Functional API:
# Retrieve a dict Keras Input objects
inputs = feature_space.get_inputs()
# Retrieve the corresponding encoded Keras tensors
encoded_features = feature_space.get_encoded_features()
# Build a Functional model
outputs = keras.layers.Dense(1, activation="sigmoid")(encoded_features)
model = keras.Model(inputs, outputs)
Customizing each feature or feature cross:
feature_space = FeatureSpace(
features={
"float_values": FeatureSpace.float_normalized(),
"string_values": FeatureSpace.string_categorical(max_tokens=10),
"int_values": FeatureSpace.integer_categorical(max_tokens=10),
},
crosses=[
FeatureSpace.cross(("string_values", "int_values"), crossing_dim=32)
],
output_mode="concat",
)
Returning a dict of integer-encoded features:
feature_space = FeatureSpace(
features={
"string_values": FeatureSpace.string_categorical(output_mode="int"),
"int_values": FeatureSpace.integer_categorical(output_mode="int"),
},
crosses=[
FeatureSpace.cross(
feature_names=("string_values", "int_values"),
crossing_dim=32,
output_mode="int",
)
],
output_mode="dict",
)
Specifying your own Keras preprocessing layer:
# Let's say that one of the features is a short text paragraph that
# we want to encode as a vector (one vector per paragraph) via TF-IDF.
data = {
"text": ["1st string", "2nd string", "3rd string"],
}
# There's a Keras layer for this: TextVectorization.
custom_layer = layers.TextVectorization(output_mode="tf_idf")
# We can use FeatureSpace.feature to create a custom feature
# that will use our preprocessing layer.
feature_space = FeatureSpace(
features={
"text": FeatureSpace.feature(
preprocessor=custom_layer, dtype="string", output_mode="float"
),
},
output_mode="concat",
)
feature_space.adapt(tf.data.Dataset.from_tensor_slices(data))
output_vector = feature_space(data)
Retrieving the underlying Keras preprocessing layers:
# The preprocessing layer of each feature is available in `.preprocessors`.
preprocessing_layer = feature_space.preprocessors["feature1"]
# The crossing layer of each feature cross is available in `.crossers`.
# It's an instance of keras.layers.HashedCrossing.
crossing_layer = feature_space.crossers["feature1_X_feature2"]
Saving and reloading a FeatureSpace:
feature_space.save("myfeaturespace.keras")
reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
Methods
adapt
adapt(
dataset
)
cross
@classmethod
cross( feature_names, crossing_dim, output_mode='one_hot' )
feature
@classmethod
feature( dtype, preprocessor, output_mode )
float
@classmethod
float( name=None )
float_discretized
@classmethod
float_discretized( num_bins, bin_boundaries=None, output_mode='one_hot', name=None )
float_normalized
@classmethod
float_normalized( name=None )
float_rescaled
@classmethod
float_rescaled( scale=1.0, offset=0.0, name=None )
get_encoded_features
get_encoded_features()
get_inputs
get_inputs()
integer_categorical
@classmethod
integer_categorical( max_tokens=None, num_oov_indices=1, output_mode='one_hot', name=None )
integer_hashed
@classmethod
integer_hashed( num_bins, output_mode='one_hot', name=None )
save
save(
filepath
)
Save the FeatureSpace
instance to a .keras
file.
You can reload it via keras.models.load_model()
:
feature_space.save("myfeaturespace.keras")
reloaded_feature_space = keras.models.load_model("myfeaturespace.keras")
string_categorical
@classmethod
string_categorical( max_tokens=None, num_oov_indices=1, output_mode='one_hot', name=None )
string_hashed
@classmethod
string_hashed( num_bins, output_mode='one_hot', name=None )