VectorIndexer
This documentation is for an unreleased version of Apache Flink Machine Learning Library. We recommend you use the latest stable version.

VectorIndexer #

VectorIndexer is an algorithm that implements the vector indexing algorithm. A vector indexer maps each column of the input vector into a continuous/categorical feature. Whether one feature is transformed into a continuous or categorical feature depends on the number of distinct values in this column. If the number of distinct values in one column is greater than a specified parameter (i.e., maxCategories), the corresponding output column is unchanged. Otherwise, it is transformed into a categorical value. For categorical outputs, the indices are in [0, numDistinctValuesInThisColumn].

The output model is organized in ascending order except that 0.0 is always mapped to 0 (for sparsity).

Input Columns #

Param name Type Default Description
inputCol Vector "input" Vectors to be indexed.

Output Columns #

Param name Type Default Description
outputCol Vector "output" Indexed vectors.

Parameters #

Below are the parameters required by VectorIndexerModel.

Key Default Type Required Description
inputCol "input" String no Input column name.
outputCol "output" String no Output column name.
handleInvalid "error" String no Strategy to handle invalid entries. Supported values: 'error', 'skip', 'keep'.

VectorIndexer needs parameters above and also below.

Key Default Type Required Description
maxCategories 20 Integer no Threshold for the number of values a categorical feature can take (>= 2). If a feature is found to have > maxCategories values, then it is declared continuous.

Examples #

import org.apache.flink.ml.common.param.HasHandleInvalid;
import org.apache.flink.ml.feature.vectorindexer.VectorIndexer;
import org.apache.flink.ml.feature.vectorindexer.VectorIndexerModel;
import org.apache.flink.ml.linalg.Vectors;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import org.apache.flink.util.CloseableIterator;

import java.util.Arrays;
import java.util.List;

/** Simple program that creates a VectorIndexer instance and uses it for feature engineering. */
public class VectorIndexerExample {
	public static void main(String[] args) {
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);

		// Generates input data.
		List<Row> trainInput =
			Arrays.asList(
				Row.of(Vectors.dense(1, 1)),
				Row.of(Vectors.dense(2, -1)),
				Row.of(Vectors.dense(3, 1)),
				Row.of(Vectors.dense(4, 0)),
				Row.of(Vectors.dense(5, 0)));

		List<Row> predictInput =
			Arrays.asList(
				Row.of(Vectors.dense(0, 2)),
				Row.of(Vectors.dense(0, 0)),
				Row.of(Vectors.dense(0, -1)));

		Table trainTable = tEnv.fromDataStream(env.fromCollection(trainInput)).as("input");
		Table predictTable = tEnv.fromDataStream(env.fromCollection(predictInput)).as("input");

		// Creates a VectorIndexer object and initializes its parameters.
		VectorIndexer vectorIndexer =
			new VectorIndexer()
				.setInputCol("input")
				.setOutputCol("output")
				.setHandleInvalid(HasHandleInvalid.KEEP_INVALID)
				.setMaxCategories(3);

		// Trains the VectorIndexer Model.
		VectorIndexerModel model = vectorIndexer.fit(trainTable);

		// Uses the VectorIndexer Model for predictions.
		Table outputTable = model.transform(predictTable)[0];

		// Extracts and displays the results.
		for (CloseableIterator<Row> it = outputTable.execute().collect(); it.hasNext(); ) {
			Row row = it.next();
			System.out.printf(
				"Input Value: %s \tOutput Value: %s\n",
				row.getField(vectorIndexer.getInputCol()),
				row.getField(vectorIndexer.getOutputCol()));
		}
	}
}

# Simple program that trains a VectorIndexer model and uses it for feature
# engineering.

from pyflink.common import Types
from pyflink.ml.linalg import Vectors, DenseVectorTypeInfo
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.ml.feature.vectorindexer import VectorIndexer
from pyflink.table import StreamTableEnvironment

# Creates a new StreamExecutionEnvironment.
env = StreamExecutionEnvironment.get_execution_environment()

# Creates a StreamTableEnvironment.
t_env = StreamTableEnvironment.create(env)

# Generates input training and prediction data.
train_table = t_env.from_data_stream(
    env.from_collection([
        (Vectors.dense(1, 1),),
        (Vectors.dense(2, -1),),
        (Vectors.dense(3, 1),),
        (Vectors.dense(4, 0),),
        (Vectors.dense(5, 0),)
    ],
        type_info=Types.ROW_NAMED(
            ['input', ],
            [DenseVectorTypeInfo(), ])))

predict_table = t_env.from_data_stream(
    env.from_collection([
        (Vectors.dense(0, 2),),
        (Vectors.dense(0, 0),),
        (Vectors.dense(0, -1),),
    ],
        type_info=Types.ROW_NAMED(
            ['input', ],
            [DenseVectorTypeInfo(), ])))

# Creates a VectorIndexer object and initializes its parameters.
vector_indexer = VectorIndexer() \
    .set_input_col('input') \
    .set_output_col('output') \
    .set_handle_invalid('keep') \
    .set_max_categories(3)

# Trains the VectorIndexer Model.
model = vector_indexer.fit(train_table)

# Uses the VectorIndexer Model for predictions.
output = model.transform(predict_table)[0]

# Extracts and displays the results.
field_names = output.get_schema().get_field_names()
for result in t_env.to_data_stream(output).execute_and_collect():
    print('Input Value: ' + str(result[field_names.index(vector_indexer.get_input_col())])
          + '\tOutput Value: ' + str(result[field_names.index(vector_indexer.get_output_col())]))