Source code for pyflink.datastream.formats.orc

################################################################################
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
from typing import Optional, TYPE_CHECKING

from pyflink.common import Configuration
from pyflink.common.serialization import BulkWriterFactory, RowDataBulkWriterFactory
from pyflink.datastream.utils import create_hadoop_configuration, create_java_properties
from pyflink.java_gateway import get_gateway
from pyflink.util.java_utils import to_jarray

if TYPE_CHECKING:
    from pyflink.table.types import RowType

__all__ = [
    'OrcBulkWriters'
]


[docs]class OrcBulkWriters(object):
    """
    Convenient builder to create a :class:`~pyflink.common.serialization.BulkWriterFactory` that
    writes records with a predefined schema into Orc files in a batch fashion.

    Example:
    ::

        >>> row_type = DataTypes.ROW([
        ...     DataTypes.FIELD('string', DataTypes.STRING()),
        ...     DataTypes.FIELD('int_array', DataTypes.ARRAY(DataTypes.INT()))
        ... ])
        >>> sink = FileSink.for_bulk_format(
        ...     OUTPUT_DIR, OrcBulkWriters.for_row_type(
        ...         row_type=row_type,
        ...         writer_properties=Configuration(),
        ...         hadoop_config=Configuration(),
        ...     )
        ... ).build()
        >>> ds.sink_to(sink)

    .. versionadded:: 1.16.0
    """

    @staticmethod
    def for_row_type(row_type: 'RowType',
                     writer_properties: Optional[Configuration] = None,
                     hadoop_config: Optional[Configuration] = None) \
            -> BulkWriterFactory:
        """
        Create a :class:`~pyflink.common.serialization.BulkWriterFactory` that writes records
        with a predefined schema into Orc files in a batch fashion.

        :param row_type: The RowType of records, it should match the RowTypeInfo of Row records.
        :param writer_properties: Orc writer options.
        :param hadoop_config: Hadoop configuration.
        """
        from pyflink.table.types import RowType
        if not isinstance(row_type, RowType):
            raise TypeError('row_type must be an instance of RowType')

        from pyflink.table.types import _to_java_data_type
        j_data_type = _to_java_data_type(row_type)
        jvm = get_gateway().jvm
        j_row_type = j_data_type.getLogicalType()
        orc_types = to_jarray(
            jvm.org.apache.flink.table.types.logical.LogicalType,
            [i for i in j_row_type.getChildren()]
        )
        type_description = jvm.org.apache.flink.orc \
            .OrcSplitReaderUtil.logicalTypeToOrcType(j_row_type)
        if writer_properties is None:
            writer_properties = Configuration()
        if hadoop_config is None:
            hadoop_config = Configuration()

        return RowDataBulkWriterFactory(
            jvm.org.apache.flink.orc.writer.OrcBulkWriterFactory(
                jvm.org.apache.flink.orc.vector.RowDataVectorizer(
                    type_description.toString(),
                    orc_types
                ),
                create_java_properties(writer_properties),
                create_hadoop_configuration(hadoop_config)
            ),
            row_type
        )