Skip to content

predict() - pyspark IndexError on python 3.11.4 #215

@sonurdogan

Description

@sonurdogan

Python version: 3.11.4
pyspark version: 3.1.2

model.predict('I love NLU! <3')
sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]

Warning::Spark Session already created, some configs may not take.
Traceback (most recent call last):
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 72, in dumps
    cp.dump(obj)
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 540, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 630, in reducer_override
    return self._function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 503, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 484, in _dynamic_function_reduce
    state = _function_getstate(func)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 156, in _function_getstate
    f_globals_ref = _extract_code_globals(func.__code__)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle.py", line 236, in _extract_code_globals
    out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle.py", line 236, in <setcomp>
    out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}
                 ~~~~~^^^^^^^
IndexError: tuple index out of range
Traceback (most recent call last):
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/serializers.py", line 437, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 72, in dumps
    cp.dump(obj)
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 540, in dump
    return Pickler.dump(self, obj)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 630, in reducer_override
    return self._function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 503, in _function_reduce
    return self._dynamic_function_reduce(obj)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 484, in _dynamic_function_reduce
    state = _function_getstate(func)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 156, in _function_getstate
    f_globals_ref = _extract_code_globals(func.__code__)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle.py", line 236, in _extract_code_globals
    out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/cloudpickle/cloudpickle.py", line 236, in <setcomp>
    out_names = {names[oparg] for _, oparg in _walk_global_ops(co)}
                 ~~~~~^^^^^^^
IndexError: tuple index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/Users//nlu/nlu/pipe/pipeline.py", line 485, in predict
    return __predict__(self, data, output_level, positions, keep_stranger_features, metadata, multithread,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//nlu/nlu/pipe/utils/predict_helper.py", line 267, in __predict__
    pipe.fit()
  File "/Users//nlu/nlu/pipe/pipeline.py", line 204, in fit
    self.vanilla_transformer_pipe = self.spark_estimator_pipe.fit(self.get_sample_spark_dataframe())
                                                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//nlu/nlu/pipe/pipeline.py", line 103, in get_sample_spark_dataframe
    return sparknlp.start().createDataFrame(data=text_df)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/sql/session.py", line 673, in createDataFrame
    return super(SparkSession, self).createDataFrame(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/sql/pandas/conversion.py", line 300, in createDataFrame
    return self._create_dataframe(data, schema, samplingRatio, verifySchema)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/sql/session.py", line 701, in _create_dataframe
    jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
                                           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 2618, in _to_java_object_rdd
    return self.ctx._jvm.SerDeUtil.pythonToJava(rdd._jrdd, True)
                                                ^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 2949, in _jrdd
    wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 2828, in _wrap_function
    pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
                                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/rdd.py", line 2814, in _prepare_for_python_RDD
    pickled_command = ser.dumps(command)
                      ^^^^^^^^^^^^^^^^^^
  File "/Users//miniconda3/lib/python3.11/site-packages/pyspark/serializers.py", line 447, in dumps
    raise pickle.PicklingError(msg)
_pickle.PicklingError: Could not serialize object: IndexError: tuple index out of range

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions