-
Notifications
You must be signed in to change notification settings - Fork 159
/
Copy pathtransform_python.py
66 lines (55 loc) · 2.5 KB
/
transform_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# (C) Copyright IBM Corp. 2024.
# Licensed under the Apache License, Version 2.0 (the “License”);
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an “AS IS” BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import sys
import time
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.runtime.pure_python.runtime_configuration import (
PythonTransformRuntimeConfiguration,
)
from data_processing.utils import ParamsUtils, get_logger
from dpk_pdf2parquet.transform import Pdf2ParquetTransformConfiguration
logger = get_logger(__name__)
class Pdf2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
"""
Implements the PythonTransformConfiguration for PDF2PARQUET as required by the PythonTransformLauncher.
"""
def __init__(self):
"""
Initialization
:param base_configuration - base configuration class
"""
super().__init__(transform_config=Pdf2ParquetTransformConfiguration())
# Class used by the notebooks to ingest binary files and create parquet files
class Pdf2Parquet:
def __init__(self, **kwargs):
self.params = {}
for key in kwargs:
self.params[key] = kwargs[key]
# if input_folder and output_folder are specified, then assume it is represent data_local_config
try:
local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
del self.params["input_folder"]
del self.params["output_folder"]
except:
pass
def transform(self):
sys.argv = ParamsUtils.dict_to_req(d=(self.params))
# create launcher
launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
# launch
return_code = launcher.launch()
return return_code
if __name__ == "__main__":
launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
logger.info("Launching pdf2parquet transform")
launcher.launch()