diff --git a/.asf.yaml b/.asf.yaml
index c5d24103072d..dcab78f6fd9a 100644
--- a/.asf.yaml
+++ b/.asf.yaml
@@ -15,18 +15,20 @@
#
github:
- description: SeaTunnel is a distributed, high-performance data integration platform for the synchronization and transformation of massive data (offline & real-time).
+ description: SeaTunnel is a next-generation super high-performance, distributed, massive data integration tool.
homepage: https://seatunnel.apache.org/
labels:
- data-integration
+ - change-data-capture
+ - cdc
- high-performance
- offline
- real-time
- - data-pipeline
- - sql-engine
+ - batch
+ - streaming
+ - data-ingestion
- apache
- - seatunnel
- - etl-framework
+ - elt
enabled_merge_buttons:
squash: true
merge: false
diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml
index 13a4d4b52d9d..fc1cefae0519 100644
--- a/.github/workflows/backend.yml
+++ b/.github/workflows/backend.yml
@@ -20,11 +20,7 @@ on:
push:
pull_request:
branches:
- - dev
- paths-ignore:
- - 'docs/**'
- - '**/*.md'
- - 'seatunnel-ui/**'
+ - '**'
concurrency:
group: backend-${{ github.event.pull_request.number || github.ref }}
@@ -32,7 +28,7 @@ concurrency:
jobs:
license-header:
- if: github.repository == '${{github.actor}}/seatunnel'
+ if: github.repository == 'apache/seatunnel'
name: License header
runs-on: ubuntu-latest
timeout-minutes: 10
@@ -44,7 +40,7 @@ jobs:
uses: apache/skywalking-eyes@985866ce7e324454f61e22eb2db2e998db09d6f3
code-style:
- if: github.repository == '${{github.actor}}/seatunnel'
+ if: github.repository == 'apache/seatunnel'
name: Code style
runs-on: ubuntu-latest
timeout-minutes: 10
@@ -56,7 +52,7 @@ jobs:
run: ./mvnw --batch-mode --quiet --no-snapshot-updates clean spotless:check
dead-link:
- if: github.repository == '${{github.actor}}/seatunnel'
+ if: github.repository != 'apache/seatunnel'
name: Dead links
runs-on: ubuntu-latest
timeout-minutes: 30
@@ -69,7 +65,7 @@ jobs:
done
sanity-check:
- if: github.repository == '${{github.actor}}/seatunnel'
+ if: github.repository != 'apache/seatunnel'
name: Sanity check results
needs: [ license-header, code-style, dead-link ]
runs-on: ubuntu-latest
@@ -83,7 +79,7 @@ jobs:
changes:
runs-on: ubuntu-latest
- if: github.repository == '${{github.actor}}/seatunnel'
+ if: github.repository != 'apache/seatunnel'
timeout-minutes: 10
outputs:
api: ${{ steps.filter.outputs.api }}
@@ -235,7 +231,7 @@ jobs:
echo "modules=$build_modules" >> $GITHUB_OUTPUT
dependency-license:
- if: needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true'
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'true' || needs.changes.outputs.engine == 'true')
name: Dependency licenses
needs: [ changes, sanity-check ]
runs-on: ubuntu-latest
@@ -262,7 +258,7 @@ jobs:
unit-test:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true' || (needs.changes.outputs.api == 'false' && needs.changes.outputs.ut-modules != '')
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'true' || (needs.changes.outputs.api == 'false' && needs.changes.outputs.ut-modules != ''))
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -293,7 +289,7 @@ jobs:
updated-modules-integration-test-part-1:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -311,14 +307,14 @@ jobs:
- name: run updated modules integration test (part-1)
if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
run: |
- sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 4 0`
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 0`
./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
env:
MAVEN_OPTS: -Xmx2048m
updated-modules-integration-test-part-2:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -336,7 +332,7 @@ jobs:
- name: run updated modules integration test (part-2)
if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
run: |
- sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 4 1`
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 1`
if [ ! -z $sub_modules ]; then
./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
else
@@ -347,7 +343,7 @@ jobs:
updated-modules-integration-test-part-3:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -365,7 +361,7 @@ jobs:
- name: run updated modules integration test (part-3)
if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
run: |
- sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 4 2`
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 2`
if [ ! -z $sub_modules ]; then
./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
else
@@ -376,7 +372,7 @@ jobs:
updated-modules-integration-test-part-4:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -394,7 +390,91 @@ jobs:
- name: run updated modules integration test (part-4)
if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
run: |
- sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 4 3`
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 3`
+ if [ ! -z $sub_modules ]; then
+ ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
+ else
+ echo "sub modules is empty, skipping"
+ fi
+ env:
+ MAVEN_OPTS: -Xmx2048m
+ updated-modules-integration-test-part-5:
+ needs: [ changes, sanity-check ]
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ java: [ '8', '11' ]
+ os: [ 'ubuntu-latest' ]
+ timeout-minutes: 90
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ cache: 'maven'
+ - name: run updated modules integration test (part-5)
+ if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ run: |
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 4`
+ if [ ! -z $sub_modules ]; then
+ ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
+ else
+ echo "sub modules is empty, skipping"
+ fi
+ env:
+ MAVEN_OPTS: -Xmx2048m
+ updated-modules-integration-test-part-6:
+ needs: [ changes, sanity-check ]
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ java: [ '8', '11' ]
+ os: [ 'ubuntu-latest' ]
+ timeout-minutes: 90
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ cache: 'maven'
+ - name: run updated modules integration test (part-6)
+ if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ run: |
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 5`
+ if [ ! -z $sub_modules ]; then
+ ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
+ else
+ echo "sub modules is empty, skipping"
+ fi
+ env:
+ MAVEN_OPTS: -Xmx2048m
+ updated-modules-integration-test-part-7:
+ needs: [ changes, sanity-check ]
+ if: github.repository != 'apache/seatunnel' && (needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '')
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ java: [ '8', '11' ]
+ os: [ 'ubuntu-latest' ]
+ timeout-minutes: 90
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ cache: 'maven'
+ - name: run updated modules integration test (part-7)
+ if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != ''
+ run: |
+ sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 7 6`
if [ ! -z $sub_modules ]; then
./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci
else
@@ -402,10 +482,9 @@ jobs:
fi
env:
MAVEN_OPTS: -Xmx2048m
-
engine-v2-it:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -429,7 +508,7 @@ jobs:
transform-v2-it-part-1:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -453,7 +532,7 @@ jobs:
transform-v2-it-part-2:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -477,7 +556,7 @@ jobs:
all-connectors-it-1:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -504,7 +583,7 @@ jobs:
all-connectors-it-2:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -531,7 +610,7 @@ jobs:
all-connectors-it-3:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -558,7 +637,7 @@ jobs:
all-connectors-it-4:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -585,7 +664,7 @@ jobs:
all-connectors-it-5:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -612,7 +691,7 @@ jobs:
all-connectors-it-6:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -639,7 +718,7 @@ jobs:
all-connectors-it-7:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -666,7 +745,7 @@ jobs:
jdbc-connectors-it-part-1:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -690,7 +769,7 @@ jobs:
jdbc-connectors-it-part-2:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -714,7 +793,7 @@ jobs:
jdbc-connectors-it-part-3:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -738,7 +817,7 @@ jobs:
jdbc-connectors-it-part-4:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -760,9 +839,33 @@ jobs:
env:
MAVEN_OPTS: -Xmx4096m
+ jdbc-connectors-it-part-5:
+ needs: [ changes, sanity-check ]
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ java: [ '8', '11' ]
+ os: [ 'ubuntu-latest' ]
+ timeout-minutes: 90
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up JDK ${{ matrix.java }}
+ uses: actions/setup-java@v3
+ with:
+ java-version: ${{ matrix.java }}
+ distribution: 'temurin'
+ cache: 'maven'
+ - name: run jdbc connectors integration test (part-5)
+ if: needs.changes.outputs.api == 'true'
+ run: |
+ ./mvnw -B -T 1C verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-5 -am -Pci
+ env:
+ MAVEN_OPTS: -Xmx4096m
+
kafka-connector-it:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -786,7 +889,7 @@ jobs:
rocketmq-connector-it:
needs: [ changes, sanity-check ]
- if: needs.changes.outputs.api == 'true'
+ if: github.repository != 'apache/seatunnel' && needs.changes.outputs.api == 'true'
runs-on: ${{ matrix.os }}
strategy:
matrix:
@@ -806,4 +909,4 @@ jobs:
run: |
./mvnw -B -T 1C verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl :connector-rocketmq-e2e -am -Pci
env:
- MAVEN_OPTS: -Xmx4096m
\ No newline at end of file
+ MAVEN_OPTS: -Xmx4096m
diff --git a/.github/workflows/notify_test_workflow.yml b/.github/workflows/notify_test_workflow.yml
new file mode 100644
index 000000000000..84196ac888ef
--- /dev/null
+++ b/.github/workflows/notify_test_workflow.yml
@@ -0,0 +1,152 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# Intentionally has a general name.
+# because the test status check created in GitHub Actions
+# currently randomly picks any associated workflow.
+# So, the name was changed to make sense in that context too.
+# See also https://github.community/t/specify-check-suite-when-creating-a-checkrun/118380/10
+name: On pull request update
+on:
+ pull_request_target:
+ types: [opened, reopened, synchronize]
+
+jobs:
+ notify:
+ name: Notify test workflow
+ runs-on: ubuntu-20.04
+ permissions:
+ actions: read
+ checks: write
+ steps:
+ - name: "Notify test workflow"
+ uses: actions/github-script@v6
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const endpoint = 'GET /repos/:owner/:repo/actions/workflows/:id/runs?&branch=:branch'
+ const check_run_endpoint = 'GET /repos/:owner/:repo/commits/:ref/check-runs'
+
+ // TODO: Should use pull_request.user and pull_request.user.repos_url?
+ // If a different person creates a commit to another forked repo,
+ // it wouldn't be able to detect.
+ const params = {
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ id: 'build_main.yml',
+ branch: context.payload.pull_request.head.ref,
+ }
+ const check_run_params = {
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ ref: context.payload.pull_request.head.ref,
+ }
+
+ console.log('Ref: ' + context.payload.pull_request.head.ref)
+ console.log('SHA: ' + context.payload.pull_request.head.sha)
+
+ // Wait 3 seconds to make sure the fork repository triggered a workflow.
+ await new Promise(r => setTimeout(r, 3000))
+
+ let runs
+ try {
+ runs = await github.request(endpoint, params)
+ } catch (error) {
+ console.error(error)
+ // Assume that runs were not found.
+ }
+
+ const name = 'Build'
+ const head_sha = context.payload.pull_request.head.sha
+ let status = 'queued'
+
+ if (!runs || runs.data.workflow_runs.length === 0) {
+ status = 'completed'
+ const conclusion = 'action_required'
+
+ github.rest.checks.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ name: name,
+ head_sha: head_sha,
+ status: status,
+ conclusion: conclusion,
+ output: {
+ title: 'Workflow run detection failed',
+ summary: `
+ Unable to detect the workflow run for testing the changes in your PR.
+
+ 1. If you did not enable GitHub Actions in your forked repository, please enable it by clicking the button as shown in the image below. See also [Disabling or limiting GitHub Actions for a repository](https://docs.github.com/en/github/administering-a-repository/disabling-or-limiting-github-actions-for-a-repository) for more details.
+ 2. It is possible your branch is based on the old \`dev\` branch in Apache SeaTunnel, please sync your branch to the latest master branch. For example as below:
+ \`\`\`bash
+ git fetch upstream
+ git rebase upstream/master
+ git push origin YOUR_BRANCH --force
+ \`\`\``,
+ images: [
+ {
+ alt: 'enabling workflows button',
+ image_url: 'https://raw.githubusercontent.com/apache/spark/master/.github/workflows/images/workflow-enable-button.png'
+ }
+ ]
+ }
+ })
+ } else {
+ const run_id = runs.data.workflow_runs[0].id
+
+ if (runs.data.workflow_runs[0].head_sha != context.payload.pull_request.head.sha) {
+ throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+ }
+
+ // Here we get check run ID to provide Check run view instead of Actions view, see also SPARK-37879.
+ const check_runs = await github.request(check_run_endpoint, check_run_params)
+ const check_run_head = check_runs.data.check_runs.filter(r => r.name === "Run / Check changes")[0]
+
+ if (check_run_head.head_sha != context.payload.pull_request.head.sha) {
+ throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.');
+ }
+
+ const check_run_url = 'https://github.com/'
+ + context.payload.pull_request.head.repo.full_name
+ + '/runs/'
+ + check_run_head.id
+
+ const actions_url = 'https://github.com/'
+ + context.payload.pull_request.head.repo.full_name
+ + '/actions/runs/'
+ + run_id
+
+ github.rest.checks.create({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ name: name,
+ head_sha: head_sha,
+ status: status,
+ output: {
+ title: 'Test results',
+ summary: '[See test results](' + check_run_url + ')',
+ text: JSON.stringify({
+ owner: context.payload.pull_request.head.repo.owner.login,
+ repo: context.payload.pull_request.head.repo.name,
+ run_id: run_id
+ })
+ },
+ details_url: actions_url,
+ })
+ }
diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml
new file mode 100644
index 000000000000..05cf4914a25c
--- /dev/null
+++ b/.github/workflows/update_build_status.yml
@@ -0,0 +1,108 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+name: Update build status workflow
+
+on:
+ schedule:
+ - cron: "*/15 * * * *"
+
+jobs:
+ update:
+ name: Update build status
+ runs-on: ubuntu-20.04
+ permissions:
+ actions: read
+ checks: write
+ steps:
+ - name: "Update build status"
+ uses: actions/github-script@v6
+ with:
+ github-token: ${{ secrets.GITHUB_TOKEN }}
+ script: |
+ const endpoint = 'GET /repos/:owner/:repo/pulls?state=:state'
+ const params = {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ state: 'open'
+ }
+
+ // See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
+ const maybeReady = ['behind', 'clean', 'draft', 'has_hooks', 'unknown', 'unstable'];
+
+ // Iterate open PRs
+ for await (const prs of github.paginate.iterator(endpoint,params)) {
+ // Each page
+ for await (const pr of prs.data) {
+ console.log('SHA: ' + pr.head.sha)
+ console.log(' Mergeable status: ' + pr.mergeable_state)
+ if (pr.mergeable_state == null || maybeReady.includes(pr.mergeable_state)) {
+ const checkRuns = await github.request('GET /repos/{owner}/{repo}/commits/{ref}/check-runs', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ ref: pr.head.sha
+ })
+
+ // Iterator GitHub Checks in the PR
+ for await (const cr of checkRuns.data.check_runs) {
+ if (cr.name == 'Build' && cr.conclusion != "action_required") {
+ // text contains parameters to make request in JSON.
+ const params = JSON.parse(cr.output.text)
+
+ // Get the workflow run in the forked repository
+ let run
+ try {
+ run = await github.request('GET /repos/{owner}/{repo}/actions/runs/{run_id}', params)
+ } catch (error) {
+ console.error(error)
+ // Run not found. This can happen when the PR author removes GitHub Actions runs or
+ // disalbes GitHub Actions.
+ continue
+ }
+
+ // Keep syncing the status of the checks
+ if (run.data.status == 'completed') {
+ console.log(' Run ' + cr.id + ': set status (' + run.data.status + ') and conclusion (' + run.data.conclusion + ')')
+ const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ check_run_id: cr.id,
+ output: cr.output,
+ status: run.data.status,
+ conclusion: run.data.conclusion,
+ details_url: run.data.details_url
+ })
+ } else {
+ console.log(' Run ' + cr.id + ': set status (' + run.data.status + ')')
+ const response = await github.request('PATCH /repos/{owner}/{repo}/check-runs/{check_run_id}', {
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ check_run_id: cr.id,
+ output: cr.output,
+ status: run.data.status,
+ details_url: run.data.details_url
+ })
+ }
+
+ break
+ }
+ }
+ }
+ }
+ }
diff --git a/.gitignore b/.gitignore
index 25977068e4fe..74311a0fa057 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,5 @@ test.conf
spark-warehouse
*.flattened-pom.xml
-seatunnel-examples
\ No newline at end of file
+seatunnel-examples
+/lib/*
\ No newline at end of file
diff --git a/DISCLAIMER b/DISCLAIMER
index fac720f1f3eb..517e33ffafa9 100644
--- a/DISCLAIMER
+++ b/DISCLAIMER
@@ -1,4 +1,4 @@
-Apache SeaTunnel (incubating) is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC.
+Apache SeaTunnel is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC.
Incubation is required of all newly accepted projects until a further review indicates that the infrastructure,
communications, and decision making process have stabilized in a manner consistent with other successful ASF projects.
While incubation status is not necessarily a reflection of the completeness or stability of the code,
diff --git a/README.md b/README.md
index e7f898bd6594..0bce6778f0f1 100644
--- a/README.md
+++ b/README.md
@@ -126,7 +126,7 @@ Please follow this [document](docs/en/contribution/setup.md).
* Mail list: **dev@seatunnel.apache.org**. Mail to `dev-subscribe@seatunnel.apache.org`, follow the reply to subscribe
the mail list.
-* Slack: https://the-asf.slack.com/archives/C053HND1D6X
+* Slack: https://s.apache.org/seatunnel-slack
* Twitter: https://twitter.com/ASFSeaTunnel
* [Bilibili](https://space.bilibili.com/1542095008) (for Chinese users)
diff --git a/bin/install-plugin.cmd b/bin/install-plugin.cmd
new file mode 100644
index 000000000000..4df77b968caf
--- /dev/null
+++ b/bin/install-plugin.cmd
@@ -0,0 +1,60 @@
+@echo off
+REM Licensed to the Apache Software Foundation (ASF) under one or more
+REM contributor license agreements. See the NOTICE file distributed with
+REM this work for additional information regarding copyright ownership.
+REM The ASF licenses this file to You under the Apache License, Version 2.0
+REM (the "License"); you may not use this file except in compliance with
+REM the License. You may obtain a copy of the License at
+REM
+REM http://www.apache.org/licenses/LICENSE-2.0
+REM
+REM Unless required by applicable law or agreed to in writing, software
+REM distributed under the License is distributed on an "AS IS" BASIS,
+REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+REM See the License for the specific language governing permissions and
+REM limitations under the License.
+
+REM This script is used to download the connector plug-ins required during the running process.
+REM All are downloaded by default. You can also choose what you need.
+REM You only need to configure the plug-in name in config\plugin_config.txt.
+
+REM Get seatunnel home
+set "SEATUNNEL_HOME=%~dp0..\"
+echo Set SEATUNNEL_HOME to [%SEATUNNEL_HOME%]
+
+REM Connector default version is 2.3.3, you can also choose a custom version. eg: 2.1.2: install-plugin.bat 2.1.2
+set "version=2.3.3"
+if not "%~1"=="" set "version=%~1"
+echo Install hadoop shade jar, usage version is %version%
+
+REM Create the lib directory
+if not exist "%SEATUNNEL_HOME%\lib" (
+ mkdir "%SEATUNNEL_HOME%\lib"
+ echo create lib directory
+)
+
+call "%SEATUNNEL_HOME%\mvnw.cmd" dependency:get -DgroupId="org.apache.seatunnel" -Dclassifier="optional" -DartifactId="seatunnel-hadoop3-3.1.4-uber" -Dversion="%version%" -Ddest="%SEATUNNEL_HOME%\lib"
+
+echo Install SeaTunnel connectors plugins, usage version is %version%
+
+REM Create the connectors directory
+if not exist "%SEATUNNEL_HOME%\connectors" (
+ mkdir "%SEATUNNEL_HOME%\connectors"
+ echo create connectors directory
+)
+
+REM Create the seatunnel connectors directory (for v2)
+if not exist "%SEATUNNEL_HOME%\connectors\seatunnel" (
+ mkdir "%SEATUNNEL_HOME%\connectors\seatunnel"
+ echo create seatunnel connectors directory
+)
+
+for /f "usebackq delims=" %%a in ("%SEATUNNEL_HOME%\config\plugin_config") do (
+ set "line=%%a"
+ setlocal enabledelayedexpansion
+ if "!line:~0,1!" neq "-" if "!line:~0,1!" neq "#" (
+ echo install connector : !line!
+ call "%SEATUNNEL_HOME%\mvnw.cmd" dependency:get -DgroupId="org.apache.seatunnel" -DartifactId="!line!" -Dversion="%version%" -Ddest="%SEATUNNEL_HOME%\connectors\seatunnel"
+ )
+ endlocal
+)
diff --git a/bin/install-plugin.sh b/bin/install-plugin.sh
index cddf4ebf68a2..b8a1cca71492 100755
--- a/bin/install-plugin.sh
+++ b/bin/install-plugin.sh
@@ -23,8 +23,8 @@
# get seatunnel home
SEATUNNEL_HOME=$(cd $(dirname $0);cd ../;pwd)
-# connector default version is 2.3.1, you can also choose a custom version. eg: 2.1.2: sh install-plugin.sh 2.1.2
-version=2.3.1
+# connector default version is 2.3.3, you can also choose a custom version. eg: 2.1.2: sh install-plugin.sh 2.1.2
+version=2.3.3
if [ -n "$1" ]; then
version="$1"
diff --git a/config/plugin_config b/config/plugin_config
index 95b952b31bf1..0c7e119ecfc4 100644
--- a/config/plugin_config
+++ b/config/plugin_config
@@ -36,7 +36,7 @@ connector-file-ftp
connector-file-hadoop
connector-file-local
connector-file-oss
-connector-file-oss-jindo
+connector-file-jindo-oss
connector-file-s3
connector-file-sftp
connector-google-sheets
diff --git a/config/seatunnel-env.cmd b/config/seatunnel-env.cmd
new file mode 100644
index 000000000000..79c2d3c117c6
--- /dev/null
+++ b/config/seatunnel-env.cmd
@@ -0,0 +1,21 @@
+@echo off
+REM Licensed to the Apache Software Foundation (ASF) under one or more
+REM contributor license agreements. See the NOTICE file distributed with
+REM this work for additional information regarding copyright ownership.
+REM The ASF licenses this file to You under the Apache License, Version 2.0
+REM (the "License"); you may not use this file except in compliance with
+REM the License. You may obtain a copy of the License at
+REM
+REM http://www.apache.org/licenses/LICENSE-2.0
+REM
+REM Unless required by applicable law or agreed to in writing, software
+REM distributed under the License is distributed on an "AS IS" BASIS,
+REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+REM See the License for the specific language governing permissions and
+REM limitations under the License.
+
+REM Home directory of spark distribution.
+if "%SPARK_HOME%" == "" set "SPARK_HOME=C:\Program Files\spark"
+
+REM Home directory of flink distribution.
+if "%FLINK_HOME%" == "" set "FLINK_HOME=C:\Program Files\flink"
\ No newline at end of file
diff --git a/config/seatunnel.yaml b/config/seatunnel.yaml
index 7e496ca39adb..5961c839238b 100644
--- a/config/seatunnel.yaml
+++ b/config/seatunnel.yaml
@@ -17,6 +17,7 @@
seatunnel:
engine:
+ history-job-expire-minutes: 1440
backup-count: 1
queue-type: blockingqueue
print-execution-info-interval: 60
@@ -26,8 +27,6 @@ seatunnel:
checkpoint:
interval: 10000
timeout: 60000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
diff --git a/docs/en/about.md b/docs/en/about.md
index d2e28693915a..57a800343b02 100644
--- a/docs/en/about.md
+++ b/docs/en/about.md
@@ -2,7 +2,7 @@
-[](https://the-asf.slack.com/archives/C053HND1D6X)
+[](https://s.apache.org/seatunnel-slack)
[](https://twitter.com/ASFSeaTunnel)
SeaTunnel is a very easy-to-use, ultra-high-performance, distributed data integration platform that supports real-time
diff --git a/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md b/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md
index 002bd0c3bec4..e0751a249272 100644
--- a/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md
+++ b/docs/en/connector-v2/formats/cdc-compatible-debezium-json.md
@@ -19,7 +19,6 @@ source {
MySQL-CDC {
result_table_name = "table1"
- hostname = localhost
base-url="jdbc:mysql://localhost:3306/test"
"startup.mode"=INITIAL
catalog {
diff --git a/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
new file mode 100644
index 000000000000..7de8a9e838b2
--- /dev/null
+++ b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md
@@ -0,0 +1,47 @@
+# Kafka source compatible kafka-connect-json
+
+Seatunnel connector kafka supports parsing data extracted through kafka connect source, especially data extracted from kafka connect jdbc and kafka connect debezium
+
+# How to use
+
+## Kafka output to mysql
+
+```bash
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ Kafka {
+ bootstrap.servers = "localhost:9092"
+ topic = "jdbc_source_record"
+ result_table_name = "kafka_table"
+ start_mode = earliest
+ schema = {
+ fields {
+ id = "int"
+ name = "string"
+ description = "string"
+ weight = "string"
+ }
+ },
+ format = COMPATIBLE_KAFKA_CONNECT_JSON
+ }
+}
+
+
+sink {
+ Jdbc {
+ driver = com.mysql.cj.jdbc.Driver
+ url = "jdbc:mysql://localhost:3306/seatunnel"
+ user = st_user
+ password = seatunnel
+ generate_sink_sql = true
+ database = seatunnel
+ table = jdbc_sink
+ primary_keys = ["id"]
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/AmazonDynamoDB.md b/docs/en/connector-v2/sink/AmazonDynamoDB.md
index e8fe0b23afbe..6e880fb4af42 100644
--- a/docs/en/connector-v2/sink/AmazonDynamoDB.md
+++ b/docs/en/connector-v2/sink/AmazonDynamoDB.md
@@ -20,7 +20,6 @@ Write data to Amazon DynamoDB
| secret_access_key | string | yes | - |
| table | string | yes | - |
| batch_size | string | no | 25 |
-| batch_interval_ms | string | no | 1000 |
| common-options | | no | - |
### url [string]
diff --git a/docs/en/connector-v2/sink/Console.md b/docs/en/connector-v2/sink/Console.md
index fd7623d7d389..55df281b2752 100644
--- a/docs/en/connector-v2/sink/Console.md
+++ b/docs/en/connector-v2/sink/Console.md
@@ -14,14 +14,24 @@ Used to send data to Console. Both support streaming and batch mode.
## Options
-| name | type | required | default value |
-|----------------|------|----------|---------------|
-| common-options | | no | - |
+| name | type | required | default value |
+|--------------------|---------|----------|---------------|
+| common-options | | no | - |
+| log.print.data | boolean | no | yes |
+| log.print.delay.ms | int | no | 0 |
### common options
Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
+### log.print.data
+
+Flag to determine whether data should be printed in the logs. The default value is `true`.
+
+### log.print.delay.ms
+
+Delay in milliseconds between printing each data item to the logs. The default value is `0`.
+
## Example
simple:
diff --git a/docs/en/connector-v2/sink/DB2.md b/docs/en/connector-v2/sink/DB2.md
index 8f5a7285e35d..fc0aaca0943c 100644
--- a/docs/en/connector-v2/sink/DB2.md
+++ b/docs/en/connector-v2/sink/DB2.md
@@ -65,8 +65,7 @@ semantics (using XA transaction guarantee).
| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
-| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
-| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. |
| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, DB2 is `com.db2.cj.jdbc.Db2XADataSource`, and please refer to appendix for other data sources |
diff --git a/docs/en/connector-v2/sink/Feishu.md b/docs/en/connector-v2/sink/Feishu.md
index bd45977ce809..5573086db3e4 100644
--- a/docs/en/connector-v2/sink/Feishu.md
+++ b/docs/en/connector-v2/sink/Feishu.md
@@ -2,41 +2,55 @@
> Feishu sink connector
-## Description
-
-Used to launch Feishu web hooks using data.
-
-> For example, if the data from upstream is [`age: 12, name: tyrantlucifer`], the body content is the following: `{"age": 12, "name": "tyrantlucifer"}`
+## Support Those Engines
-**Tips: Feishu sink only support `post json` webhook and the data from source will be treated as body content in web hook.**
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|----------------|--------|----------|---------------|
-| url | String | Yes | - |
-| headers | Map | No | - |
-| common-options | | no | - |
-
-### url [string]
-
-Feishu webhook url
-
-### headers [Map]
-
-Http request headers
+## Description
-### common options
+Used to launch Feishu web hooks using data.
-Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
+> For example, if the data from upstream is [`age: 12, name: tyrantlucifer`], the body content is the following: `{"age": 12, "name": "tyrantlucifer"}`
-## Example
+**Tips: Feishu sink only support `post json` webhook and the data from source will be treated as body content in web hook.**
-simple:
+## Data Type Mapping
+
+| Seatunnel Data type | Feishu Data type |
+|-----------------------------|------------------|
+| ROW MAP | Json |
+| NULL | null |
+| BOOLEAN | boolean |
+| TINYINT | byte |
+| SMALLINT | short |
+| INT | int |
+| BIGINT | long |
+| FLOAT | float |
+| DOUBLE | double |
+| DECIMAL | BigDecimal |
+| BYTES | byte[] |
+| STRING | String |
+| TIME TIMESTAMP TIME | String |
+| ARRAY | JsonArray |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|----------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | Feishu webhook url |
+| headers | Map | No | - | Http request headers |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
+
+## Task Example
+
+### Simple:
```hocon
Feishu {
diff --git a/docs/en/connector-v2/sink/HdfsFile.md b/docs/en/connector-v2/sink/HdfsFile.md
index 34ce19714b4d..135c5115c2aa 100644
--- a/docs/en/connector-v2/sink/HdfsFile.md
+++ b/docs/en/connector-v2/sink/HdfsFile.md
@@ -1,20 +1,14 @@
# HdfsFile
-> HDFS file sink connector
+> HDFS File Sink Connector
-## Description
-
-Output data to hdfs file
-
-:::tip
-
-If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
-
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+## Support Those Engines
-:::
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [x] [exactly-once](../../concept/connector-v2-features.md)
@@ -30,183 +24,120 @@ By default, we use 2PC commit to ensure `exactly-once`
- [x] compress codec
- [x] lzo
-## Options
-
-| name | type | required | default value | remarks |
-|----------------------------------|---------|----------|--------------------------------------------|-----------------------------------------------------------|
-| fs.defaultFS | string | yes | - | |
-| path | string | yes | - | |
-| hdfs_site_path | string | no | - | |
-| custom_filename | boolean | no | false | Whether you need custom the filename |
-| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
-| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
-| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
-| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
-| have_partition | boolean | no | false | Whether you need processing partitions. |
-| partition_by | array | no | - | Only used then have_partition is true |
-| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
-| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true |
-| sink_columns | array | no | | When this parameter is empty, all fields are sink columns |
-| is_enable_transaction | boolean | no | true | |
-| batch_size | int | no | 1000000 | |
-| compress_codec | string | no | none | |
-| kerberos_principal | string | no | - |
-| kerberos_keytab_path | string | no | - | |
-| compress_codec | string | no | none | |
-| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
-
-### fs.defaultFS [string]
-
-The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster`
-
-### path [string]
-
-The target dir path is required.
-
-### hdfs_site_path [string]
-
-The path of `hdfs-site.xml`, used to load ha configuration of namenodes
-
-### custom_filename [boolean]
-
-Whether custom the filename
-
-### file_name_expression [string]
-
-Only used when `custom_filename` is `true`
-
-`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`,
-`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`.
-
-Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.
-
-### filename_time_format [string]
-
-Only used when `custom_filename` is `true`
-
-When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows:
-
-| Symbol | Description |
-|--------|--------------------|
-| y | Year |
-| M | Month |
-| d | Day of month |
-| H | Hour in day (0-23) |
-| m | Minute in hour |
-| s | Second in minute |
-
-### file_format_type [string]
-
-We supported as the following file types:
-
-`text` `json` `csv` `orc` `parquet` `excel`
-
-Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`.
-
-### field_delimiter [string]
-
-The separator between columns in a row of data. Only needed by `text` file format.
-
-### row_delimiter [string]
-
-The separator between rows in a file. Only needed by `text` file format.
-
-### have_partition [boolean]
-
-Whether you need processing partitions.
-
-### partition_by [array]
-
-Only used when `have_partition` is `true`.
-
-Partition data based on selected fields.
-
-### partition_dir_expression [string]
-
-Only used when `have_partition` is `true`.
-
-If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory.
-
-Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field.
-
-### is_partition_field_write_in_file [boolean]
-
-Only used when `have_partition` is `true`.
-
-If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file.
-
-For example, if you want to write a Hive Data File, Its value should be `false`.
-
-### sink_columns [array]
-
-Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`.
-The order of the fields determines the order in which the file is actually written.
-
-### is_enable_transaction [boolean]
-
-If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory.
-
-Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.
-
-Only support `true` now.
-
-### batch_size [int]
-
-The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger.
-
-### compress_codec [string]
-
-The compress codec of files and the details that supported as the following shown:
-
-- txt: `lzo` `none`
-- json: `lzo` `none`
-- csv: `lzo` `none`
-- orc: `lzo` `snappy` `lz4` `zlib` `none`
-- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`
-
-Tips: excel type does not support any compression format
-
-### kerberos_principal [string]
-
-The principal of kerberos
-
-### kerberos_keytab_path [string]
-
-The keytab path of kerberos
-
-### common options
-
-Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
+## Description
-### max_rows_in_memory [int]
+Output data to hdfs file
-When File Format is Excel,The maximum number of data items that can be cached in the memory.
+## Supported DataSource Info
+
+| Datasource | Supported Versions |
+|------------|--------------------|
+| HdfsFile | hadoop 2.x and 3.x |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|----------------------------------|---------|----------|--------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` |
+| path | string | yes | - | The target dir path is required. |
+| hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes |
+| custom_filename | boolean | no | false | Whether you need custom the filename |
+| file_name_expression | string | no | "${transactionId}" | Only used when `custom_filename` is `true`.`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`,`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. |
+| filename_time_format | string | no | "yyyy.MM.dd" | Only used when `custom_filename` is `true`.When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows:[y:Year,M:Month,d:Day of month,H:Hour in day (0-23),m:Minute in hour,s:Second in minute] |
+| file_format_type | string | no | "csv" | We supported as the following file types:`text` `json` `csv` `orc` `parquet` `excel`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. |
+| field_delimiter | string | no | '\001' | Only used when file_format is text,The separator between columns in a row of data. Only needed by `text` file format. |
+| row_delimiter | string | no | "\n" | Only used when file_format is text,The separator between rows in a file. Only needed by `text` file format. |
+| have_partition | boolean | no | false | Whether you need processing partitions. |
+| partition_by | array | no | - | Only used then have_partition is true,Partition data based on selected fields. |
+| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true,If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. |
+| is_partition_field_write_in_file | boolean | no | false | Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file.For example, if you want to write a Hive Data File, Its value should be `false`. |
+| sink_columns | array | no | | When this parameter is empty, all fields are sink columns.Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. |
+| is_enable_transaction | boolean | no | true | If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.Only support `true` now. |
+| batch_size | int | no | 1000000 | The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. |
+| compress_codec | string | no | none | The compress codec of files and the details that supported as the following shown:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`].Tips: excel type does not support any compression format. |
+| kerberos_principal | string | no | - | The principal of kerberos |
+| kerberos_keytab_path | string | no | - | The keytab path of kerberos |
+| compress_codec | string | no | none | compress codec |
+| common-options | object | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
+| max_rows_in_memory | int | no | - | Only used when file_format is excel.When File Format is Excel,The maximum number of data items that can be cached in the memory. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel.Writer the sheet of the workbook |
+
+### Tips
+
+> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+
+## Task Example
+
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Hdfs.
-### sheet_name [string]
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
-Writer the sheet of the workbook
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ c_map = "map"
+ c_array = "array"
+ c_string = string
+ c_boolean = boolean
+ c_tinyint = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(30, 8)"
+ c_bytes = bytes
+ c_date = date
+ c_timestamp = timestamp
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
-## Example
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
-For orc file format simple config
+sink {
+ HdfsFile {
+ fs.defaultFS = "hdfs://hadoopcluster"
+ path = "/tmp/hive/warehouse/test2"
+ file_format = "orc"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
-```bash
+### For orc file format simple config
+```
HdfsFile {
fs.defaultFS = "hdfs://hadoopcluster"
path = "/tmp/hive/warehouse/test2"
- file_format_type = "orc"
+ file_format = "orc"
}
-
```
-For text file format with `have_partition` and `custom_filename` and `sink_columns`
-
-```bash
+### For text file format with `have_partition` and `custom_filename` and `sink_columns`
+```
HdfsFile {
fs.defaultFS = "hdfs://hadoopcluster"
path = "/tmp/hive/warehouse/test2"
@@ -223,13 +154,11 @@ HdfsFile {
sink_columns = ["name","age"]
is_enable_transaction = true
}
-
```
-For parquet file format with `have_partition` and `custom_filename` and `sink_columns`
-
-```bash
+### For parquet file format with `have_partition` and `custom_filename` and `sink_columns`
+```
HdfsFile {
fs.defaultFS = "hdfs://hadoopcluster"
path = "/tmp/hive/warehouse/test2"
@@ -244,32 +173,27 @@ HdfsFile {
sink_columns = ["name","age"]
is_enable_transaction = true
}
-
```
-## Changelog
+### For kerberos simple config
-### 2.2.0-beta 2022-09-26
-
-- Add HDFS File Sink Connector
-
-### 2.3.0-beta 2022-10-20
-
-- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980))
-- [BugFix] Fix filesystem get error ([3117](https://github.com/apache/seatunnel/pull/3117))
-- [BugFix] Solved the bug of can not parse '\t' as delimiter from config file ([3083](https://github.com/apache/seatunnel/pull/3083))
-
-### 2.3.0 2022-12-30
-
-- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258))
- - When field from upstream is null it will throw NullPointerException
- - Sink columns mapping failed
- - When restore writer from states getting transaction directly failed
+```
+HdfsFile {
+ fs.defaultFS = "hdfs://hadoopcluster"
+ path = "/tmp/hive/warehouse/test2"
+ hdfs_site_path = "/path/to/your/hdfs_site_path"
+ kerberos_principal = "your_principal@EXAMPLE.COM"
+ kerberos_keytab_path = "/path/to/your/keytab/file.keytab"
+}
+```
-### Next version
+### For compress simple config
-- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625))
-- [Improve] Support lzo compression for text in file format ([3782](https://github.com/apache/seatunnel/pull/3782))
-- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840))
-- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899))
+```
+HdfsFile {
+ fs.defaultFS = "hdfs://hadoopcluster"
+ path = "/tmp/hive/warehouse/test2"
+ compress_codec = "lzo"
+}
+```
diff --git a/docs/en/connector-v2/sink/InfluxDB.md b/docs/en/connector-v2/sink/InfluxDB.md
index e824a41fe686..1dba1fbe4dc8 100644
--- a/docs/en/connector-v2/sink/InfluxDB.md
+++ b/docs/en/connector-v2/sink/InfluxDB.md
@@ -22,7 +22,6 @@ Write data to InfluxDB.
| key_time | string | no | processing time |
| key_tags | array | no | exclude `field` & `key_time` |
| batch_size | int | no | 1024 |
-| batch_interval_ms | int | no | - |
| max_retries | int | no | - |
| retry_backoff_multiplier_ms | int | no | - |
| connect_timeout_ms | long | no | 15000 |
@@ -63,11 +62,7 @@ If not specified, include all fields with `influxDB` measurement field
### batch_size [int]
-For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the influxDB
-
-### batch_interval_ms [int]
-
-For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the influxDB
+For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `checkpoint.interval`, the data will be flushed into the influxDB
### max_retries [int]
diff --git a/docs/en/connector-v2/sink/IoTDB.md b/docs/en/connector-v2/sink/IoTDB.md
index d60021719e80..554d0bfd06ed 100644
--- a/docs/en/connector-v2/sink/IoTDB.md
+++ b/docs/en/connector-v2/sink/IoTDB.md
@@ -2,193 +2,190 @@
> IoTDB sink connector
-## Description
+## Support Those Engines
-Used to write data to IoTDB.
+> Spark
+> Flink
+> SeaTunnel Zeta
-:::tip
-
-There is a conflict of thrift version between IoTDB and Spark.Therefore, you need to execute `rm -f $SPARK_HOME/jars/libthrift*` and `cp $IOTDB_HOME/lib/libthrift* $SPARK_HOME/jars/` to resolve it.
-
-:::
-
-## Key features
+## Key Features
- [x] [exactly-once](../../concept/connector-v2-features.md)
IoTDB supports the `exactly-once` feature through idempotent writing. If two pieces of data have
the same `key` and `timestamp`, the new data will overwrite the old one.
-## Options
-
-| name | type | required | default value |
-|-----------------------------|---------|----------|--------------------------------|
-| node_urls | list | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| key_device | string | yes | - |
-| key_timestamp | string | no | processing time |
-| key_measurement_fields | array | no | exclude `device` & `timestamp` |
-| storage_group | string | no | - |
-| batch_size | int | no | 1024 |
-| batch_interval_ms | int | no | - |
-| max_retries | int | no | - |
-| retry_backoff_multiplier_ms | int | no | - |
-| max_retry_backoff_ms | int | no | - |
-| default_thrift_buffer_size | int | no | - |
-| max_thrift_frame_size | int | no | - |
-| zone_id | string | no | - |
-| enable_rpc_compression | boolean | no | - |
-| connection_timeout_in_ms | int | no | - |
-| common-options | | no | - |
-
-### node_urls [list]
-
-`IoTDB` cluster address, the format is `["host:port", ...]`
-
-### username [string]
-
-`IoTDB` user username
-
-### password [string]
-
-`IoTDB` user password
-
-### key_device [string]
-
-Specify field name of the `IoTDB` deviceId in SeaTunnelRow
-
-### key_timestamp [string]
-
-Specify field-name of the `IoTDB` timestamp in SeaTunnelRow. If not specified, use processing-time as timestamp
-
-### key_measurement_fields [array]
-
-Specify field-name of the `IoTDB` measurement list in SeaTunnelRow. If not specified, include all fields but exclude `device` & `timestamp`
-
-### storage_group [string]
-
-Specify device storage group(path prefix)
-
-example: deviceId = ${storage_group} + "." + ${key_device}
-
-### batch_size [int]
-
-For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the IoTDB
-
-### batch_interval_ms [int]
-
-For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the IoTDB
-
-### max_retries [int]
-
-The number of retries to flush failed
-
-### retry_backoff_multiplier_ms [int]
-
-Using as a multiplier for generating the next delay for backoff
-
-### max_retry_backoff_ms [int]
-
-The amount of time to wait before attempting to retry a request to `IoTDB`
-
-### default_thrift_buffer_size [int]
+## Description
-Thrift init buffer size in `IoTDB` client
+Used to write data to IoTDB.
-### max_thrift_frame_size [int]
+:::tip
-Thrift max frame size in `IoTDB` client
+There is a conflict of thrift version between IoTDB and Spark.Therefore, you need to execute `rm -f $SPARK_HOME/jars/libthrift*` and `cp $IOTDB_HOME/lib/libthrift* $SPARK_HOME/jars/` to resolve it.
-### zone_id [string]
+:::
-java.time.ZoneId in `IoTDB` client
+## Supported DataSource Info
+
+| Datasource | Supported Versions | Url |
+|------------|--------------------|----------------|
+| IoTDB | `>= 0.13.0` | localhost:6667 |
+
+## Database Dependency
+
+## Data Type Mapping
+
+| IotDB Data type | SeaTunnel Data type |
+|-----------------|---------------------|
+| BOOLEAN | BOOLEAN |
+| INT32 | TINYINT |
+| INT32 | SMALLINT |
+| INT32 | INT |
+| INT64 | BIGINT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| TEXT | STRING |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|-----------------------------|---------|----------|--------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| node_urls | String | Yes | - | `IoTDB` cluster address, the format is `"host1:port"` or `"host1:port,host2:port"` |
+| username | String | Yes | - | `IoTDB` user username |
+| password | String | Yes | - | `IoTDB` user password |
+| key_device | String | Yes | - | Specify field name of the `IoTDB` deviceId in SeaTunnelRow |
+| key_timestamp | String | No | processing time | Specify field-name of the `IoTDB` timestamp in SeaTunnelRow. If not specified, use processing-time as timestamp |
+| key_measurement_fields | Array | No | exclude `device` & `timestamp` | Specify field-name of the `IoTDB` measurement list in SeaTunnelRow. If not specified, include all fields but exclude `device` & `timestamp` |
+| storage_group | Array | No | - | Specify device storage group(path prefix) example: deviceId = ${storage_group} + "." + ${key_device} |
+| batch_size | Integer | No | 1024 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the IoTDB |
+| max_retries | Integer | No | - | The number of retries to flush failed |
+| retry_backoff_multiplier_ms | Integer | No | - | Using as a multiplier for generating the next delay for backoff |
+| max_retry_backoff_ms | Integer | No | - | The amount of time to wait before attempting to retry a request to `IoTDB` |
+| default_thrift_buffer_size | Integer | No | - | Thrift init buffer size in `IoTDB` client |
+| max_thrift_frame_size | Integer | No | - | Thrift max frame size in `IoTDB` client |
+| zone_id | string | No | - | java.time.ZoneId in `IoTDB` client |
+| enable_rpc_compression | Boolean | No | - | Enable rpc compression in `IoTDB` client |
+| connection_timeout_in_ms | Integer | No | - | The maximum time (in ms) to wait when connecting to `IoTDB` |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
-### enable_rpc_compression [boolean]
+## Examples
-Enable rpc compression in `IoTDB` client
+```hocon
+env {
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
-### connection_timeout_in_ms [int]
+source {
+ FakeSource {
+ row.num = 16
+ bigint.template = [1664035200001]
+ schema = {
+ fields {
+ device_name = "string"
+ temperature = "float"
+ moisture = "int"
+ event_ts = "bigint"
+ c_string = "string"
+ c_boolean = "boolean"
+ c_tinyint = "tinyint"
+ c_smallint = "smallint"
+ c_int = "int"
+ c_bigint = "bigint"
+ c_float = "float"
+ c_double = "double"
+ }
+ }
+ }
+}
-The maximum time (in ms) to wait when connecting to `IoTDB`
+...
-### common options
+```
-Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
+Upstream SeaTunnelRow data format is the following:
-## Examples
+| device_name | temperature | moisture | event_ts | c_string | c_boolean | c_tinyint | c_smallint | c_int | c_bigint | c_float | c_double |
+|--------------------------|-------------|----------|---------------|----------|-----------|-----------|------------|-------|------------|---------|----------|
+| root.test_group.device_a | 36.1 | 100 | 1664035200001 | abc1 | true | 1 | 1 | 1 | 2147483648 | 1.0 | 1.0 |
+| root.test_group.device_b | 36.2 | 101 | 1664035200001 | abc2 | false | 2 | 2 | 2 | 2147483649 | 2.0 | 2.0 |
+| root.test_group.device_c | 36.3 | 102 | 1664035200001 | abc3 | false | 3 | 3 | 3 | 2147483649 | 3.0 | 3.0 |
### Case1
-Common options:
+only fill required config.
+use current processing time as timestamp. and include all fields but exclude `device` & `timestamp` as measurement fields
```hocon
sink {
IoTDB {
- node_urls = ["localhost:6667"]
+ node_urls = "localhost:6667"
username = "root"
password = "root"
- batch_size = 1024
- batch_interval_ms = 1000
+ key_device = "device_name" # specify the `deviceId` use device_name field
}
}
```
-When you assign `key_device` is `device_name`, for example:
+Output to `IoTDB` data format is the following:
+
+```shell
+IoTDB> SELECT * FROM root.test_group.* align by device;
++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+
+| Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double|
++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+
+|2023-09-01T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0|
+|2023-09-01T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0|
+|2023-09-01T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0|
++------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+
+```
+
+### Case2
+
+use source event's time
```hocon
sink {
IoTDB {
- ...
- key_device = "device_name"
+ node_urls = "localhost:6667"
+ username = "root"
+ password = "root"
+ key_device = "device_name" # specify the `deviceId` use device_name field
+ key_timestamp = "event_ts" # specify the `timestamp` use event_ts field
}
}
```
-Upstream SeaTunnelRow data format is the following:
-
-| device_name | field_1 | field_2 |
-|--------------------------|---------|---------|
-| root.test_group.device_a | 1001 | 1002 |
-| root.test_group.device_b | 2001 | 2002 |
-| root.test_group.device_c | 3001 | 3002 |
-
Output to `IoTDB` data format is the following:
```shell
IoTDB> SELECT * FROM root.test_group.* align by device;
-+------------------------+------------------------+-----------+----------+
-| Time| Device| field_1| field_2|
-+------------------------+------------------------+----------+-----------+
-|2022-09-26T17:50:01.201Z|root.test_group.device_a| 1001| 1002|
-|2022-09-26T17:50:01.202Z|root.test_group.device_b| 2001| 2002|
-|2022-09-26T17:50:01.203Z|root.test_group.device_c| 3001| 3002|
-+------------------------+------------------------+----------+-----------+
++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+
+| Time| Device| temperature| moisture| event_ts| c_string| c_boolean| c_tinyint| c_smallint| c_int| c_bigint| c_float| c_double|
++------------------------+------------------------+--------------+-----------+--------------+---------+----------+----------+-----------+------+-----------+--------+---------+
+|2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1664035200001| abc1| true| 1| 1| 1| 2147483648| 1.0| 1.0|
+|2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 1664035200001| abc2| false| 2| 2| 2| 2147483649| 2.0| 2.0|
+|2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 1664035200001| abc2| false| 3| 3| 3| 2147483649| 3.0| 3.0|
++------------------------+------------------------+--------------+-----------+--------------+---------+---------+-----------+-----------+------+-----------+--------+---------+
```
-### Case2
+### Case3
-When you assign `key_device`、`key_timestamp`、`key_measurement_fields`, for example:
+use source event's time and limit measurement fields
```hocon
sink {
IoTDB {
- ...
+ node_urls = "localhost:6667"
+ username = "root"
+ password = "root"
key_device = "device_name"
- key_timestamp = "ts"
+ key_timestamp = "event_ts"
key_measurement_fields = ["temperature", "moisture"]
}
}
```
-Upstream SeaTunnelRow data format is the following:
-
-| ts | device_name | field_1 | field_2 | temperature | moisture |
-|---------------|--------------------------|---------|---------|-------------|----------|
-| 1664035200001 | root.test_group.device_a | 1001 | 1002 | 36.1 | 100 |
-| 1664035200001 | root.test_group.device_b | 2001 | 2002 | 36.2 | 101 |
-| 1664035200001 | root.test_group.device_c | 3001 | 3002 | 36.3 | 102 |
-
Output to `IoTDB` data format is the following:
```shell
diff --git a/docs/en/connector-v2/sink/Jdbc.md b/docs/en/connector-v2/sink/Jdbc.md
index 9d68278cf51e..394fadde8018 100644
--- a/docs/en/connector-v2/sink/Jdbc.md
+++ b/docs/en/connector-v2/sink/Jdbc.md
@@ -41,13 +41,13 @@ support `Xa transactions`. You can set `is_exactly_once=true` to enable it.
| connection_check_timeout_sec | Int | No | 30 |
| max_retries | Int | No | 0 |
| batch_size | Int | No | 1000 |
-| batch_interval_ms | Int | No | 1000 |
| is_exactly_once | Boolean | No | false |
| generate_sink_sql | Boolean | No | false |
| xa_data_source_class_name | String | No | - |
| max_commit_attempts | Int | No | 3 |
| transaction_timeout_sec | Int | No | -1 |
| auto_commit | Boolean | No | true |
+| field_ide | String | No | - |
| common-options | | no | - |
### driver [string]
@@ -107,12 +107,7 @@ The number of retries to submit failed (executeBatch)
### batch_size[int]
-For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
-, the data will be flushed into the database
-
-### batch_interval_ms[int]
-
-For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`
+For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database
### is_exactly_once[boolean]
@@ -142,6 +137,12 @@ exactly-once semantics
Automatic transaction commit is enabled by default
+### field_ide [String]
+
+The field "field_ide" is used to identify whether the field needs to be converted to uppercase or lowercase when
+synchronizing from the source to the sink. "ORIGINAL" indicates no conversion is needed, "UPPERCASE" indicates
+conversion to uppercase, and "LOWERCASE" indicates conversion to lowercase.
+
### common options
Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
@@ -175,6 +176,7 @@ there are some reference value for params above.
| Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb | com.amazon.redshift.xa.RedshiftXADataSource | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 |
| Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc |
| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar |
+| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | / | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar |
| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar |
## Example
diff --git a/docs/en/connector-v2/sink/Kingbase.md b/docs/en/connector-v2/sink/Kingbase.md
new file mode 100644
index 000000000000..b92b12fc4200
--- /dev/null
+++ b/docs/en/connector-v2/sink/Kingbase.md
@@ -0,0 +1,168 @@
+# Kingbase
+
+> JDBC Kingbase Sink Connector
+
+## Support Connector Version
+
+- 8.6
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
+
+## Description
+
+> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is
+> support `Xa transactions`. You can set `is_exactly_once=true` to enable it.Kingbase currently does not support
+
+## Supported DataSource Info
+
+| Datasource | Supported versions | Driver | Url | Maven |
+|------------|--------------------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------|
+| Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/'
+> working directory
+> For example: cp kingbase8-8.6.0.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+
+## Data Type Mapping
+
+| Kingbase Data type | SeaTunnel Data type |
+|----------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
+| BOOL | BOOLEAN |
+| INT2 | SHORT |
+| SMALLSERIAL SERIAL INT4 | INT |
+| INT8 BIGSERIAL | BIGINT |
+| FLOAT4 | FLOAT |
+| FLOAT8 | DOUBLE |
+| NUMERIC | DECIMAL((Get the designated column's specified column size), (Gets the designated column's number of digits to right of the decimal point.))) |
+| BPCHAR CHARACTER VARCHAR TEXT | STRING |
+| TIMESTAMP | LOCALDATETIME |
+| TIME | LOCALTIME |
+| DATE | LOCALDATE |
+| Other data type | Not supported yet |
+
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority |
+| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. |
+| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. |
+| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. |
+| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
+| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
+| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. Kingbase currently does not support |
+| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
+| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver,Kingbase currently does not support |
+| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
+| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics |
+| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed
+> in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends
+> it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having 12 fields. The final target table is test_table will also be 16 rows of data in the table.
+> Before
+> run this job, you need create database test and table test_table in your Kingbase. And if you have not yet installed and
+> deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md)
+> to
+> install and deploy SeaTunnel. And then follow the instructions
+> in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ c_string = string
+ c_boolean = boolean
+ c_tinyint = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(30, 8)"
+ c_date = date
+ c_time = time
+ c_timestamp = timestamp
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ jdbc {
+ url = "jdbc:kingbase8://127.0.0.1:54321/dbname"
+ driver = "com.kingbase8.Driver"
+ user = "root"
+ password = "123456"
+ query = "insert into test_table(c_string,c_boolean,c_tinyint,c_smallint,c_int,c_bigint,c_float,c_double,c_decimal,c_date,c_time,c_timestamp) values(?,?,?,?,?,?,?,?,?,?,?,?)"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
+
+### Generate Sink SQL
+
+> This example not need to write complex sql statements, you can configure the database name table name to automatically
+> generate add statements for you
+
+```
+sink {
+ jdbc {
+ url = "jdbc:kingbase8://127.0.0.1:54321/dbname"
+ driver = "com.kingbase8.Driver"
+ user = "root"
+ password = "123456"
+ # Automatically generate sql statements based on database table names
+ generate_sink_sql = true
+ database = test
+ table = test_table
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/Mysql.md b/docs/en/connector-v2/sink/Mysql.md
index 92254c1b54fa..860f071df0e0 100644
--- a/docs/en/connector-v2/sink/Mysql.md
+++ b/docs/en/connector-v2/sink/Mysql.md
@@ -2,6 +2,10 @@
> JDBC Mysql Sink Connector
+## Support Mysql Version
+
+- 5.5/5.6/5.7/8.0
+
## Support Those Engines
> Spark
@@ -67,14 +71,14 @@ semantics (using XA transaction guarantee).
| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
-| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
-| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. |
| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and please refer to appendix for other data sources |
| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics |
| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
+| field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed;`UPPERCASE` indicates conversion to uppercase;`LOWERCASE` indicates conversion to lowercase. |
| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
### Tips
@@ -119,7 +123,7 @@ transform {
sink {
jdbc {
- url = "jdbc:mysql://localhost:3306/test"
+ url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
user = "root"
password = "123456"
@@ -137,7 +141,7 @@ sink {
```
sink {
jdbc {
- url = "jdbc:mysql://localhost:3306/test"
+ url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
user = "root"
password = "123456"
@@ -156,7 +160,7 @@ sink {
```
sink {
jdbc {
- url = "jdbc:mysql://localhost:3306/test"
+ url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
max_retries = 0
@@ -178,7 +182,7 @@ sink {
```
sink {
jdbc {
- url = "jdbc:mysql://localhost:3306/test"
+ url = "jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
user = "root"
password = "123456"
@@ -188,6 +192,7 @@ sink {
database = test
table = sink_table
primary_keys = ["id","name"]
+ field_ide = UPPERCASE
}
}
```
diff --git a/docs/en/connector-v2/sink/OceanBase.md b/docs/en/connector-v2/sink/OceanBase.md
index ec87ce3d36d1..3cea0b5e6e6d 100644
--- a/docs/en/connector-v2/sink/OceanBase.md
+++ b/docs/en/connector-v2/sink/OceanBase.md
@@ -81,8 +81,7 @@ Write data through jdbc. Support Batch mode and Streaming mode, support concurre
| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
-| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
-| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics |
diff --git a/docs/en/connector-v2/sink/Oracle.md b/docs/en/connector-v2/sink/Oracle.md
new file mode 100644
index 000000000000..151243f318fb
--- /dev/null
+++ b/docs/en/connector-v2/sink/Oracle.md
@@ -0,0 +1,191 @@
+# Oracle
+
+> JDBC Oracle Sink Connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [cdc](../../concept/connector-v2-features.md)
+
+> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is
+> support `Xa transactions`. You can set `is_exactly_once=true` to enable it.
+
+## Description
+
+Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once
+semantics (using XA transaction guarantee).
+
+## Supported DataSource Info
+
+| Datasource | Supported Versions | Driver | Url | Maven |
+|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------|
+| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATNUNNEL_HOME/lib/
+> To support the i18n character set, copy the orai18n.jar to the $SEATNUNNEL_HOME/lib/ directory.
+
+## Data Type Mapping
+
+| Oracle Data type | SeaTunnel Data type |
+|--------------------------------------------------------------------------------------|---------------------|
+| INTEGER | INT |
+| FLOAT | DECIMAL(38, 18) |
+| NUMBER(precision <= 9, scale == 0) | INT |
+| NUMBER(9 < precision <= 18, scale == 0) | BIGINT |
+| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) |
+| NUMBER(scale != 0) | DECIMAL(38, 18) |
+| BINARY_DOUBLE | DOUBLE |
+| BINARY_FLOAT REAL | FLOAT |
+| CHAR NCHAR NVARCHAR2 VARCHAR2 LONG ROWID NCLOB CLOB | STRING |
+| DATE | DATE |
+| TIMESTAMP TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP |
+| BLOB RAW LONG RAW BFILE | BYTES |
+
+## Options
+
+| Name | Type | Required | Default | Description |
+|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use Oracle the value is `oracle.jdbc.OracleDriver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority |
+| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. |
+| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database. This option is mutually exclusive with `query` and has a higher priority. |
+| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. |
+| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
+| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
+| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. |
+| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. |
+| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, Oracle is `oracle.jdbc.xa.client.OracleXADataSource`, and please refer to appendix for other data sources |
+| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
+| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics |
+| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your Oracle. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ name = "string"
+ age = "int"
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+ user = root
+ password = 123456
+ query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
+
+### Generate Sink SQL
+
+> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you
+
+```
+sink {
+ Jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+ user = root
+ password = 123456
+
+ generate_sink_sql = true
+ database = XE
+ table = "TEST.TEST_TABLE"
+ }
+}
+```
+
+### Exactly-once :
+
+> For accurate write scene we guarantee accurate once
+
+```
+sink {
+ jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+
+ max_retries = 0
+ user = root
+ password = 123456
+ query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)"
+
+ is_exactly_once = "true"
+
+ xa_data_source_class_name = "oracle.jdbc.xa.client.OracleXADataSource"
+ }
+}
+```
+
+### CDC(Change Data Capture) Event
+
+> CDC change data is also supported by us In this case, you need config database, table and primary_keys.
+
+```
+sink {
+ jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+ user = root
+ password = 123456
+
+ generate_sink_sql = true
+ # You need to configure both database and table
+ database = XE
+ table = "TEST.TEST_TABLE"
+ primary_keys = ["ID"]
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/sink/PostgreSql.md b/docs/en/connector-v2/sink/PostgreSql.md
index f7d6469b60fc..bcc5616f5ea1 100644
--- a/docs/en/connector-v2/sink/PostgreSql.md
+++ b/docs/en/connector-v2/sink/PostgreSql.md
@@ -36,34 +36,34 @@ semantics (using XA transaction guarantee).
## Data Type Mapping
-| PostgreSQL Data type | SeaTunnel Data type |
-|----------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
-| BOOL | BOOLEAN |
-| _BOOL | ARRAY<BOOLEAN> |
-| BYTEA | BYTES |
-| _BYTEA | ARRAY<TINYINT> |
-| INT2 SMALLSERIAL INT4 SERIAL | INT |
-| _INT2 _INT4 | ARRAY<INT> |
-| INT8 BIGSERIAL | BIGINT |
-| _INT8 | ARRAY<BIGINT> |
-| FLOAT4 | FLOAT |
-| _FLOAT4 | ARRAY<FLOAT> |
-| FLOAT8 | DOUBLE |
-| _FLOAT8 | ARRAY<DOUBLE> |
-| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) |
-| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) |
-| BPCHAR CHARACTER VARCHAR TEXT GEOMETRY GEOGRAPHY | STRING |
-| _BPCHAR _CHARACTER _VARCHAR _TEXT | ARRAY<STRING> |
-| TIMESTAMP | TIMESTAMP |
-| TIME | TIME |
-| DATE | DATE |
-| OTHER DATA TYPES | NOT SUPPORTED YET |
+| PostgreSQL Data type | SeaTunnel Data type |
+|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| BOOL | BOOLEAN |
+| _BOOL | ARRAY<BOOLEAN> |
+| BYTEA | BYTES |
+| _BYTEA | ARRAY<TINYINT> |
+| INT2 SMALLSERIAL INT4 SERIAL | INT |
+| _INT2 _INT4 | ARRAY<INT> |
+| INT8 BIGSERIAL | BIGINT |
+| _INT8 | ARRAY<BIGINT> |
+| FLOAT4 | FLOAT |
+| _FLOAT4 | ARRAY<FLOAT> |
+| FLOAT8 | DOUBLE |
+| _FLOAT8 | ARRAY<DOUBLE> |
+| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) |
+| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) |
+| BPCHAR CHARACTER VARCHAR TEXT GEOMETRY GEOGRAPHY JSON JSONB | STRING |
+| _BPCHAR _CHARACTER _VARCHAR _TEXT | ARRAY<STRING> |
+| TIMESTAMP | TIMESTAMP |
+| TIME | TIME |
+| DATE | DATE |
+| OTHER DATA TYPES | NOT SUPPORTED YET |
## Options
| Name | Type | Required | Default | Description |
|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test |
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option |
| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use PostgreSQL the value is `org.postgresql.Driver`. |
| user | String | No | - | Connection instance user name |
| password | String | No | - | Connection instance password |
@@ -74,14 +74,14 @@ semantics (using XA transaction guarantee).
| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
-| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
-| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. |
| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. |
| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, PostgreSQL is `org.postgresql.xa.PGXADataSource`, and please refer to appendix for other data sources |
| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics |
| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
+| field_ide | String | No | - | Identify whether the field needs to be converted when synchronizing from the source to the sink. `ORIGINAL` indicates no conversion is needed;`UPPERCASE` indicates conversion to uppercase;`LOWERCASE` indicates conversion to lowercase. |
| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
### Tips
@@ -125,6 +125,7 @@ transform {
sink {
jdbc {
+ # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option
url = "jdbc:postgresql://localhost:5432/test"
driver = "org.postgresql.Driver"
user = root
@@ -143,6 +144,7 @@ sink {
```
sink {
Jdbc {
+ # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option
url = "jdbc:postgresql://localhost:5432/test"
driver = org.postgresql.Driver
user = root
@@ -162,6 +164,7 @@ sink {
```
sink {
jdbc {
+ # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option
url = "jdbc:postgresql://localhost:5432/test"
driver = "org.postgresql.Driver"
@@ -184,6 +187,7 @@ sink {
```
sink {
jdbc {
+ # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option
url = "jdbc:postgresql://localhost:5432/test"
driver = "org.postgresql.Driver"
user = root
@@ -194,6 +198,7 @@ sink {
database = test
table = sink_table
primary_keys = ["id","name"]
+ field_ide = UPPERCASE
}
}
```
diff --git a/docs/en/connector-v2/sink/Redis.md b/docs/en/connector-v2/sink/Redis.md
index fcface7da22a..7d2ef237e1ce 100644
--- a/docs/en/connector-v2/sink/Redis.md
+++ b/docs/en/connector-v2/sink/Redis.md
@@ -23,6 +23,7 @@ Used to write data to Redis.
| mode | string | no | single |
| nodes | list | yes when mode=cluster | - |
| format | string | no | json |
+| expire | long | no | -1 |
| common-options | | no | - |
### host [string]
@@ -120,6 +121,10 @@ Connector will generate data as the following and write it to redis:
```
+### expire [long]
+
+Set redis expiration time, the unit is second. The default value is -1, keys do not automatically expire by default.
+
### common options
Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md
index 7841afdf04e3..4bb670ae38c8 100644
--- a/docs/en/connector-v2/sink/S3File.md
+++ b/docs/en/connector-v2/sink/S3File.md
@@ -1,24 +1,17 @@
# S3File
-> S3 file sink connector
+> S3 File Sink Connector
-## Description
-
-Output data to aws s3 file system.
-
-:::tip
+## Support Those Engines
-If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+> Spark
+> Flink
+> SeaTunnel Zeta
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
-
-To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir.
-
-:::
-
-## Key features
+## Key Features
- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
By default, we use 2PC commit to ensure `exactly-once`
@@ -30,59 +23,100 @@ By default, we use 2PC commit to ensure `exactly-once`
- [x] json
- [x] excel
-## Options
-
-| name | type | required | default value | remarks |
-|----------------------------------|---------|----------|-------------------------------------------------------|--------------------------------------------------------------------------------------------------------|
-| path | string | yes | - | |
-| bucket | string | yes | - | |
-| fs.s3a.endpoint | string | yes | - | |
-| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | |
-| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
-| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
-| custom_filename | boolean | no | false | Whether you need custom the filename |
-| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
-| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
-| file_format_type | string | no | "csv" | |
-| field_delimiter | string | no | '\001' | Only used when file_format_type is text |
-| row_delimiter | string | no | "\n" | Only used when file_format_type is text |
-| have_partition | boolean | no | false | Whether you need processing partitions. |
-| partition_by | array | no | - | Only used then have_partition is true |
-| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true |
-| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true |
-| sink_columns | array | no | | When this parameter is empty, all fields are sink columns |
-| is_enable_transaction | boolean | no | true | |
-| batch_size | int | no | 1000000 | |
-| compress_codec | string | no | none | |
-| common-options | object | no | - | |
-| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. |
-| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. |
-
-### path [string]
-
-The target dir path is required.
-
-### bucket [string]
-
-The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`.
-
-### fs.s3a.endpoint [string]
-
-fs s3a endpoint
-
-### fs.s3a.aws.credentials.provider [string]
-
-The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now.
-
-More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A)
-
-### access_key [string]
-
-The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
+## Description
-### access_secret [string]
+Output data to aws s3 file system.
-The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
+## Supported DataSource Info
+
+| Datasource | Supported Versions |
+|------------|--------------------|
+| S3 | current |
+
+## Database Dependency
+
+> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+>
+> If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under `${SEATUNNEL_HOME}/lib` to confirm this.
+> To use this connector you need put `hadoop-aws-3.1.4.jar` and `aws-java-sdk-bundle-1.11.271.jar` in `${SEATUNNEL_HOME}/lib` dir.
+
+## Data Type Mapping
+
+If write to `csv`, `text` file type, All column will be string.
+
+### Orc File Type
+
+| SeaTunnel Data type | Orc Data type |
+|----------------------|-----------------------|
+| STRING | STRING |
+| BOOLEAN | BOOLEAN |
+| TINYINT | BYTE |
+| SMALLINT | SHORT |
+| INT | INT |
+| BIGINT | LONG |
+| FLOAT | FLOAT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| DECIMAL | DECIMAL |
+| BYTES | BINARY |
+| DATE | DATE |
+| TIME TIMESTAMP | TIMESTAMP |
+| ROW | STRUCT |
+| NULL | UNSUPPORTED DATA TYPE |
+| ARRAY | LIST |
+| Map | Map |
+
+### Parquet File Type
+
+| SeaTunnel Data type | Parquet Data type |
+|----------------------|-----------------------|
+| STRING | STRING |
+| BOOLEAN | BOOLEAN |
+| TINYINT | INT_8 |
+| SMALLINT | INT_16 |
+| INT | INT32 |
+| BIGINT | INT64 |
+| FLOAT | FLOAT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| DECIMAL | DECIMAL |
+| BYTES | BINARY |
+| DATE | DATE |
+| TIME TIMESTAMP | TIMESTAMP_MILLIS |
+| ROW | GroupType |
+| NULL | UNSUPPORTED DATA TYPE |
+| ARRAY | LIST |
+| Map | Map |
+
+## Sink Options
+
+| name | type | required | default value | Description |
+|----------------------------------|---------|----------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| path | string | yes | - | |
+| bucket | string | yes | - | |
+| fs.s3a.endpoint | string | yes | - | |
+| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. |
+| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
+| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider |
+| custom_filename | boolean | no | false | Whether you need custom the filename |
+| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true |
+| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true |
+| file_format_type | string | no | "csv" | |
+| field_delimiter | string | no | '\001' | Only used when file_format is text |
+| row_delimiter | string | no | "\n" | Only used when file_format is text |
+| have_partition | boolean | no | false | Whether you need processing partitions. |
+| partition_by | array | no | - | Only used when have_partition is true |
+| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used when have_partition is true |
+| is_partition_field_write_in_file | boolean | no | false | Only used when have_partition is true |
+| sink_columns | array | no | | When this parameter is empty, all fields are sink columns |
+| is_enable_transaction | boolean | no | true | |
+| batch_size | int | no | 1000000 | |
+| compress_codec | string | no | none | |
+| common-options | object | no | - | |
+| max_rows_in_memory | int | no | - | Only used when file_format is excel. |
+| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. |
+| hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) |
+| |
### hadoop_s3_properties [map]
@@ -208,6 +242,83 @@ Writer the sheet of the workbook
## Example
+### Simple:
+
+> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to S3File Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target s3 dir will also create a file and all of the data in write in it.
+> Before run this job, you need create s3 path: /seatunnel/text. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ # This is a example source plugin **only for test and demonstrate the feature source plugin**
+ FakeSource {
+ parallelism = 1
+ result_table_name = "fake"
+ row.num = 16
+ schema = {
+ fields {
+ c_map = "map>"
+ c_array = "array"
+ name = string
+ c_boolean = boolean
+ age = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(16, 1)"
+ c_null = "null"
+ c_bytes = bytes
+ c_date = date
+ c_timestamp = timestamp
+ }
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ S3File {
+ bucket = "s3a://seatunnel-test"
+ tmp_path = "/tmp/seatunnel"
+ path="/seatunnel/text"
+ fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
+ fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider"
+ file_format_type = "text"
+ field_delimiter = "\t"
+ row_delimiter = "\n"
+ have_partition = true
+ partition_by = ["age"]
+ partition_dir_expression = "${k0}=${v0}"
+ is_partition_field_write_in_file = true
+ custom_filename = true
+ file_name_expression = "${transactionId}_${now}"
+ filename_time_format = "yyyy.MM.dd"
+ sink_columns = ["name","age"]
+ is_enable_transaction=true
+ hadoop_s3_properties {
+ "fs.s3a.buffer.dir" = "/data/st_test/s3a"
+ "fs.s3a.fast.upload.buffer" = "disk"
+ }
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
+
For text file format with `have_partition` and `custom_filename` and `sink_columns` and `com.amazonaws.auth.InstanceProfileCredentialsProvider`
```hocon
diff --git a/docs/en/connector-v2/sink/SelectDB-Cloud.md b/docs/en/connector-v2/sink/SelectDB-Cloud.md
index 24d22d5a2d04..6ad2997903bd 100644
--- a/docs/en/connector-v2/sink/SelectDB-Cloud.md
+++ b/docs/en/connector-v2/sink/SelectDB-Cloud.md
@@ -2,139 +2,169 @@
> SelectDB Cloud sink connector
-## Description
+## Support Those Engines
-Used to send data to SelectDB Cloud. Both support streaming and batch mode.
-The internal implementation of SelectDB Cloud sink connector upload after batch caching and commit the CopyInto sql to load data into the table.
+> Spark
+> Flink
+> SeaTunnel Zeta
-:::tip
-
-Version Supported
-
-* supported `SelectDB Cloud version is >= 2.2.x`
-
-:::
-
-## Key features
+## Key Features
- [x] [exactly-once](../../concept/connector-v2-features.md)
- [x] [cdc](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|--------------------|--------|----------|------------------------|
-| load-url | string | yes | - |
-| jdbc-url | string | yes | - |
-| cluster-name | string | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| table.identifier | string | yes | - |
-| sink.enable-delete | bool | no | false |
-| selectdb.config | map | yes | - |
-| sink.buffer-size | int | no | 10 * 1024 * 1024 (1MB) |
-| sink.buffer-count | int | no | 10000 |
-| sink.max-retries | int | no | 3 |
-
-### load-url [string]
-
-`SelectDB Cloud` warehouse http address, the format is `warehouse_ip:http_port`
-
-### jdbc-url [string]
-
-`SelectDB Cloud` warehouse jdbc address, the format is `warehouse_ip:mysql_port`
-
-### cluster-name [string]
-
-`SelectDB Cloud` cluster name
-
-### username [string]
-
-`SelectDB Cloud` user username
-
-### password [string]
-
-`SelectDB Cloud` user password
-
-### table.identifier [string]
-
-The name of `SelectDB Cloud` table, the format is `database.table`
+## Description
-### sink.enable-delete [string]
+Used to send data to SelectDB Cloud. Both support streaming and batch mode.
+The internal implementation of SelectDB Cloud sink connector upload after batch caching and commit the CopyInto sql to load data into the table.
-Whether to enable deletion. This option requires SelectDB Cloud table to enable batch delete function, and only supports Unique model.
+## Supported DataSource Info
-`ALTER TABLE example_db.my_table ENABLE FEATURE "BATCH_DELETE";`
+:::tip
-### selectdb.config [map]
+Version Supported
-Write property configuration
+* supported `SelectDB Cloud version is >= 2.2.x`
-CSV Write:
+:::
-```
-selectdb.config {
- file.type="csv"
- file.column_separator=","
- file.line_delimiter="\n"
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|--------------------|--------|----------|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
+| load-url | String | Yes | - | `SelectDB Cloud` warehouse http address, the format is `warehouse_ip:http_port` |
+| jdbc-url | String | Yes | - | `SelectDB Cloud` warehouse jdbc address, the format is `warehouse_ip:mysql_port` |
+| cluster-name | String | Yes | - | `SelectDB Cloud` cluster name |
+| username | String | Yes | - | `SelectDB Cloud` user username |
+| password | String | Yes | - | `SelectDB Cloud` user password |
+| table.identifier | String | Yes | - | The name of `SelectDB Cloud` table, the format is `database.table` |
+| sink.enable-delete | bool | No | false | Whether to enable deletion. This option requires SelectDB Cloud table to enable batch delete function, and only supports Unique model. |
+| sink.max-retries | int | No | 3 | the max retry times if writing records to database failed |
+| sink.buffer-size | int | No | 10 * 1024 * 1024 (1MB) | the buffer size to cache data for stream load. |
+| sink.buffer-count | int | No | 10000 | the buffer count to cache data for stream load. |
+| selectdb.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. |
+
+## Data Type Mapping
+
+| SelectDB Cloud Data type | SeaTunnel Data type |
+|--------------------------|-----------------------------------------|
+| BOOLEAN | BOOLEAN |
+| TINYINT | TINYINT |
+| SMALLINT | SMALLINT TINYINT |
+| INT | INT SMALLINT TINYINT |
+| BIGINT | BIGINT INT SMALLINT TINYINT |
+| LARGEINT | BIGINT INT SMALLINT TINYINT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE FLOAT |
+| DECIMAL | DECIMAL DOUBLE FLOAT |
+| DATE | DATE |
+| DATETIME | TIMESTAMP |
+| CHAR | STRING |
+| VARCHAR | STRING |
+| STRING | STRING |
+| ARRAY | ARRAY |
+| MAP | MAP |
+| JSON | STRING |
+| HLL | Not supported yet |
+| BITMAP | Not supported yet |
+| QUANTILE_STATE | Not supported yet |
+| STRUCT | Not supported yet |
+
+#### Supported import data formats
+
+The supported formats include CSV and JSON
+
+## Task Example
+
+### Simple:
+
+> The following example describes writing multiple data types to SelectDBCloud, and users need to create corresponding tables downstream
+
+```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+ checkpoint.interval = 10000
}
-```
-JSON Write:
+source {
+ FakeSource {
+ row.num = 10
+ map.size = 10
+ array.size = 10
+ bytes.length = 10
+ string.length = 10
+ schema = {
+ fields {
+ c_map = "map>"
+ c_array = "array"
+ c_string = string
+ c_boolean = boolean
+ c_tinyint = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(16, 1)"
+ c_null = "null"
+ c_bytes = bytes
+ c_date = date
+ c_timestamp = timestamp
+ }
+ }
+ }
+}
-```
-selectdb.config {
- file.type="json"
+sink {
+ SelectDBCloud {
+ load-url = "warehouse_ip:http_port"
+ jdbc-url = "warehouse_ip:mysql_port"
+ cluster-name = "Cluster"
+ table.identifier = "test.test"
+ username = "admin"
+ password = "******"
+ selectdb.config {
+ file.type = "json"
+ }
+ }
}
```
-### sink.buffer-size [string]
-
-The maximum capacity of the cache, in bytes, that is flushed to the object storage. The default is 10MB. it is not recommended to modify it.
-
-### sink.buffer-count [string]
-
-Maximum number of entries flushed to the object store. The default value is 10000. it is not recommended to modify.
-
-### sink.max-retries [string]
-
-The maximum number of retries in the Commit phase, the default is 3.
-
-## Example
-
-Use JSON format to import data
+### Use JSON format to import data
```
sink {
SelectDBCloud {
- load-url="warehouse_ip:http_port"
- jdbc-url="warehouse_ip:mysql_port"
- cluster-name="Cluster"
- table.identifier="test.test"
- username="admin"
- password="******"
+ load-url = "warehouse_ip:http_port"
+ jdbc-url = "warehouse_ip:mysql_port"
+ cluster-name = "Cluster"
+ table.identifier = "test.test"
+ username = "admin"
+ password = "******"
selectdb.config {
- file.type="json"
+ file.type = "json"
}
}
}
+
```
-Use CSV format to import data
+### Use CSV format to import data
```
sink {
SelectDBCloud {
- load-url="warehouse_ip:http_port"
- jdbc-url="warehouse_ip:mysql_port"
- cluster-name="Cluster"
- table.identifier="test.test"
- username="admin"
- password="******"
+ load-url = "warehouse_ip:http_port"
+ jdbc-url = "warehouse_ip:mysql_port"
+ cluster-name = "Cluster"
+ table.identifier = "test.test"
+ username = "admin"
+ password = "******"
selectdb.config {
- file.type="csv"
- file.column_separator=","
- file.line_delimiter="\n"
+ file.type = "csv"
+ file.column_separator = ","
+ file.line_delimiter = "\n"
}
}
}
diff --git a/docs/en/connector-v2/sink/Slack.md b/docs/en/connector-v2/sink/Slack.md
index 27ba01c32b0f..7ed87d2022c3 100644
--- a/docs/en/connector-v2/sink/Slack.md
+++ b/docs/en/connector-v2/sink/Slack.md
@@ -2,42 +2,39 @@
> Slack sink connector
-## Description
-
-Used to send data to Slack Channel. Both support streaming and batch mode.
+## Support Those Engines
-> For example, if the data from upstream is [`age: 12, name: huan`], the content send to socket server is the following: `{"name":"huan","age":17}`
+> Spark
+> Flink
+> SeaTunnel Zeta
## Key features
- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [cdc](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|----------------|--------|----------|---------------|
-| webhooks_url | String | Yes | - |
-| oauth_token | String | Yes | - |
-| slack_channel | String | Yes | - |
-| common-options | | no | - |
-
-### webhooks_url [string]
+## Description
-Slack webhook url
+Used to send data to Slack Channel. Both support streaming and batch mode.
-### oauth_token [string]
+> For example, if the data from upstream is [`age: 12, name: huan`], the content send to socket server is the following: `{"name":"huan","age":17}`
-Slack oauth token used for the actual authentication
+## Data Type Mapping
-### slack_channel [string]
+All data types are mapped to string.
-slack channel for data write
+## Options
-### common options
+| Name | Type | Required | Default | Description |
+|----------------|--------|----------|---------|-----------------------------------------------------------------------------------------------------|
+| webhooks_url | String | Yes | - | Slack webhook url |
+| oauth_token | String | Yes | - | Slack oauth token used for the actual authentication |
+| slack_channel | String | Yes | - | slack channel for data write |
+| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details |
-Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details
+## Task Example
-## Example
+### Simple:
```hocon
sink {
diff --git a/docs/en/connector-v2/sink/Snowflake.md b/docs/en/connector-v2/sink/Snowflake.md
index 21bfb175ef7e..1dfff5e09c74 100644
--- a/docs/en/connector-v2/sink/Snowflake.md
+++ b/docs/en/connector-v2/sink/Snowflake.md
@@ -61,8 +61,7 @@ Write data through jdbc. Support Batch mode and Streaming mode, support concurre
| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
-| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
-| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures |
| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect exactly-once semantics |
| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default |
diff --git a/docs/en/connector-v2/sink/StarRocks.md b/docs/en/connector-v2/sink/StarRocks.md
index 7c6491fb591e..38893a429ef7 100644
--- a/docs/en/connector-v2/sink/StarRocks.md
+++ b/docs/en/connector-v2/sink/StarRocks.md
@@ -2,94 +2,43 @@
> StarRocks sink connector
-## Description
+## Support These Engines
-Used to send data to StarRocks. Both support streaming and batch mode.
-The internal implementation of StarRocks sink connector is cached and imported by stream load in batches.
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [ ] [exactly-once](../../concept/connector-v2-features.md)
- [x] [cdc](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|-----------------------------|---------|----------|-----------------|
-| nodeUrls | list | yes | - |
-| base-url | string | yes | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| database | string | yes | - |
-| table | string | no | - |
-| labelPrefix | string | no | - |
-| batch_max_rows | long | no | 1024 |
-| batch_max_bytes | int | no | 5 * 1024 * 1024 |
-| batch_interval_ms | int | no | - |
-| max_retries | int | no | - |
-| retry_backoff_multiplier_ms | int | no | - |
-| max_retry_backoff_ms | int | no | - |
-| enable_upsert_delete | boolean | no | false |
-| save_mode_create_template | string | no | see below |
-| starrocks.config | map | no | - |
-
-### nodeUrls [list]
-
-`StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]`
-
-### base-url [string]
-
-The JDBC URL like `jdbc:mysql://localhost:9030/` or `jdbc:mysql://localhost:9030` or `jdbc:mysql://localhost:9030/db`
-
-### username [string]
-
-`StarRocks` user username
-
-### password [string]
-
-`StarRocks` user password
-
-### database [string]
-
-The name of StarRocks database
-
-### table [string]
-
-The name of StarRocks table, If not set, the table name will be the name of the upstream table
-
-### labelPrefix [string]
-
-The prefix of StarRocks stream load label
-
-### batch_max_rows [long]
-
-For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks
-
-### batch_max_bytes [int]
-
-For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks
-
-### batch_interval_ms [int]
-
-For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks
-
-### max_retries [int]
-
-The number of retries to flush failed
-
-### retry_backoff_multiplier_ms [int]
-
-Using as a multiplier for generating the next delay for backoff
-
-### max_retry_backoff_ms [int]
-
-The amount of time to wait before attempting to retry a request to `StarRocks`
-
-### enable_upsert_delete [boolean]
+## Description
-Whether to enable upsert/delete, only supports PrimaryKey model.
+Used to send data to StarRocks. Both support streaming and batch mode.
+The internal implementation of StarRocks sink connector is cached and imported by stream load in batches.
-### save_mode_create_template [string]
+## Sink Options
+
+| Name | Type | Required | Default | Description |
+|-----------------------------|---------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| nodeUrls | list | yes | - | `StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` |
+| base-url | string | yes | - | The JDBC URL like `jdbc:mysql://localhost:9030/` or `jdbc:mysql://localhost:9030` or `jdbc:mysql://localhost:9030/db` |
+| username | string | yes | - | `StarRocks` user username |
+| password | string | yes | - | `StarRocks` user password |
+| database | string | yes | - | The name of StarRocks database |
+| table | string | no | - | The name of StarRocks table, If not set, the table name will be the name of the upstream table |
+| labelPrefix | string | no | - | The prefix of StarRocks stream load label |
+| batch_max_rows | long | no | 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `checkpoint.interval`, the data will be flushed into the StarRocks |
+| batch_max_bytes | int | no | 5 * 1024 * 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `checkpoint.interval`, the data will be flushed into the StarRocks |
+| max_retries | int | no | - | The number of retries to flush failed |
+| retry_backoff_multiplier_ms | int | no | - | Using as a multiplier for generating the next delay for backoff |
+| max_retry_backoff_ms | int | no | - | The amount of time to wait before attempting to retry a request to `StarRocks` |
+| enable_upsert_delete | boolean | no | false | Whether to enable upsert/delete, only supports PrimaryKey model. |
+| save_mode_create_template | string | no | see below | see below |
+| starrocks.config | map | no | - | The parameter of the stream load `data_desc` |
+
+### save_mode_create_template
We use templates to automatically create starrocks tables,
which will create corresponding table creation statements based on the type of upstream data and schema type,
@@ -131,19 +80,72 @@ You can use the following placeholders
description of StarRocks
- rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list)
-### starrocks.config [map]
-
-The parameter of the stream load `data_desc`
+## Data Type Mapping
+
+| StarRocks Data type | SeaTunnel Data type |
+|---------------------|---------------------|
+| BOOLEAN | BOOLEAN |
+| TINYINT | TINYINT |
+| SMALLINT | SMALLINT |
+| INT | INT |
+| BIGINT | BIGINT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| DECIMAL | DECIMAL |
+| DATE | STRING |
+| TIME | STRING |
+| DATETIME | STRING |
+| STRING | STRING |
+| ARRAY | STRING |
+| MAP | STRING |
+| BYTES | STRING |
#### Supported import data formats
-The supported formats include CSV and JSON. Default value: JSON
+The supported formats include CSV and JSON
-## Example
+## Task Example
-Use JSON format to import data
+### Simple:
+
+> The following example describes writing multiple data types to StarRocks, and users need to create corresponding tables downstream
```hocon
+env {
+ parallelism = 1
+ job.mode = "BATCH"
+ checkpoint.interval = 10000
+}
+
+source {
+ FakeSource {
+ row.num = 10
+ map.size = 10
+ array.size = 10
+ bytes.length = 10
+ string.length = 10
+ schema = {
+ fields {
+ c_map = "map>"
+ c_array = "array"
+ c_string = string
+ c_boolean = boolean
+ c_tinyint = tinyint
+ c_smallint = smallint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_decimal = "decimal(16, 1)"
+ c_null = "null"
+ c_bytes = bytes
+ c_date = date
+ c_timestamp = timestamp
+ }
+ }
+ }
+}
+
sink {
StarRocks {
nodeUrls = ["e2e_starRocksdb:8030"]
@@ -158,12 +160,29 @@ sink {
}
}
}
-
```
-Use CSV format to import data
+### Support write cdc changelog event(INSERT/UPDATE/DELETE)
```hocon
+sink {
+ StarRocks {
+ nodeUrls = ["e2e_starRocksdb:8030"]
+ username = root
+ password = ""
+ database = "test"
+ table = "e2e_table_sink"
+ ...
+
+ // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model.
+ enable_upsert_delete = true
+ }
+}
+```
+
+### Use JSON format to import data
+
+```
sink {
StarRocks {
nodeUrls = ["e2e_starRocksdb:8030"]
@@ -173,17 +192,17 @@ sink {
table = "e2e_table_sink"
batch_max_rows = 10
starrocks.config = {
- format = "CSV"
- column_separator = "\\x01"
- row_delimiter = "\\x02"
+ format = "JSON"
+ strip_outer_array = true
}
}
}
+
```
-Support write cdc changelog event(INSERT/UPDATE/DELETE)
+### Use CSV format to import data
-```hocon
+```
sink {
StarRocks {
nodeUrls = ["e2e_starRocksdb:8030"]
@@ -191,10 +210,12 @@ sink {
password = ""
database = "test"
table = "e2e_table_sink"
- ...
-
- // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model.
- enable_upsert_delete = true
+ batch_max_rows = 10
+ starrocks.config = {
+ format = "CSV"
+ column_separator = "\\x01"
+ row_delimiter = "\\x02"
+ }
}
}
```
diff --git a/docs/en/connector-v2/sink/Tablestore.md b/docs/en/connector-v2/sink/Tablestore.md
index ed59895c65f1..8f161ad25f6e 100644
--- a/docs/en/connector-v2/sink/Tablestore.md
+++ b/docs/en/connector-v2/sink/Tablestore.md
@@ -21,7 +21,6 @@ Write data to `Tablestore`
| table | string | yes | - |
| primary_keys | array | yes | - |
| batch_size | string | no | 25 |
-| batch_interval_ms | string | no | 1000 |
| common-options | config | no | - |
### end_point [string]
diff --git a/docs/en/connector-v2/sink/Vertica.md b/docs/en/connector-v2/sink/Vertica.md
index 0db8571d55f2..9a6244076828 100644
--- a/docs/en/connector-v2/sink/Vertica.md
+++ b/docs/en/connector-v2/sink/Vertica.md
@@ -67,8 +67,7 @@ semantics (using XA transaction guarantee).
| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance |
| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. |
| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) |
-| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` , the data will be flushed into the database |
-| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database |
+| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database |
| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. |
| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to |
| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, vertical is `com.vertical.cj.jdbc.VerticalXADataSource`, and please refer to appendix for other data sources |
diff --git a/docs/en/connector-v2/source/Clickhouse.md b/docs/en/connector-v2/source/Clickhouse.md
index 7596bf72a8f0..d70a8f0e33fb 100644
--- a/docs/en/connector-v2/source/Clickhouse.md
+++ b/docs/en/connector-v2/source/Clickhouse.md
@@ -66,7 +66,7 @@ The following example demonstrates how to create a data synchronization job that
```bash
# Set the basic configuration of the task to be performed
env {
- execution.parallelism = 1
+ execution.parallelism = 10
job.mode = "BATCH"
}
diff --git a/docs/en/connector-v2/source/DB2.md b/docs/en/connector-v2/source/DB2.md
index 7ea91b7165c7..c9eb6a578b6a 100644
--- a/docs/en/connector-v2/source/DB2.md
+++ b/docs/en/connector-v2/source/DB2.md
@@ -54,20 +54,20 @@ Read external data source data through JDBC.
## Source Options
-| Name | Type | Required | Default | Description |
-|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname |
-| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use db2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. |
-| user | String | No | - | Connection instance user name |
-| password | String | No | - | Connection instance password |
-| query | String | Yes | - | Query statement |
-| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
-| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
-| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
-| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
-| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
-| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
-| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+| Name | Type | Required | Default | Description |
+|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use db2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
### Tips
diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md
index c692a7483a6d..c9fb8e70cdb8 100644
--- a/docs/en/connector-v2/source/FtpFile.md
+++ b/docs/en/connector-v2/source/FtpFile.md
@@ -58,9 +58,9 @@ The target ftp host is required
The target ftp port is required
-### username [string]
+### user [string]
-The target ftp username is required
+The target ftp user name is required
### password [string]
diff --git a/docs/en/connector-v2/source/Github.md b/docs/en/connector-v2/source/Github.md
index 5cc6beea76b9..900a207e6971 100644
--- a/docs/en/connector-v2/source/Github.md
+++ b/docs/en/connector-v2/source/Github.md
@@ -28,7 +28,7 @@ Used to read data from Github.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -55,7 +55,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/Gitlab.md b/docs/en/connector-v2/source/Gitlab.md
index b2c17c9f2465..ff3b6bc6423b 100644
--- a/docs/en/connector-v2/source/Gitlab.md
+++ b/docs/en/connector-v2/source/Gitlab.md
@@ -28,7 +28,7 @@ Used to read data from Gitlab.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -55,7 +55,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md
index f479e40a2bc2..88c1e35f87e3 100644
--- a/docs/en/connector-v2/source/HdfsFile.md
+++ b/docs/en/connector-v2/source/HdfsFile.md
@@ -1,20 +1,14 @@
# HdfsFile
-> Hdfs file source connector
+> Hdfs File Source Connector
-## Description
-
-Read data from hdfs file system.
-
-:::tip
+## Support Those Engines
-If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+> Spark
+> Flink
+> SeaTunnel Zeta
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
-
-:::
-
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
@@ -33,238 +27,57 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
- [x] json
- [x] excel
-## Options
-
-| name | type | required | default value |
-|---------------------------|---------|----------|---------------------|
-| path | string | yes | - |
-| file_format_type | string | yes | - |
-| fs.defaultFS | string | yes | - |
-| read_columns | list | yes | - |
-| hdfs_site_path | string | no | - |
-| delimiter | string | no | \001 |
-| parse_partition_from_path | boolean | no | true |
-| date_format | string | no | yyyy-MM-dd |
-| datetime_format | string | no | yyyy-MM-dd HH:mm:ss |
-| time_format | string | no | HH:mm:ss |
-| kerberos_principal | string | no | - |
-| kerberos_keytab_path | string | no | - |
-| skip_header_row_number | long | no | 0 |
-| schema | config | no | - |
-| common-options | | no | - |
-| sheet_name | string | no | - |
-| file_filter_pattern | string | no | - |
-
-### path [string]
-
-The source file path.
-
-### delimiter [string]
-
-Field delimiter, used to tell connector how to slice and dice fields when reading text files
-
-default `\001`, the same as hive's default delimiter
-
-### parse_partition_from_path [boolean]
-
-Control whether parse the partition keys and values from file path
-
-For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`
-
-Every record data from file will be added these two fields:
-
-| name | age |
-|---------------|-----|
-| tyrantlucifer | 26 |
-
-Tips: **Do not define partition fields in schema option**
-
-### date_format [string]
-
-Date type format, used to tell connector how to convert string to date, supported as the following formats:
-
-`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
-
-default `yyyy-MM-dd`
-
-### datetime_format [string]
-
-Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:
-
-`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
-
-default `yyyy-MM-dd HH:mm:ss`
-
-### time_format [string]
-
-Time type format, used to tell connector how to convert string to time, supported as the following formats:
-
-`HH:mm:ss` `HH:mm:ss.SSS`
-
-default `HH:mm:ss`
-
-### skip_header_row_number [long]
-
-Skip the first few lines, but only for the txt and csv.
-
-For example, set like following:
-
-`skip_header_row_number = 2`
-
-then SeaTunnel will skip the first 2 lines from source files
-
-### file_format_type [string]
-
-File type, supported as the following file types:
-
-`text` `csv` `parquet` `orc` `json` `excel`
-
-If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want.
-
-For example:
-
-upstream data is the following:
-
-```json
-
-{"code": 200, "data": "get success", "success": true}
-
-```
-
-You can also save multiple pieces of data in one file and split them by newline:
-
-```json lines
-
-{"code": 200, "data": "get success", "success": true}
-{"code": 300, "data": "get failed", "success": false}
-
-```
-
-you should assign schema as the following:
-
-```hocon
-
-schema {
- fields {
- code = int
- data = string
- success = boolean
- }
-}
-
-```
-
-connector will generate data as the following:
-
-| code | data | success |
-|------|-------------|---------|
-| 200 | get success | true |
-
-If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
-
-If you assign file type to `text` `csv`, you can choose to specify the schema information or not.
+## Description
-For example, upstream data is the following:
+Read data from hdfs file system.
-```text
+## Supported DataSource Info
-tyrantlucifer#26#male
+| Datasource | Supported Versions |
+|------------|--------------------|
+| HdfsFile | hadoop 2.x and 3.x |
-```
+## Source Options
-If you do not assign data schema connector will treat the upstream data as the following:
+| Name | Type | Required | Default | Description |
+|---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| path | string | yes | - | The source file path. |
+| file_format_type | string | yes | - | We supported as the following file types:`text` `json` `csv` `orc` `parquet` `excel`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. |
+| fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` |
+| read_columns | list | yes | - | The read column list of the data source, user can use it to implement field projection.The file type supported column projection as the following shown:[text,json,csv,orc,parquet,excel].Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured. |
+| hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes |
+| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. default `\001`, the same as hive's default delimiter |
+| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields:[name:tyrantlucifer,age:26].Tips:Do not define partition fields in schema option. |
+| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd`.Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` |
+| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` .default `yyyy-MM-dd HH:mm:ss` |
+| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS`.default `HH:mm:ss` |
+| kerberos_principal | string | no | - | The principal of kerberos |
+| kerberos_keytab_path | string | no | - | The keytab path of kerberos |
+| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv.For example, set like following:`skip_header_row_number = 2`.then Seatunnel will skip the first 2 lines from source files |
+| schema | config | no | - | the schema fields of upstream data |
+| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
+| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. |
-| content |
-|-----------------------|
-| tyrantlucifer#26#male |
+### Tips
-If you assign data schema, you should also assign the option `delimiter` too except CSV file type
+> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
-you should assign schema and delimiter as the following:
+## Task Example
-```hocon
+### Simple:
-delimiter = "#"
-schema {
- fields {
- name = string
- age = int
- gender = string
- }
-}
+> This example defines a SeaTunnel synchronization task that read data from Hdfs and sends it to Hdfs.
```
-
-connector will generate data as the following:
-
-| name | age | gender |
-|---------------|-----|--------|
-| tyrantlucifer | 26 | male |
-
-### fs.defaultFS [string]
-
-Hdfs cluster address.
-
-### hdfs_site_path [string]
-
-The path of `hdfs-site.xml`, used to load ha configuration of namenodes
-
-### kerberos_principal [string]
-
-The principal of kerberos
-
-### kerberos_keytab_path [string]
-
-The keytab path of kerberos
-
-### schema [Config]
-
-#### fields [Config]
-
-the schema fields of upstream data
-
-### read_columns [list]
-
-The read column list of the data source, user can use it to implement field projection.
-
-The file type supported column projection as the following shown:
-
-- text
-- json
-- csv
-- orc
-- parquet
-- excel
-
-**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured**
-
-### common options
-
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
-
-### sheet_name [string]
-
-Reader the sheet of the workbook,Only used when file_format_type is excel.
-
-### file_filter_pattern [string]
-
-Filter pattern, which used for filtering files.
-
-## Example
-
-```hocon
-
-HdfsFile {
- path = "/apps/hive/demo/student"
- file_format_type = "parquet"
- fs.defaultFS = "hdfs://namenode001"
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
}
-```
-
-```hocon
-
-HdfsFile {
+source {
+ HdfsFile {
schema {
fields {
name = string
@@ -274,24 +87,24 @@ HdfsFile {
path = "/apps/hive/demo/student"
type = "json"
fs.defaultFS = "hdfs://namenode001"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of source plugins,
+ # please go to https://seatunnel.apache.org/docs/category/source-v2
}
-```
-
-## Changelog
-
-### 2.2.0-beta 2022-09-26
-
-- Add HDFS File Source Connector
-
-### 2.3.0-beta 2022-10-20
-
-- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980))
-- [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085))
-- [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985))
-
-### next version
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
-- [Improve] Support skip header for csv and txt files ([3900](https://github.com/apache/seatunnel/pull/3840))
-- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840))
+sink {
+ HdfsFile {
+ fs.defaultFS = "hdfs://hadoopcluster"
+ path = "/tmp/hive/warehouse/test2"
+ file_format = "orc"
+ }
+ # If you would like to get more information about how to configure seatunnel and see full list of sink plugins,
+ # please go to https://seatunnel.apache.org/docs/category/sink-v2
+}
+```
diff --git a/docs/en/connector-v2/source/Hive.md b/docs/en/connector-v2/source/Hive.md
index f9f35aaf733f..afa9893d5b2b 100644
--- a/docs/en/connector-v2/source/Hive.md
+++ b/docs/en/connector-v2/source/Hive.md
@@ -33,17 +33,18 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
## Options
-| name | type | required | default value |
-|----------------------|--------|----------|---------------|
-| table_name | string | yes | - |
-| metastore_uri | string | yes | - |
-| kerberos_principal | string | no | - |
-| kerberos_keytab_path | string | no | - |
-| hdfs_site_path | string | no | - |
-| hive_site_path | string | no | - |
-| read_partitions | list | no | - |
-| read_columns | list | no | - |
-| common-options | | no | - |
+| name | type | required | default value |
+|-------------------------------|---------|----------|---------------|
+| table_name | string | yes | - |
+| metastore_uri | string | yes | - |
+| kerberos_principal | string | no | - |
+| kerberos_keytab_path | string | no | - |
+| hdfs_site_path | string | no | - |
+| hive_site_path | string | no | - |
+| read_partitions | list | no | - |
+| read_columns | list | no | - |
+| abort_drop_partition_metadata | boolean | no | true |
+| common-options | | no | - |
### table_name [string]
@@ -80,6 +81,10 @@ The keytab file path of kerberos authentication
The read column list of the data source, user can use it to implement field projection.
+### abort_drop_partition_metadata [list]
+
+Flag to decide whether to drop partition metadata from Hive Metastore during an abort operation. Note: this only affects the metadata in the metastore, the data in the partition will always be deleted(data generated during the synchronization process).
+
### common options
Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details
diff --git a/docs/en/connector-v2/source/Http.md b/docs/en/connector-v2/source/Http.md
index 291835b93ed6..f3e6a221bb03 100644
--- a/docs/en/connector-v2/source/Http.md
+++ b/docs/en/connector-v2/source/Http.md
@@ -52,9 +52,9 @@ They can be downloaded via install-plugin.sh or from the Maven central repositor
| format | String | No | json | The format of upstream data, now only support `json` `text`, default `json`. |
| method | String | No | get | Http request method, only supports GET, POST method. |
| headers | Map | No | - | Http headers. |
-| params | Map | No | - | Http params. |
-| body | String | No | - | Http body. |
-| poll_interval_ms | Int | No | - | Request http api interval(millis) in stream mode. |
+| params | Map | No | - | Http params,the program will automatically add http header application/x-www-form-urlencoded. |
+| body | String | No | - | Http body,the program will automatically add http header application/json,body is jsonbody. |
+| poll_interval_millis | Int | No | - | Request http api interval(millis) in stream mode. |
| retry | Int | No | - | The max retry times if request http return to `IOException`. |
| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. |
| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed |
diff --git a/docs/en/connector-v2/source/Iceberg.md b/docs/en/connector-v2/source/Iceberg.md
index 6a42ee0ddd30..b6d3924b95f1 100644
--- a/docs/en/connector-v2/source/Iceberg.md
+++ b/docs/en/connector-v2/source/Iceberg.md
@@ -2,9 +2,15 @@
> Apache Iceberg source connector
-## Description
+## Support Iceberg Version
-Source connector for Apache Iceberg. It can support batch and stream mode.
+- 0.14.0
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
## Key features
@@ -22,126 +28,120 @@ Source connector for Apache Iceberg. It can support batch and stream mode.
- [x] hadoop(2.7.1 , 2.7.5 , 3.1.3)
- [x] hive(2.3.9 , 3.1.2)
-## Options
-
-| name | type | required | default value |
-|--------------------------|---------|----------|----------------------|
-| catalog_name | string | yes | - |
-| catalog_type | string | yes | - |
-| uri | string | no | - |
-| warehouse | string | yes | - |
-| namespace | string | yes | - |
-| table | string | yes | - |
-| schema | config | no | - |
-| case_sensitive | boolean | no | false |
-| start_snapshot_timestamp | long | no | - |
-| start_snapshot_id | long | no | - |
-| end_snapshot_id | long | no | - |
-| use_snapshot_id | long | no | - |
-| use_snapshot_timestamp | long | no | - |
-| stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT |
-| common-options | | no | - |
-
-### catalog_name [string]
-
-User-specified catalog name.
-
-### catalog_type [string]
-
-The optional values are:
-- hive: The hive metastore catalog.
-- hadoop: The hadoop catalog.
-
-### uri [string]
-
-The Hive metastore’s thrift URI.
-
-### warehouse [string]
-
-The location to store metadata files and data files.
-
-### namespace [string]
-
-The iceberg database name in the backend catalog.
-
-### table [string]
-
-The iceberg table name in the backend catalog.
-
-### case_sensitive [boolean]
+## Description
-If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity.
+Source connector for Apache Iceberg. It can support batch and stream mode.
-### schema [config]
+## Supported DataSource Info
-#### fields [Config]
+| Datasource | Dependent | Maven |
+|------------|---------------------|---------------------------------------------------------------------------|
+| Iceberg | flink-shaded-hadoop | [Download](https://mvnrepository.com/search?q=flink-shaded-hadoop-) |
+| Iceberg | hive-exec | [Download](https://mvnrepository.com/artifact/org.apache.hive/hive-exec) |
+| Iceberg | libfb303 | [Download](https://mvnrepository.com/artifact/org.apache.thrift/libfb303) |
-Use projection to select data columns and columns order.
+## Database Dependency
-e.g.
+> In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec and flink-shaded-hadoop-2 in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages.
```
-schema {
- fields {
- f2 = "boolean"
- f1 = "bigint"
- f3 = "int"
- f4 = "bigint"
- }
-}
+flink-shaded-hadoop-x-xxx.jar
+hive-exec-xxx.jar
+libfb303-xxx.jar
```
-### start_snapshot_id [long]
-
-Instructs this scan to look for changes starting from a particular snapshot (exclusive).
-
-### start_snapshot_timestamp [long]
-
-Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp. timestamp – the timestamp in millis since the Unix epoch
-
-### end_snapshot_id [long]
-
-Instructs this scan to look for changes up to a particular snapshot (inclusive).
-
-### use_snapshot_id [long]
-
-Instructs this scan to look for use the given snapshot ID.
-
-### use_snapshot_timestamp [long]
-
-Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch
-
-### stream_scan_strategy [enum]
-
-Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value.
-The optional values are:
-- TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode.
-- FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive.
-- FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive.
-- FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive.
-- FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive.
-
-### common options
-
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
-
-## Example
-
-simple
+> Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package.
+
+## Data Type Mapping
+
+| Iceberg Data type | SeaTunnel Data type |
+|-------------------|---------------------|
+| BOOLEAN | BOOLEAN |
+| INTEGER | INT |
+| LONG | BIGINT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| DATE | DATE |
+| TIME | TIME |
+| TIMESTAMP | TIMESTAMP |
+| STRING | STRING |
+| FIXED BINARY | BYTES |
+| DECIMAL | DECIMAL |
+| STRUCT | ROW |
+| LIST | ARRAY |
+| MAP | MAP |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|--------------------------|---------|----------|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| catalog_name | string | yes | - | User-specified catalog name. |
+| catalog_type | string | yes | - | The optional values are: hive(The hive metastore catalog),hadoop(The hadoop catalog) |
+| uri | string | no | - | The Hive metastore’s thrift URI. |
+| warehouse | string | yes | - | The location to store metadata files and data files. |
+| namespace | string | yes | - | The iceberg database name in the backend catalog. |
+| table | string | yes | - | The iceberg table name in the backend catalog. |
+| schema | config | no | - | Use projection to select data columns and columns order. |
+| case_sensitive | boolean | no | false | If data columns where selected via schema [config], controls whether the match to the schema will be done with case sensitivity. |
+| start_snapshot_timestamp | long | no | - | Instructs this scan to look for changes starting from the most recent snapshot for the table as of the timestamp. timestamp – the timestamp in millis since the Unix epoch |
+| start_snapshot_id | long | no | - | Instructs this scan to look for changes starting from a particular snapshot (exclusive). |
+| end_snapshot_id | long | no | - | Instructs this scan to look for changes up to a particular snapshot (inclusive). |
+| use_snapshot_id | long | no | - | Instructs this scan to look for use the given snapshot ID. |
+| use_snapshot_timestamp | long | no | - | Instructs this scan to look for use the most recent snapshot as of the given time in milliseconds. timestamp – the timestamp in millis since the Unix epoch |
+| stream_scan_strategy | enum | no | FROM_LATEST_SNAPSHOT | Starting strategy for stream mode execution, Default to use `FROM_LATEST_SNAPSHOT` if don’t specify any value,The optional values are: TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode. FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive. FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive. FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive. FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. |
+| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
+
+## Task Example
+
+### Simple:
```hocon
+env {
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+
source {
Iceberg {
+ schema {
+ fields {
+ f2 = "boolean"
+ f1 = "bigint"
+ f3 = "int"
+ f4 = "bigint"
+ f5 = "float"
+ f6 = "double"
+ f7 = "date"
+ f9 = "timestamp"
+ f10 = "timestamp"
+ f11 = "string"
+ f12 = "bytes"
+ f13 = "bytes"
+ f14 = "decimal(19,9)"
+ f15 = "array"
+ f16 = "map"
+ }
+ }
catalog_name = "seatunnel"
catalog_type = "hadoop"
- warehouse = "hdfs://your_cluster//tmp/seatunnel/iceberg/"
- namespace = "your_iceberg_database"
- table = "your_iceberg_table"
+ warehouse = "file:///tmp/seatunnel/iceberg/hadoop/"
+ namespace = "database1"
+ table = "source"
+ result_table_name = "iceberg"
+ }
+}
+
+transform {
+}
+
+sink {
+ Console {
+ source_table_name = "iceberg"
}
}
```
-Or
+### Hive Catalog:
```hocon
source {
@@ -156,7 +156,7 @@ source {
}
```
-column projection
+### Column Projection:
```hocon
source {
@@ -179,20 +179,6 @@ source {
}
```
-:::tip
-
-In order to be compatible with different versions of Hadoop and Hive, the scope of hive-exec and flink-shaded-hadoop-2 in the project pom file are provided, so if you use the Flink engine, first you may need to add the following Jar packages to /lib directory, if you are using the Spark engine and integrated with Hadoop, then you do not need to add the following Jar packages.
-
-:::
-
-```
-flink-shaded-hadoop-x-xxx.jar
-hive-exec-xxx.jar
-libfb303-xxx.jar
-```
-
-Some versions of the hive-exec package do not have libfb303-xxx.jar, so you also need to manually import the Jar package.
-
## Changelog
### 2.2.0-beta 2022-09-26
diff --git a/docs/en/connector-v2/source/IoTDB.md b/docs/en/connector-v2/source/IoTDB.md
index a20680ce638f..da0f198d3e1b 100644
--- a/docs/en/connector-v2/source/IoTDB.md
+++ b/docs/en/connector-v2/source/IoTDB.md
@@ -2,14 +2,16 @@
> IoTDB source connector
-## Description
+## Support Those Engines
-Read external data source data through IoTDB.
+> Spark
+> Flink
+> SeaTunnel Zeta
## Key features
- [x] [batch](../../concept/connector-v2-features.md)
-- [ ] [stream](../../concept/connector-v2-features.md)
+- [x] [stream](../../concept/connector-v2-features.md)
- [x] [exactly-once](../../concept/connector-v2-features.md)
- [x] [column projection](../../concept/connector-v2-features.md)
@@ -18,106 +20,53 @@ supports query SQL and can achieve projection effect.
- [x] [parallelism](../../concept/connector-v2-features.md)
- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|----------------------------|---------|----------|---------------|
-| host | string | no | - |
-| port | int | no | - |
-| node_urls | string | no | - |
-| username | string | yes | - |
-| password | string | yes | - |
-| sql | string | yes | - |
-| schema | config | yes | - |
-| fetch_size | int | no | - |
-| lower_bound | long | no | - |
-| upper_bound | long | no | - |
-| num_partitions | int | no | - |
-| thrift_default_buffer_size | int | no | - |
-| enable_cache_leader | boolean | no | - |
-| version | string | no | - |
-| common-options | | no | - |
-
-### single node, you need to set host and port to connect to the remote data source.
-
-**host** [string] the host of the IoTDB when you select host of the IoTDB
-
-**port** [int] the port of the IoTDB when you select
-
-### multi node, you need to set node_urls to connect to the remote data source.
-
-**node_urls** [string] the node_urls of the IoTDB when you select
-
-e.g.
-
-```text
-127.0.0.1:8080,127.0.0.2:8080
-```
-
-### other parameters
-
-**sql** [string]
-execute sql statement e.g.
-
-```
-select name,age from test
-```
-
-### schema [config]
-
-#### fields [Config]
-
-The schema of the IoTDB that you want to generate
-
-e.g.
-
-```
-schema {
- fields {
- name = string
- age = int
- }
- }
-```
-
-### option parameters
-
-### fetch_size [int]
-
-the fetch_size of the IoTDB when you select
-
-### username [string]
-
-the username of the IoTDB when you select
-
-### password [string]
-
-the password of the IoTDB when you select
-
-### lower_bound [long]
-
-the lower_bound of the IoTDB when you select
-
-### upper_bound [long]
-
-the upper_bound of the IoTDB when you select
-
-### num_partitions [int]
-
-the num_partitions of the IoTDB when you select
-
-### thrift_default_buffer_size [int]
-
-the thrift_default_buffer_size of the IoTDB when you select
-
-### enable_cache_leader [boolean]
-
-enable_cache_leader of the IoTDB when you select
+## Description
-### version [string]
+Read external data source data through IoTDB.
-Version represents the SQL semantic version used by the client, which is used to be compatible with the SQL semantics of
-0.12 when upgrading 0.13. The possible values are: V_0_12, V_0_13.
+:::tip
+
+There is a conflict of thrift version between IoTDB and Spark.Therefore, you need to execute `rm -f $SPARK_HOME/jars/libthrift*` and `cp $IOTDB_HOME/lib/libthrift* $SPARK_HOME/jars/` to resolve it.
+
+:::
+
+## Supported DataSource Info
+
+| Datasource | Supported Versions | Url |
+|------------|--------------------|----------------|
+| IoTDB | `>= 0.13.0` | localhost:6667 |
+
+## Data Type Mapping
+
+| IotDB Data type | SeaTunnel Data type |
+|-----------------|---------------------|
+| BOOLEAN | BOOLEAN |
+| INT32 | TINYINT |
+| INT32 | SMALLINT |
+| INT32 | INT |
+| INT64 | BIGINT |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| TEXT | STRING |
+
+## Source Options
+
+| Name | Type | Required | Default Value | Description |
+|----------------------------|---------|----------|---------------|------------------------------------------------------------------------------------|
+| node_urls | string | yes | - | `IoTDB` cluster address, the format is `"host1:port"` or `"host1:port,host2:port"` |
+| username | string | yes | - | `IoTDB` user username |
+| password | string | yes | - | `IoTDB` user password |
+| sql | string | yes | - | execute sql statement |
+| schema | config | yes | - | the data schema |
+| fetch_size | int | no | - | the fetch_size of the IoTDB when you select |
+| lower_bound | long | no | - | the lower_bound of the IoTDB when you select |
+| upper_bound | long | no | - | the upper_bound of the IoTDB when you select |
+| num_partitions | int | no | - | the num_partitions of the IoTDB when you select |
+| thrift_default_buffer_size | int | no | - | the thrift_default_buffer_size of the IoTDB when you select |
+| thrift_max_frame_size | int | no | - | the thrift max frame size |
+| enable_cache_leader | boolean | no | - | enable_cache_leader of the IoTDB when you select |
+| version | string | no | - | SQL semantic version used by the client, The possible values are: V_0_12, V_0_13 |
+| common-options | | no | - | |
### split partitions
@@ -157,37 +106,37 @@ Source plugin common parameters, please refer to [Source Common Options](common-
## Examples
-### Case1
-
-Common options:
-
```hocon
+env {
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+
source {
IoTDB {
node_urls = "localhost:6667"
username = "root"
password = "root"
+ sql = "SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device"
+ schema {
+ fields {
+ ts = timestamp
+ device_name = string
+ temperature = float
+ moisture = bigint
+ c_int = int
+ c_bigint = bigint
+ c_float = float
+ c_double = double
+ c_string = string
+ c_boolean = boolean
+ }
+ }
}
}
-```
-
-When you assign `sql`、`fields`、`partition`, for example:
-```hocon
sink {
- IoTDB {
- ...
- sql = "SELECT temperature, moisture FROM root.test_group.* WHERE time < 4102329600000 align by device"
- lower_bound = 1
- upper_bound = 4102329600000
- num_partitions = 10
- fields {
- ts = bigint
- device_name = string
-
- temperature = float
- moisture = bigint
- }
+ Console {
}
}
```
@@ -195,23 +144,23 @@ sink {
Upstream `IoTDB` data format is the following:
```shell
-IoTDB> SELECT temperature, moisture FROM root.test_group.* WHERE time < 4102329600000 align by device;
-+------------------------+------------------------+--------------+-----------+
-| Time| Device| temperature| moisture|
-+------------------------+------------------------+--------------+-----------+
-|2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100|
-|2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101|
-|2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102|
-+------------------------+------------------------+--------------+-----------+
+IoTDB> SELECT temperature, moisture, c_int, c_bigint, c_float, c_double, c_string, c_boolean FROM root.test_group.* WHERE time < 4102329600000 align by device;
++------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+
+| Time| Device| temperature| moisture| c_int| c_bigint| c_float| c_double| c_string| c_boolean|
++------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+
+|2022-09-25T00:00:00.001Z|root.test_group.device_a| 36.1| 100| 1| 21474836470| 1.0f| 1.0d| abc| true|
+|2022-09-25T00:00:00.001Z|root.test_group.device_b| 36.2| 101| 2| 21474836470| 2.0f| 2.0d| abc| true|
+|2022-09-25T00:00:00.001Z|root.test_group.device_c| 36.3| 102| 3| 21474836470| 3.0f| 3.0d| abc| true|
++------------------------+------------------------+--------------+-----------+--------+--------------+----------+---------+---------+----------+
```
Loaded to SeaTunnelRow data format is the following:
-| ts | device_name | temperature | moisture |
-|---------------|--------------------------|-------------|----------|
-| 1664035200001 | root.test_group.device_a | 36.1 | 100 |
-| 1664035200001 | root.test_group.device_b | 36.2 | 101 |
-| 1664035200001 | root.test_group.device_c | 36.3 | 102 |
+| ts | device_name | temperature | moisture | c_int | c_bigint | c_float | c_double | c_string | c_boolean |
+|---------------|--------------------------|-------------|----------|-------|-------------|---------|----------|----------|-----------|
+| 1664035200001 | root.test_group.device_a | 36.1 | 100 | 1 | 21474836470 | 1.0f | 1.0d | abc | true |
+| 1664035200001 | root.test_group.device_b | 36.2 | 101 | 2 | 21474836470 | 2.0f | 2.0d | abc | true |
+| 1664035200001 | root.test_group.device_c | 36.3 | 102 | 3 | 21474836470 | 3.0f | 3.0d | abc | true |
## Changelog
diff --git a/docs/en/connector-v2/source/Jdbc.md b/docs/en/connector-v2/source/Jdbc.md
index a324316e5946..b86a7b33854b 100644
--- a/docs/en/connector-v2/source/Jdbc.md
+++ b/docs/en/connector-v2/source/Jdbc.md
@@ -76,11 +76,11 @@ The time in seconds to wait for the database operation used to validate the conn
The column name for parallelism's partition, only support numeric type.
-### partition_upper_bound [long]
+### partition_upper_bound [BigDecimal]
The partition_column max value for scan, if not set SeaTunnel will query database get max value.
-### partition_lower_bound [long]
+### partition_lower_bound [BigDecimal]
The partition_column min value for scan, if not set SeaTunnel will query database get min value.
@@ -125,6 +125,7 @@ there are some reference value for params above.
| Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc |
| Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb?defaultRowFetchSize=1000 | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 |
| Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar |
+| Kingbase | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar |
| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar |
## Example
@@ -145,15 +146,25 @@ Jdbc {
parallel:
```
-Jdbc {
- url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8"
- driver = "com.mysql.cj.jdbc.Driver"
- connection_check_timeout_sec = 100
- user = "root"
- password = "123456"
- query = "select * from type_bin"
- partition_column = "id"
- partition_num = 10
+env {
+ execution.parallelism = 10
+ job.mode = "BATCH"
+}
+source {
+ Jdbc {
+ url = "jdbc:mysql://localhost/test?serverTimezone=GMT%2b8"
+ driver = "com.mysql.cj.jdbc.Driver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ query = "select * from type_bin"
+ partition_column = "id"
+ partition_num = 10
+ }
+}
+
+sink {
+ Console {}
}
```
diff --git a/docs/en/connector-v2/source/Jira.md b/docs/en/connector-v2/source/Jira.md
index 6452b66c9312..dcfe6cc11d37 100644
--- a/docs/en/connector-v2/source/Jira.md
+++ b/docs/en/connector-v2/source/Jira.md
@@ -29,7 +29,7 @@ Used to read data from Jira.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -62,7 +62,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/Kingbase.md b/docs/en/connector-v2/source/Kingbase.md
new file mode 100644
index 000000000000..62e280675dd7
--- /dev/null
+++ b/docs/en/connector-v2/source/Kingbase.md
@@ -0,0 +1,148 @@
+# Kingbase
+
+> JDBC Kingbase Source Connector
+
+## Support Connector Version
+
+- 8.6
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [column projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [x] [support user-defined split](../../concept/connector-v2-features.md)
+
+## Description
+
+Read external data source data through JDBC.
+
+## Supported DataSource Info
+
+| Datasource | Supported versions | Driver | Url | Maven |
+|------------|--------------------|----------------------|------------------------------------------|------------------------------------------------------------------------------------------------|
+| Kingbase | 8.6 | com.kingbase8.Driver | jdbc:kingbase8://localhost:54321/db_test | [Download](https://repo1.maven.org/maven2/cn/com/kingbase/kingbase8/8.6.0/kingbase8-8.6.0.jar) |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example: cp kingbase8-8.6.0.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+
+## Data Type Mapping
+
+| Kingbase Data type | SeaTunnel Data type |
+|-------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------|
+| BOOL | BOOLEAN |
+| INT2 | SHORT |
+| SMALLSERIAL SERIAL INT4 | INT |
+| INT8 BIGSERIAL | BIGINT |
+| FLOAT4 | FLOAT |
+| FLOAT8 | DOUBLE |
+| NUMERIC | DECIMAL((Get the designated column's specified column size), (Gets the designated column's number of digits to right of the decimal point.))) |
+| BPCHAR CHARACTER VARCHAR TEXT | STRING |
+| TIMESTAMP | LOCALDATETIME |
+| TIME | LOCALTIME |
+| DATE | LOCALDATE |
+| Other data type | Not supported yet |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:kingbase8://localhost:54321/test |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.kingbase8.Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure the row fetch size used in the query to improve performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+```
+env {
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+
+source {
+ Jdbc {
+ driver = "com.kingbase8.Driver"
+ url = "jdbc:kingbase8://localhost:54321/db_test"
+ user = "root"
+ password = ""
+ query = "select * from source"
+ }
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/transform/sql
+}
+
+sink {
+ Console {}
+}
+```
+
+### Parallel:
+
+> Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table
+
+```
+source {
+ Jdbc {
+ driver = "com.kingbase8.Driver"
+ url = "jdbc:kingbase8://localhost:54321/db_test"
+ user = "root"
+ password = ""
+ query = "select * from source"
+ # Parallel sharding reads fields
+ partition_column = "id"
+ # Number of fragments
+ partition_num = 10
+ }
+}
+```
+
+### Parallel Boundary:
+
+> It is more efficient to read your data source according to the upper and lower boundaries you configured
+
+```
+source {
+ Jdbc {
+ driver = "com.kingbase8.Driver"
+ url = "jdbc:kingbase8://localhost:54321/db_test"
+ user = "root"
+ password = ""
+ query = "select * from source"
+ partition_column = "id"
+ partition_num = 10
+ # Read start boundary
+ partition_lower_bound = 1
+ # Read end boundary
+ partition_upper_bound = 500
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/source/Klaviyo.md b/docs/en/connector-v2/source/Klaviyo.md
index 20ed8ded5015..e80a2434fdf1 100644
--- a/docs/en/connector-v2/source/Klaviyo.md
+++ b/docs/en/connector-v2/source/Klaviyo.md
@@ -30,7 +30,7 @@ Used to read data from Klaviyo.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -63,7 +63,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/Lemlist.md b/docs/en/connector-v2/source/Lemlist.md
index 5e7c4138c581..76cac3b9bf81 100644
--- a/docs/en/connector-v2/source/Lemlist.md
+++ b/docs/en/connector-v2/source/Lemlist.md
@@ -28,7 +28,7 @@ Used to read data from Lemlist.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -57,7 +57,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/MongoDB.md b/docs/en/connector-v2/source/MongoDB.md
index 137fb205b8c9..d63d303fa248 100644
--- a/docs/en/connector-v2/source/MongoDB.md
+++ b/docs/en/connector-v2/source/MongoDB.md
@@ -283,6 +283,10 @@ By utilizing `flat.sync-string`, only one field attribute value can be set, and
This operation will perform a string mapping on a single MongoDB data entry.
```bash
+env {
+ execution.parallelism = 10
+ job.mode = "BATCH"
+}
source {
MongoDB {
uri = "mongodb://user:password@127.0.0.1:27017"
@@ -296,6 +300,9 @@ source {
}
}
}
+sink {
+ Console {}
+}
```
Use the data samples synchronized with modified parameters, such as the following:
diff --git a/docs/en/connector-v2/source/MyHours.md b/docs/en/connector-v2/source/MyHours.md
index ec3a93553364..91321990ab2b 100644
--- a/docs/en/connector-v2/source/MyHours.md
+++ b/docs/en/connector-v2/source/MyHours.md
@@ -2,11 +2,13 @@
> My Hours source connector
-## Description
+## Support Those Engines
-Used to read data from My Hours.
+> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
@@ -15,71 +17,103 @@ Used to read data from My Hours.
- [ ] [parallelism](../../concept/connector-v2-features.md)
- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|-----------------------------|---------|----------|---------------|
-| url | String | Yes | - |
-| email | String | Yes | - |
-| password | String | Yes | - |
-| method | String | No | get |
-| schema | Config | No | - |
-| schema.fields | Config | No | - |
-| format | String | No | json |
-| params | Map | No | - |
-| body | String | No | - |
-| json_field | Config | No | - |
-| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
-| retry | int | No | - |
-| retry_backoff_multiplier_ms | int | No | 100 |
-| retry_backoff_max_ms | int | No | 10000 |
-| enable_multi_lines | boolean | No | false |
-| common-options | config | No | - |
-
-### url [String]
-
-http request url
-
-### email [String]
-
-email for login
-
-### password [String]
-
-password for login
-
-### method [String]
-
-http request method, only supports GET, POST method
-
-### params [Map]
-
-http params
-
-### body [String]
-
-http body
-
-### poll_interval_ms [int]
+## Description
-request http api interval(millis) in stream mode
+Used to read data from My Hours.
-### retry [int]
+## Key features
-The max retry times if request http return to `IOException`
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [ ] [exactly-once](../../concept/connector-v2-features.md)
+- [ ] [column projection](../../concept/connector-v2-features.md)
+- [ ] [parallelism](../../concept/connector-v2-features.md)
+- [ ] [support user-defined split](../../concept/connector-v2-features.md)
-### retry_backoff_multiplier_ms [int]
+## Supported DataSource Info
+
+In order to use the My Hours connector, the following dependencies are required.
+They can be downloaded via install-plugin.sh or from the Maven central repository.
+
+| Datasource | Supported Versions | Dependency |
+|------------|--------------------|---------------------------------------------------------------------------------------------|
+| My Hours | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2) |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|-----------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | Http request url. |
+| email | String | Yes | - | My hours login email address. |
+| password | String | Yes | - | My hours login password. |
+| schema | Config | No | - | Http and seatunnel data structure mapping |
+| schema.fields | Config | No | - | The schema fields of upstream data |
+| json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. |
+| content_json | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. |
+| format | String | No | json | The format of upstream data, now only support `json` `text`, default `json`. |
+| method | String | No | get | Http request method, only supports GET, POST method. |
+| headers | Map | No | - | Http headers. |
+| params | Map | No | - | Http params. |
+| body | String | No | - | Http body. |
+| poll_interval_millis | Int | No | - | Request http api interval(millis) in stream mode. |
+| retry | Int | No | - | The max retry times if request http return to `IOException`. |
+| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. |
+| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed |
+| enable_multi_lines | Boolean | No | false | |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+## How to Create a My Hours Data Synchronization Jobs
-The retry-backoff times(millis) multiplier if request http failed
+```hocon
+env {
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
-### retry_backoff_max_ms [int]
+MyHours{
+ url = "https://api2.myhours.com/api/Projects/getAll"
+ email = "seatunnel@test.com"
+ password = "seatunnel"
+ schema {
+ fields {
+ name = string
+ archived = boolean
+ dateArchived = string
+ dateCreated = string
+ clientName = string
+ budgetAlertPercent = string
+ budgetType = int
+ totalTimeLogged = double
+ budgetValue = double
+ totalAmount = double
+ totalExpense = double
+ laborCost = double
+ totalCost = double
+ billableTimeLogged = double
+ totalBillableAmount = double
+ billable = boolean
+ roundType = int
+ roundInterval = int
+ budgetSpentPercentage = double
+ budgetTarget = int
+ budgetPeriodType = string
+ budgetSpent = string
+ id = string
+ }
+ }
+}
-The maximum retry-backoff times(millis) if request http failed
+# Console printing of the read data
+sink {
+ Console {
+ parallelism = 1
+ }
+}
+```
-### format [String]
+## Parameter Interpretation
-the format of upstream data, now only support `json` `text`, default `json`.
+### format
when you assign format is `json`, you should also assign schema option, for example:
@@ -98,11 +132,11 @@ you should assign schema as the following:
```hocon
schema {
- fields {
- code = int
- data = string
- success = boolean
- }
+ fields {
+ code = int
+ data = string
+ success = boolean
+ }
}
```
@@ -131,13 +165,7 @@ connector will generate data as the following:
|----------------------------------------------------------|
| {"code": 200, "data": "get success", "success": true} |
-### schema [Config]
-
-#### fields [Config]
-
-the schema fields of upstream data
-
-### content_json [String]
+### content_json
This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`.
@@ -212,14 +240,14 @@ Here is an example:
- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json)
- See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf).
-### json_field [Config]
+### json_field
This parameter helps you configure the schema,so this parameter must be used with schema.
If your data looks something like this:
```json
-{
+{
"store": {
"book": [
{
@@ -273,47 +301,6 @@ source {
- Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json)
- See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf).
-### common options
-
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details
-
-## Example
-
-```hocon
-MyHours{
- url = "https://api2.myhours.com/api/Projects/getAll"
- email = "seatunnel@test.com"
- password = "seatunnel"
- schema {
- fields {
- name = string
- archived = boolean
- dateArchived = string
- dateCreated = string
- clientName = string
- budgetAlertPercent = string
- budgetType = int
- totalTimeLogged = double
- budgetValue = double
- totalAmount = double
- totalExpense = double
- laborCost = double
- totalCost = double
- billableTimeLogged = double
- totalBillableAmount = double
- billable = boolean
- roundType = int
- roundInterval = int
- budgetSpentPercentage = double
- budgetTarget = int
- budgetPeriodType = string
- budgetSpent = string
- id = string
- }
- }
-}
-```
-
## Changelog
### next version
diff --git a/docs/en/connector-v2/source/MySQL-CDC.md b/docs/en/connector-v2/source/MySQL-CDC.md
index caeeca062836..6740fd4b8b2e 100644
--- a/docs/en/connector-v2/source/MySQL-CDC.md
+++ b/docs/en/connector-v2/source/MySQL-CDC.md
@@ -2,10 +2,9 @@
> MySQL CDC source connector
-## Description
+## Support Those Engines
-The MySQL CDC connector allows for reading snapshot data and incremental data from MySQL database. This document
-describes how to set up the MySQL CDC connector to run SQL queries against MySQL databases.
+> SeaTunnel Zeta
## Key features
@@ -16,207 +15,202 @@ describes how to set up the MySQL CDC connector to run SQL queries against MySQL
- [x] [parallelism](../../concept/connector-v2-features.md)
- [x] [support user-defined split](../../concept/connector-v2-features.md)
-## Options
-
-| name | type | required | default value |
-|------------------------------------------------|----------|----------|---------------|
-| username | String | Yes | - |
-| password | String | Yes | - |
-| database-names | List | No | - |
-| table-names | List | Yes | - |
-| base-url | String | Yes | - |
-| startup.mode | Enum | No | INITIAL |
-| startup.timestamp | Long | No | - |
-| startup.specific-offset.file | String | No | - |
-| startup.specific-offset.pos | Long | No | - |
-| stop.mode | Enum | No | NEVER |
-| stop.timestamp | Long | No | - |
-| stop.specific-offset.file | String | No | - |
-| stop.specific-offset.pos | Long | No | - |
-| incremental.parallelism | Integer | No | 1 |
-| snapshot.split.size | Integer | No | 8096 |
-| snapshot.fetch.size | Integer | No | 1024 |
-| server-id | String | No | - |
-| server-time-zone | String | No | UTC |
-| connect.timeout.ms | Duration | No | 30000 |
-| connect.max-retries | Integer | No | 3 |
-| connection.pool.size | Integer | No | 20 |
-| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 |
-| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 |
-| sample-sharding.threshold | int | No | 1000 |
-| inverse-sampling.rate | int | No | 1000 |
-| exactly_once | Boolean | No | true |
-| debezium.* | config | No | - |
-| format | Enum | No | DEFAULT |
-| common-options | | no | - |
-
-### username [String]
-
-Name of the database to use when connecting to the database server.
-
-### password [String]
-
-Password to use when connecting to the database server.
-
-### database-names [List]
-
-Database name of the database to monitor.
-
-### table-names [List]
-
-Table name of the database to monitor. The table name needs to include the database name, for example: database_name.table_name
-
-### base-url [String]
-
-URL has to be with database, like "jdbc:mysql://localhost:5432/db" or "jdbc:mysql://localhost:5432/db?useSSL=true".
-
-### startup.mode [Enum]
-
-Optional startup mode for MySQL CDC consumer, valid enumerations are "initial", "earliest", "latest" and "specific".
-
-### startup.timestamp [Long]
-
-Start from the specified epoch timestamp (in milliseconds).
-
-**Note, This option is required when the "startup.mode" option used `'timestamp'`.**
-
-### startup.specific-offset.file [String]
-
-Start from the specified binlog file name.
-
-**Note, This option is required when the "startup.mode" option used `'specific'`.**
-
-### startup.specific-offset.pos [Long]
-
-Start from the specified binlog file position.
-
-**Note, This option is required when the "startup.mode" option used `'specific'`.**
-
-### stop.mode [Enum]
-
-Optional stop mode for MySQL CDC consumer, valid enumerations are "never".
-
-### stop.timestamp [Long]
-
-Stop from the specified epoch timestamp (in milliseconds).
-
-**Note, This option is required when the "stop.mode" option used `'timestamp'`.**
-
-### stop.specific-offset.file [String]
-
-Stop from the specified binlog file name.
-
-**Note, This option is required when the "stop.mode" option used `'specific'`.**
-
-### stop.specific-offset.pos [Long]
-
-Stop from the specified binlog file position.
-
-**Note, This option is required when the "stop.mode" option used `'specific'`.**
-
-### incremental.parallelism [Integer]
-
-The number of parallel readers in the incremental phase.
-
-### snapshot.split.size [Integer]
-
-The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot
-of table.
-
-### snapshot.fetch.size [Integer]
-
-The maximum fetch size for per poll when read table snapshot.
-
-### chunk-key.even-distribution.factor.upper-bound [Double]
-
-The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0.
-
-### chunk-key.even-distribution.factor.lower-bound [Double]
+## Description
-The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05.
+The MySQL CDC connector allows for reading snapshot data and incremental data from MySQL database. This document
+describes how to set up the MySQL CDC connector to run SQL queries against MySQL databases.
-### sample-sharding.threshold [Integer]
+## Supported DataSource Info
-This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards.
+| Datasource | Supported versions | Driver | Url | Maven |
+|------------|-------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------|----------------------------------|----------------------------------------------------------------------|
+| MySQL |
| com.mysql.cj.jdbc.Driver | jdbc:mysql://localhost:3306/test | https://mvnrepository.com/artifact/mysql/mysql-connector-java/8.0.28 |
-### inverse-sampling.rate [Integer]
+## Database Dependency
-The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000.
+### Install Jdbc Driver
-### server-id [String]
+Please download and put mysql driver in `${SEATUNNEL_HOME}/lib/` dir. For example: cp mysql-connector-java-xxx.jar `$SEATNUNNEL_HOME/lib/`
-A numeric ID or a numeric ID range of this database client, The numeric ID syntax is like '5400', the numeric ID range
-syntax is like '5400-5408'.
+### Creating MySQL user
-Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the
-MySQL cluster as another server (with this unique ID) so it can read the binlog.
+You have to define a MySQL user with appropriate permissions on all databases that the Debezium MySQL connector monitors.
-By default, a random number is generated between 5400 and 6400, though we recommend setting an explicit value.
+1. Create the MySQL user:
-### server-time-zone [String]
+```sql
+mysql> CREATE USER 'user'@'localhost' IDENTIFIED BY 'password';
+```
-The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone.
+2. Grant the required permissions to the user:
-### connect.timeout.ms [long]
+```sql
+mysql> GRANT SELECT, RELOAD, SHOW DATABASES, REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'user' IDENTIFIED BY 'password';
+```
-The maximum time that the connector should wait after trying to connect to the database server before timing out.
+3. Finalize the user’s permissions:
-### connect.max-retries [Integer]
+```sql
+mysql> FLUSH PRIVILEGES;
+```
-The max retry times that the connector should retry to build database server connection.
+### Enabling the MySQL binlog
-### connection.pool.size [Integer]
+You must enable binary logging for MySQL replication. The binary logs record transaction updates for replication tools to propagate changes.
-The connection pool size.
+1. Check whether the `log-bin` option is already on:
-### exactly_once [Boolean]
+```sql
+mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency');
++--------------------------+----------------+
+| Variable_name | Value |
++--------------------------+----------------+
+| binlog_format | ROW |
+| binlog_row_image | FULL |
+| enforce_gtid_consistency | ON |
+| gtid_mode | ON |
+| log_bin | ON |
++--------------------------+----------------+
+5 rows in set (0.00 sec)
+```
-Enable exactly once semantic.
+2. If inconsistent with the above results, configure your MySQL server configuration file(`$MYSQL_HOME/mysql.cnf`) with the following properties, which are described in the table below:
-### debezium [Config]
+```
+# Enable binary replication log and set the prefix, expiration, and log format.
+# The prefix is arbitrary, expiration can be short for integration tests but would
+# be longer on a production system. Row-level info is required for ingest to work.
+# Server ID is required, but this will vary on production systems
+server-id = 223344
+log_bin = mysql-bin
+expire_logs_days = 10
+binlog_format = row
+binlog_row_image = FULL
+
+# enable gtid mode
+gtid_mode = on
+enforce_gtid_consistency = on
+```
-Pass-through Debezium's properties to Debezium Embedded Engine which is used to capture data changes from MySQL server.
+3. Restart MySQL Server
-See more about
-the [Debezium's MySQL Connector properties](https://debezium.io/documentation/reference/1.6/connectors/mysql.html#mysql-connector-properties)
+```shell
+/etc/inint.d/mysqld restart
+```
-### format [Enum]
+4. Confirm your changes by checking the binlog status once more:
+
+```sql
+mysql> show variables where variable_name in ('log_bin', 'binlog_format', 'binlog_row_image', 'gtid_mode', 'enforce_gtid_consistency');
++--------------------------+----------------+
+| Variable_name | Value |
++--------------------------+----------------+
+| binlog_format | ROW |
+| binlog_row_image | FULL |
+| enforce_gtid_consistency | ON |
+| gtid_mode | ON |
+| log_bin | ON |
++--------------------------+----------------+
+5 rows in set (0.00 sec)
+```
-Optional output format for MySQL CDC, valid enumerations are "DEFAULT"、"COMPATIBLE_DEBEZIUM_JSON".
+### Notes
+
+#### Setting up MySQL session timeouts
+
+When an initial consistent snapshot is made for large databases, your established connection could timeout while the tables are being read. You can prevent this behavior by configuring interactive_timeout and wait_timeout in your MySQL configuration file.
+- `interactive_timeout`: The number of seconds the server waits for activity on an interactive connection before closing it. See [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_interactive_timeout) for more details.
+- `wait_timeout`: The number of seconds the server waits for activity on a non-interactive connection before closing it. See [MySQL’s documentation](https://dev.mysql.com/doc/refman/8.0/en/server-system-variables.html#sysvar_wait_timeout) for more details.
+
+*For more database settings see [Debezium MySQL Connector](https://debezium.io/documentation/reference/1.6/connectors/mysql.html#setting-up-mysql)*
+
+## Data Type Mapping
+
+| Mysql Data type | SeaTunnel Data type |
+|------------------------------------------------------------------------------------------|---------------------|
+| BIT(1) TINYINT(1) | BOOLEAN |
+| TINYINT | TINYINT |
+| TINYINT UNSIGNED SMALLINT | SMALLINT |
+| SMALLINT UNSIGNED MEDIUMINT MEDIUMINT UNSIGNED INT INTEGER YEAR | INT |
+| INT UNSIGNED INTEGER UNSIGNED BIGINT | BIGINT |
+| BIGINT UNSIGNED | DECIMAL(20,0) |
+| DECIMAL(p, s) DECIMAL(p, s) UNSIGNED NUMERIC(p, s) NUMERIC(p, s) UNSIGNED | DECIMAL(p,s) |
+| FLOAT FLOAT UNSIGNED | FLOAT |
+| DOUBLE DOUBLE UNSIGNED REAL REAL UNSIGNED | DOUBLE |
+| CHAR VARCHAR TINYTEXT MEDIUMTEXT TEXT LONGTEXT ENUM JSON | STRING |
+| DATE | DATE |
+| TIME | TIME |
+| DATETIME TIMESTAMP | TIMESTAMP |
+| BINARY VARBINAR BIT(p) TINYBLOB MEDIUMBLOB BLOB LONGBLOB | BYTES |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------------------------------------|----------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| base-url | String | Yes | - | The URL of the JDBC connection. Refer to a case: `jdbc:mysql://localhost:3306:3306/test`. |
+| username | String | Yes | - | Name of the database to use when connecting to the database server. |
+| password | String | Yes | - | Password to use when connecting to the database server. |
+| database-names | List | No | - | Database name of the database to monitor. |
+| table-names | List | Yes | - | Table name of the database to monitor. The table name needs to include the database name, for example: `database_name.table_name` |
+| startup.mode | Enum | No | INITIAL | Optional startup mode for MySQL CDC consumer, valid enumerations are `initial`, `earliest`, `latest` and `specific`. `initial`: Synchronize historical data at startup, and then synchronize incremental data. `earliest`: Startup from the earliest offset possible. `latest`: Startup from the latest offset. `specific`: Startup from user-supplied specific offsets. |
+| startup.specific-offset.file | String | No | - | Start from the specified binlog file name. **Note, This option is required when the `startup.mode` option used `specific`.** |
+| startup.specific-offset.pos | Long | No | - | Start from the specified binlog file position. **Note, This option is required when the `startup.mode` option used `specific`.** |
+| stop.mode | Enum | No | NEVER | Optional stop mode for MySQL CDC consumer, valid enumerations are `never`, `latest` or `specific`. `never`: Real-time job don't stop the source. `latest`: Stop from the latest offset. `specific`: Stop from user-supplied specific offset. |
+| stop.specific-offset.file | String | No | - | Stop from the specified binlog file name. **Note, This option is required when the `stop.mode` option used `specific`.** |
+| stop.specific-offset.pos | Long | No | - | Stop from the specified binlog file position. **Note, This option is required when the `stop.mode` option used `specific`.** |
+| snapshot.split.size | Integer | No | 8096 | The split size (number of rows) of table snapshot, captured tables are split into multiple splits when read the snapshot of table. |
+| snapshot.fetch.size | Integer | No | 1024 | The maximum fetch size for per poll when read table snapshot. |
+| server-id | String | No | - | A numeric ID or a numeric ID range of this database client, The numeric ID syntax is like `5400`, the numeric ID range syntax is like '5400-5408'. Every ID must be unique across all currently-running database processes in the MySQL cluster. This connector joins the MySQL cluster as another server (with this unique ID) so it can read the binlog. By default, a random number is generated between 5400 and 6400, though we recommend setting an explicit value. |
+| server-time-zone | String | No | UTC | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. |
+| connect.timeout.ms | Duration | No | 30000 | The maximum time that the connector should wait after trying to connect to the database server before timing out. |
+| connect.max-retries | Integer | No | 3 | The max retry times that the connector should retry to build database server connection. |
+| connection.pool.size | Integer | No | 20 | The jdbc connection pool size. |
+| chunk-key.even-distribution.factor.upper-bound | Double | No | 100 | The upper bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be less than or equal to this upper bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is greater, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 100.0. |
+| chunk-key.even-distribution.factor.lower-bound | Double | No | 0.05 | The lower bound of the chunk key distribution factor. This factor is used to determine whether the table data is evenly distributed. If the distribution factor is calculated to be greater than or equal to this lower bound (i.e., (MAX(id) - MIN(id) + 1) / row count), the table chunks would be optimized for even distribution. Otherwise, if the distribution factor is less, the table will be considered as unevenly distributed and the sampling-based sharding strategy will be used if the estimated shard count exceeds the value specified by `sample-sharding.threshold`. The default value is 0.05. |
+| sample-sharding.threshold | Integer | No | 1000 | This configuration specifies the threshold of estimated shard count to trigger the sample sharding strategy. When the distribution factor is outside the bounds specified by `chunk-key.even-distribution.factor.upper-bound` and `chunk-key.even-distribution.factor.lower-bound`, and the estimated shard count (calculated as approximate row count / chunk size) exceeds this threshold, the sample sharding strategy will be used. This can help to handle large datasets more efficiently. The default value is 1000 shards. |
+| inverse-sampling.rate | Integer | No | 1000 | The inverse of the sampling rate used in the sample sharding strategy. For example, if this value is set to 1000, it means a 1/1000 sampling rate is applied during the sampling process. This option provides flexibility in controlling the granularity of the sampling, thus affecting the final number of shards. It's especially useful when dealing with very large datasets where a lower sampling rate is preferred. The default value is 1000. |
+| exactly_once | Boolean | No | true | Enable exactly once semantic. |
+| format | Enum | No | DEFAULT | Optional output format for MySQL CDC, valid enumerations are `DEFAULT`、`COMPATIBLE_DEBEZIUM_JSON`. |
+| debezium | Config | No | - | Pass-through [Debezium's properties](https://debezium.io/documentation/reference/1.6/connectors/mysql.html#mysql-connector-properties) to Debezium Embedded Engine which is used to capture data changes from MySQL server. |
+| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+## Task Example
+
+### Simple
+
+> Support multi-table reading
-#### example
+```
+env {
+ parallelism = 1
+ job.mode = "STREAMING"
+ checkpoint.interval = 10000
+}
-```conf
source {
MySQL-CDC {
- debezium {
- snapshot.mode = "never"
- decimal.handling.mode = "double"
+ catalog = {
+ factory = MySQL
}
+ base-url = "jdbc:mysql://localhost:3306/testdb"
+ username = "root"
+ password = "root@123"
+ table-names = ["testdb.table1", "testdb.table2"]
+
+ startup.mode = "initial"
}
}
-```
-
-### common options
-
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
-
-## Example
-```Jdbc {
-source {
- MySQL-CDC {
- result_table_name = "fake"
- parallelism = 1
- server-id = 5656
- username = "mysqluser"
- password = "mysqlpw"
- table-names = ["inventory_vwyw0n.products"]
- base-url = "jdbc:mysql://localhost:56725/inventory_vwyw0n"
+sink {
+ Console {
}
}
```
+### Support debezium-compatible format send to kafka
+
+> Must be used with kafka connector sink, see [compatible debezium format](../formats/cdc-compatible-debezium-json.md) for details
+
## Changelog
- Add MySQL CDC Source Connector
diff --git a/docs/en/connector-v2/source/Mysql.md b/docs/en/connector-v2/source/Mysql.md
index d04c7eec3020..bdac5c0aec61 100644
--- a/docs/en/connector-v2/source/Mysql.md
+++ b/docs/en/connector-v2/source/Mysql.md
@@ -2,6 +2,10 @@
> JDBC Mysql Source Connector
+## Support Mysql Version
+
+- 5.5/5.6/5.7/8.0
+
## Support Those Engines
> Spark
@@ -56,24 +60,24 @@ Read external data source data through JDBC.
## Source Options
-| Name | Type | Required | Default | Description |
-|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test |
-| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. |
-| user | String | No | - | Connection instance user name |
-| password | String | No | - | Connection instance password |
-| query | String | Yes | - | Query statement |
-| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
-| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
-| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
-| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
-| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
-| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
-| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+| Name | Type | Required | Default | Description |
+|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
### Tips
-> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks , When your shard read field is a large number type such as bigint(30) and above and the data is not evenly distributed, it is recommended to set the parallelism level to 1 to ensure that the data skew problem is resolved
## Task Example
@@ -90,7 +94,7 @@ env {
}
source{
Jdbc {
- url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8"
+ url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
connection_check_timeout_sec = 100
user = "root"
@@ -114,9 +118,13 @@ sink {
> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table
```
+env {
+ execution.parallelism = 10
+ job.mode = "BATCH"
+}
source {
Jdbc {
- url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8"
+ url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
connection_check_timeout_sec = 100
user = "root"
@@ -129,6 +137,9 @@ source {
partition_num = 10
}
}
+sink {
+ Console {}
+}
```
### Parallel Boundary:
@@ -138,7 +149,7 @@ source {
```
source {
Jdbc {
- url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8"
+ url = "jdbc:mysql://localhost:3306/test?serverTimezone=GMT%2b8&useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true"
driver = "com.mysql.cj.jdbc.Driver"
connection_check_timeout_sec = 100
user = "root"
diff --git a/docs/en/connector-v2/source/Notion.md b/docs/en/connector-v2/source/Notion.md
index 186294c6874d..d138c21c1d69 100644
--- a/docs/en/connector-v2/source/Notion.md
+++ b/docs/en/connector-v2/source/Notion.md
@@ -29,7 +29,7 @@ Used to read data from Notion.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -62,7 +62,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/OceanBase.md b/docs/en/connector-v2/source/OceanBase.md
index 9625ef4fbb94..434e25284ddc 100644
--- a/docs/en/connector-v2/source/OceanBase.md
+++ b/docs/en/connector-v2/source/OceanBase.md
@@ -71,21 +71,21 @@ Read external data source data through JDBC.
## Source Options
-| Name | Type | Required | Default | Description |
-|------------------------------|--------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test |
-| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. |
-| user | String | No | - | Connection instance user name |
-| password | String | No | - | Connection instance password |
-| compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. |
-| query | String | Yes | - | Query statement |
-| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
-| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. |
-| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
-| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
-| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. |
-| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure the row fetch size used in the query to improve performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
-| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+| Name | Type | Required | Default | Description |
+|------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure the row fetch size used in the query to improve performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
### Tips
@@ -127,6 +127,10 @@ sink {
> Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table
```
+env {
+ execution.parallelism = 10
+ job.mode = "BATCH"
+}
source {
Jdbc {
driver = "com.oceanbase.jdbc.Driver"
@@ -141,6 +145,9 @@ source {
partition_num = 10
}
}
+sink {
+ Console {}
+}
```
### Parallel Boundary:
diff --git a/docs/en/connector-v2/source/OneSignal.md b/docs/en/connector-v2/source/OneSignal.md
index 52636cf5bdac..9fb6d65379be 100644
--- a/docs/en/connector-v2/source/OneSignal.md
+++ b/docs/en/connector-v2/source/OneSignal.md
@@ -29,7 +29,7 @@ Used to read data from OneSignal.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -58,7 +58,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/Oracle.md b/docs/en/connector-v2/source/Oracle.md
new file mode 100644
index 000000000000..f191cda9d998
--- /dev/null
+++ b/docs/en/connector-v2/source/Oracle.md
@@ -0,0 +1,161 @@
+# Oracle
+
+> JDBC Oracle Source Connector
+
+## Support Those Engines
+
+> Spark
+> Flink
+> SeaTunnel Zeta
+
+## Key Features
+
+- [x] [batch](../../concept/connector-v2-features.md)
+- [ ] [stream](../../concept/connector-v2-features.md)
+- [x] [exactly-once](../../concept/connector-v2-features.md)
+- [x] [column projection](../../concept/connector-v2-features.md)
+- [x] [parallelism](../../concept/connector-v2-features.md)
+- [x] [support user-defined split](../../concept/connector-v2-features.md)
+
+> supports query SQL and can achieve projection effect.
+
+## Description
+
+Read external data source data through JDBC.
+
+## Supported DataSource Info
+
+| Datasource | Supported versions | Driver | Url | Maven |
+|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------|
+| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 |
+
+## Database Dependency
+
+> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATNUNNEL_HOME/lib/
+> To support the i18n character set, copy the orai18n.jar to the $SEATNUNNEL_HOME/lib/ directory.
+
+## Data Type Mapping
+
+| Oracle Data type | SeaTunnel Data type |
+|--------------------------------------------------------------------------------------|---------------------|
+| INTEGER | INT |
+| FLOAT | DECIMAL(38, 18) |
+| NUMBER(precision <= 9, scale == 0) | INT |
+| NUMBER(9 < precision <= 18, scale == 0) | BIGINT |
+| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) |
+| NUMBER(scale != 0) | DECIMAL(38, 18) |
+| BINARY_DOUBLE | DOUBLE |
+| BINARY_FLOAT REAL | FLOAT |
+| CHAR NCHAR NVARCHAR2 VARCHAR2 LONG ROWID NCLOB CLOB | STRING |
+| DATE | DATE |
+| TIMESTAMP TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP |
+| BLOB RAW LONG RAW BFILE | BYTES |
+
+## Source Options
+
+| Name | Type | Required | Default | Description |
+|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use MySQL the value is `oracle.jdbc.OracleDriver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+
+### Tips
+
+> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks.
+
+## Task Example
+
+### Simple:
+
+> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 2
+ job.mode = "BATCH"
+}
+source{
+ Jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+ user = "root"
+ password = "123456"
+ query = "SELECT * FROM TEST_TABLE"
+ }
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/transform-v2/sql
+}
+
+sink {
+ Console {}
+}
+```
+
+### Parallel:
+
+> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table
+
+```
+env {
+ execution.parallelism = 10
+ job.mode = "BATCH"
+}
+source {
+ Jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ # Define query logic as required
+ query = "SELECT * FROM TEST_TABLE"
+ # Parallel sharding reads fields
+ partition_column = "ID"
+ # Number of fragments
+ partition_num = 10
+ }
+}
+sink {
+ Console {}
+}
+```
+
+### Parallel Boundary:
+
+> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured
+
+```
+source {
+ Jdbc {
+ url = "jdbc:oracle:thin:@datasource01:1523:xe"
+ driver = "oracle.jdbc.OracleDriver"
+ connection_check_timeout_sec = 100
+ user = "root"
+ password = "123456"
+ # Define query logic as required
+ query = "SELECT * FROM TEST_TABLE"
+ partition_column = "ID"
+ # Read start boundary
+ partition_lower_bound = 1
+ # Read end boundary
+ partition_upper_bound = 500
+ partition_num = 10
+ }
+}
+```
+
diff --git a/docs/en/connector-v2/source/Persistiq.md b/docs/en/connector-v2/source/Persistiq.md
index e102b8b3edd6..c308efbb389c 100644
--- a/docs/en/connector-v2/source/Persistiq.md
+++ b/docs/en/connector-v2/source/Persistiq.md
@@ -29,7 +29,7 @@ Used to read data from Persistiq.
| body | String | No | - |
| json_field | Config | No | - |
| content_json | String | No | - |
-| poll_interval_ms | int | No | - |
+| poll_interval_millis | int | No | - |
| retry | int | No | - |
| retry_backoff_multiplier_ms | int | No | 100 |
| retry_backoff_max_ms | int | No | 10000 |
@@ -56,7 +56,7 @@ http params
http body
-### poll_interval_ms [int]
+### poll_interval_millis [int]
request http api interval(millis) in stream mode
diff --git a/docs/en/connector-v2/source/PostgreSQL.md b/docs/en/connector-v2/source/PostgreSQL.md
index 3f9e13d2e648..63ddbc25ecf9 100644
--- a/docs/en/connector-v2/source/PostgreSQL.md
+++ b/docs/en/connector-v2/source/PostgreSQL.md
@@ -38,28 +38,28 @@ Read external data source data through JDBC.
## Data Type Mapping
-| PostgreSQL Data type | SeaTunnel Data type |
-|----------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
-| BOOL | BOOLEAN |
-| _BOOL | ARRAY<BOOLEAN> |
-| BYTEA | BYTES |
-| _BYTEA | ARRAY<TINYINT> |
-| INT2 SMALLSERIAL INT4 SERIAL | INT |
-| _INT2 _INT4 | ARRAY<INT> |
-| INT8 BIGSERIAL | BIGINT |
-| _INT8 | ARRAY<BIGINT> |
-| FLOAT4 | FLOAT |
-| _FLOAT4 | ARRAY<FLOAT> |
-| FLOAT8 | DOUBLE |
-| _FLOAT8 | ARRAY<DOUBLE> |
-| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) |
-| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) |
-| BPCHAR CHARACTER VARCHAR TEXT GEOMETRY GEOGRAPHY | STRING |
-| _BPCHAR _CHARACTER _VARCHAR _TEXT | ARRAY<STRING> |
-| TIMESTAMP | TIMESTAMP |
-| TIME | TIME |
-| DATE | DATE |
-| OTHER DATA TYPES | NOT SUPPORTED YET |
+| PostgreSQL Data type | SeaTunnel Data type |
+|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------|
+| BOOL | BOOLEAN |
+| _BOOL | ARRAY<BOOLEAN> |
+| BYTEA | BYTES |
+| _BYTEA | ARRAY<TINYINT> |
+| INT2 SMALLSERIAL INT4 SERIAL | INT |
+| _INT2 _INT4 | ARRAY<INT> |
+| INT8 BIGSERIAL | BIGINT |
+| _INT8 | ARRAY<BIGINT> |
+| FLOAT4 | FLOAT |
+| _FLOAT4 | ARRAY<FLOAT> |
+| FLOAT8 | DOUBLE |
+| _FLOAT8 | ARRAY<DOUBLE> |
+| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) |
+| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) |
+| BPCHAR CHARACTER VARCHAR TEXT GEOMETRY GEOGRAPHY JSON JSONB | STRING |
+| _BPCHAR _CHARACTER _VARCHAR _TEXT | ARRAY<STRING> |
+| TIMESTAMP | TIMESTAMP |
+| TIME | TIME |
+| DATE | DATE |
+| OTHER DATA TYPES | NOT SUPPORTED YET |
## Options
@@ -120,6 +120,10 @@ sink {
> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table
```
+env {
+ execution.parallelism = 10
+ job.mode = "BATCH"
+}
source{
jdbc{
url = "jdbc:postgresql://localhost:5432/test"
@@ -131,6 +135,9 @@ source{
partition_num = 5
}
}
+sink {
+ Console {}
+}
```
### Parallel Boundary:
diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md
index f7ad1cc8bd0f..54124a370382 100644
--- a/docs/en/connector-v2/source/S3File.md
+++ b/docs/en/connector-v2/source/S3File.md
@@ -1,22 +1,14 @@
# S3File
-> S3 file source connector
+> S3 File Source Connector
-## Description
-
-Read data from aws s3 file system.
-
-:::tip
-
-If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
-
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+## Support Those Engines
-To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir.
+> Spark
+> Flink
+> SeaTunnel Zeta
-:::
-
-## Key features
+## Key Features
- [x] [batch](../../concept/connector-v2-features.md)
- [ ] [stream](../../concept/connector-v2-features.md)
@@ -35,104 +27,31 @@ Read all the data in a split in a pollNext call. What splits are read will be sa
- [x] json
- [x] excel
-## Options
-
-| name | type | required | default value |
-|---------------------------------|---------|----------|-------------------------------------------------------|
-| path | string | yes | - |
-| file_format_type | string | yes | - |
-| bucket | string | yes | - |
-| fs.s3a.endpoint | string | yes | - |
-| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider |
-| read_columns | list | no | - |
-| access_key | string | no | - |
-| access_secret | string | no | - |
-| hadoop_s3_properties | map | no | - |
-| delimiter | string | no | \001 |
-| parse_partition_from_path | boolean | no | true |
-| date_format | string | no | yyyy-MM-dd |
-| datetime_format | string | no | yyyy-MM-dd HH:mm:ss |
-| time_format | string | no | HH:mm:ss |
-| skip_header_row_number | long | no | 0 |
-| schema | config | no | - |
-| common-options | | no | - |
-| sheet_name | string | no | - |
-| file_filter_pattern | string | no | - |
-
-### path [string]
-
-The source file path.
-
-### fs.s3a.endpoint [string]
-
-fs s3a endpoint
-
-### fs.s3a.aws.credentials.provider [string]
-
-The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now.
-
-More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A)
-
-### delimiter [string]
-
-Field delimiter, used to tell connector how to slice and dice fields when reading text files
-
-default `\001`, the same as hive's default delimiter
-
-### parse_partition_from_path [boolean]
-
-Control whether parse the partition keys and values from file path
-
-For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`
-
-Every record data from file will be added these two fields:
-
-| name | age |
-|---------------|-----|
-| tyrantlucifer | 26 |
-
-Tips: **Do not define partition fields in schema option**
-
-### date_format [string]
-
-Date type format, used to tell connector how to convert string to date, supported as the following formats:
-
-`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`
-
-default `yyyy-MM-dd`
-
-### datetime_format [string]
-
-Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:
-
-`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss`
-
-default `yyyy-MM-dd HH:mm:ss`
-
-### time_format [string]
-
-Time type format, used to tell connector how to convert string to time, supported as the following formats:
-
-`HH:mm:ss` `HH:mm:ss.SSS`
-
-default `HH:mm:ss`
+## Description
-### skip_header_row_number [long]
+Read data from aws s3 file system.
-Skip the first few lines, but only for the txt and csv.
+## Supported DataSource Info
-For example, set like following:
+| Datasource | Supported versions |
+|------------|--------------------|
+| S3 | current |
-`skip_header_row_number = 2`
+## Dependency
-then SeaTunnel will skip the first 2 lines from source files
+> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+>
+> If you use SeaTunnel Zeta, It automatically integrated the hadoop jar when you download and install SeaTunnel Zeta. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+> To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir.
-### file_format_type [string]
+## Data Type Mapping
-File type, supported as the following file types:
+Data type mapping is related to the type of file being read, We supported as the following file types:
`text` `csv` `parquet` `orc` `json` `excel`
+### JSON File Type
+
If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want.
For example:
@@ -174,7 +93,7 @@ connector will generate data as the following:
|------|-------------|---------|
| 200 | get success | true |
-If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
+### Text Or CSV File Type
If you assign file type to `text` `csv`, you can choose to specify the schema information or not.
@@ -215,61 +134,102 @@ connector will generate data as the following:
|---------------|-----|--------|
| tyrantlucifer | 26 | male |
-### bucket [string]
-
-The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`.
-
-### access_key [string]
-
-The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
-
-### access_secret [string]
-
-The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
+### Orc File Type
-### hadoop_s3_properties [map]
-
-If you need to add a other option, you could add it here and refer to this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html)
-
-```
-hadoop_s3_properties {
- "xxx" = "xxx"
- }
-```
-
-### schema [config]
-
-#### fields [Config]
-
-The schema of upstream data.
-
-### read_columns [list]
-
-The read column list of the data source, user can use it to implement field projection.
-
-The file type supported column projection as the following shown:
-
-- text
-- json
-- csv
-- orc
-- parquet
-- excel
+If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
-**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured**
+| Orc Data type | SeaTunnel Data type |
+|----------------------------------|----------------------------------------------------------------|
+| BOOLEAN | BOOLEAN |
+| INT | INT |
+| BYTE | BYTE |
+| SHORT | SHORT |
+| LONG | LONG |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| BINARY | BINARY |
+| STRING VARCHAR CHAR | STRING |
+| DATE | LOCAL_DATE_TYPE |
+| TIMESTAMP | LOCAL_DATE_TIME_TYPE |
+| DECIMAL | DECIMAL |
+| LIST(STRING) | STRING_ARRAY_TYPE |
+| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE |
+| LIST(TINYINT) | BYTE_ARRAY_TYPE |
+| LIST(SMALLINT) | SHORT_ARRAY_TYPE |
+| LIST(INT) | INT_ARRAY_TYPE |
+| LIST(BIGINT) | LONG_ARRAY_TYPE |
+| LIST(FLOAT) | FLOAT_ARRAY_TYPE |
+| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE |
+| Map | MapType, This type of K and V will transform to SeaTunnel type |
+| STRUCT | SeaTunnelRowType |
+
+### Parquet File Type
-### common options
+If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically.
-Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details.
+| Orc Data type | SeaTunnel Data type |
+|----------------------|----------------------------------------------------------------|
+| INT_8 | BYTE |
+| INT_16 | SHORT |
+| DATE | DATE |
+| TIMESTAMP_MILLIS | TIMESTAMP |
+| INT64 | LONG |
+| INT96 | TIMESTAMP |
+| BINARY | BYTES |
+| FLOAT | FLOAT |
+| DOUBLE | DOUBLE |
+| BOOLEAN | BOOLEAN |
+| FIXED_LEN_BYTE_ARRAY | TIMESTAMP DECIMAL |
+| DECIMAL | DECIMAL |
+| LIST(STRING) | STRING_ARRAY_TYPE |
+| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE |
+| LIST(TINYINT) | BYTE_ARRAY_TYPE |
+| LIST(SMALLINT) | SHORT_ARRAY_TYPE |
+| LIST(INT) | INT_ARRAY_TYPE |
+| LIST(BIGINT) | LONG_ARRAY_TYPE |
+| LIST(FLOAT) | FLOAT_ARRAY_TYPE |
+| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE |
+| Map | MapType, This type of K and V will transform to SeaTunnel type |
+| STRUCT | SeaTunnelRowType |
-### sheet_name [string]
+## Options
-Reader the sheet of the workbook,Only used when file_format_type is excel.
+| name | type | required | default value | Description |
+|---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option |
+| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` |
+| bucket | string | yes | - | The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. |
+| fs.s3a.endpoint | string | yes | - | fs s3a endpoint |
+| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) |
+| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. |
+| access_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` |
+| access_secret | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` |
+| hadoop_s3_properties | map | no | - | If you need to add other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) |
+| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. |
+| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 |
+| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` |
+| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` |
+| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` |
+| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files |
+| schema | config | no | - | The schema of upstream data. |
+| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. |
+| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. |
## Example
-```hocon
+1. In this example, We read data from s3 path `s3a://seatunnel-test/seatunnel/text` and the file type is orc in this path.
+ We use `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` to authentication so `access_key` and `secret_key` is required.
+ All columns in the file will be read and send to sink.
+
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+source {
S3File {
path = "/seatunnel/text"
fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
@@ -279,9 +239,21 @@ Reader the sheet of the workbook,Only used when file_format_type is excel.
bucket = "s3a://seatunnel-test"
file_format_type = "orc"
}
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+sink {
+ Console {}
+}
```
+2. Use `InstanceProfileCredentialsProvider` to authentication
+ The file type in S3 is json, so need config schema option.
+
```hocon
S3File {
@@ -300,9 +272,47 @@ Reader the sheet of the workbook,Only used when file_format_type is excel.
```
-### file_filter_pattern [string]
+3. Use `InstanceProfileCredentialsProvider` to authentication
+ The file type in S3 is json and has five fields (`id`, `name`, `age`, `sex`, `type`), so need config schema option.
+ In this job, we only need send `id` and `name` column to mysql.
-Filter pattern, which used for filtering files.
+```
+# Defining the runtime environment
+env {
+ # You can set flink configuration here
+ execution.parallelism = 1
+ job.mode = "BATCH"
+}
+
+source {
+ S3File {
+ path = "/seatunnel/json"
+ bucket = "s3a://seatunnel-test"
+ fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn"
+ fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider"
+ file_format_type = "json"
+ read_columns = ["id", "name"]
+ schema {
+ fields {
+ id = int
+ name = string
+ age = int
+ sex = int
+ type = string
+ }
+ }
+ }
+}
+
+transform {
+ # If you would like to get more information about how to configure seatunnel and see full list of transform plugins,
+ # please go to https://seatunnel.apache.org/docs/category/transform-v2
+}
+
+sink {
+ Console {}
+}
+```
## Changelog
diff --git a/docs/en/connector-v2/source/Snowflake.md b/docs/en/connector-v2/source/Snowflake.md
index cd824eab4635..a7835013d58a 100644
--- a/docs/en/connector-v2/source/Snowflake.md
+++ b/docs/en/connector-v2/source/Snowflake.md
@@ -56,20 +56,20 @@ Read external data source data through JDBC.
## Options
-| name | type | required | default | description |
-|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com |
-| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. |
-| user | String | No | - | Connection instance user name |
-| password | String | No | - | Connection instance password |
-| query | String | Yes | - | Query statement |
-| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
-| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
-| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
-| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
-| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
-| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
-| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+| name | type | required | default | description |
+|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
## tips
diff --git a/docs/en/connector-v2/source/Vertica.md b/docs/en/connector-v2/source/Vertica.md
index 66f18e7a4ed4..df387ac30bf0 100644
--- a/docs/en/connector-v2/source/Vertica.md
+++ b/docs/en/connector-v2/source/Vertica.md
@@ -56,20 +56,20 @@ Read external data source data through JDBC.
## Source Options
-| Name | Type | Required | Default | Description |
-|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica |
-| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use Vertica the value is `com.vertica.jdbc.Driver`. |
-| user | String | No | - | Connection instance user name |
-| password | String | No | - | Connection instance password |
-| query | String | Yes | - | Query statement |
-| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
-| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
-| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
-| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
-| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
-| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
-| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
+| Name | Type | Required | Default | Description |
+|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica |
+| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, if you use Vertica the value is `com.vertica.jdbc.Driver`. |
+| user | String | No | - | Connection instance user name |
+| password | String | No | - | Connection instance password |
+| query | String | Yes | - | Query statement |
+| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete |
+| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. |
+| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. |
+| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. |
+| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism |
+| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure the row fetch size used in the query toimprove performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value. |
+| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details |
### Tips
diff --git a/docs/en/seatunnel-engine/checkpoint-storage.md b/docs/en/seatunnel-engine/checkpoint-storage.md
index a88f301439e4..f2a6487f28d2 100644
--- a/docs/en/seatunnel-engine/checkpoint-storage.md
+++ b/docs/en/seatunnel-engine/checkpoint-storage.md
@@ -59,8 +59,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
@@ -94,8 +92,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
@@ -119,8 +115,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
@@ -152,6 +146,28 @@ seatunnel:
kerberosKeytab: your-kerberos-keytab
```
+if HDFS is in HA mode , you can config like this:
+
+```yaml
+seatunnel:
+ engine:
+ checkpoint:
+ storage:
+ type: hdfs
+ max-retained: 3
+ plugin-config:
+ storage.type: hdfs
+ fs.defaultFS: hdfs://usdp-bing
+ seatunnel.hadoop.dfs.nameservices: usdp-bing
+ seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2
+ seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020
+ seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020
+ seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"
+
+```
+
+if HDFS has some other configs in `hdfs-site.xml` or `core-site.xml` , just set HDFS config by using `seatunnel.hadoop.` prefix.
+
#### LocalFile
```yaml
@@ -160,8 +176,6 @@ seatunnel:
checkpoint:
interval: 6000
timeout: 7000
- max-concurrent: 1
- tolerable-failure: 2
storage:
type: hdfs
max-retained: 3
diff --git a/docs/en/seatunnel-engine/deployment.md b/docs/en/seatunnel-engine/deployment.md
index c07cd45d6b1a..18c1a587a2a3 100644
--- a/docs/en/seatunnel-engine/deployment.md
+++ b/docs/en/seatunnel-engine/deployment.md
@@ -75,14 +75,6 @@ The interval between two checkpoints, unit is milliseconds. If the `checkpoint.i
The timeout of a checkpoint. If a checkpoint cannot be completed within the timeout period, a checkpoint failure will be triggered. Therefore, Job will be restored.
-**max-concurrent**
-
-How many checkpoints can be performed simultaneously at most.
-
-**tolerable-failure**
-
-Maximum number of retries after checkpoint failure.
-
Example
```
@@ -95,14 +87,24 @@ seatunnel:
checkpoint:
interval: 300000
timeout: 10000
- max-concurrent: 1
- tolerable-failure: 2
```
**checkpoint storage**
About the checkpoint storage, you can see [checkpoint storage](checkpoint-storage.md)
+### 4.4 Historical Job expiration Config
+
+The information about each completed Job, such as status, counters, and error logs, is stored in the IMap object. As the number of running jobs increases, the memory increases and eventually the memory will overflow. Therefore, you can adjust the history-job-expire-minutes parameter to solve this problem. The time unit of this parameter is minute. The default value is 1440 minutes, that is, one day.
+
+Example
+
+```
+seatunnel:
+ engine:
+ history-job-expire-minutes: 1440
+```
+
## 5. Config SeaTunnel Engine Server
All SeaTunnel Engine Server config in `hazelcast.yaml` file.
diff --git a/docs/en/seatunnel-engine/rest-api.md b/docs/en/seatunnel-engine/rest-api.md
index 2edec3496adb..2f44421a3d60 100644
--- a/docs/en/seatunnel-engine/rest-api.md
+++ b/docs/en/seatunnel-engine/rest-api.md
@@ -180,3 +180,61 @@ network:
------------------------------------------------------------------------------------------
+### Submit Job.
+
+
+POST/hazelcast/rest/maps/submit-job(Returns jobId and jobName if job submitted successfully.)
+
+#### Parameters
+
+> | name | type | data type | description |
+> |----------------------|----------|-----------|-----------------------------------|
+> | jobId | optional | string | job id |
+> | jobName | optional | string | job name |
+> | isStartWithSavePoint | optional | string | if job is started with save point |
+
+#### Body
+
+```json
+{
+ "env": {
+ "job.mode": "batch"
+ },
+ "source": [
+ {
+ "plugin_name": "FakeSource",
+ "result_table_name": "fake",
+ "row.num": 100,
+ "schema": {
+ "fields": {
+ "name": "string",
+ "age": "int",
+ "card": "int"
+ }
+ }
+ }
+ ],
+ "transform": [
+ ],
+ "sink": [
+ {
+ "plugin_name": "Console",
+ "source_table_name": ["fake"]
+ }
+ ]
+}
+```
+
+#### Responses
+
+```json
+{
+ "jobId": 733584788375666689,
+ "jobName": "rest_api_test"
+}
+```
+
+
+
+------------------------------------------------------------------------------------------
+
diff --git a/docs/en/start-v2/locally/deployment.md b/docs/en/start-v2/locally/deployment.md
index 6f82a9d84890..1e5c0f9ed620 100644
--- a/docs/en/start-v2/locally/deployment.md
+++ b/docs/en/start-v2/locally/deployment.md
@@ -23,8 +23,8 @@ Or you can download it by terminal
```shell
export version="2.3.2"
-wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-incubating-${version}-bin.tar.gz"
-tar -xzvf "apache-seatunnel-incubating-${version}-bin.tar.gz"
+wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz"
+tar -xzvf "apache-seatunnel-${version}-bin.tar.gz"
```
diff --git a/docs/en/start-v2/locally/quick-start-flink.md b/docs/en/start-v2/locally/quick-start-flink.md
index cf01a0fccdb0..9fd9f1eb43d5 100644
--- a/docs/en/start-v2/locally/quick-start-flink.md
+++ b/docs/en/start-v2/locally/quick-start-flink.md
@@ -68,14 +68,14 @@ You could start the application by the following commands
flink version between `1.12.x` and `1.14.x`
```shell
-cd "apache-seatunnel-incubating-${version}"
+cd "apache-seatunnel-${version}"
./bin/start-seatunnel-flink-13-connector-v2.sh --config ./config/v2.streaming.conf.template
```
flink version between `1.15.x` and `1.16.x`
```shell
-cd "apache-seatunnel-incubating-${version}"
+cd "apache-seatunnel-${version}"
./bin/start-seatunnel-flink-15-connector-v2.sh --config ./config/v2.streaming.conf.template
```
diff --git a/docs/en/start-v2/locally/quick-start-seatunnel-engine.md b/docs/en/start-v2/locally/quick-start-seatunnel-engine.md
index db998897027f..f469c570e3a6 100644
--- a/docs/en/start-v2/locally/quick-start-seatunnel-engine.md
+++ b/docs/en/start-v2/locally/quick-start-seatunnel-engine.md
@@ -59,7 +59,7 @@ More information about config please check [config concept](../../concept/config
You could start the application by the following commands
```shell
-cd "apache-seatunnel-incubating-${version}"
+cd "apache-seatunnel-${version}"
./bin/seatunnel.sh --config ./config/v2.batch.config.template -e local
```
diff --git a/docs/en/start-v2/locally/quick-start-spark.md b/docs/en/start-v2/locally/quick-start-spark.md
index 88aebd5aa439..903217c8ec14 100644
--- a/docs/en/start-v2/locally/quick-start-spark.md
+++ b/docs/en/start-v2/locally/quick-start-spark.md
@@ -69,7 +69,7 @@ You could start the application by the following commands
spark 2.4.x
```bash
-cd "apache-seatunnel-incubating-${version}"
+cd "apache-seatunnel-${version}"
./bin/start-seatunnel-spark-2-connector-v2.sh \
--master local[4] \
--deploy-mode client \
@@ -79,7 +79,7 @@ cd "apache-seatunnel-incubating-${version}"
spark3.x.x
```shell
-cd "apache-seatunnel-incubating-${version}"
+cd "apache-seatunnel-${version}"
./bin/start-seatunnel-spark-3-connector-v2.sh \
--master local[4] \
--deploy-mode client \
diff --git a/docs/en/transform-v2/sql-udf.md b/docs/en/transform-v2/sql-udf.md
index 143044f5a797..ede3ef9ab4a8 100644
--- a/docs/en/transform-v2/sql-udf.md
+++ b/docs/en/transform-v2/sql-udf.md
@@ -39,16 +39,31 @@ public interface ZetaUDF {
## UDF Implements Example
-Add the dependency of transform-v2 and provided scope to your maven project:
+Add these dependencies and provided scope to your maven project:
```xml
-
- org.apache.seatunnel
- seatunnel-transforms-v2
- 2.3.x
- provided
-
+
+
+ org.apache.seatunnel
+ seatunnel-transforms-v2
+ 2.3.2
+ provided
+
+
+ org.apache.seatunnel
+ seatunnel-api
+ 2.3.2
+ provided
+
+
+ com.google.auto.service
+ auto-service
+ 1.0.1
+ provided
+
+
+
```
Add a Java Class implements of ZetaUDF like this:
diff --git a/pom.xml b/pom.xml
index 3d619644952b..224a339fc6e4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -768,6 +768,9 @@
${spotless.version}
+
+ src/main/java/org/apache/seatunnel/antlr4/generated/*.*
+ 1.7
diff --git a/release-note.md b/release-note.md
index 9ade9c614309..61664d773f48 100644
--- a/release-note.md
+++ b/release-note.md
@@ -15,6 +15,7 @@
### Connectors
- [Elasticsearch] Support https protocol & compatible with opensearch
- [Hbase] Add hbase sink connector #4049
+- [Clickhouse] Fix clickhouse old version compatibility #5326
### Formats
- [Canal]Support read canal format message #3950
- [Debezium]Support debezium canal format message #3981
@@ -81,9 +82,12 @@
- [E2E] [Kafka] Fix kafka e2e testcase (#4520)
- [Container Version] Fix risk of unreproducible test cases #4591
- [E2e] [Mysql-cdc] Removing the excess MySqlIncrementalSourceIT e2e reduces the CI time (#4738)
+- [E2E] [Common] Update test container version of seatunnel engine (#5323)
## Improve
+- [Improve][Connector-V2][Jdbc-Source] Support for Decimal types as splict keys (#4634)
+
### Core
- [Core] [Spark] Push transform operation from Spark Driver to Executors (#4503)
@@ -113,6 +117,7 @@
### CI
- [CI] Fix error repository name in ci config files (#4795)
+- [CI][E2E][Zeta] Increase Zeta checkpoint timeout to avoid connector-file-sftp-e2e failed frequently (#5339)
### Zeta(ST-Engine)
@@ -154,6 +159,7 @@
- [Connector-V2] [Paimon] Introduce paimon connector (#4178)
- [Connector V2] [Cassandra] Expose configurable options in Cassandra (#3681)
- [Connector V2] [Jdbc] Supports GEOMETRY data type for PostgreSQL (#4673)
+- [Connector V2] [Jdbc] Supports Kingbase database (#4803)
- [Transform-V2] Add UDF SPI and an example implement for SQL Transform plugin (#4392)
- [Transform-V2] Support copy field list (#4404)
- [Transform-V2] Add support CatalogTable for FieldMapperTransform (#4423)
@@ -190,3 +196,4 @@
- [Docs] Redshift add defaultRowFetchSize (#4616)
- [Docs] Refactor connector-v2 docs using unified format Mysql (#4590)
- [Docs] Add Value types in Java to Schema features (#5087)
+- [Docs] Replace username by user in the options of FtpFile (#5421)
\ No newline at end of file
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/JobMetrics.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/JobMetrics.java
index 0149ad26497f..d39e8b96640e 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/JobMetrics.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/JobMetrics.java
@@ -26,9 +26,11 @@
import java.io.Serializable;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -72,6 +74,10 @@ public JobMetrics merge(JobMetrics jobMetrics) {
}
Map> metricsMap = new HashMap<>();
metrics.forEach((key, value) -> metricsMap.put(key, new ArrayList<>(value)));
+ //// Because if a job is restarted, the running node might change, so we need to remove the
+ // node information.
+ Set keysToExclude =
+ new HashSet<>(Arrays.asList(MetricTags.MEMBER, MetricTags.ADDRESS));
jobMetrics.metrics.forEach(
(key, value) ->
metricsMap.merge(
@@ -82,7 +88,11 @@ public JobMetrics merge(JobMetrics jobMetrics) {
for (Measurement m1 : v1) {
if (v2.stream()
.noneMatch(
- m2 -> m2.getTags().equals(m1.getTags()))) {
+ m2 ->
+ areMapsEqualExcludingKeys(
+ m2.getTags(),
+ m1.getTags(),
+ keysToExclude))) {
ms.add(m1);
}
}
@@ -91,6 +101,40 @@ public JobMetrics merge(JobMetrics jobMetrics) {
return new JobMetrics(metricsMap);
}
+ /**
+ * Compares two Map objects excluding certain keys.
+ *
+ * @param map1 the first map
+ * @param map2 the second map
+ * @param keysToExclude the keys to be excluded during comparison
+ * @return true if the maps are equal excluding the specific keys, false otherwise
+ */
+ public static boolean areMapsEqualExcludingKeys(
+ Map map1, Map map2, Set keysToExclude) {
+ // Return false if either of the maps is null
+ if (map1 == null || map2 == null) {
+ return false;
+ }
+
+ // Return false if the sizes of the maps are different
+ if (map1.size() != map2.size()) {
+ return false;
+ }
+
+ // Create copies of the maps to avoid modifying the original maps
+ Map map1Copy = new HashMap<>(map1);
+ Map map2Copy = new HashMap<>(map2);
+
+ // Remove specific keys from the copies
+ for (String key : keysToExclude) {
+ map1Copy.remove(key);
+ map2Copy.remove(key);
+ }
+
+ // Return whether the copies are equal
+ return map1Copy.equals(map2Copy);
+ }
+
/** Returns all metrics present. */
public Set metrics() {
return Collections.unmodifiableSet(metrics.keySet());
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/ThreadSafeQPSMeter.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/ThreadSafeQPSMeter.java
index 627e9bd4ca7e..11a4376d7cd7 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/ThreadSafeQPSMeter.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/metrics/ThreadSafeQPSMeter.java
@@ -48,7 +48,6 @@ public void markEvent(long n) {
VOLATILE_VALUE_UPDATER.addAndGet(this, n);
}
- @SuppressWarnings("checkstyle:MagicNumber")
@Override
public double getRate() {
long cost = System.currentTimeMillis() - timestamp;
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java
index a4ce408d73b0..72057aef5f52 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java
@@ -30,7 +30,7 @@
import java.util.List;
import java.util.Map;
-import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument;
public class Options {
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvCommonOptions.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvCommonOptions.java
index bc80c6642889..d076cd5367bf 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvCommonOptions.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvCommonOptions.java
@@ -51,6 +51,12 @@ public interface EnvCommonOptions {
.withDescription(
"The interval (in milliseconds) between two consecutive checkpoints.");
+ Option CHECKPOINT_TIMEOUT =
+ Options.key("checkpoint.timeout")
+ .longType()
+ .noDefaultValue()
+ .withDescription("The timeout (in milliseconds) for a checkpoint.");
+
Option JARS =
Options.key("jars")
.stringType()
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvOptionRule.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvOptionRule.java
index 3a90b82e83bc..09310f080c53 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvOptionRule.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/env/EnvOptionRule.java
@@ -30,6 +30,7 @@ public static OptionRule getEnvOptionRules() {
CommonOptions.PARALLELISM,
EnvCommonOptions.JARS,
EnvCommonOptions.CHECKPOINT_INTERVAL,
+ EnvCommonOptions.CHECKPOINT_TIMEOUT,
EnvCommonOptions.CUSTOM_PARAMETERS)
.build();
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java
index 2100b9529cdc..5fabe2a284a9 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/serialization/DefaultSerializer.java
@@ -35,6 +35,9 @@ public byte[] serialize(T obj) throws IOException {
@Override
public T deserialize(byte[] serialized) throws IOException {
+ if (serialized == null) {
+ return null;
+ }
return SerializationUtils.deserialize(serialized);
}
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java
index c0fbe2c0299c..3b1e715ebee7 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/sink/SinkWriter.java
@@ -18,6 +18,7 @@
package org.apache.seatunnel.api.sink;
import org.apache.seatunnel.api.common.metrics.MetricsContext;
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
import java.io.IOException;
import java.io.Serializable;
@@ -44,6 +45,14 @@ public interface SinkWriter {
*/
void write(T element) throws IOException;
+ /**
+ * apply schema change to third party data receiver.
+ *
+ * @param event
+ * @throws IOException
+ */
+ default void applySchemaChange(SchemaChangeEvent event) throws IOException {}
+
/**
* prepare the commit, will be called before {@link #snapshotState(long checkpointId)}. If you
* need to use 2pc, you can return the commit info in this method, and receive the commit info
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/source/Collector.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/source/Collector.java
index 0b924bb570a9..85435880c63b 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/source/Collector.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/source/Collector.java
@@ -17,6 +17,8 @@
package org.apache.seatunnel.api.source;
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
+
/**
* A {@link Collector} is used to collect data from {@link SourceReader}.
*
@@ -26,6 +28,12 @@ public interface Collector {
void collect(T record);
+ default void markSchemaChangeBeforeCheckpoint() {}
+
+ default void collect(SchemaChangeEvent event) {}
+
+ default void markSchemaChangeAfterCheckpoint() {}
+
/**
* Returns the checkpoint lock.
*
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TablePath.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TablePath.java
index 7b2dd6d5533a..358e873b9915 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TablePath.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/catalog/TablePath.java
@@ -22,6 +22,8 @@
import lombok.RequiredArgsConstructor;
import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
@Getter
@EqualsAndHashCode
@@ -54,14 +56,15 @@ public static TablePath of(String databaseName, String schemaName, String tableN
}
public String getSchemaAndTableName() {
- return String.format("%s.%s", schemaName, tableName);
+ return getNameCommon(null, schemaName, tableName, null, null);
+ }
+
+ public String getSchemaAndTableName(String quote) {
+ return getNameCommon(null, schemaName, tableName, quote, quote);
}
public String getFullName() {
- if (schemaName == null) {
- return String.format("%s.%s", databaseName, tableName);
- }
- return String.format("%s.%s.%s", databaseName, schemaName, tableName);
+ return getNameCommon(databaseName, schemaName, tableName, null, null);
}
public String getFullNameWithQuoted() {
@@ -69,13 +72,36 @@ public String getFullNameWithQuoted() {
}
public String getFullNameWithQuoted(String quote) {
- if (schemaName == null) {
- return String.format(
- "%s%s%s.%s%s%s", quote, databaseName, quote, quote, tableName, quote);
+ return getNameCommon(databaseName, schemaName, tableName, quote, quote);
+ }
+
+ public String getFullNameWithQuoted(String quoteLeft, String quoteRight) {
+ return getNameCommon(databaseName, schemaName, tableName, quoteLeft, quoteRight);
+ }
+
+ private String getNameCommon(
+ String databaseName,
+ String schemaName,
+ String tableName,
+ String quoteLeft,
+ String quoteRight) {
+ List joinList = new ArrayList<>();
+ quoteLeft = quoteLeft == null ? "" : quoteLeft;
+ quoteRight = quoteRight == null ? "" : quoteRight;
+
+ if (databaseName != null) {
+ joinList.add(quoteLeft + databaseName + quoteRight);
+ }
+
+ if (schemaName != null) {
+ joinList.add(quoteLeft + schemaName + quoteRight);
}
- return String.format(
- "%s%s%s.%s%s%s.%s%s%s",
- quote, databaseName, quote, quote, schemaName, quote, quote, tableName, quote);
+
+ if (tableName != null) {
+ joinList.add(quoteLeft + tableName + quoteRight);
+ }
+
+ return String.join(".", joinList);
}
@Override
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableAddColumnEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableAddColumnEvent.java
new file mode 100644
index 000000000000..967452545265
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableAddColumnEvent.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.Column;
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.ToString;
+
+@Getter
+@ToString(callSuper = true)
+public class AlterTableAddColumnEvent extends AlterTableColumnEvent {
+ private final Column column;
+ private final boolean first;
+ private final String afterColumn;
+
+ public AlterTableAddColumnEvent(
+ TablePath tablePath, Column column, boolean first, String afterColumn) {
+ super(tablePath);
+ this.column = column;
+ this.first = first;
+ this.afterColumn = afterColumn;
+ }
+
+ public static AlterTableAddColumnEvent addFirst(TablePath tablePath, Column column) {
+ return new AlterTableAddColumnEvent(tablePath, column, true, null);
+ }
+
+ public static AlterTableAddColumnEvent add(TablePath tablePath, Column column) {
+ return new AlterTableAddColumnEvent(tablePath, column, false, null);
+ }
+
+ public static AlterTableAddColumnEvent addAfter(
+ TablePath tablePath, Column column, String afterColumn) {
+ return new AlterTableAddColumnEvent(tablePath, column, false, afterColumn);
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableChangeColumnEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableChangeColumnEvent.java
new file mode 100644
index 000000000000..2a8ba71846c3
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableChangeColumnEvent.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.Column;
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.ToString;
+
+@Getter
+@ToString(callSuper = true)
+public class AlterTableChangeColumnEvent extends AlterTableAddColumnEvent {
+ private final String oldColumn;
+
+ public AlterTableChangeColumnEvent(
+ TablePath tablePath,
+ String oldColumn,
+ Column column,
+ boolean first,
+ String afterColumn) {
+ super(tablePath, column, first, afterColumn);
+ this.oldColumn = oldColumn;
+ }
+
+ public static AlterTableChangeColumnEvent changeFirst(
+ TablePath tablePath, String oldColumn, Column column) {
+ return new AlterTableChangeColumnEvent(tablePath, oldColumn, column, true, null);
+ }
+
+ public static AlterTableChangeColumnEvent change(
+ TablePath tablePath, String oldColumn, Column column) {
+ return new AlterTableChangeColumnEvent(tablePath, oldColumn, column, false, null);
+ }
+
+ public static AlterTableChangeColumnEvent changeAfter(
+ TablePath tablePath, String oldColumn, Column column, String afterColumn) {
+ return new AlterTableChangeColumnEvent(tablePath, oldColumn, column, false, afterColumn);
+ }
+}
diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/record/SchemaBarrier.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java
similarity index 69%
rename from seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/record/SchemaBarrier.java
rename to seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java
index 4268acd86726..a61dccc08d15 100644
--- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/record/SchemaBarrier.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java
@@ -15,22 +15,15 @@
* limitations under the License.
*/
-package org.apache.seatunnel.engine.server.task.record;
+package org.apache.seatunnel.api.table.event;
-/** Change the schema of the task and flow. */
-public class SchemaBarrier implements Barrier {
- @Override
- public long getId() {
- return -1;
- }
+import org.apache.seatunnel.api.table.catalog.TablePath;
- @Override
- public boolean snapshot() {
- return false;
- }
+import lombok.ToString;
- @Override
- public boolean prepareClose() {
- return false;
+@ToString(callSuper = true)
+public abstract class AlterTableColumnEvent extends AlterTableEvent {
+ public AlterTableColumnEvent(TablePath tablePath) {
+ super(tablePath);
}
}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnsEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnsEvent.java
new file mode 100644
index 000000000000..eb81c67dd193
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnsEvent.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.ToString;
+
+import java.util.ArrayList;
+import java.util.List;
+
+@Getter
+@ToString(callSuper = true)
+public class AlterTableColumnsEvent extends AlterTableEvent {
+ private final List events;
+
+ public AlterTableColumnsEvent(TablePath tablePath) {
+ this(tablePath, new ArrayList<>());
+ }
+
+ public AlterTableColumnsEvent(TablePath tablePath, List events) {
+ super(tablePath);
+ this.events = events;
+ }
+
+ public AlterTableColumnsEvent addEvent(AlterTableColumnEvent event) {
+ events.add(event);
+ return this;
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableDropColumnEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableDropColumnEvent.java
new file mode 100644
index 000000000000..3dbf5294594f
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableDropColumnEvent.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.ToString;
+
+@Getter
+@ToString(callSuper = true)
+public class AlterTableDropColumnEvent extends AlterTableColumnEvent {
+ private final String column;
+
+ public AlterTableDropColumnEvent(TablePath tablePath, String column) {
+ super(tablePath);
+ this.column = column;
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableEvent.java
new file mode 100644
index 000000000000..0bf268dc210e
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableEvent.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.ToString;
+
+@ToString(callSuper = true)
+public abstract class AlterTableEvent extends TableEvent {
+ public AlterTableEvent(TablePath tablePath) {
+ super(tablePath);
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableModifyColumnEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableModifyColumnEvent.java
new file mode 100644
index 000000000000..97be83f719eb
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableModifyColumnEvent.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.Column;
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.ToString;
+
+@Getter
+@ToString(callSuper = true)
+public class AlterTableModifyColumnEvent extends AlterTableAddColumnEvent {
+ public AlterTableModifyColumnEvent(
+ TablePath tablePath, Column column, boolean first, String afterColumn) {
+ super(tablePath, column, first, afterColumn);
+ }
+
+ public static AlterTableModifyColumnEvent modifyFirst(TablePath tablePath, Column column) {
+ return new AlterTableModifyColumnEvent(tablePath, column, true, null);
+ }
+
+ public static AlterTableModifyColumnEvent modify(TablePath tablePath, Column column) {
+ return new AlterTableModifyColumnEvent(tablePath, column, false, null);
+ }
+
+ public static AlterTableModifyColumnEvent modifyAfter(
+ TablePath tablePath, Column column, String afterColumn) {
+ return new AlterTableModifyColumnEvent(tablePath, column, false, afterColumn);
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableNameEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableNameEvent.java
new file mode 100644
index 000000000000..cc01a916031b
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableNameEvent.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.ToString;
+
+@Getter
+@ToString(callSuper = true)
+public class AlterTableNameEvent extends AlterTableColumnEvent {
+ private final TablePath newTablePath;
+
+ public AlterTableNameEvent(TablePath tablePath, TablePath newTablePath) {
+ super(tablePath);
+ this.newTablePath = newTablePath;
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/SchemaChangeEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/SchemaChangeEvent.java
new file mode 100644
index 000000000000..3f01d8f867fd
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/SchemaChangeEvent.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import java.io.Serializable;
+
+/** Represents a structural change to a table schema. */
+public interface SchemaChangeEvent extends Serializable {
+
+ /**
+ * Path of the change table object
+ *
+ * @return
+ */
+ TablePath tablePath();
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/TableEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/TableEvent.java
new file mode 100644
index 000000000000..b81f18f88763
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/TableEvent.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event;
+
+import org.apache.seatunnel.api.table.catalog.TablePath;
+
+import lombok.Getter;
+import lombok.RequiredArgsConstructor;
+import lombok.ToString;
+
+@Getter
+@ToString
+@RequiredArgsConstructor
+public abstract class TableEvent implements SchemaChangeEvent {
+ protected final TablePath tablePath;
+
+ @Override
+ public TablePath tablePath() {
+ return tablePath;
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/AlterTableEventHandler.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/AlterTableEventHandler.java
new file mode 100644
index 000000000000..b020e66a2a3e
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/AlterTableEventHandler.java
@@ -0,0 +1,196 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event.handler;
+
+import org.apache.seatunnel.api.table.catalog.Column;
+import org.apache.seatunnel.api.table.event.AlterTableAddColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableChangeColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableColumnsEvent;
+import org.apache.seatunnel.api.table.event.AlterTableDropColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableEvent;
+import org.apache.seatunnel.api.table.event.AlterTableModifyColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableNameEvent;
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+
+public class AlterTableEventHandler implements DataTypeChangeEventHandler {
+ private SeaTunnelRowType dataType;
+
+ @Override
+ public SeaTunnelRowType get() {
+ return dataType;
+ }
+
+ @Override
+ public DataTypeChangeEventHandler reset(SeaTunnelRowType dataType) {
+ this.dataType = dataType;
+ return this;
+ }
+
+ @Override
+ public SeaTunnelRowType apply(SchemaChangeEvent event) {
+ AlterTableEvent alterTableEvent = (AlterTableEvent) event;
+ return apply(dataType, alterTableEvent);
+ }
+
+ private SeaTunnelRowType apply(SeaTunnelRowType dataType, AlterTableEvent alterTableEvent) {
+ if (alterTableEvent instanceof AlterTableNameEvent) {
+ return dataType;
+ }
+ if (alterTableEvent instanceof AlterTableDropColumnEvent) {
+ return applyDropColumn(dataType, (AlterTableDropColumnEvent) alterTableEvent);
+ }
+ if (alterTableEvent instanceof AlterTableModifyColumnEvent) {
+ return applyModifyColumn(dataType, (AlterTableModifyColumnEvent) alterTableEvent);
+ }
+ if (alterTableEvent instanceof AlterTableChangeColumnEvent) {
+ return applyChangeColumn(dataType, (AlterTableChangeColumnEvent) alterTableEvent);
+ }
+ if (alterTableEvent instanceof AlterTableAddColumnEvent) {
+ return applyAddColumn(dataType, (AlterTableAddColumnEvent) alterTableEvent);
+ }
+ if (alterTableEvent instanceof AlterTableColumnsEvent) {
+ SeaTunnelRowType newType = dataType;
+ for (AlterTableColumnEvent columnEvent :
+ ((AlterTableColumnsEvent) alterTableEvent).getEvents()) {
+ newType = apply(newType, columnEvent);
+ }
+ return newType;
+ }
+
+ throw new UnsupportedOperationException(
+ "Unsupported alter table event: " + alterTableEvent);
+ }
+
+ private SeaTunnelRowType applyAddColumn(
+ SeaTunnelRowType dataType, AlterTableAddColumnEvent addColumnEvent) {
+ LinkedList originFields = new LinkedList<>(Arrays.asList(dataType.getFieldNames()));
+ LinkedList> originFieldTypes =
+ new LinkedList<>(Arrays.asList(dataType.getFieldTypes()));
+ Column column = addColumnEvent.getColumn();
+ if (originFields.contains(column.getName())) {
+ return applyModifyColumn(
+ dataType,
+ new AlterTableModifyColumnEvent(
+ addColumnEvent.tablePath(),
+ addColumnEvent.getColumn(),
+ addColumnEvent.isFirst(),
+ addColumnEvent.getAfterColumn()));
+ }
+
+ if (addColumnEvent.isFirst()) {
+ originFields.addFirst(column.getName());
+ originFieldTypes.addFirst(column.getDataType());
+ } else if (addColumnEvent.getAfterColumn() != null) {
+ int index = originFields.indexOf(addColumnEvent.getAfterColumn());
+ originFields.add(index + 1, column.getName());
+ originFieldTypes.add(index + 1, column.getDataType());
+ } else {
+ originFields.addLast(column.getName());
+ originFieldTypes.addLast(column.getDataType());
+ }
+
+ return new SeaTunnelRowType(
+ originFields.toArray(new String[0]),
+ originFieldTypes.toArray(new SeaTunnelDataType[0]));
+ }
+
+ private SeaTunnelRowType applyDropColumn(
+ SeaTunnelRowType dataType, AlterTableDropColumnEvent dropColumnEvent) {
+ List fieldNames = new ArrayList<>();
+ List fieldTypes = new ArrayList<>();
+ for (int i = 0; i < dataType.getTotalFields(); i++) {
+ if (dataType.getFieldName(i).equals(dropColumnEvent.getColumn())) {
+ continue;
+ }
+ fieldNames.add(dataType.getFieldName(i));
+ fieldTypes.add(dataType.getFieldType(i));
+ }
+ return new SeaTunnelRowType(
+ fieldNames.toArray(new String[0]), fieldTypes.toArray(new SeaTunnelDataType[0]));
+ }
+
+ private SeaTunnelRowType applyModifyColumn(
+ SeaTunnelRowType dataType, AlterTableModifyColumnEvent modifyColumnEvent) {
+ List fieldNames = Arrays.asList(dataType.getFieldNames());
+ if (!fieldNames.contains(modifyColumnEvent.getColumn().getName())) {
+ return dataType;
+ }
+
+ String modifyColumnName = modifyColumnEvent.getColumn().getName();
+ int modifyColumnIndex = dataType.indexOf(modifyColumnName);
+ return applyModifyColumn(
+ dataType,
+ modifyColumnIndex,
+ modifyColumnEvent.getColumn(),
+ modifyColumnEvent.isFirst(),
+ modifyColumnEvent.getAfterColumn());
+ }
+
+ private SeaTunnelRowType applyChangeColumn(
+ SeaTunnelRowType dataType, AlterTableChangeColumnEvent changeColumnEvent) {
+ String oldColumn = changeColumnEvent.getOldColumn();
+ int oldColumnIndex = dataType.indexOf(oldColumn);
+
+ return applyModifyColumn(
+ dataType,
+ oldColumnIndex,
+ changeColumnEvent.getColumn(),
+ changeColumnEvent.isFirst(),
+ changeColumnEvent.getAfterColumn());
+ }
+
+ private SeaTunnelRowType applyModifyColumn(
+ SeaTunnelRowType dataType,
+ int columnIndex,
+ Column column,
+ boolean first,
+ String afterColumn) {
+ LinkedList originFields = new LinkedList<>(Arrays.asList(dataType.getFieldNames()));
+ LinkedList> originFieldTypes =
+ new LinkedList<>(Arrays.asList(dataType.getFieldTypes()));
+
+ if (first) {
+ originFields.remove(columnIndex);
+ originFieldTypes.remove(columnIndex);
+
+ originFields.addFirst(column.getName());
+ originFieldTypes.addFirst(column.getDataType());
+ } else if (afterColumn != null) {
+ originFields.remove(columnIndex);
+ originFieldTypes.remove(columnIndex);
+
+ int index = originFields.indexOf(afterColumn);
+ originFields.add(index + 1, column.getName());
+ originFieldTypes.add(index + 1, column.getDataType());
+ } else {
+ originFields.set(columnIndex, column.getName());
+ originFieldTypes.set(columnIndex, column.getDataType());
+ }
+ return new SeaTunnelRowType(
+ originFields.toArray(new String[0]),
+ originFieldTypes.toArray(new SeaTunnelDataType[0]));
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/DataTypeChangeEventDispatcher.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/DataTypeChangeEventDispatcher.java
new file mode 100644
index 000000000000..ec4f69334f7a
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/DataTypeChangeEventDispatcher.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event.handler;
+
+import org.apache.seatunnel.api.table.event.AlterTableAddColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableChangeColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableColumnsEvent;
+import org.apache.seatunnel.api.table.event.AlterTableDropColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableEvent;
+import org.apache.seatunnel.api.table.event.AlterTableModifyColumnEvent;
+import org.apache.seatunnel.api.table.event.AlterTableNameEvent;
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+
+import lombok.extern.slf4j.Slf4j;
+
+import java.util.HashMap;
+import java.util.Map;
+
+@Slf4j
+public class DataTypeChangeEventDispatcher implements DataTypeChangeEventHandler {
+
+ private final Map handlers;
+ private SeaTunnelRowType dataType;
+
+ public DataTypeChangeEventDispatcher() {
+ this.handlers = createHandlers();
+ }
+
+ @Override
+ public SeaTunnelRowType get() {
+ return dataType;
+ }
+
+ @Override
+ public DataTypeChangeEventHandler reset(SeaTunnelRowType dataType) {
+ this.dataType = dataType;
+ return this;
+ }
+
+ @Override
+ public SeaTunnelRowType apply(SchemaChangeEvent event) {
+ DataTypeChangeEventHandler handler = handlers.get(event.getClass());
+ if (handler == null) {
+ log.warn("No DataTypeChangeEventHandler for event: {}", event.getClass());
+ return dataType;
+ }
+ return handler.reset(dataType).apply(event);
+ }
+
+ private static Map createHandlers() {
+ Map handlers = new HashMap<>();
+
+ AlterTableEventHandler alterTableEventHandler = new AlterTableEventHandler();
+ handlers.put(AlterTableEvent.class, alterTableEventHandler);
+ handlers.put(AlterTableNameEvent.class, alterTableEventHandler);
+ handlers.put(AlterTableColumnsEvent.class, alterTableEventHandler);
+ handlers.put(AlterTableAddColumnEvent.class, alterTableEventHandler);
+ handlers.put(AlterTableModifyColumnEvent.class, alterTableEventHandler);
+ handlers.put(AlterTableDropColumnEvent.class, alterTableEventHandler);
+ handlers.put(AlterTableChangeColumnEvent.class, alterTableEventHandler);
+ return handlers;
+ }
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/DataTypeChangeEventHandler.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/DataTypeChangeEventHandler.java
new file mode 100644
index 000000000000..01d8924d531b
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/DataTypeChangeEventHandler.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event.handler;
+
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
+import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+
+public interface DataTypeChangeEventHandler extends SchemaChangeEventHandler {
+
+ SeaTunnelRowType get();
+
+ DataTypeChangeEventHandler reset(SeaTunnelRowType dataType);
+
+ default SeaTunnelRowType handle(SchemaChangeEvent event) {
+ if (get() == null) {
+ throw new IllegalStateException("DataTypeChanger not reset");
+ }
+
+ try {
+ return apply(event);
+ } finally {
+ reset(null);
+ if (get() != null) {
+ throw new IllegalStateException("DataTypeChanger not reset");
+ }
+ }
+ }
+
+ SeaTunnelRowType apply(SchemaChangeEvent event);
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/SchemaChangeEventHandler.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/SchemaChangeEventHandler.java
new file mode 100644
index 000000000000..167dc6cc315e
--- /dev/null
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/handler/SchemaChangeEventHandler.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.api.table.event.handler;
+
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
+
+import java.io.Serializable;
+
+public interface SchemaChangeEventHandler extends Serializable {
+
+ T handle(SchemaChangeEvent event);
+}
diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java
index 6bf0a2a865db..4eedb2255ad6 100644
--- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java
+++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java
@@ -20,7 +20,7 @@
import java.util.Arrays;
import java.util.List;
-import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument;
public class SeaTunnelRowType implements CompositeType {
private static final long serialVersionUID = 2L;
diff --git a/seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/ReadableConfigTest.java b/seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/ReadableConfigTest.java
index b1436edd0fc5..ffaae72d0f10 100644
--- a/seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/ReadableConfigTest.java
+++ b/seatunnel-api/src/test/java/org/apache/seatunnel/api/configuration/ReadableConfigTest.java
@@ -35,7 +35,6 @@
import java.util.List;
import java.util.Map;
-@SuppressWarnings("checkstyle:StaticVariableName")
public class ReadableConfigTest {
private static final String CONFIG_PATH = "/conf/option-test.conf";
private static ReadonlyConfig config;
diff --git a/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/RetryUtils.java b/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/RetryUtils.java
index aa1bbd5934bd..e8ee03a5013a 100644
--- a/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/RetryUtils.java
+++ b/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/RetryUtils.java
@@ -66,7 +66,7 @@ public static T retryWithException(
backoff);
Thread.sleep(backoff);
} else {
- log.debug(attemptMessage, ExceptionUtils.getMessage(e), i, retryTimes, 0);
+ log.info(attemptMessage, ExceptionUtils.getMessage(e), i, retryTimes, 0);
}
}
}
diff --git a/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/SerializationUtils.java b/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/SerializationUtils.java
index d0e9a5b941b7..46494bc5c525 100644
--- a/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/SerializationUtils.java
+++ b/seatunnel-common/src/main/java/org/apache/seatunnel/common/utils/SerializationUtils.java
@@ -44,7 +44,6 @@ public static T stringToObject(String str) {
return null;
}
- @SuppressWarnings("checkstyle:MagicNumber")
public static byte[] serialize(T obj) {
try (ByteArrayOutputStream b = new ByteArrayOutputStream(512);
ObjectOutputStream out = new ObjectOutputStream(b)) {
diff --git a/seatunnel-common/src/test/java/org/apache/seatunnel/common/utils/SerializationUtilsTest.java b/seatunnel-common/src/test/java/org/apache/seatunnel/common/utils/SerializationUtilsTest.java
index 8121a6b42c9f..b5085d85b2c8 100644
--- a/seatunnel-common/src/test/java/org/apache/seatunnel/common/utils/SerializationUtilsTest.java
+++ b/seatunnel-common/src/test/java/org/apache/seatunnel/common/utils/SerializationUtilsTest.java
@@ -23,7 +23,6 @@
import java.util.ArrayList;
import java.util.HashMap;
-@SuppressWarnings("checkstyle:RegexpSingleline")
public class SerializationUtilsTest {
@Test
diff --git a/seatunnel-config/seatunnel-config-shade/src/main/java/org/apache/seatunnel/shade/com/typesafe/config/impl/PropertiesParser.java b/seatunnel-config/seatunnel-config-shade/src/main/java/org/apache/seatunnel/shade/com/typesafe/config/impl/PropertiesParser.java
index eceacf997972..3cfdb7dba3b5 100644
--- a/seatunnel-config/seatunnel-config-shade/src/main/java/org/apache/seatunnel/shade/com/typesafe/config/impl/PropertiesParser.java
+++ b/seatunnel-config/seatunnel-config-shade/src/main/java/org/apache/seatunnel/shade/com/typesafe/config/impl/PropertiesParser.java
@@ -7,8 +7,7 @@
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
+import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
@@ -58,7 +57,15 @@ private static AbstractConfigObject fromEntrySet(
}
private static Map getPathMap(Set> entries) {
- Map pathMap = new LinkedHashMap();
+ Map pathMap = new LinkedHashMap<>();
+ System.getProperties()
+ .forEach(
+ (key, value) -> {
+ if (key instanceof String) {
+ Path path = pathFromPropertyKey((String) key);
+ pathMap.put(path, value);
+ }
+ });
for (Map.Entry entry : entries) {
Object key = entry.getKey();
if (key instanceof String) {
@@ -74,7 +81,7 @@ static AbstractConfigObject fromStringMap(ConfigOrigin origin, Map pathExpressionMap) {
- Map pathMap = new LinkedHashMap();
+ Map pathMap = new LinkedHashMap<>();
for (Map.Entry, ?> entry : pathExpressionMap.entrySet()) {
Object keyObj = entry.getKey();
if (!(keyObj instanceof String)) {
@@ -93,8 +100,8 @@ private static AbstractConfigObject fromPathMap(
* First, build a list of paths that will have values, either string or
* object values.
*/
- Set scopePaths = new LinkedHashSet();
- Set valuePaths = new LinkedHashSet();
+ Set scopePaths = new LinkedHashSet<>();
+ Set valuePaths = new LinkedHashSet<>();
for (Path path : pathMap.keySet()) {
// add value's path
valuePaths.add(path);
@@ -129,13 +136,11 @@ private static AbstractConfigObject fromPathMap(
/*
* Create maps for the object-valued values.
*/
- Map root = new LinkedHashMap();
- Map> scopes =
- new LinkedHashMap>();
+ Map root = new LinkedHashMap<>();
+ Map> scopes = new LinkedHashMap<>();
for (Path path : scopePaths) {
- Map scope =
- new LinkedHashMap();
+ Map scope = new LinkedHashMap<>();
scopes.put(path, scope);
}
@@ -150,7 +155,17 @@ private static AbstractConfigObject fromPathMap(
AbstractConfigValue value;
if (convertedFromProperties) {
if (rawValue instanceof String) {
- value = new ConfigString.Quoted(origin, (String) rawValue);
+ if (((String) rawValue).startsWith("[") && ((String) rawValue).endsWith("]")) {
+ List list =
+ Arrays.asList(
+ ((String) rawValue)
+ .substring(1, ((String) rawValue).length() - 1)
+ .split(","));
+ value = ConfigImpl.fromAnyRef(list, origin, FromMapMode.KEYS_ARE_PATHS);
+ } else {
+ value = new ConfigString.Quoted(origin, (String) rawValue);
+ }
+
} else {
// silently ignore non-string values in Properties
value = null;
@@ -167,19 +182,14 @@ private static AbstractConfigObject fromPathMap(
* Make a list of scope paths from longest to shortest, so children go
* before parents.
*/
- List sortedScopePaths = new ArrayList();
- sortedScopePaths.addAll(scopePaths);
+ List sortedScopePaths = new ArrayList<>(scopePaths);
// sort descending by length
- Collections.sort(
- sortedScopePaths,
- new Comparator() {
- @Override
- public int compare(Path a, Path b) {
- // Path.length() is O(n) so in theory this sucks
- // but in practice we can make Path precompute length
- // if it ever matters.
- return b.length() - a.length();
- }
+ sortedScopePaths.sort(
+ (a, b) -> {
+ // Path.length() is O(n) so in theory this sucks
+ // but in practice we can make Path precompute length
+ // if it ever matters.
+ return b.length() - a.length();
});
/*
diff --git a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBConfig.java b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBConfig.java
index ccb6808ffa0f..5194e50f7c6b 100644
--- a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBConfig.java
+++ b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBConfig.java
@@ -49,14 +49,12 @@ public class AmazonDynamoDBConfig implements Serializable {
.noDefaultValue()
.withDescription("The table of Amazon DynamoDB");
- @SuppressWarnings("checkstyle:MagicNumber")
public static final Option BATCH_SIZE =
Options.key("batch_size")
.intType()
.defaultValue(25)
.withDescription("The batch size of Amazon DynamoDB");
- @SuppressWarnings("checkstyle:MagicNumber")
public static final Option BATCH_INTERVAL_MS =
Options.key("batch_interval_ms")
.intType()
diff --git a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBSourceOptions.java b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBSourceOptions.java
index f92921ee140f..54f955f540e0 100644
--- a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBSourceOptions.java
+++ b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/config/AmazonDynamoDBSourceOptions.java
@@ -43,7 +43,6 @@ public class AmazonDynamoDBSourceOptions implements Serializable {
private Config schema;
public int batchSize = AmazonDynamoDBConfig.BATCH_SIZE.defaultValue();
- public int batchIntervalMs = AmazonDynamoDBConfig.BATCH_INTERVAL_MS.defaultValue();
public AmazonDynamoDBSourceOptions(Config config) {
this.url = config.getString(AmazonDynamoDBConfig.URL.key());
@@ -57,8 +56,5 @@ public AmazonDynamoDBSourceOptions(Config config) {
if (config.hasPath(AmazonDynamoDBConfig.BATCH_SIZE.key())) {
this.batchSize = config.getInt(AmazonDynamoDBConfig.BATCH_SIZE.key());
}
- if (config.hasPath(AmazonDynamoDBConfig.BATCH_INTERVAL_MS.key())) {
- this.batchIntervalMs = config.getInt(AmazonDynamoDBConfig.BATCH_INTERVAL_MS.key());
- }
}
}
diff --git a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/AmazonDynamoDBWriter.java b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/AmazonDynamoDBWriter.java
index 016036cc841b..d059bce7b578 100644
--- a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/AmazonDynamoDBWriter.java
+++ b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/AmazonDynamoDBWriter.java
@@ -25,6 +25,7 @@
import org.apache.seatunnel.connectors.seatunnel.common.sink.AbstractSinkWriter;
import java.io.IOException;
+import java.util.Optional;
public class AmazonDynamoDBWriter extends AbstractSinkWriter {
@@ -48,4 +49,10 @@ public void write(SeaTunnelRow element) throws IOException {
public void close() throws IOException {
dynamoDbSinkClient.close();
}
+
+ @Override
+ public Optional prepareCommit() {
+ dynamoDbSinkClient.flush();
+ return Optional.empty();
+ }
}
diff --git a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/DynamoDbSinkClient.java b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/DynamoDbSinkClient.java
index d8acf33ebeb8..e42f573dfb8a 100644
--- a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/DynamoDbSinkClient.java
+++ b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/sink/DynamoDbSinkClient.java
@@ -24,7 +24,6 @@
import org.apache.seatunnel.connectors.seatunnel.amazondynamodb.serialize.DefaultSeaTunnelRowDeserializer;
import org.apache.seatunnel.connectors.seatunnel.amazondynamodb.serialize.SeaTunnelRowDeserializer;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
import software.amazon.awssdk.regions.Region;
@@ -40,15 +39,9 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import java.util.concurrent.Executors;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ScheduledFuture;
-import java.util.concurrent.TimeUnit;
public class DynamoDbSinkClient {
private final AmazonDynamoDBSourceOptions amazondynamodbSourceOptions;
- private ScheduledExecutorService scheduler;
- private ScheduledFuture> scheduledFuture;
private volatile boolean initialize;
private volatile Exception flushException;
private DynamoDbClient dynamoDbClient;
@@ -62,7 +55,7 @@ public DynamoDbSinkClient(
this.seaTunnelRowDeserializer = new DefaultSeaTunnelRowDeserializer(typeInfo);
}
- private void tryInit() throws IOException {
+ private void tryInit() {
if (initialize) {
return;
}
@@ -78,25 +71,6 @@ private void tryInit() throws IOException {
amazondynamodbSourceOptions.getAccessKeyId(),
amazondynamodbSourceOptions.getSecretAccessKey())))
.build();
-
- scheduler =
- Executors.newSingleThreadScheduledExecutor(
- new ThreadFactoryBuilder()
- .setNameFormat("DdynamoDb-sink-output-%s")
- .build());
- scheduledFuture =
- scheduler.scheduleAtFixedRate(
- () -> {
- try {
- flush();
- } catch (IOException e) {
- flushException = e;
- }
- },
- amazondynamodbSourceOptions.getBatchIntervalMs(),
- amazondynamodbSourceOptions.getBatchIntervalMs(),
- TimeUnit.MILLISECONDS);
-
initialize = true;
}
@@ -114,17 +88,13 @@ public synchronized void write(PutItemRequest putItemRequest) throws IOException
}
public synchronized void close() throws IOException {
- if (scheduledFuture != null) {
- scheduledFuture.cancel(false);
- scheduler.shutdown();
- }
if (dynamoDbClient != null) {
flush();
dynamoDbClient.close();
}
}
- synchronized void flush() throws IOException {
+ synchronized void flush() {
checkFlushException();
if (batchList.isEmpty()) {
return;
diff --git a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java
index afaafa3f8a9f..c25f8b0e0b15 100644
--- a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java
+++ b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java
@@ -31,11 +31,13 @@
import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
+import software.amazon.awssdk.services.dynamodb.model.AttributeValue;
import software.amazon.awssdk.services.dynamodb.model.ScanRequest;
import software.amazon.awssdk.services.dynamodb.model.ScanResponse;
import java.io.IOException;
import java.net.URI;
+import java.util.Map;
@Slf4j
public class AmazonDynamoDBSourceReader extends AbstractSingleSplitReader {
@@ -78,18 +80,25 @@ public void close() throws IOException {
@Override
@SuppressWarnings("magicnumber")
public void pollNext(Collector output) throws Exception {
- ScanResponse scan =
- dynamoDbClient.scan(
- ScanRequest.builder()
- .tableName(amazondynamodbSourceOptions.getTable())
- .build());
- if (scan.hasItems()) {
- scan.items()
- .forEach(
- item -> {
- output.collect(seaTunnelRowDeserializer.deserialize(item));
- });
- }
+ Map lastKeyEvaluated = null;
+
+ ScanResponse scan;
+ do {
+ scan =
+ dynamoDbClient.scan(
+ ScanRequest.builder()
+ .tableName(amazondynamodbSourceOptions.getTable())
+ .exclusiveStartKey(lastKeyEvaluated)
+ .build());
+ if (scan.hasItems()) {
+ scan.items()
+ .forEach(
+ item -> {
+ output.collect(seaTunnelRowDeserializer.deserialize(item));
+ });
+ }
+ lastKeyEvaluated = scan.lastEvaluatedKey();
+ } while (lastKeyEvaluated != null && !lastKeyEvaluated.isEmpty());
context.signalNoMoreElement();
}
}
diff --git a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/excecutor/AssertExecutor.java b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/excecutor/AssertExecutor.java
index c8666cd9a55e..5868fba91276 100644
--- a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/excecutor/AssertExecutor.java
+++ b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/excecutor/AssertExecutor.java
@@ -20,6 +20,8 @@
import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.api.table.type.SeaTunnelRowType;
+import org.apache.seatunnel.connectors.seatunnel.assertion.exception.AssertConnectorErrorCode;
+import org.apache.seatunnel.connectors.seatunnel.assertion.exception.AssertConnectorException;
import org.apache.seatunnel.connectors.seatunnel.assertion.rule.AssertFieldRule;
import org.apache.commons.lang3.StringUtils;
@@ -27,6 +29,7 @@
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
+import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
@@ -108,33 +111,8 @@ private boolean pass(Object value, AssertFieldRule.AssertRule valueRule) {
return ((Number) value).doubleValue() >= valueRule.getRuleValue();
}
if (valueRule.getEqualTo() != null) {
- if (value instanceof String) {
- return value.equals(valueRule.getEqualTo());
- }
- if (value instanceof Number) {
- return ((Number) value).doubleValue() == Double.parseDouble(valueRule.getEqualTo());
- }
- if (value instanceof Boolean) {
- return value.equals(Boolean.parseBoolean(valueRule.getEqualTo()));
- }
- if (value instanceof LocalDateTime) {
- TemporalAccessor parsedTimestamp =
- DateTimeFormatter.ISO_LOCAL_DATE_TIME.parse(valueRule.getEqualTo());
- LocalTime localTime = parsedTimestamp.query(TemporalQueries.localTime());
- LocalDate localDate = parsedTimestamp.query(TemporalQueries.localDate());
- return ((LocalDateTime) value).isEqual(LocalDateTime.of(localDate, localTime));
- }
- if (value instanceof LocalDate) {
- DateTimeFormatter fmt = DateTimeFormatter.ofPattern("yyyy-MM-dd");
- return ((LocalDate) value).isEqual(LocalDate.parse(valueRule.getEqualTo(), fmt));
- }
- if (value instanceof LocalTime) {
- DateTimeFormatter fmt = DateTimeFormatter.ofPattern("HH:mm:ss");
- return value.equals(LocalTime.parse(valueRule.getEqualTo(), fmt));
- }
- return false;
+ return compareValue(value, valueRule);
}
-
String valueStr = Objects.isNull(value) ? StringUtils.EMPTY : String.valueOf(value);
if (AssertFieldRule.AssertRuleType.MAX_LENGTH.equals(valueRule.getRuleType())) {
return valueStr.length() <= valueRule.getRuleValue();
@@ -146,6 +124,44 @@ private boolean pass(Object value, AssertFieldRule.AssertRule valueRule) {
return Boolean.TRUE;
}
+ private boolean compareValue(Object value, AssertFieldRule.AssertRule valueRule) {
+ if (value instanceof String) {
+ return value.equals(valueRule.getEqualTo());
+ } else if (value instanceof Integer) {
+ return value.equals(Integer.parseInt(valueRule.getEqualTo()));
+ } else if (value instanceof Long) {
+ return value.equals(Long.parseLong(valueRule.getEqualTo()));
+ } else if (value instanceof Short) {
+ return value.equals(Short.parseShort(valueRule.getEqualTo()));
+ } else if (value instanceof Float) {
+ return value.equals((Float.parseFloat(valueRule.getEqualTo())));
+ } else if (value instanceof Byte) {
+ return value.equals((Byte.parseByte(valueRule.getEqualTo())));
+ } else if (value instanceof Double) {
+ return value.equals(Double.parseDouble(valueRule.getEqualTo()));
+ } else if (value instanceof BigDecimal) {
+ return value.equals(new BigDecimal(valueRule.getEqualTo()));
+ } else if (value instanceof Boolean) {
+ return value.equals(Boolean.parseBoolean(valueRule.getEqualTo()));
+ } else if (value instanceof LocalDateTime) {
+ TemporalAccessor parsedTimestamp =
+ DateTimeFormatter.ISO_LOCAL_DATE_TIME.parse(valueRule.getEqualTo());
+ LocalTime localTime = parsedTimestamp.query(TemporalQueries.localTime());
+ LocalDate localDate = parsedTimestamp.query(TemporalQueries.localDate());
+ return ((LocalDateTime) value).isEqual(LocalDateTime.of(localDate, localTime));
+ } else if (value instanceof LocalDate) {
+ DateTimeFormatter fmt = DateTimeFormatter.ofPattern("yyyy-MM-dd");
+ return ((LocalDate) value).isEqual(LocalDate.parse(valueRule.getEqualTo(), fmt));
+ } else if (value instanceof LocalTime) {
+ DateTimeFormatter fmt = DateTimeFormatter.ofPattern("HH:mm:ss");
+ return value.equals(LocalTime.parse(valueRule.getEqualTo(), fmt));
+ } else {
+ throw new AssertConnectorException(
+ AssertConnectorErrorCode.TYPES_NOT_SUPPORTED_FAILED,
+ String.format(" %s types not supported yet", value.getClass().getSimpleName()));
+ }
+ }
+
private Boolean checkType(Object value, SeaTunnelDataType> fieldType) {
return value.getClass().equals(fieldType.getTypeClass());
}
diff --git a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/exception/AssertConnectorErrorCode.java b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/exception/AssertConnectorErrorCode.java
index abb085e2837e..16ae8aed1c7d 100644
--- a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/exception/AssertConnectorErrorCode.java
+++ b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/exception/AssertConnectorErrorCode.java
@@ -20,7 +20,8 @@
import org.apache.seatunnel.common.exception.SeaTunnelErrorCode;
public enum AssertConnectorErrorCode implements SeaTunnelErrorCode {
- RULE_VALIDATION_FAILED("ASSERT-01", "Rule validate failed");
+ RULE_VALIDATION_FAILED("ASSERT-01", "Rule validate failed"),
+ TYPES_NOT_SUPPORTED_FAILED("ASSERT-02", "Types not supported");
private final String code;
private final String description;
diff --git a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/rule/AssertRuleParser.java b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/rule/AssertRuleParser.java
index f479dfa5c99b..eccf2c684505 100644
--- a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/rule/AssertRuleParser.java
+++ b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/rule/AssertRuleParser.java
@@ -20,6 +20,7 @@
import org.apache.seatunnel.shade.com.typesafe.config.Config;
import org.apache.seatunnel.api.table.type.BasicType;
+import org.apache.seatunnel.api.table.type.DecimalType;
import org.apache.seatunnel.api.table.type.LocalTimeType;
import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
@@ -105,5 +106,6 @@ private SeaTunnelDataType> getFieldType(String fieldTypeStr) {
TYPES.put("datetime", LocalTimeType.LOCAL_DATE_TIME_TYPE);
TYPES.put("date", LocalTimeType.LOCAL_DATE_TYPE);
TYPES.put("time", LocalTimeType.LOCAL_TIME_TYPE);
+ TYPES.put("decimal", new DecimalType(38, 18));
}
}
diff --git a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/sink/AssertSinkWriter.java b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/sink/AssertSinkWriter.java
index d397681189cf..ee865ad9da66 100644
--- a/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/sink/AssertSinkWriter.java
+++ b/seatunnel-connectors-v2/connector-assert/src/main/java/org/apache/seatunnel/connectors/seatunnel/assertion/sink/AssertSinkWriter.java
@@ -47,7 +47,6 @@ public AssertSinkWriter(
}
@Override
- @SuppressWarnings("checkstyle:RegexpSingleline")
public void write(SeaTunnelRow element) {
LONG_ACCUMULATOR.accumulate(1);
if (Objects.nonNull(assertFieldRules)) {
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java
index 068ee4be116d..d5d920c2573f 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java
@@ -29,7 +29,6 @@
import java.util.Properties;
/** A {@link SourceConfig.Factory} to provide {@link SourceConfig} of JDBC data source. */
-@SuppressWarnings("checkstyle:MagicNumber")
public abstract class JdbcSourceConfigFactory implements SourceConfig.Factory {
private static final long serialVersionUID = 1L;
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/JdbcSourceOptions.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/JdbcSourceOptions.java
index 715915c745c9..813d69b862a1 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/JdbcSourceOptions.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/option/JdbcSourceOptions.java
@@ -25,7 +25,6 @@
import java.util.List;
/** Configurations for {@link IncrementalSource} of JDBC data source. */
-@SuppressWarnings("checkstyle:MagicNumber")
public class JdbcSourceOptions extends SourceOptions {
public static final Option HOSTNAME =
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/relational/connection/JdbcConnectionFactory.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/relational/connection/JdbcConnectionFactory.java
index 5fad3f6f22c8..349adf37fdc1 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/relational/connection/JdbcConnectionFactory.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/relational/connection/JdbcConnectionFactory.java
@@ -44,7 +44,6 @@ public JdbcConnectionFactory(
this.jdbcConnectionPoolFactory = jdbcConnectionPoolFactory;
}
- @SuppressWarnings("checkstyle:MagicNumber")
@Override
public Connection connect(JdbcConfiguration config) throws SQLException {
final int connectRetryTimes = sourceConfig.getConnectMaxRetries();
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/schema/SchemaChangeResolver.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/schema/SchemaChangeResolver.java
new file mode 100644
index 000000000000..ee3ef08f7d22
--- /dev/null
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/schema/SchemaChangeResolver.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.seatunnel.connectors.cdc.base.schema;
+
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
+
+import org.apache.kafka.connect.source.SourceRecord;
+
+import java.io.Serializable;
+
+public interface SchemaChangeResolver extends Serializable {
+
+ boolean support(SourceRecord record);
+
+ SchemaChangeEvent resolve(SourceRecord record, SeaTunnelDataType dataType);
+}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/IncrementalSource.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/IncrementalSource.java
index 965d3fd27fe8..ed04fb0f5d7d 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/IncrementalSource.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/IncrementalSource.java
@@ -27,6 +27,7 @@
import org.apache.seatunnel.api.source.SeaTunnelSource;
import org.apache.seatunnel.api.source.SourceReader;
import org.apache.seatunnel.api.source.SourceSplitEnumerator;
+import org.apache.seatunnel.api.source.SupportCoordinate;
import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
import org.apache.seatunnel.api.table.type.SeaTunnelRow;
import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
@@ -36,6 +37,7 @@
import org.apache.seatunnel.connectors.cdc.base.option.SourceOptions;
import org.apache.seatunnel.connectors.cdc.base.option.StartupMode;
import org.apache.seatunnel.connectors.cdc.base.option.StopMode;
+import org.apache.seatunnel.connectors.cdc.base.schema.SchemaChangeResolver;
import org.apache.seatunnel.connectors.cdc.base.source.enumerator.HybridSplitAssigner;
import org.apache.seatunnel.connectors.cdc.base.source.enumerator.IncrementalSourceEnumerator;
import org.apache.seatunnel.connectors.cdc.base.source.enumerator.IncrementalSplitAssigner;
@@ -75,7 +77,7 @@
@NoArgsConstructor
public abstract class IncrementalSource
- implements SeaTunnelSource {
+ implements SeaTunnelSource, SupportCoordinate {
protected ReadonlyConfig readonlyConfig;
protected SourceConfig.Factory configFactory;
@@ -167,17 +169,22 @@ public SourceReader createReader(SourceReader.Context reader
BlockingQueue> elementsQueue =
new LinkedBlockingQueue<>(2);
+ SchemaChangeResolver schemaChangeResolver = deserializationSchema.getSchemaChangeResolver();
Supplier> splitReaderSupplier =
() ->
new IncrementalSourceSplitReader<>(
- readerContext.getIndexOfSubtask(), dataSourceDialect, sourceConfig);
+ readerContext.getIndexOfSubtask(),
+ dataSourceDialect,
+ sourceConfig,
+ schemaChangeResolver);
return new IncrementalSourceReader<>(
elementsQueue,
splitReaderSupplier,
createRecordEmitter(sourceConfig, readerContext.getMetricsContext()),
new SourceReaderOptions(readonlyConfig),
readerContext,
- sourceConfig);
+ sourceConfig,
+ deserializationSchema);
}
protected RecordEmitter createRecordEmitter(
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/IncrementalSplitAssigner.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/IncrementalSplitAssigner.java
index d000d505363b..fe8204f6cd2f 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/IncrementalSplitAssigner.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/IncrementalSplitAssigner.java
@@ -17,6 +17,7 @@
package org.apache.seatunnel.connectors.cdc.base.source.enumerator;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
import org.apache.seatunnel.connectors.cdc.base.source.enumerator.state.IncrementalPhaseState;
import org.apache.seatunnel.connectors.cdc.base.source.event.SnapshotSplitWatermark;
@@ -70,6 +71,7 @@ public class IncrementalSplitAssigner implements SplitAs
private final Map assignedSplits = new HashMap<>();
private boolean startWithSnapshotMinimumOffset = true;
+ private SeaTunnelDataType checkpointDataType;
public IncrementalSplitAssigner(
SplitAssigner.Context context,
@@ -152,6 +154,7 @@ public void addSplits(Collection splits) {
}
tableWatermarks.put(tableId, startupOffset);
}
+ checkpointDataType = incrementalSplit.getCheckpointDataType();
});
if (!tableWatermarks.isEmpty()) {
this.startWithSnapshotMinimumOffset = false;
@@ -249,6 +252,7 @@ private IncrementalSplit createIncrementalSplit(
capturedTables,
incrementalSplitStartOffset,
sourceConfig.getStopConfig().getStopOffset(offsetFactory),
- completedSnapshotSplitInfos);
+ completedSnapshotSplitInfos,
+ checkpointDataType);
}
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java
index e956b111709b..e99e7dab4b19 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java
@@ -112,6 +112,19 @@ private List splitTableIntoChunks(
final int chunkSize = sourceConfig.getSplitSize();
final double distributionFactorUpper = sourceConfig.getDistributionFactorUpper();
final double distributionFactorLower = sourceConfig.getDistributionFactorLower();
+ final int sampleShardingThreshold = sourceConfig.getSampleShardingThreshold();
+
+ log.info(
+ "Splitting table {} into chunks, split column: {}, min: {}, max: {}, chunk size: {}, "
+ + "distribution factor upper: {}, distribution factor lower: {}, sample sharding threshold: {}",
+ tableId,
+ splitColumnName,
+ min,
+ max,
+ chunkSize,
+ distributionFactorUpper,
+ distributionFactorLower,
+ sampleShardingThreshold);
if (isEvenlySplitColumn(splitColumn)) {
long approximateRowCnt = queryApproximateRowCnt(jdbc, tableId);
@@ -130,7 +143,7 @@ private List splitTableIntoChunks(
} else {
int shardCount = (int) (approximateRowCnt / chunkSize);
int inverseSamplingRate = sourceConfig.getInverseSamplingRate();
- if (sourceConfig.getSampleShardingThreshold() < shardCount) {
+ if (sampleShardingThreshold < shardCount) {
// It is necessary to ensure that the number of data rows sampled by the
// sampling rate is greater than the number of shards.
// Otherwise, if the sampling rate is too low, it may result in an insufficient
@@ -144,9 +157,17 @@ private List splitTableIntoChunks(
chunkSize);
inverseSamplingRate = chunkSize;
}
+ log.info(
+ "Use sampling sharding for table {}, the sampling rate is {}",
+ tableId,
+ inverseSamplingRate);
Object[] sample =
sampleDataFromColumn(
jdbc, tableId, splitColumnName, inverseSamplingRate);
+ log.info(
+ "Sample data from table {} end, the sample size is {}",
+ tableId,
+ sample.length);
return efficientShardingThroughSampling(
tableId, sample, approximateRowCnt, shardCount);
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java
index 1cf62f3448bf..c543bad18cdf 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java
@@ -22,7 +22,7 @@
import java.util.Objects;
-import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument;
/**
* An internal structure describes a chunk range with a chunk start (inclusive) and chunk end
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceReader.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceReader.java
index b251759ff7c2..ceb6215f41d4 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceReader.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceReader.java
@@ -19,6 +19,7 @@
import org.apache.seatunnel.api.source.Collector;
import org.apache.seatunnel.api.source.SourceReader;
+import org.apache.seatunnel.api.table.type.SeaTunnelDataType;
import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
import org.apache.seatunnel.connectors.cdc.base.source.event.CompletedSnapshotSplitsReportEvent;
import org.apache.seatunnel.connectors.cdc.base.source.event.SnapshotSplitWatermark;
@@ -29,6 +30,7 @@
import org.apache.seatunnel.connectors.cdc.base.source.split.state.IncrementalSplitState;
import org.apache.seatunnel.connectors.cdc.base.source.split.state.SnapshotSplitState;
import org.apache.seatunnel.connectors.cdc.base.source.split.state.SourceSplitStateBase;
+import org.apache.seatunnel.connectors.cdc.debezium.DebeziumDeserializationSchema;
import org.apache.seatunnel.connectors.seatunnel.common.source.reader.RecordEmitter;
import org.apache.seatunnel.connectors.seatunnel.common.source.reader.RecordsWithSplitIds;
import org.apache.seatunnel.connectors.seatunnel.common.source.reader.SingleThreadMultiplexSourceReaderBase;
@@ -38,6 +40,7 @@
import lombok.extern.slf4j.Slf4j;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -62,6 +65,7 @@ public class IncrementalSourceReader
private final int subtaskId;
private final C sourceConfig;
+ private final DebeziumDeserializationSchema debeziumDeserializationSchema;
public IncrementalSourceReader(
BlockingQueue> elementsQueue,
@@ -69,7 +73,8 @@ public IncrementalSourceReader(
RecordEmitter recordEmitter,
SourceReaderOptions options,
SourceReader.Context context,
- C sourceConfig) {
+ C sourceConfig,
+ DebeziumDeserializationSchema debeziumDeserializationSchema) {
super(
elementsQueue,
new SingleThreadFetcherManager<>(elementsQueue, splitReaderSupplier::get),
@@ -79,6 +84,7 @@ public IncrementalSourceReader(
this.sourceConfig = sourceConfig;
this.finishedUnackedSplits = new HashMap<>();
this.subtaskId = context.getIndexOfSubtask();
+ this.debeziumDeserializationSchema = debeziumDeserializationSchema;
}
@Override
@@ -163,6 +169,15 @@ protected SourceSplitStateBase initializedState(SourceSplitBase split) {
if (split.isSnapshotSplit()) {
return new SnapshotSplitState(split.asSnapshotSplit());
} else {
+ IncrementalSplit incrementalSplit = split.asIncrementalSplit();
+ if (incrementalSplit.getCheckpointDataType() != null) {
+ log.info(
+ "The incremental split[{}] has checkpoint datatype {} for restore.",
+ incrementalSplit.splitId(),
+ incrementalSplit.getCheckpointDataType());
+ debeziumDeserializationSchema.restoreCheckpointProducedType(
+ incrementalSplit.getCheckpointDataType());
+ }
return new IncrementalSplitState(split.asIncrementalSplit());
}
}
@@ -180,6 +195,10 @@ public List snapshotState(long checkpointId) {
// add finished snapshot splits that didn't receive ack yet
unfinishedSplits.addAll(finishedUnackedSplits.values());
+ if (isIncrementalSplitPhase(unfinishedSplits)) {
+ return snapshotCheckpointDataType(unfinishedSplits);
+ }
+
return unfinishedSplits;
}
@@ -187,4 +206,25 @@ public List snapshotState(long checkpointId) {
protected SourceSplitBase toSplitType(String splitId, SourceSplitStateBase splitState) {
return splitState.toSourceSplit();
}
+
+ private boolean isIncrementalSplitPhase(List stateSplits) {
+ return stateSplits.size() == 1 && stateSplits.get(0).isIncrementalSplit();
+ }
+
+ private List snapshotCheckpointDataType(List stateSplits) {
+ if (!isIncrementalSplitPhase(stateSplits)) {
+ throw new IllegalStateException(
+ "The splits should be incremental split when snapshot checkpoint datatype");
+ }
+ IncrementalSplit incrementalSplit = stateSplits.get(0).asIncrementalSplit();
+ // Snapshot current datatype to checkpoint
+ SeaTunnelDataType checkpointDataType = debeziumDeserializationSchema.getProducedType();
+ IncrementalSplit newIncrementalSplit =
+ new IncrementalSplit(incrementalSplit, checkpointDataType);
+ log.debug(
+ "Snapshot checkpoint datatype {} into split[{}] state.",
+ checkpointDataType,
+ incrementalSplit.splitId());
+ return Arrays.asList(newIncrementalSplit);
+ }
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java
index 2f8409b99a3a..eacb427acbcb 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java
@@ -20,6 +20,7 @@
import org.apache.seatunnel.api.common.metrics.Counter;
import org.apache.seatunnel.api.common.metrics.MetricsContext;
import org.apache.seatunnel.api.source.Collector;
+import org.apache.seatunnel.api.table.event.SchemaChangeEvent;
import org.apache.seatunnel.connectors.cdc.base.source.offset.Offset;
import org.apache.seatunnel.connectors.cdc.base.source.offset.OffsetFactory;
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceRecords;
@@ -37,6 +38,8 @@
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isHighWatermarkEvent;
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isLowWatermarkEvent;
+import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isSchemaChangeAfterWatermarkEvent;
+import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isSchemaChangeBeforeWatermarkEvent;
import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isWatermarkEvent;
import static org.apache.seatunnel.connectors.cdc.base.utils.SourceRecordUtils.getFetchTimestamp;
import static org.apache.seatunnel.connectors.cdc.base.utils.SourceRecordUtils.getMessageTimestamp;
@@ -110,9 +113,12 @@ protected void processElement(
Offset watermark = getWatermark(element);
if (isLowWatermarkEvent(element) && splitState.isSnapshotSplitState()) {
splitState.asSnapshotSplitState().setLowWatermark(watermark);
- }
- if (isHighWatermarkEvent(element) && splitState.isSnapshotSplitState()) {
+ } else if (isHighWatermarkEvent(element) && splitState.isSnapshotSplitState()) {
splitState.asSnapshotSplitState().setHighWatermark(watermark);
+ } else if ((isSchemaChangeBeforeWatermarkEvent(element)
+ || isSchemaChangeAfterWatermarkEvent(element))
+ && splitState.isIncrementalSplitState()) {
+ emitElement(element, output);
}
} else if (isSchemaChangeEvent(element) && splitState.isIncrementalSplitState()) {
emitElement(element, output);
@@ -157,9 +163,24 @@ public void collect(T record) {
output.collect(record);
}
+ @Override
+ public void collect(SchemaChangeEvent event) {
+ output.collect(event);
+ }
+
+ @Override
+ public void markSchemaChangeBeforeCheckpoint() {
+ output.markSchemaChangeBeforeCheckpoint();
+ }
+
+ @Override
+ public void markSchemaChangeAfterCheckpoint() {
+ output.markSchemaChangeAfterCheckpoint();
+ }
+
@Override
public Object getCheckpointLock() {
- return null;
+ return output.getCheckpointLock();
}
}
}
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceSplitReader.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceSplitReader.java
index 932b5f0e4e96..53f97362734f 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceSplitReader.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceSplitReader.java
@@ -20,6 +20,7 @@
import org.apache.seatunnel.common.utils.SeaTunnelException;
import org.apache.seatunnel.connectors.cdc.base.config.SourceConfig;
import org.apache.seatunnel.connectors.cdc.base.dialect.DataSourceDialect;
+import org.apache.seatunnel.connectors.cdc.base.schema.SchemaChangeResolver;
import org.apache.seatunnel.connectors.cdc.base.source.reader.external.FetchTask;
import org.apache.seatunnel.connectors.cdc.base.source.reader.external.Fetcher;
import org.apache.seatunnel.connectors.cdc.base.source.reader.external.IncrementalSourceScanFetcher;
@@ -50,13 +51,18 @@ public class IncrementalSourceSplitReader
private String currentSplitId;
private final DataSourceDialect dataSourceDialect;
private final C sourceConfig;
+ private final SchemaChangeResolver schemaChangeResolver;
public IncrementalSourceSplitReader(
- int subtaskId, DataSourceDialect dataSourceDialect, C sourceConfig) {
+ int subtaskId,
+ DataSourceDialect dataSourceDialect,
+ C sourceConfig,
+ SchemaChangeResolver schemaChangeResolver) {
this.subtaskId = subtaskId;
this.splits = new ArrayDeque<>();
this.dataSourceDialect = dataSourceDialect;
this.sourceConfig = sourceConfig;
+ this.schemaChangeResolver = schemaChangeResolver;
}
@Override
@@ -133,7 +139,9 @@ protected void checkSplitOrStartNext() throws IOException {
}
final FetchTask.Context taskContext =
dataSourceDialect.createFetchTaskContext(nextSplit, sourceConfig);
- currentFetcher = new IncrementalSourceStreamFetcher(taskContext, subtaskId);
+ currentFetcher =
+ new IncrementalSourceStreamFetcher(
+ taskContext, subtaskId, schemaChangeResolver);
log.info("Stream fetcher is created.");
}
currentFetcher.submitTask(dataSourceDialect.createFetchTask(nextSplit));
diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java
index 2b8e9f7725fd..31fdaaf2e50a 100644
--- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java
+++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java
@@ -18,11 +18,14 @@
package org.apache.seatunnel.connectors.cdc.base.source.reader.external;
import org.apache.seatunnel.common.utils.SeaTunnelException;
+import org.apache.seatunnel.connectors.cdc.base.schema.SchemaChangeResolver;
import org.apache.seatunnel.connectors.cdc.base.source.offset.Offset;
import org.apache.seatunnel.connectors.cdc.base.source.split.CompletedSnapshotSplitInfo;
import org.apache.seatunnel.connectors.cdc.base.source.split.IncrementalSplit;
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceRecords;
import org.apache.seatunnel.connectors.cdc.base.source.split.SourceSplitBase;
+import org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent;
+import org.apache.seatunnel.connectors.cdc.base.utils.SourceRecordUtils;
import org.apache.kafka.connect.source.SourceRecord;
@@ -33,6 +36,7 @@
import lombok.extern.slf4j.Slf4j;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@@ -53,6 +57,7 @@
@Slf4j
public class IncrementalSourceStreamFetcher implements Fetcher {
private final FetchTask.Context taskContext;
+ private final SchemaChangeResolver schemaChangeResolver;
private final ExecutorService executorService;
// has entered pure binlog mode
private final Set pureBinlogPhaseTables;
@@ -72,8 +77,12 @@ public class IncrementalSourceStreamFetcher implements Fetcher pollSplitRecords()
throws InterruptedException, SeaTunnelException {
checkReadException();
- final List sourceRecords = new ArrayList<>();
+
+ Iterator sourceRecordsIterator = Collections.emptyIterator();
if (streamFetchTask.isRunning()) {
List batch = queue.poll();
- for (DataChangeEvent event : batch) {
+ if (!batch.isEmpty()) {
+ if (schemaChangeResolver != null) {
+ sourceRecordsIterator = splitSchemaChangeStream(batch);
+ } else {
+ sourceRecordsIterator = splitNormalStream(batch);
+ }
+ }
+ }
+ return sourceRecordsIterator;
+ }
+
+ private Iterator splitNormalStream(List batchEvents) {
+ List sourceRecords = new ArrayList<>();
+ if (streamFetchTask.isRunning()) {
+ for (DataChangeEvent event : batchEvents) {
if (shouldEmit(event.getRecord())) {
sourceRecords.add(event.getRecord());
}
@@ -125,6 +149,92 @@ public Iterator pollSplitRecords()
return sourceRecordsSet.iterator();
}
+ /**
+ * Split schema change stream.
+ *
+ *