From 7135842c8f4af72c0add84846f9809a5ec8b9d74 Mon Sep 17 00:00:00 2001 From: Vitaliy Kukharik <37010174+vitabaks@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:55:54 +0300 Subject: [PATCH] Analyze a PostgreSQL database (optimizer statistics) immediately after the upgrade (#601) --- pg_upgrade.yml | 42 ++++---------- roles/upgrade/README.md | 84 +++++++++++++--------------- roles/upgrade/tasks/post_upgrade.yml | 63 +++++++++++++++------ roles/upgrade/tasks/statistics.yml | 50 ++++++++++------- vars/upgrade.yml | 2 + 5 files changed, 131 insertions(+), 110 deletions(-) diff --git a/pg_upgrade.yml b/pg_upgrade.yml index bcd2dc23e..584862c3d 100644 --- a/pg_upgrade.yml +++ b/pg_upgrade.yml @@ -56,7 +56,7 @@ tags: - always -- name: "(1/7) PRE-UPGRADE: Perform Pre-Checks" +- name: "(1/6) PRE-UPGRADE: Perform Pre-Checks" hosts: 'primary:secondary' gather_facts: false become: true @@ -79,7 +79,7 @@ - upgrade - pre-checks -- name: "(2/7) PRE-UPGRADE: Install new PostgreSQL packages" +- name: "(2/6) PRE-UPGRADE: Install new PostgreSQL packages" hosts: 'primary:secondary' gather_facts: false become: true @@ -103,7 +103,7 @@ - upgrade-check - packages -- name: "(3/7) PRE-UPGRADE: Initialize new db, schema compatibility check, and pg_upgrade --check" +- name: "(3/6) PRE-UPGRADE: Initialize new db, schema compatibility check, and pg_upgrade --check" hosts: 'primary:secondary' gather_facts: false become: true @@ -143,7 +143,7 @@ - upgrade-check - schema-compatibility-check -- name: "(4/7) PRE-UPGRADE: Prepare the Patroni configuration" +- name: "(4/6) PRE-UPGRADE: Prepare the Patroni configuration" hosts: 'primary:secondary' gather_facts: false become: true @@ -165,7 +165,7 @@ - upgrade - update-config -- name: "(5/7) UPGRADE: Upgrade PostgreSQL" +- name: "(5/6) UPGRADE: Upgrade PostgreSQL" hosts: 'primary:secondary' gather_facts: false become: true @@ -228,7 +228,7 @@ tags: - upgrade -- name: "(6/7) POST-UPGRADE: Perform Post-Checks and Update extensions" +- name: "(6/6) POST-UPGRADE: Analyze a PostgreSQL database (update optimizer statistics) and Post-Upgrade tasks" hosts: 'primary:secondary' gather_facts: false become: true @@ -242,39 +242,23 @@ - name: Include upgrade variables ansible.builtin.include_vars: "vars/upgrade.yml" tasks: - - name: Running Post-Checks + - name: Analyze database ansible.builtin.include_role: name: upgrade - tasks_from: post_checks + tasks_from: statistics + tags: analyze, statistics - name: Update extensions ansible.builtin.include_role: name: upgrade tasks_from: extensions when: update_extensions | bool - tags: - - upgrade - - post-checks - - update-extensions + tags: update_extensions -- name: "(7/7) POST-UPGRADE: Analyze a PostgreSQL database (update optimizer statistics) and Post-Upgrade tasks" - hosts: 'primary:secondary' - gather_facts: false - become: true - become_user: postgres - any_errors_fatal: true - pre_tasks: - - name: Include main variables - ansible.builtin.include_vars: "vars/main.yml" - - name: Include OS-specific variables - ansible.builtin.include_vars: "vars/{{ ansible_os_family }}.yml" - - name: Include upgrade variables - ansible.builtin.include_vars: "vars/upgrade.yml" - tasks: - - name: Analyze database + - name: Running Post-Checks ansible.builtin.include_role: name: upgrade - tasks_from: statistics + tasks_from: post_checks - name: Running Post-Upgrade tasks ansible.builtin.include_role: @@ -283,7 +267,5 @@ tags: - upgrade - post-upgrade - - analyze - - statistics ... diff --git a/roles/upgrade/README.md b/roles/upgrade/README.md index db70850f0..dcb60b917 100644 --- a/roles/upgrade/README.md +++ b/roles/upgrade/README.md @@ -77,6 +77,7 @@ If these checks pass, the playbook switches back to the old PostgreSQL paths and | `schema_compatibility_check_timeout` | Max duration for compatibility check (pg_dumpall --schema-only) in seconds. | `3600` | | `vacuumdb_parallel_jobs` | Execute the analyze command in parallel by running `njobs` commands simultaneously. This option may reduce the processing time but it also increases the load on the database server. | all CPU cores | | `vacuumdb_analyze_timeout` | Max duration of analyze command in seconds. | `3600` | +| `vacuumdb_analyze_terminate_treshold` | Terminate active queries that are longer than the specified time (in seconds) during the collection of statistics (0 = do not terminate active backends). | `0` | | `update_extensions` | Automatically update all PostgreSQL extensions. | `true` | | `max_replication_lag_bytes` | Maximum allowed replication lag in bytes. | `10485760` | | `max_transaction_sec` | Maximum allowed duration for a transaction in seconds. | `15` | @@ -285,20 +286,10 @@ Please see the variable file vars/[upgrade.yml](../../vars/upgrade.yml) - Start vip-manager service - Make sure that the cluster ip address (VIP) is running -#### 6. POST-UPGRADE: Perform Post-Checks and Update extensions -- **Make sure that physical replication is active** - - if no active replication connections found, print error message: - - "No active replication connections found. Please check the replication status and PostgreSQL logs." -- **Create a table "test_replication" with 10000 rows on the Primary** -- **Wait until the PostgreSQL replica is synchronized** - - Notes: max wait time: 2 minutes -- **Drop a table "test_replication"** -- **Print the result of checking the number of records** - - if the number of rows match, print info message: - - "The PostgreSQL Replication is OK. The number of records in the 'test_replication' table the same as the Primary." - - if the number of rows does not match, print error message: - - "The number of records in the 'test_replication' table does not match the Primary. Please check the replication status and PostgreSQL logs." -- **Get a list of databases** +#### 6. POST-UPGRADE: Analyze a PostgreSQL database (update optimizer statistics) and Post-Upgrade tasks +- **Run vacuumdb to analyze the PostgreSQL databases** + - Note: Uses parallel processes equal to 50% of CPU cores ('`vacuumdb_parallel_jobs`' variable) + - Note: Before collecting statistics, the 'pg_terminator' script is launched to monitor and terminate any 'ANALYZE' blockers. Once statistics collection is complete, the script is stopped. - **Update extensions in each database** - Get list of installed PostgreSQL extensions - Get list of old PostgreSQL extensions @@ -310,33 +301,38 @@ Please see the variable file vars/[upgrade.yml](../../vars/upgrade.yml) - Notes: if pg_repack is installed - Notes: if there are no old extensions, print message: - "The extension versions are up-to-date for the database. No update is required." - -#### 7. POST-UPGRADE: Analyze a PostgreSQL database (update optimizer statistics) and Post-Upgrade tasks -- **Run vacuumdb to analyze the PostgreSQL databases** - - Notes: Uses parallel processes equal to CPU cores ('`vacuumdb_parallel_jobs`' variable) - - Notes: Before collecting statistics, the 'pg_terminator' script is launched to monitor and terminate any 'ANALYZE' blockers. Once statistics collection is complete, the script is stopped. - - Wait for the analyze to complete. - - Notes: max wait time: 1 hour ('`vacuumdb_analyze_timeout`' variable) -- **Ensure the current data directory is the new data directory** - - Notes: to prevent deletion the old directory if it is used -- **Delete the old PostgreSQL data directory** - - Notes: perform pg_dropcluster for Debian based -- **Delete the old PostgreSQL WAL directory** - - Notes: if 'pg_new_wal_dir' is defined -- **Remove old PostgreSQL packages** - - Notes: if 'pg_old_packages_remove' is 'true' -- **pgBackRest** (if 'pgbackrest_install' is 'true') - - Check pg-path option - - Update pg-path in pgbackrest.conf - - Upgrade stanza -- **WAL-G** (if 'wal_g_install' is 'true') - - Update PostgreSQL data directory path in .walg.json - - Update PostgreSQL data directory path in cron jobs -- **Check the Patroni cluster state** -- **Check the current PostgreSQL version** -- **Remove temporary local access rule from pg_hba.conf** - - Notes: if it has been changed - - Update the PostgreSQL configuration -- **Print info messages** - - List the Patroni cluster members - - Upgrade completed +- **Perform Post-Checks** + - Make sure that physical replication is active + - Note: if no active replication connections found, print error message: "No active replication connections found. Please check the replication status and PostgreSQL logs." + - Create a table "test_replication" with 10000 rows on the Primary + - Wait until the PostgreSQL replica is synchronized (max wait time: 2 minutes) + - Drop a table "test_replication" + - Print the result of checking the number of records + - if the number of rows match, print info message: "The PostgreSQL Replication is OK. The number of records in the 'test_replication' table the same as the Primary." + - if the number of rows does not match, print error message: "The number of records in the 'test_replication' table does not match the Primary. Please check the replication status and PostgreSQL logs." +- **Perform Post-Upgrade tasks** + - **Ensure the current data directory is the new data directory** + - Notes: to prevent deletion the old directory if it is used + - **Delete the old PostgreSQL data directory** + - Notes: perform pg_dropcluster for Debian based + - **Delete the old PostgreSQL WAL directory** + - Notes: if 'pg_new_wal_dir' is defined + - **Remove old PostgreSQL packages** + - Notes: if 'pg_old_packages_remove' is 'true' + - **Remove temporary local access rule from pg_hba.conf** + - Notes: if it has been changed + - Update the PostgreSQL configuration + - **pgBackRest** (if 'pgbackrest_install' is 'true') + - Check pg-path option + - Update pg-path in pgbackrest.conf + - Upgrade stanza + - **WAL-G** (if 'wal_g_install' is 'true') + - Update PostgreSQL data directory path in .walg.json + - Update PostgreSQL data directory path in cron jobs + - **Wait for the analyze to complete.** + - Notes: max wait time: 1 hour ('`vacuumdb_analyze_timeout`' variable) + - **Check the Patroni cluster state** + - **Check the current PostgreSQL version** + - **Print info messages** + - List the Patroni cluster members + - Upgrade completed diff --git a/roles/upgrade/tasks/post_upgrade.yml b/roles/upgrade/tasks/post_upgrade.yml index 591eb9a89..6e78cc4d3 100644 --- a/roles/upgrade/tasks/post_upgrade.yml +++ b/roles/upgrade/tasks/post_upgrade.yml @@ -71,7 +71,21 @@ - pg_old_packages_remove | bool - ansible_os_family == "Debian" -# pgbackrest (local) +# Return the pg_hba.conf file to its original state (if it has been changed) +- block: + - name: Remove temporary local access rule from pg_hba.conf + ansible.builtin.blockinfile: + path: "{{ pg_new_confdir }}/pg_hba.conf" + marker: "# {mark} ANSIBLE TEMPORARY pg_upgrade RULE" + state: absent + + - name: Update the PostgreSQL configuration + ansible.builtin.command: "{{ pg_new_bindir }}/pg_ctl reload -D {{ pg_new_datadir }}" + when: + - socket_access_result.stderr is defined + - "'no pg_hba.conf entry' in socket_access_result.stderr" + +# pgBackRest (local) - block: - name: pgbackrest | Check pg-path option ansible.builtin.command: "grep -c '^pg.*-path=' {{ pgbackrest_conf_file }}" @@ -99,7 +113,7 @@ - pgbackrest_install | bool - pgbackrest_repo_host | length < 1 -# pgbackrest (dedicated) +# pgBackRest (dedicated) - block: - name: pgbackrest | Check pg-path option delegate_to: "{{ groups['pgbackrest'][0] }}" @@ -151,6 +165,36 @@ ignore_errors: true when: wal_g_install | bool +# Wait for the analyze to complete +- name: "Collecting statistics in progress. Wait for the analyze to complete." + ansible.builtin.async_status: + jid: "{{ vacuumdb_analyze.ansible_job_id }}" + register: vacuumdb_analyze_job_result + until: vacuumdb_analyze_job_result.finished + retries: "{{ (vacuumdb_analyze_timeout | int) // 10 }}" # max wait time + delay: 10 + ignore_errors: true # ignore errors if the task runs for over an vacuumdb_analyze_timeout + when: + - vacuumdb_analyze is defined + - vacuumdb_analyze.ansible_job_id is defined + +- name: "Stop pg_terminator script" + ansible.builtin.shell: | + while read pid; do + if ps -p $pid > /dev/null 2>&1; then + echo "Stopping pg_terminator with pid: $pid" >> /tmp/pg_terminator.log + kill -9 $pid + else + echo "No process found for pid: $pid" >> /tmp/pg_terminator.log + fi + done < /tmp/pg_terminator.pid + args: + executable: /bin/bash + ignore_errors: true + when: (pg_terminator_analyze is defined and pg_terminator_analyze is changed) or + (pg_terminator_long_transactions is defined and pg_terminator_long_transactions is changed) + +# finish (info) - name: Check the Patroni cluster state run_once: true become: true @@ -171,21 +215,6 @@ changed_when: false when: inventory_hostname in groups['primary'] -# Return the pg_hba.conf file to its original state (if it has been changed) -- block: - - name: Remove temporary local access rule from pg_hba.conf - ansible.builtin.blockinfile: - path: "{{ pg_new_confdir }}/pg_hba.conf" - marker: "# {mark} ANSIBLE TEMPORARY pg_upgrade RULE" - state: absent - - - name: Update the PostgreSQL configuration - ansible.builtin.command: "{{ pg_new_bindir }}/pg_ctl reload -D {{ pg_new_datadir }}" - when: - - socket_access_result.stderr is defined - - "'no pg_hba.conf entry' in socket_access_result.stderr" - -# finish (info) - name: List the Patroni cluster members run_once: true ansible.builtin.debug: diff --git a/roles/upgrade/tasks/statistics.yml b/roles/upgrade/tasks/statistics.yml index 706171117..a2e57b18d 100644 --- a/roles/upgrade/tasks/statistics.yml +++ b/roles/upgrade/tasks/statistics.yml @@ -9,7 +9,7 @@ - block: # Monitor the locks and terminate the backend blocking the 'ANALYZE' query (for more than 15 seconds) - - name: "Start pg_terminator script: Monitor locks and terminate the 'ANALYZE' blockers" + - name: "pg_terminator: Monitor locks and terminate the 'ANALYZE' blockers" ansible.builtin.shell: | echo $$ > /tmp/pg_terminator.pid for i in {1..{{ vacuumdb_analyze_timeout // 10 }}}; do @@ -41,6 +41,36 @@ ignore_errors: true # ignore errors if the task runs for over an 'vacuumdb_analyze_timeout'. when: pg_new_version is version('9.6', '>=') + # Monitor long-running transactions and terminate them (for more than 'vacuumdb_analyze_terminate_treshold') + - name: "pg_terminator: Monitor and terminate the long-running transactions (more than {{ max_tx_sec }} seconds) during collecting statistics" + ansible.builtin.shell: | + echo $$ >> /tmp/pg_terminator.pid + for i in {1..{{ vacuumdb_analyze_timeout // 10 }}}; do + {{ pg_new_bindir }}/psql -p {{ postgresql_port }} -U {{ patroni_superuser_username }} -d postgres -tAXc " + select + clock_timestamp(), + pg_terminate_backend(pid), + pid, + clock_timestamp() - xact_start as xact_age, + left(regexp_replace(query, E'[ \\t\\n\\r]+', ' ', 'g'),150) as query + from pg_stat_activity + where + backend_type = 'client backend' and pid <> pg_backend_pid() + and query not ilike 'ANALYZE %' + and xact_start < clock_timestamp() - interval '{{ max_tx_sec }}s';" >> /tmp/pg_terminator.log + sleep 10 + done + args: + executable: /bin/bash + async: "{{ vacuumdb_analyze_timeout }}" # run the command asynchronously with a maximum duration + poll: 0 + register: pg_terminator_long_transactions + ignore_errors: true # ignore errors if the task runs for over an 'vacuumdb_analyze_timeout'. + vars: + max_tx_sec: "{{ vacuumdb_analyze_terminate_treshold }}" + when: pg_new_version is version('10', '>=') and vacuumdb_analyze_terminate_treshold | int > 0 + + # ANALYZE - name: "Run vacuumdb to analyze the PostgreSQL databases" ansible.builtin.command: > {{ pg_new_bindir }}/vacuumdb -p {{ postgresql_port }} @@ -49,24 +79,6 @@ poll: 0 register: vacuumdb_analyze ignore_errors: true # ignore errors if the task runs for over an 'vacuumdb_analyze_timeout'. - - - name: "Collecting statistics in progress. Wait for the analyze to complete." - ansible.builtin.async_status: - jid: "{{ vacuumdb_analyze.ansible_job_id }}" - register: vacuumdb_analyze_job_result - until: vacuumdb_analyze_job_result.finished - retries: "{{ (vacuumdb_analyze_timeout | int) // 10 }}" # max wait time - delay: 10 - ignore_errors: true # ignore errors if the task runs for over an vacuumdb_analyze_timeout - - - name: "Stop pg_terminator script" - ansible.builtin.shell: | - pid=$(cat /tmp/pg_terminator.pid) - ps -p $pid > /dev/null 2>&1 && kill -9 $pid - args: - executable: /bin/bash - ignore_errors: true - when: pg_terminator_analyze is changed when: inventory_hostname in groups['primary'] ... diff --git a/vars/upgrade.yml b/vars/upgrade.yml index 0e3f66fb5..55ee30fc7 100644 --- a/vars/upgrade.yml +++ b/vars/upgrade.yml @@ -66,6 +66,8 @@ update_extensions: true # if 'true', try to update extensions automatically vacuumdb_parallel_jobs: "{{ [ansible_processor_vcpus | int // 2, 1] | max }}" # use 50% CPU cores vacuumdb_analyze_timeout: 3600 # seconds. The maximum duration of analyze command (soft limit, exceeding won't halt playbook) +# terminate active queries that are longer than the specified time (in seconds) during the collection of statistics. +vacuumdb_analyze_terminate_treshold: 0 # (0 = do not terminate active backends) # Do not perform an upgrade if max_replication_lag_bytes: 10485760 # 10 MiB - Maximum allowed replication lag in bytes