Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve error handling and add retry logic #11734

Merged
merged 13 commits into from
Nov 8, 2023
6 changes: 5 additions & 1 deletion salt/common/tools/sbin/so-common
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,10 @@ retry() {
echo "<Start of output>"
echo "$output"
echo "<End of output>"
if [[ $exitcode -eq 0 ]]; then
echo "Forcing exit code to 1"
exitcode=1
fi
fi
elif [ -n "$failedOutput" ]; then
if [[ "$output" =~ "$failedOutput" ]]; then
Expand All @@ -405,7 +409,7 @@ retry() {
echo "$output"
echo "<End of output>"
if [[ $exitcode -eq 0 ]]; then
echo "The exitcode was 0, but we are setting to 1 since we found $failedOutput in the output."
echo "Forcing exit code to 1"
exitcode=1
fi
else
Expand Down
10 changes: 7 additions & 3 deletions salt/elasticsearch/enabled.sls
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ escomponenttemplates:
- group: 939
- clean: True
- onchanges_in:
- cmd: so-elasticsearch-templates
- file: so-elasticsearch-templates-reload

# Auto-generate templates from defaults file
{% for index, settings in ES_INDEX_SETTINGS.items() %}
Expand All @@ -123,7 +123,7 @@ es_index_template_{{index}}:
TEMPLATE_CONFIG: {{ settings.index_template }}
- template: jinja
- onchanges_in:
- cmd: so-elasticsearch-templates
- file: so-elasticsearch-templates-reload
{% endif %}
{% endfor %}

Expand All @@ -142,7 +142,7 @@ es_template_{{TEMPLATE.split('.')[0] | replace("/","_") }}:
- user: 930
- group: 939
- onchanges_in:
- cmd: so-elasticsearch-templates
- file: so-elasticsearch-templates-reload
{% endfor %}
{% endif %}

Expand All @@ -167,6 +167,10 @@ so-elasticsearch-ilm-policy-load:
- onchanges:
- file: so-elasticsearch-ilm-policy-load-script

so-elasticsearch-templates-reload:
file.absent:
- name: /opt/so/state/estemplates.txt

so-elasticsearch-templates:
cmd.run:
- name: /usr/sbin/so-elasticsearch-templates-load
Expand Down
38 changes: 8 additions & 30 deletions salt/elasticsearch/tools/sbin/so-elasticsearch-pipelines
Original file line number Diff line number Diff line change
Expand Up @@ -6,49 +6,27 @@

. /usr/sbin/so-common


RETURN_CODE=0
ELASTICSEARCH_HOST=$1
ELASTICSEARCH_PORT=9200

# Define a default directory to load pipelines from
ELASTICSEARCH_INGEST_PIPELINES="/opt/so/conf/elasticsearch/ingest/"

# Wait for ElasticSearch to initialize

if [ ! -f /opt/so/state/espipelines.txt ]; then

echo "State file /opt/so/state/espipelines.txt not found. Running so-elasticsearch-pipelines."
echo -n "Waiting for ElasticSearch..."
COUNT=0
ELASTICSEARCH_CONNECTED="no"
while [[ "$COUNT" -le 240 ]]; do
curl -K /opt/so/conf/elasticsearch/curl.config -k --output /dev/null --silent --head --fail -L https://"$ELASTICSEARCH_HOST":"$ELASTICSEARCH_PORT"
if [ $? -eq 0 ]; then
ELASTICSEARCH_CONNECTED="yes"
echo "connected!"
break
else
((COUNT+=1))
sleep 1
echo -n "."
fi
done
if [ "$ELASTICSEARCH_CONNECTED" == "no" ]; then
echo
echo -e "Connection attempt timed out. Unable to connect to ElasticSearch. \nPlease try: \n -checking log(s) in /var/log/elasticsearch/\n -running 'sudo docker ps' \n -running 'sudo so-elastic-restart'"
echo
fi
retry 240 1 "so-elasticsearch-query / -k --output /dev/null --silent --head --fail" || fail "Connection attempt timed out. Unable to connect to ElasticSearch. \nPlease try: \n -checking log(s) in /var/log/elasticsearch/\n -running 'sudo docker ps' \n -running 'sudo so-elastic-restart'"

cd ${ELASTICSEARCH_INGEST_PIPELINES}

echo "Loading pipelines..."
for i in .[a-z]* *; do echo $i; RESPONSE=$(curl -K /opt/so/conf/elasticsearch/curl.config -k -XPUT -L https://${ELASTICSEARCH_HOST}:${ELASTICSEARCH_PORT}/_ingest/pipeline/$i -H 'Content-Type: application/json' -d@$i 2>/dev/null); echo $RESPONSE; if [[ "$RESPONSE" == *"error"* ]]; then RETURN_CODE=1; fi; done
for i in .[a-z]* *;
do
echo $i;
retry 5 5 "so-elasticsearch-query _ingest/pipeline/$i -d@$i -XPUT | grep '{\"acknowledged\":true}'" || fail "Could not load pipeline: $i"
done
echo

cd - >/dev/null
if [[ "$RETURN_CODE" != "1" ]]; then
touch /opt/so/state/espipelines.txt
fi
else
exit $RETURN_CODE
touch /opt/so/state/espipelines.txt
fi
236 changes: 137 additions & 99 deletions salt/elasticsearch/tools/sbin_jinja/so-elasticsearch-templates-load
Original file line number Diff line number Diff line change
Expand Up @@ -7,105 +7,143 @@
{% from 'vars/globals.map.jinja' import GLOBALS %}
{%- set SUPPORTED_PACKAGES = salt['pillar.get']('elasticfleet:packages', default=ELASTICFLEETDEFAULTS.elasticfleet.packages, merge=True) %}

. /usr/sbin/so-common
{% if GLOBALS.role != 'so-heavynode' %}
if [ -f /usr/sbin/so-elastic-fleet-common ]; then
. /usr/sbin/so-elastic-fleet-common
STATE_FILE_INITIAL=/opt/so/state/estemplates_initial_load_attempt.txt
STATE_FILE_SUCCESS=/opt/so/state/estemplates.txt

if [[ -f $STATE_FILE_INITIAL ]]; then
# The initial template load has already run. As this is a subsequent load, all dependencies should
# already be satisified. Therefore, immediately exit/abort this script upon any template load failure
# since this is an unrecoverable failure.
should_exit_on_failure=1
else
# This is the initial template load, and there likely are some components not yet setup in Elasticsearch.
# Therefore load as many templates as possible at this time and if an error occurs proceed to the next
# template. But if at least one template fails to load do not mark the templates as having been loaded.
# This will allow the next load to resume the load of the templates that failed to load initially.
should_exit_on_failure=0
echo "This is the initial template load"
fi
{% endif %}

default_conf_dir=/opt/so/conf

# Define a default directory to load pipelines from
ELASTICSEARCH_TEMPLATES="$default_conf_dir/elasticsearch/templates/"

{% if GLOBALS.role == 'so-heavynode' %}
file="/opt/so/conf/elasticsearch/templates/index/so-common-template.json"
{% else %}
file="/usr/sbin/so-elastic-fleet-common"
{% endif %}

if [ -f "$file" ]; then
# Wait for ElasticSearch to initialize
echo -n "Waiting for ElasticSearch..."
COUNT=0
ELASTICSEARCH_CONNECTED="no"
while [[ "$COUNT" -le 240 ]]; do
so-elasticsearch-query / -k --output /dev/null --silent --head --fail
if [ $? -eq 0 ]; then
ELASTICSEARCH_CONNECTED="yes"
echo "connected!"
break
else
((COUNT+=1))
sleep 1
echo -n "."
fi
done
if [ "$ELASTICSEARCH_CONNECTED" == "no" ]; then
echo
echo -e "Connection attempt timed out. Unable to connect to ElasticSearch. \nPlease try: \n -checking log(s) in /var/log/elasticsearch/\n -running 'sudo docker ps' \n -running 'sudo so-elastic-restart'"
echo
exit 1
fi

{% if GLOBALS.role != 'so-heavynode' %}
SESSIONCOOKIE=$(curl -s -K /opt/so/conf/elasticsearch/curl.config -c - -X GET http://localhost:5601/ | grep sid | awk '{print $7}')
INSTALLED=$(elastic_fleet_package_is_installed {{ SUPPORTED_PACKAGES[0] }} )
if [ "$INSTALLED" != "installed" ]; then
echo
echo "Packages not yet installed."
echo
exit 0
fi
{% endif %}
set -e

cd ${ELASTICSEARCH_TEMPLATES}/component/ecs

echo "Loading ECS component templates..."
for i in *; do TEMPLATE=$(echo $i | cut -d '.' -f1); echo "$TEMPLATE-mappings"; so-elasticsearch-query _component_template/$TEMPLATE-mappings -d@$i -XPUT 2>/dev/null; echo; done

cd ${ELASTICSEARCH_TEMPLATES}/component/elastic-agent

echo "Loading Elastic Agent component templates..."
{% if GLOBALS.role == 'so-heavynode' %}
component_pattern="so-*"
{% else %}
component_pattern="*"
{% endif %}
for i in $component_pattern; do TEMPLATE=${i::-5}; echo "$TEMPLATE"; so-elasticsearch-query _component_template/$TEMPLATE -d@$i -XPUT 2>/dev/null; echo; done

# Load SO-specific component templates
cd ${ELASTICSEARCH_TEMPLATES}/component/so

echo "Loading Security Onion component templates..."
for i in *; do TEMPLATE=$(echo $i | cut -d '.' -f1); echo "$TEMPLATE"; so-elasticsearch-query _component_template/$TEMPLATE -d@$i -XPUT 2>/dev/null; echo; done
echo

# Load SO index templates
cd ${ELASTICSEARCH_TEMPLATES}/index

echo "Loading Security Onion index templates..."
shopt -s extglob
{% if GLOBALS.role == 'so-heavynode' %}
pattern="!(*1password*|*aws*|*azure*|*cloudflare*|*elastic_agent*|*fim*|*github*|*google*|*osquery*|*system*|*windows*)"
{% else %}
pattern="*"
{% endif %}
for i in $pattern; do
TEMPLATE=${i::-14};
echo "$TEMPLATE";
so-elasticsearch-query _index_template/$TEMPLATE -d@$i -XPUT 2>/dev/null;
echo;
done
echo

load_failures=0

load_template() {
uri=$1
file=$2

echo "Loading template file $i"
if ! retry 3 5 "so-elasticsearch-query $uri -d@$file -XPUT" "{\"acknowledged\":true}"; then
if [[ $should_exit_on_failure -eq 1 ]]; then
fail "Could not load template file: $file"
else
load_failures=$((load_failures+1))
echo "Incremented load failure counter: $load_failures"
fi
fi
}

if [ ! -f $STATE_FILE_SUCCESS ]; then
echo "State file $STATE_FILE_SUCCESS not found. Running so-elasticsearch-templates-load."

. /usr/sbin/so-common

{% if GLOBALS.role != 'so-heavynode' %}
if [ -f /usr/sbin/so-elastic-fleet-common ]; then
. /usr/sbin/so-elastic-fleet-common
fi
{% endif %}

default_conf_dir=/opt/so/conf

# Define a default directory to load pipelines from
ELASTICSEARCH_TEMPLATES="$default_conf_dir/elasticsearch/templates/"

{% if GLOBALS.role == 'so-heavynode' %}
file="/opt/so/conf/elasticsearch/templates/index/so-common-template.json"
{% else %}
file="/usr/sbin/so-elastic-fleet-common"
{% endif %}

if [ -f "$file" ]; then
# Wait for ElasticSearch to initialize
echo -n "Waiting for ElasticSearch..."
retry 240 1 "so-elasticsearch-query / -k --output /dev/null --silent --head --fail" || fail "Connection attempt timed out. Unable to connect to ElasticSearch. \nPlease try: \n -checking log(s) in /var/log/elasticsearch/\n -running 'sudo docker ps' \n -running 'sudo so-elastic-restart'"
{% if GLOBALS.role != 'so-heavynode' %}
SESSIONCOOKIE=$(curl -s -K /opt/so/conf/elasticsearch/curl.config -c - -X GET http://localhost:5601/ | grep sid | awk '{print $7}')
INSTALLED=$(elastic_fleet_package_is_installed {{ SUPPORTED_PACKAGES[0] }} )
if [ "$INSTALLED" != "installed" ]; then
echo
echo "Packages not yet installed."
echo
exit 0
fi
{% endif %}

touch $STATE_FILE_INITIAL

cd ${ELASTICSEARCH_TEMPLATES}/component/ecs

echo "Loading ECS component templates..."
for i in *; do
TEMPLATE=$(echo $i | cut -d '.' -f1)
load_template "_component_template/${TEMPLATE}-mappings" "$i"
done
echo

cd ${ELASTICSEARCH_TEMPLATES}/component/elastic-agent

echo "Loading Elastic Agent component templates..."
{% if GLOBALS.role == 'so-heavynode' %}
component_pattern="so-*"
{% else %}
component_pattern="*"
{% endif %}
for i in $component_pattern; do
TEMPLATE=${i::-5}
load_template "_component_template/$TEMPLATE" "$i"
done
echo

# Load SO-specific component templates
cd ${ELASTICSEARCH_TEMPLATES}/component/so

echo "Loading Security Onion component templates..."
for i in *; do
TEMPLATE=$(echo $i | cut -d '.' -f1);
load_template "_component_template/$TEMPLATE" "$i"
done
echo

# Load SO index templates
cd ${ELASTICSEARCH_TEMPLATES}/index

echo "Loading Security Onion index templates..."
shopt -s extglob
{% if GLOBALS.role == 'so-heavynode' %}
pattern="!(*1password*|*aws*|*azure*|*cloudflare*|*elastic_agent*|*fim*|*github*|*google*|*osquery*|*system*|*windows*)"
{% else %}
pattern="*"
{% endif %}
for i in $pattern; do
TEMPLATE=${i::-14}
load_template "_index_template/$TEMPLATE" "$i"
done
else
{% if GLOBALS.role == 'so-heavynode' %}
echo "Common template does not exist. Exiting..."
{% else %}
echo "Elastic Fleet not configured. Exiting..."
{% endif %}
exit 0
fi

cd - >/dev/null

if [[ $load_failures -eq 0 ]]; then
echo "All template loaded successfully"
touch $STATE_FILE_SUCCESS
else
echo "Encountered $load_failures templates that were unable to load, likely due to missing dependencies that will be available later; will retry on next highstate"
fi
else
{% if GLOBALS.role == 'so-heavynode' %}
echo "Common template does not exist. Exiting..."
{% else %}
echo "Elastic Fleet not configured. Exiting..."
{% endif %}
exit 0
echo "Templates already loaded"
fi
cd - >/dev/null
4 changes: 2 additions & 2 deletions setup/so-verify
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ log_has_errors() {
# Failed to restart snapd.mounts-pre.target: Operation refused, unit snapd.mounts-pre.target
# may be requested by dependency only (it is configured to refuse manual start/stop).

# Exit code 100 failure is likely apt-get running in the background, we wait for it to unlock.
# Command failed with exit code is output during retry loops.

grep -E "FAILED|Failed|failed|ERROR|Result: False|Error is not recoverable" "$setup_log" | \
grep -vE "The Salt Master has cached the public key for this node" | \
Expand All @@ -57,7 +57,7 @@ log_has_errors() {
grep -vE "Login Failed Details" | \
grep -vE "response from daemon: unauthorized" | \
grep -vE "Reading first line of patchfile" | \
grep -vE "Command failed with exit code 100; will retry" | \
grep -vE "Command failed with exit code" | \
grep -vE "Running scope as unit" &> "$error_log"

if [[ $? -eq 0 ]]; then
Expand Down