Skip to content

Commit

Permalink
Make harvester more resilient to bad paths
Browse files Browse the repository at this point in the history
The harvester now rejects files that have control characters in their filename or invalid utf-8 sequences.

Program runner is also more resilient to parsing invalid characters.

Fixes #702
  • Loading branch information
atruskie committed Jan 16, 2025
1 parent bdf451e commit 3c61290
Show file tree
Hide file tree
Showing 9 changed files with 240 additions and 26 deletions.
1 change: 1 addition & 0 deletions baw-server.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@
"Railtie",
"rakefile",
"rdebug",
"readables",
"realdirpath",
"realpath",
"recaptcha",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ def initialize(timeout_sec, logger)
end

# Execute an external program.
# Do not use if your program will write binary (non UTF-8 compatible) data to stdout or stderr.
# ! Can return invalid UTF-8 byte sequences.
# @param [String] command
# @param [Boolean] raise_exit_error
# @return [Hash] result hash
Expand All @@ -44,17 +46,20 @@ def execute(command, raise_exit_error = true)
pid = pid_inner
end
rescue StandardError => e
@logger.fatal(@class_name) do e end
@logger.fatal(@class_name) do
e
end
raise e
end

status_msg = "status=#{status.exitstatus};killed=#{killed};pid=#{pid};"
timeout_msg = "time_out_sec=#{@timeout_sec};time_taken_sec=#{time};timed_out=#{timed_out};"
exceptions_msg = "exceptions=#{exceptions.inspect};"
output_msg = "\n\tStandard output: #{stdout_str}\n\tStandard Error: #{stderr_str}\n\n"

msg = "External Program: #{status_msg}#{timeout_msg}#{exceptions_msg}command=#{command}#{output_msg}"

if (!stderr_str.blank? && !status.success?) || timed_out || killed
if (stderr_str.present? && !status.success?) || timed_out || killed
@logger.warn(@class_name) { msg }
else
@logger.debug(@class_name) { msg }
Expand Down Expand Up @@ -94,17 +99,19 @@ def run_with_timeout(*opts)
timeout = 60 if timeout.nil?
cleanup_sleep = options[:cleanup_sleep]

output = ''
error = ''
# be specific about encoding to avoid encoding issues
# Note: external programs may output invalid UTF-8 byte sequences
output = String.new(encoding: Encoding::UTF_8)
error = String.new(encoding: Encoding::UTF_8)

# Start task in another thread, which spawns a process
Open3.popen3(*opts) do |_stdin, stdout, stderr, thread|
# Get the pid of the spawned process
pid = thread[:pid]
start = Time.now
start = Time.zone.now

exceptions = []
while (time_remaining = (Time.now - start) < timeout) && thread.alive?
while (time_remaining = (Time.zone.now - start) < timeout) && thread.alive?
exceptions.push read_to_stream(stdout, stderr, output, error, options)
end

Expand Down Expand Up @@ -141,8 +148,8 @@ def read_to_stream(stdout, stderr, output, error, options)
exceptions = []

# Wait up to `tick` seconds for output/error data
readables, writeables, = Kernel.select([stdout, stderr], nil, nil, tick)
unless readables.blank?
readables, = Kernel.select([stdout, stderr], nil, nil, tick)
if readables.present?
readables.each do |readable|
stream = readable == stdout ? output : error
begin
Expand All @@ -151,7 +158,7 @@ def read_to_stream(stdout, stderr, output, error, options)
# Need to read all of both streams
# Keep going until thread dies
exceptions.push(e)
rescue EOFError => e
rescue EOFError
# ignore EOFErrors
end
end
Expand All @@ -166,7 +173,11 @@ def read_to_stream(stdout, stderr, output, error, options)
end

def read_linux(stream, readable, buffer_size)
stream << readable.read_nonblock(buffer_size)
# Sometimes incompatible byte sequences are output by external programs.
# We want the encoding to stay UTF-8 so we don't get incompatible encoding errors
# on string operations. Invalid UTF-8 byte sequences aren't great but it's better
# than exceptions on normal operations.
stream << readable.read_nonblock(buffer_size).force_encoding(Encoding::UTF_8)
end
end
end
19 changes: 19 additions & 0 deletions lib/gems/baw-workers/lib/baw_workers/jobs/harvest/harvest_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class << self
# @return [Boolean] status of job enqueue, true if successful
def enqueue_file(harvest, rel_path, should_harvest:, debounce_on_recent_metadata_extraction: false)
rel_path = rel_path.to_s if rel_path.is_a?(Pathname)
rel_path = check_filename_encoding!(rel_path)

# try and find an existing record
item = existing_harvest_item(harvest, rel_path)
Expand Down Expand Up @@ -69,6 +70,24 @@ def enqueue(id, should_harvest:)

private

# Check if the path is valid UTF-8, if not rename the file to a scrubbed version
# replacing invalid characters with the unicode replacement character.
# @param rel_path [String]
# @return [String] the scrubbed rel_path
def check_filename_encoding!(rel_path)
return rel_path if rel_path.valid_encoding?

scrubbed = rel_path.scrub

# attempt to rename the file so we don't lose it
original = "#{Settings.root_to_do_path}/#{rel_path}"
new_path = "#{Settings.root_to_do_path}/#{scrubbed}"

File.rename(original, new_path)

scrubbed
end

# @param harvest [Harvest]
# @param rel_path [String] the path within harvester_to_do to process
def new_harvest_item(harvest, rel_path)
Expand Down
42 changes: 36 additions & 6 deletions lib/gems/baw-workers/lib/baw_workers/jobs/harvest/validations.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def simple_validations
FileExists,
IsAFile,
FileEmpty,
TargetType

TargetType,
ValidFileNameCharacters
]
end

Expand Down Expand Up @@ -85,7 +85,7 @@ class FileEmpty < Validation

def validate(harvest_item)
size = harvest_item.absolute_path.size
return not_fixable("File has no content (#{size} bytes)") unless size.positive?
not_fixable("File has no content (#{size} bytes)") unless size.positive?
end
end

Expand All @@ -105,6 +105,36 @@ def validate(harvest_item)
end
end

# Do match C control characters (Control, Format, Unassigned, Private Use, Surrogate)
# Also match the replacement character (�). In other parts of the code we have
# to scrub invalid UTF-8 sequences and replace them with this character.
# We do this so we can generate a database record for the invalid harvest item
# (we can't save invalid UTF-8 to the database). Otherwise the files are ignored
# without error.
INVALID_FILENAME_CHARACTERS = /[\p{C}�]/

class ValidFileNameCharacters < Validation
validation_name :invalid_filename_characters

def validate(harvest_item)
name = File.basename(harvest_item.path).force_encoding(Encoding::UTF_8)

unless name.valid_encoding?

name = name.scrub

return fixable("Filename has invalid characters. Remove problem characters where indicated `#{name}`")

end

# check for non-printable characters
changed = name.gsub!(INVALID_FILENAME_CHARACTERS, '�')
return if changed.nil?

fixable("Filename has invalid characters. Remove problem characters where indicated `#{name}`")
end
end

class RecordedDate < Validation
validation_name :missing_date

Expand All @@ -127,7 +157,7 @@ def validate(harvest_item)
# if there's no date at all there's no point in checking this
return if recorded_date_local.blank?

return unless utc_offset.blank?
return if utc_offset.present?

fixable('Only a local date/time was found, supply a UTC offset')
end
Expand All @@ -143,7 +173,7 @@ def validate(harvest_item)
# (prior valiations should have already caught this)
return if recorded_date.blank?

return if recorded_date <= Time.now
return if recorded_date <= Time.zone.now

fixable('This file has a recorded date in the future')
end
Expand All @@ -154,7 +184,7 @@ class NoDuration < Validation

def validate(harvest_item)
duration = harvest_item.info.file_info[:duration_seconds]
return unless duration.blank?
return if duration.present?

not_fixable('Could not find a duration for this file')
end
Expand Down
5 changes: 5 additions & 0 deletions spec/fixtures/fixtures.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ def self.sqlite_fixture
'/sub_dir_1/BLENDED.Tile_20160727T124536Z_3.2.png' => 100_993
}.freeze

# A filename with some non-printable characters in it.
# https://github.com/QutEcoacoustics/baw-server/issues/702
# @return [String]
MIXED_ENCODING_FILENAME = "20231205T\xe2\x80\x8f\xe2\x80\x8e140158_TURTNEST22_n1_m1.wav"

# @return [Pathname]
def self.zip_fixture
ensure_exists FILES_PATH / 'compressed.zip'
Expand Down
38 changes: 28 additions & 10 deletions spec/lib/gems/baw_audio_tools/run_external_program_spec.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# frozen_string_literal: true



describe BawAudioTools::RunExternalProgram do
include_context 'audio base'

Expand All @@ -12,14 +10,16 @@ def test_is_not_running(message)
# 0 = A value of 0 will cause error checking to be performed (with no signal being sent).
# This can be used to check the validity of pid.
is_running = (begin
!!Process.kill(0, pid)
rescue StandardError
false
end)
!Process.kill(0, pid).nil?
rescue StandardError
false
end)

expect(is_running).to be_falsey
end

let(:sleep_range) { 0.5 }

it 'check timeout is enforced' do
run_program = BawAudioTools::RunExternalProgram.new(3, logger)

Expand All @@ -38,8 +38,6 @@ def test_is_not_running(message)
test_is_not_running(error.message)
end

let(:sleep_range) { 0.5 }

it 'check timeout does not impact successful execution' do
run_program = BawAudioTools::RunExternalProgram.new(
Settings.audio_tools_timeout_sec,
Expand Down Expand Up @@ -82,15 +80,35 @@ def test_is_not_running(message)
run_program = BawAudioTools::RunExternalProgram.new(100, logger)

command = "bash -c 'exit 255'"
error = nil
raise_exit_error = false

result = run_program.execute(command, raise_exit_error)

expect(result[:exit_code]).to eq(255)
expect(result[:success]).to eq(false)
expect(result[:success]).to be(false)
expect(result[:execute_msg]).to match(/#{command}/)

test_is_not_running(result[:execute_msg])
end

# https://github.com/QutEcoacoustics/baw-server/issues/702
it 'handles utf-8 strings with non-printable characters in stderr and stdout' do
run_program = BawAudioTools::RunExternalProgram.new(1, logger)

problematic_name = Fixtures::MIXED_ENCODING_FILENAME
target = temp_file(basename: problematic_name)
target.write(Fixtures.audio_file_mono.read)

result = run_program.execute(%(ffmpeg -hide_banner -loglevel repeat+verbose -nostdin -i "#{target}" -codec copy -f null -))

expect(result).to match(a_hash_including(
exit_code: 0,
success: true,
stdout: an_instance_of(String).and(string_to_have_encoding(Encoding::UTF_8)),
stderr: an_instance_of(String)
.and(string_to_have_encoding(Encoding::UTF_8))
.and(match(problematic_name)),
execute_msg: an_instance_of(String).and(string_to_have_encoding(Encoding::UTF_8))
))
end
end
Loading

0 comments on commit 3c61290

Please sign in to comment.