Skip to content

Enable test reruns on failed fragiled tests #217

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 24, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions lib/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sys
import traceback
from functools import partial
from hashlib import md5

try:
from cStringIO import StringIO
Expand Down Expand Up @@ -152,9 +153,9 @@ def run(self, server):
it to stdout.

Returns short status of the test as a string: 'skip', 'pass',
'new', 'updated' or 'fail'. There is also one possible value for
short_status, 'disabled', but it returned in the caller,
TestSuite.run_test().
'new', 'updated' or 'fail' and results file checksum on fail.
There is also one possible value for short_status, 'disabled',
but it returned in the caller, TestSuite.run_test().
"""

# Note: test was created before certain worker become known, so we need
Expand Down Expand Up @@ -219,6 +220,7 @@ def run(self, server):
self.is_valgrind_clean = not bool(non_empty_logs)

short_status = None
result_checksum = None

if self.skip:
short_status = 'skip'
Expand Down Expand Up @@ -252,6 +254,8 @@ def run(self, server):
has_result = os.path.exists(self.tmp_result)
if has_result:
shutil.copy(self.tmp_result, self.reject)
with open(self.tmp_result, mode='rb') as result_file:
result_checksum = md5(result_file.read()).hexdigest()
short_status = 'fail'
color_stdout("[ fail ]\n", schema='test_fail')

Expand All @@ -277,7 +281,7 @@ def run(self, server):
"Test failed! Output from log file "
"{0}:\n".format(log_file))
where = ": there were warnings in the valgrind log file(s)"
return short_status
return short_status, result_checksum

def print_diagnostics(self, log_file, message):
"""Print whole lines of client program output leading to test
Expand Down
35 changes: 30 additions & 5 deletions lib/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def __init__(self, suite_path, args):
self.args = args
self.tests = []
self.ini = {}
self.fragile = {'retries': 0, 'tests': {}}
self.suite_path = suite_path
self.ini["core"] = "tarantool"

Expand Down Expand Up @@ -110,6 +111,17 @@ def __init__(self, suite_path, args):
lambda x: os.path.join(suite_path, x),
dict.fromkeys(self.ini[i].split())
if i in self.ini else dict())
if config.has_option("default", "fragile"):
fragiles = config.get("default", "fragile")
try:
self.fragile = json.loads(fragiles)
if 'tests' not in self.fragile:
raise RuntimeError(
"Key 'tests' absent in 'fragile' json: {}"
. format(self.fragile))
except ValueError:
# use old format dictionary
self.fragile['tests'] = self.ini['fragile']

self.parse_bool_opt('pretest_clean', False)
self.parse_bool_opt('use_unix_sockets', False)
Expand Down Expand Up @@ -154,22 +166,31 @@ def collect_tests(self):
self.tests_are_collected = True
return self.tests

def get_fragile_list(self):
return self.fragile['tests'].keys()

def stable_tests(self):
self.collect_tests()
res = []
for test in self.tests:
if os.path.basename(test.name) not in self.ini['fragile']:
if os.path.basename(test.name) not in self.get_fragile_list():
res.append(test)
return res

def fragile_tests(self):
self.collect_tests()
res = []
for test in self.tests:
if os.path.basename(test.name) in self.ini['fragile']:
if os.path.basename(test.name) in self.get_fragile_list():
res.append(test)
return res

def get_test_fragile_checksums(self, test):
try:
return self.fragile['tests'][test]['checksums']
except Exception:
return []

def gen_server(self):
try:
return Server(self.ini, test_suite=self)
Expand Down Expand Up @@ -222,7 +243,7 @@ def stop_server(self, server, inspector, silent=False, cleanup=True):

def run_test(self, test, server, inspector):
""" Returns short status of the test as a string: 'skip', 'pass',
'new', 'fail', or 'disabled'.
'new', 'fail', or 'disabled' and results file checksum on fail.
"""
test.inspector = inspector
test_name = os.path.basename(test.name)
Expand All @@ -236,19 +257,23 @@ def run_test(self, test, server, inspector):
color_stdout(conf.ljust(16), schema='test_var')

if self.is_test_enabled(test, conf, server):
short_status = test.run(server)
short_status, result_checksum = test.run(server)
else:
color_stdout("[ disabled ]\n", schema='t_name')
short_status = 'disabled'
result_checksum = None

# cleanup only if test passed or if --force mode enabled
if lib.Options().args.is_force or short_status == 'pass':
inspector.cleanup_nondefault()

return short_status
return short_status, result_checksum

def is_parallel(self):
return self.ini['is_parallel']

def fragile_retries(self):
return self.fragile.get('retries', 0)

def show_reproduce_content(self):
return self.ini['show_reproduce_content']
38 changes: 31 additions & 7 deletions lib/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,15 @@ class WorkerTaskResult(BaseWorkerMessage):
""" Passed into the result queue when a task processed (done) by the
worker. The short_status (string) field intended to give short note whether
the task processed successfully or not, but with little more flexibility
than binary True/False. The task_id (any hashable object) field hold ID of
than binary True/False. The result_checksum (string) field saves the results
file checksum on test fail. The task_id (any hashable object) field hold ID of
the processed task. The show_reproduce_content configuration form suite.ini
"""
def __init__(self, worker_id, worker_name, task_id,
short_status, show_reproduce_content):
short_status, result_checksum, show_reproduce_content):
super(WorkerTaskResult, self).__init__(worker_id, worker_name)
self.short_status = short_status
self.result_checksum = result_checksum
self.task_id = task_id
self.show_reproduce_content = show_reproduce_content

Expand Down Expand Up @@ -214,8 +216,9 @@ def current_task(self, task_id):
return WorkerCurrentTask(self.id, self.name, task_name, task_param,
task_result, task_tmp_result)

def wrap_result(self, task_id, short_status):
def wrap_result(self, task_id, short_status, result_checksum):
return WorkerTaskResult(self.id, self.name, task_id, short_status,
result_checksum,
self.suite.show_reproduce_content())

def sigterm_handler(self, signum, frame):
Expand Down Expand Up @@ -302,7 +305,7 @@ def run_task(self, task_id):
with open(self.reproduce_file, 'a') as f:
task_id_str = yaml.safe_dump(task.id, default_flow_style=True)
f.write('- ' + task_id_str)
short_status = self.suite.run_test(
short_status, result_checksum = self.suite.run_test(
task, self.server, self.inspector)
except KeyboardInterrupt:
self.report_keyboard_interrupt()
Expand All @@ -312,7 +315,7 @@ def run_task(self, task_id):
'\nWorker "%s" received the following error; stopping...\n'
% self.name + traceback.format_exc() + '\n', schema='error')
raise
return short_status
return short_status, result_checksum

def run_loop(self, task_queue, result_queue):
""" called from 'run_all' """
Expand All @@ -326,9 +329,30 @@ def run_loop(self, task_queue, result_queue):
self.stop_worker(task_queue, result_queue)
break

short_status = None
result_checksum = None
result_queue.put(self.current_task(task_id))
short_status = self.run_task(task_id)
result_queue.put(self.wrap_result(task_id, short_status))
testname = os.path.basename(task_id[0])
fragile_checksums = self.suite.get_test_fragile_checksums(testname)
retries_left = self.suite.fragile_retries()
# let's run till short_status became 'pass'
while short_status != 'pass' and retries_left >= 0:
# print message only after some fails occurred
if short_status == 'fail':
color_stdout(
'Test "%s", conf: "%s"\n'
'\tfrom "fragile" list failed with results'
' file checksum: "%s", rerunning ...\n'
% (task_id[0], task_id[1], result_checksum), schema='error')
# run task and save the result to short_status
short_status, result_checksum = self.run_task(task_id)
# check if the results file checksum set on fail and if
# the newly created results file is known by checksum
if not result_checksum or (result_checksum not in fragile_checksums):
break
retries_left = retries_left - 1

result_queue.put(self.wrap_result(task_id, short_status, result_checksum))
if not lib.Options().args.is_force and short_status == 'fail':
color_stdout(
'Worker "%s" got failed test; stopping the server...\n'
Expand Down
4 changes: 3 additions & 1 deletion listeners.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def process_result(self, obj):
if obj.short_status == 'fail':
self.failed_tasks.append((obj.task_id,
obj.worker_name,
obj.result_checksum,
obj.show_reproduce_content))

def print_statistics(self):
Expand All @@ -58,10 +59,11 @@ def print_statistics(self):
return False

color_stdout('Failed tasks:\n', schema='test_var')
for task_id, worker_name, show_reproduce_content in self.failed_tasks:
for task_id, worker_name, result_checksum, show_reproduce_content in self.failed_tasks:
logfile = self.get_logfile(worker_name)
task_id_str = yaml.safe_dump(task_id, default_flow_style=True)
color_stdout('- %s' % task_id_str, schema='test_var')
color_stdout('# results file checksum: %s\n' % result_checksum)
color_stdout('# logfile: %s\n' % logfile)
reproduce_file_path = get_reproduce_file(worker_name)
color_stdout('# reproduce file: %s\n' % reproduce_file_path)
Expand Down