diff --git a/src/python/OBis/obis/dm/commands/openbis_command.py b/src/python/OBis/obis/dm/commands/openbis_command.py index bdb3beb09bac8f426f3e21e91477a8833f765c98..c53fc36bdb2906890062fb904ebcb18646375672 100644 --- a/src/python/OBis/obis/dm/commands/openbis_command.py +++ b/src/python/OBis/obis/dm/commands/openbis_command.py @@ -57,6 +57,9 @@ class OpenbisCommand(object): def fileservice_url(self): return self.config_dict.get('fileservice_url') + def git_annex_hash_as_checksum(self): + return self.config_dict.get('git_annex_hash_as_checksum') + def prepare_run(self): result = self.check_configuration() if result.failure(): diff --git a/src/python/OBis/obis/dm/commands/openbis_sync.py b/src/python/OBis/obis/dm/commands/openbis_sync.py index 3c2f81cbd6c60ed01f32b4bdcc0ba7ad238c4c11..85820add48a27de4193997926d2f4925f5472225 100644 --- a/src/python/OBis/obis/dm/commands/openbis_sync.py +++ b/src/python/OBis/obis/dm/commands/openbis_sync.py @@ -53,7 +53,7 @@ class OpenbisSync(OpenbisCommand): commit_id = result.output sample_id = self.object_id() experiment_id = self.collection_id() - contents = GitRepoFileInfo(self.git_wrapper).contents() + contents = GitRepoFileInfo(self.git_wrapper).contents(git_annex_hash_as_checksum=self.git_annex_hash_as_checksum()) try: data_set = self.openbis.new_git_data_set(data_set_type, top_level_path, commit_id, repository_id, external_dms.code, sample=sample_id, experiment=experiment_id, properties=properties, parents=parent_data_set_id, diff --git a/src/python/OBis/obis/dm/config.py b/src/python/OBis/obis/dm/config.py index 92083fc270890ee257e9c656f9fd40a831fe0b01..ebf4f941de070c8dcfabe77e114e6bbe83aca19f 100644 --- a/src/python/OBis/obis/dm/config.py +++ b/src/python/OBis/obis/dm/config.py @@ -30,7 +30,7 @@ class ConfigLocation(object): class ConfigParam(object): """Class for configuration parameters.""" - def __init__(self, name, private, is_json=False, ignore_global=False): + def __init__(self, name, private, is_json=False, ignore_global=False, default_value=None): """ :param name: Name of the parameter. :param private: Should the parameter be private to the repo or visible in the data set? @@ -40,6 +40,7 @@ class ConfigParam(object): self.private = private self.is_json = is_json self.ignore_global = ignore_global + self.default_value = default_value def location_path(self, loc): if loc == 'global': @@ -101,12 +102,13 @@ class ConfigEnv(object): self.add_param(ConfigParam(name='openbis_url', private=False)) self.add_param(ConfigParam(name='fileservice_url', private=False)) self.add_param(ConfigParam(name='user', private=True)) - self.add_param(ConfigParam(name='verify_certificates', private=True, is_json=True)) + self.add_param(ConfigParam(name='verify_certificates', private=True, is_json=True, default_value=True)) self.add_param(ConfigParam(name='object_id', private=False, ignore_global=True)) self.add_param(ConfigParam(name='collection_id', private=False, ignore_global=True)) self.add_param(ConfigParam(name='data_set_type', private=False)) self.add_param(ConfigParam(name='data_set_properties', private=False, is_json=True)) self.add_param(ConfigParam(name='hostname', private=False)) + self.add_param(ConfigParam(name='git_annex_hash_as_checksum', private=False, is_json=True, default_value=True)) def add_param(self, param): self.params[param.name] = param @@ -246,7 +248,10 @@ class ConfigResolverImpl(object): config = config['private'] else: config = config['public'] - return config.get(param.name) + value = config.get(param.name) + if value is None: + value = param.default_value + return value def set_cache_value_for_parameter(self, param, value, loc): config = self.location_cache[loc] diff --git a/src/python/OBis/obis/dm/data_mgmt.py b/src/python/OBis/obis/dm/data_mgmt.py index 90b099cb548f0fda947345d1939da86a8e8128fa..36d21714bb5a3a94ae658238278ccdaaf7e9280f 100644 --- a/src/python/OBis/obis/dm/data_mgmt.py +++ b/src/python/OBis/obis/dm/data_mgmt.py @@ -176,6 +176,20 @@ class NoGitDataMgmt(AbstractDataMgmt): self.error_raise("download", "No git command found.") +def with_restore(f): + def f_with_restore(self, *args): + self.set_restorepoint() + try: + result = f(self, *args) + if result.failure(): + self.restore() + return result + except Exception as e: + self.restore() + return CommandResult(returncode=-1, output="Error: " + str(e)) + return f_with_restore + + class GitDataMgmt(AbstractDataMgmt): """DataMgmt operations in normal state.""" @@ -248,12 +262,9 @@ class GitDataMgmt(AbstractDataMgmt): return result + @with_restore def sync(self, ignore_missing_parent=False): - self.set_restorepoint() - result = self._sync(ignore_missing_parent) - if result.failure(): - self.restore() - return result + return self._sync(ignore_missing_parent) def _sync(self, ignore_missing_parent=False): @@ -269,8 +280,8 @@ class GitDataMgmt(AbstractDataMgmt): return self._commit(msg, auto_add, ignore_missing_parent, sync); + @with_restore def _commit(self, msg, auto_add=True, ignore_missing_parent=False, sync=True): - self.set_restorepoint() if auto_add: result = self.git_wrapper.git_top_level_path() if result.failure(): @@ -284,10 +295,9 @@ class GitDataMgmt(AbstractDataMgmt): return result if sync: result = self._sync(ignore_missing_parent) - if result.failure(): - self.restore() return result + def status(self): git_status = self.git_wrapper.git_status() try: diff --git a/src/python/OBis/obis/dm/git-annex-attributes b/src/python/OBis/obis/dm/git-annex-attributes index c6cf20e09e3c117cb5b1579fc47018e237e6c0a2..36e35ab56eab2c65279e8a9c80203ef3e1acc84c 100644 --- a/src/python/OBis/obis/dm/git-annex-attributes +++ b/src/python/OBis/obis/dm/git-annex-attributes @@ -1,3 +1,4 @@ +* annex.backend=MD5 * annex.largefiles=(largerthan=100kb) *.zip annex.largefiles=anything *.gz annex.largefiles=anything diff --git a/src/python/OBis/obis/dm/git.py b/src/python/OBis/obis/dm/git.py index 8612f8365b1ea6e9ef91b036859cf64558f27447..1e04941bf2110cb5e512fe30a4cc86cf1de14640 100644 --- a/src/python/OBis/obis/dm/git.py +++ b/src/python/OBis/obis/dm/git.py @@ -1,6 +1,8 @@ +import json import shutil import os from .utils import run_shell +from .command_result import CommandException class GitWrapper(object): @@ -88,7 +90,7 @@ class GitWrapper(object): def git_delete_if_untracked(self, file): result = run_shell([self.git_path, 'ls-files', '--error-unmatch', file]) - if file in result.output: + if 'did not match' in result.output: run_shell(['rm', file]) class GitRepoFileInfo(object): @@ -97,16 +99,18 @@ class GitRepoFileInfo(object): def __init__(self, git_wrapper): self.git_wrapper = git_wrapper - def contents(self): + def contents(self, git_annex_hash_as_checksum=False): """Return a list of dicts describing the contents of the repo. :return: A list of dictionaries {'crc32': checksum, + 'checksum': checksum other than crc32 + 'checksumType': type of checksum 'fileLength': size of the file, 'path': path relative to repo root. 'directory': False }""" files = self.file_list() - cksum = self.cksum(files) + cksum = self.cksum(files, git_annex_hash_as_checksum) return cksum def file_list(self): @@ -117,20 +121,79 @@ class GitRepoFileInfo(object): files = [line.split("\t")[-1].strip() for line in lines] return files - def cksum(self, files): - cmd = ['cksum'] - cmd.extend(files) - result = run_shell(cmd) - if result.failure(): - return [] - lines = result.output.split("\n") - return [self.checksum_line_to_dict(line) for line in lines] + def cksum(self, files, git_annex_hash_as_checksum=False): + + if git_annex_hash_as_checksum == False: + checksum_generator = ChecksumGeneratorCrc32() + else: + checksum_generator = ChecksumGeneratorGitAnnex() + + checksums = [] + + for file in files: + checksum = checksum_generator.get_checksum(file) + checksums.append(checksum) + + return checksums + - @staticmethod - def checksum_line_to_dict(line): - fields = line.split(" ") +class ChecksumGeneratorCrc32(object): + def get_checksum(self, file): + result = run_shell(['cksum', file]) + if result.failure(): + raise CommandException(result) + fields = result.output.split(" ") return { 'crc32': int(fields[0]), 'fileLength': int(fields[1]), - 'path': fields[2] - } \ No newline at end of file + 'path': file + } + + +class ChecksumGeneratorMd5(object): + def get_checksum(self, file): + md5_result = run_shell(['md5', file], raise_exception_on_failure=True) + return { + 'checksum': md5_result.output.split(" ")[-1], + 'checksumType': 'MD5', + 'fileLength': os.path.getsize(file), + 'path': file + } + + +class ChecksumGeneratorGitAnnex(object): + + def __init__(self): + backend = self._get_annex_backend() + self.checksum_generator_replacement = ChecksumGeneratorCrc32() if backend is None else None + # define which generator to use for files which are not handled by annex + if backend == 'MD5': + self.checksum_generator_supplement = ChecksumGeneratorMd5() + else: + self.checksum_generator_supplement = ChecksumGeneratorCrc32() + + def get_checksum(self, file): + if self.checksum_generator_replacement is not None: + return self.checksum_generator_replacement.get_checksum(file) + return self._get_checksum(file) + + def _get_checksum(self, file): + annex_result = run_shell(['git', 'annex', 'info', '-j', file], raise_exception_on_failure=True) + if 'Not a valid object name' in annex_result.output: + return self.checksum_generator_supplement.get_checksum(file) + annex_info = json.loads(annex_result.output) + if annex_info['present'] != True: + return self.checksum_generator_supplement.get_checksum(file) + return { + 'checksum': annex_info['key'].split('--')[1], + 'checksumType': annex_info['key'].split('-')[0], + 'fileLength': os.path.getsize(file), + 'path': file + } + + def _get_annex_backend(self): + with open('.gitattributes') as gitattributes: + for line in gitattributes.readlines(): + if 'annex.backend' in line: + return line.split('=')[1].strip() + return None diff --git a/src/python/OBis/obis/dm/utils.py b/src/python/OBis/obis/dm/utils.py index 9e2878b23d44e7f5497ff0a15c10a4506da607db..c0712898d730630c2fe503cd8ffe8e6a0d5b7ccd 100644 --- a/src/python/OBis/obis/dm/utils.py +++ b/src/python/OBis/obis/dm/utils.py @@ -1,7 +1,7 @@ import subprocess import os from contextlib import contextmanager -from .command_result import CommandResult +from .command_result import CommandResult, CommandException def complete_openbis_config(config, resolver, local_only=True): @@ -10,10 +10,7 @@ def complete_openbis_config(config, resolver, local_only=True): if config.get('url') is None: config['url'] = config_dict['openbis_url'] if config.get('verify_certificates') is None: - if config_dict.get('verify_certificates') is not None: - config['verify_certificates'] = config_dict['verify_certificates'] - else: - config['verify_certificates'] = True + config['verify_certificates'] = config_dict['verify_certificates'] if config.get('token') is None: config['token'] = None @@ -37,8 +34,11 @@ def default_echo(details): print(details['message']) -def run_shell(args, shell=False, strip_whitespace=True): - return CommandResult(subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell), strip_whitespace=strip_whitespace) +def run_shell(args, shell=False, strip_whitespace=True, raise_exception_on_failure=False): + result = CommandResult(subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell), strip_whitespace=strip_whitespace) + if raise_exception_on_failure == True and result.failure(): + raise CommandException(result) + return result def locate_command(command): diff --git a/src/python/PyBis/pybis/data_set.py b/src/python/PyBis/pybis/data_set.py index 1ed2ec12438344c6dc475018207f2e2a4ccb3eb6..59abd174778770922200b944222079d87d8ae270 100644 --- a/src/python/PyBis/pybis/data_set.py +++ b/src/python/PyBis/pybis/data_set.py @@ -38,6 +38,8 @@ class GitDataSetCreation(object): :param contents: A list of dicts that describe the contents: {'fileLength': [file length], 'crc32': [crc32 checksum], + 'checksum': [checksum other than crc32], + 'checksumType': [checksum type if fiels checksum is used], 'directory': [is path a directory?] 'path': [the relative path string]} @@ -172,6 +174,8 @@ class GitDataSetCreation(object): result = {} transfer_to_file_creation(content, result, 'fileLength') transfer_to_file_creation(content, result, 'crc32', 'checksumCRC32') + transfer_to_file_creation(content, result, 'checksum', 'checksum') + transfer_to_file_creation(content, result, 'checksumType', 'checksumType') transfer_to_file_creation(content, result, 'directory') transfer_to_file_creation(content, result, 'path') return result