multi-data-set-archiving.md

archiver.class = ch.systemsx.cisd.openbis.dss.generic.server.plugins.standard.archiver.MultiDataSetArchiver

# Temporary folder (needed for sanity check). Default: Value provided by Java system property java.io.tmpdir. Usually /tmp
# archiver.temp-folder = <java temp folder>

# Archive destination
archiver.final-destination = path/to/strongbox/as/mounted/resource

# Staging folder (needed for 'staging workflow' and 'staging and replication workflow')
archiver.staging-destination = path/to/local/stage/area

# Replication folder (needed for 'replication workflow' and 'staging and replication workflow')
archiver.replicated-destination = path/to/mounted/replication/folder

# The archiver will refuse to archive group of data sets, which together are smaller than this value
archiver.minimum-container-size-in-bytes = 15000000

# The archiver will refuse to archive group of data sets, which together are bigger than this value.
# The archiver will ignore this value, when archiving single data set.
archiver.maximum-container-size-in-bytes = 35000000

# This variable is meant for another use case, than this archiver, but is shared among all archivers.
# For this archiver it should be specified for something safely larger than maximum-container-size-in-bytes
archiver.batch-size-in-bytes = 80000000

# (since version 20.10.4) Check consistency between file meta data of the files in the store and from the pathinfo database.
# Default value: true
# check-consistency-between-store-and-pathinfo-db = true

# Archiving can be speed up if setting this flag to false (default value: true). But this works only if the data sets
# to be archived do not contain hdf5 files which are handled as folders (like the thumbnail h5ar files in screening/microscopy).
# archiver.hdf5-files-in-data-set = true

# Whether all data sets should be archived in a top level directory of archive or with sharding (the way data sets are stored in openbis internal store)
# archiver.with-sharding = false

# Polling time for evaluating free space on archive destination
# archiver.waiting-for-free-space-polling-time = 1 min

# Maximum waiting time for free space on archive destination
# archiver.waiting-for-free-space-time-out = 4 h

# If set to true, then an initial waiting time will be added before starting a sanity check.
# If the sanity check fails, it will be retried. The time between each sanity check attempt is doubled,
# starting from the initial waiting time up to the maximum waiting time (see properties below).
# Default: false
archiver.wait-for-sanity-check = true

# Initial waiting time before starting a sanity check. Works only if 'wait-for-sanity-check = true'
# Default: 10sec
archiver.wait-for-sanity-check-initial-waiting-time = 120 sec

# Maximum total waiting time for failed sanity check attempts. Works only if 'wait-for-sanity-check = true'
# Default: 30min
archiver.wait-for-sanity-check-max-waiting-time = 5 min

# A template of a shell command to be executed before unarchiving. The template may use ${container-path} and ${container-id} variables which will be replaced with an absolute container path (full path of the tar file to be unarchived)
# and a container id (id of the container to be unarchived used in the archiving database). The command created from the template is executed only once for a given container (just before the first unarchiving attempt) and is not retried.
# The unarchiver waits for the command to finish before proceeding. If the command exits with status zero, then the unarchiving is started. If the command exits with a non-zero value, then the archiving is marked as failed.
#
# Example: tar -tf ${container-path}
# Default: null
archiver.unarchiving-prepare-command-template

# If set to true, then the unarchiver waits for T flag to be removed from the file in the final destination before it tries to read the file.
# Default: false
archiver.unarchiving-wait-for-t-flag = true

# Maximum total waiting time for failed unarchiving attempts.
# Default: null
archiver.unarchiving-max-waiting-time = 1d

# Polling time for waiting on unarchiving.
# Default: null
archiver.unarchiving-polling-time = 5 min

# If set to true, then the archiver waits for T flag to be set on the file in the replicated destination. The check is done before a potential sanity check of the replicated file (see 'finalizer-sanity-check').
# Default: false
archiver.finalizer-wait-for-t-flag = true

# If set to true, then a sanity check for the replicated destination is also performed (in addition to a sanity check for the final destination which is always executed).
# Default: false
archiver.finalizer-sanity-check = true

# Minimum required free space at final destination before triggering archiving if > 0. This threshold can be
# specified as a percentage of total space or number of bytes. If both are specified the threshold is given by
# the maximum of both values.
# archiver.minimum-free-space-at-final-destination-in-percentage
# archiver.minimum-free-space-at-final-destination-in-bytes

# Minimum free space on archive destination after container file has been added.
# archiver.minimum-free-space-in-MB = 1024

# Polling time for waiting on replication. Only needed if archiver.replicated-destination is specified.
# archiver.finalizer-polling-time = 1 min

# Maximum waiting time for replication finished.  Only needed if archiver.replicated-destination is specified.
# archiver.finalizer-max-waiting-time = 1 d

# Maximum total size (in MB) of data sets that can be scheduled for unarchiving at any given time. When not specified, defaults to 1 TB.
# Note also that the value specified must be consistent with the scratch share size.
# maximum-unarchiving-capacity-in-megabytes = 200000

# Delay unarchiving. Needs MultiDataSetUnarchivingMaintenanceTask.
# archiver.delay-unarchiving = false

# Size of the buffer used for copying data. Default value: 1048576 (i.e. 1 MB). This value is only important in case of accurate
# measurements of data transfer rates. In case of expected fast transfer rates a larger value (e.g. 10 MB) should be used.
# archiver.buffer-size = 1048576

# Maximum size of the writing queue for copying data. Reading from the data store and writing to the TAR file is
# done in parallel. The default value 5 * archiver.buffer-size.
# archiver.maximum-queue-size-in-bytes = 5242880

# Path (absolute or relative to store root) of an empty file. If this file is present starting
# archiving will be paused until this file has been removed.
# This property is useful for archiving media/facilities with maintenance downtimes.
# archiver.pause-file = pause-archiving

# Time interval between two checks whether pause file still exists or not.
# archiver.pause-file-polling-time = 10 min

#-------------------------------------------------------------------------------------------------------
# Clean up properties
#
# A comma-separated list of path to folders which should be cleaned in a separate thread
#archiver.cleaner.file-path-prefixes-for-async-deletion = <absolute path 1>, <absolute path 2>, ...

# A folder which will contain deletion request files. This is a mandatory property if
# archiver.cleaner.file-path-prefixes-for-async-deletion is specified.
#archiver.cleaner.deletion-requests-dir = <some local folder>

# Polling time interval for looking and performing deletion requests. Default value is 10 minutes.
#archiver.cleaner.deletion-polling-time = 10 min

# Time out of deletion requests. Default value is one day.
#archiver.cleaner.deletion-time-out = 24 h

# Optional e-mail address. If specified every integer multiple of the timeout period an e-mail is send to
# this address listing all deletion requests older than specified timeout.
#archiver.cleaner.email-address = <some valid e-mail address>

# Optional e-mail address for the 'from' field.
#archiver.cleaner.email-from-address = <some well-formed e-mail address>

# Subject for the 'subject' field. Mandatory if an e-mail address is specified.
#archiver.cleaner.email-subject = Deletion failure

# Template with variable ${file-list} for the body text of the e-mail. The variable will be replaced by a list of
# lines. Two lines for each deletion request. One for the absolute file path and one of the request time stamp.
# Mandatory if an e-mail address is specified.
#archiver.cleaner.email-template = The following files couldn't be deleted:\n${file-list}

#-------------------------------------------------------------------------------------------------------
# The following properties are necessary in combination with data source configuration
multi-dataset-archive-database.kind = prod
multi-dataset-archive-sql-root-folder = datastore_server/sql/multi-dataset-archive
version-holder-class = ch.systemsx.cisd.openbis.dss.generic.server.plugins.standard.archiver.dataaccess.MultiDataSetArchiverDBVersionHolder
databaseEngineCode = postgresql
basicDatabaseName = multi_dataset_archive
urlHostPart = ${multi-dataset-archive-database.url-host-part:localhost}
databaseKind = ${multi-dataset-archive-database.kind:prod}
scriptFolder = ${multi-dataset-archive-sql-root-folder:}
owner = ${multi-dataset-archive-database.owner:}
password = ${multi-dataset-archive-database.password:}
unarchiving-scratch-share = true
unarchiving-scratch-share-maximum-size-in-GB = 100
data-set-command-queue-mapping = archiving:Archiving|Copying data sets to archive, unarchiving:Unarchiving, archiving-finalizer:Archiving Finalizer
SELECT id, size, present_in_archive, share_id, location FROM external_data WHERE status = 'ARCHIVE_PENDING';

openbis_prod=> SELECT id, size, present_in_archive, share_id, location FROM external_data WHERE status = 'ARCHIVE_PENDING';
 data_id |    size     | present_in_archive | share_id |                               location
---------+-------------+--------------------+----------+-----------------------------------------------------------------------
    3001 | 34712671864 | f                  | 1        | 585D8354-92A3-4C24-9621-F6B7063A94AC/17/65/a4/20170712111421297-37998
    3683 | 29574172672 | f                  | 1        | 585D8354-92A3-4C24-9621-F6B7063A94AC/39/6c/b0/20171106181516927-39987
    3688 | 53416316928 | f                  | 1        | 585D8354-92A3-4C24-9621-F6B7063A94AC/ca/3b/93/20171106183212074-39995
    3692 | 47547908096 | f                  | 1        | 585D8354-92A3-4C24-9621-F6B7063A94AC/b7/26/85/20171106185354378-40002
 openbis_prod=> UPDATE external_data SET status = 'AVAILABLE', present_in_archive = 'f'  WHERE id IN (SELECT id FROM data where code in ('20170712111421297-37998', '20171106181516927-39987'));
# To find out the containers:

SELECT * FROM data_sets WHERE CODE IN('20170712111421297-37998', '20171106181516927-39987', '20171106183212074-39995', '20171106185354378-40002');

multi_dataset_archive_prod=> SELECT * FROM data_sets WHERE CODE IN('20170712111421297-37998', '20171106181516927-39987', '20171106183212074-39995', '20171106185354378-40002');
 id  |          code           | ctnr_id | size_in_bytes
-----+-------------------------+---------+---------------
 294 | 20170712111421297-37998 |      60 |   34712671864
 295 | 20171106185354378-40002 |      61 |   47547908096
 296 | 20171106183212074-39995 |      61 |   53416316928
 297 | 20171106181516927-39987 |      61 |   29574172672
(4 rows)

multi_dataset_archive_prod=> SELECT * FROM containers WHERE id IN(60, 61);
 id |                    path                     | unarchiving_requested
----+---------------------------------------------+-----------------------
 60 | 20170712111421297-37998-20171108-105339.tar | f
 61 | 20171106185354378-40002-20171108-130342.tar | f

multi_dataset_archive_prod=> SELECT * FROM data_sets WHERE ctnr_id IN(SELECT ctnr_id FROM data_sets WHERE CODE IN('20170712111421297-37998', '20171106181516927-39987', '20171106183212074-39995', '20171106185354378-40002'));
openbis_prod=> DELETE FROM containers WHERE id IN (SELECT ctnr_id FROM data_sets WHERE CODE IN('20170712111421297-37998', '20171106181516927-39987', '20171106183212074-39995', '20171106185354378-40002'));