From 02c5ad4cb9aa5cae6039171f25dab35f15649f67 Mon Sep 17 00:00:00 2001 From: jakubs <jakubs> Date: Tue, 12 Jun 2012 13:50:05 +0000 Subject: [PATCH] BIS-21 SP-108 Add retry code to the data set storage algorithm. Do tests for testcase with zero retries SVN: 25651 --- .../DataSetStorageAlgorithmRunner.java | 138 +++++++++++++++++- .../RegistrationExceptionsCollection.java | 108 ++++++++++++++ .../impl/DataSetRegistrationTransaction.java | 7 + .../AbstractJythonDataSetHandlerTest.java | 23 +++ .../JythonDropboxRecoveryTest.java | 22 ++- 5 files changed, 286 insertions(+), 12 deletions(-) create mode 100644 datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/RegistrationExceptionsCollection.java diff --git a/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/DataSetStorageAlgorithmRunner.java b/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/DataSetStorageAlgorithmRunner.java index a1905d2aeb8..f43be216c6b 100644 --- a/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/DataSetStorageAlgorithmRunner.java +++ b/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/DataSetStorageAlgorithmRunner.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.log4j.Logger; +import ch.systemsx.cisd.common.concurrent.ConcurrencyUtilities; import ch.systemsx.cisd.common.logging.LogCategory; import ch.systemsx.cisd.common.logging.LogFactory; import ch.systemsx.cisd.etlserver.DssRegistrationLogger; @@ -34,6 +35,7 @@ import ch.systemsx.cisd.etlserver.registrator.recovery.IDataSetStorageRecoveryMa import ch.systemsx.cisd.openbis.dss.generic.shared.IEncapsulatedOpenBISService; import ch.systemsx.cisd.openbis.dss.generic.shared.dto.DataSetInformation; import ch.systemsx.cisd.openbis.dss.generic.shared.dto.DataSetRegistrationInformation; +import ch.systemsx.cisd.openbis.generic.shared.basic.EntityOperationsState; import ch.systemsx.cisd.openbis.generic.shared.basic.TechId; /** @@ -67,6 +69,8 @@ public class DataSetStorageAlgorithmRunner<T extends DataSetInformation> { public void registerDataSetsInApplicationServer(TechId registrationId, List<DataSetRegistrationInformation<T>> data) throws Throwable; + + public EntityOperationsState didEntityOperationsSucceeded(TechId registrationId); } public static interface IPrePostRegistrationHook<T extends DataSetInformation> @@ -525,8 +529,29 @@ public class DataSetStorageAlgorithmRunner<T extends DataSetInformation> return true; } + private static int RETRY_SLEEP_PERIOD = 1000; + + private static int MAX_RETRY_SAME_ERROR_COUNT = 0; + private boolean registerDataSetsInApplicationServer(TechId registrationId, List<DataSetRegistrationInformation<T>> registrationData) + { + boolean result; + if (shouldUseAutoRecovery()) + { + result = registerDataWithRecovery(registrationId, registrationData); + } else + { + result = registerData(registrationId, registrationData); + } + + dssRegistrationLog.log("Data has been registered with the openBIS Application Server."); + return result; + } + + // TODO: should we have DSARunnerV2 instead of these two methods? + private boolean registerData(TechId registrationId, + List<DataSetRegistrationInformation<T>> registrationData) { try { @@ -537,18 +562,117 @@ public class DataSetStorageAlgorithmRunner<T extends DataSetInformation> { operationLog.error("Error in registrating data in application server", throwable); dssRegistrationLog.log("Error in registrating data in application server"); - if (shouldUseAutoRecovery() && storageRecoveryManager.canRecoverFromError(throwable)) + rollbackDuringMetadataRegistration(throwable); + return false; + } + return true; + } + + private boolean registerDataWithRecovery(TechId registrationId, + List<DataSetRegistrationInformation<T>> registrationData) + { + RegistrationExceptionsCollection exceptionCollection = + new RegistrationExceptionsCollection(); + + EntityOperationsState result = EntityOperationsState.NO_OPERATION; + + Throwable problem = null; + int errorCount = 0; + + while (true) + { + + if (result == EntityOperationsState.NO_OPERATION) { - rollbackDelegate.markReadyForRecovery(this, throwable); - } else + try + { + applicationServerRegistrator.registerDataSetsInApplicationServer( + registrationId, registrationData); + return true; + } catch (final Throwable exception) + { + operationLog.error("Error in registrating data in application server", + exception); + dssRegistrationLog.log("Error in registrating data in application server"); + + problem = exception; + } + + // how many times has this error already happened? + errorCount = exceptionCollection.add(problem); + + if (!storageRecoveryManager.canRecoverFromError(problem)) + { + rollbackDuringMetadataRegistration(problem); + return false; + } + } + operationLog.debug("Will check the status of registration"); + + // check in openbis.registration succeeded + result = checkOperationsSucceededNoGiveUp(registrationId); + + operationLog.debug("The registration is in state: " + result); + + switch (result) { - rollbackDuringMetadataRegistration(throwable); + case IN_PROGRESS: + operationLog + .debug("The registration is in progress. Will wait until it's done."); + + waitTheRetryPeriod(); + break; + + case NO_OPERATION: + if (errorCount > MAX_RETRY_SAME_ERROR_COUNT) + { + operationLog.debug("The same error happened " + errorCount + + " times. Will stop registration."); + dssRegistrationLog.log("The same error happened " + errorCount + + " times. Will stop registration."); + + rollbackDelegate.markReadyForRecovery(this, problem); + return false; + } + waitTheRetryPeriod(); + break; + + case OPERATION_SUCCEEDED: + operationLog + .debug("The registration is in progress. Will wait until it's done."); + + // the operation has succeeded so we return + return true; } - return false; + } + } - dssRegistrationLog.log("Data has been registered with the openBIS Application Server."); - return true; + /** + * Checks if the operations have succeeded in AS. Never give up if can't connect. + */ + private EntityOperationsState checkOperationsSucceededNoGiveUp(TechId registrationId) + { + while (true) + { + try + { + EntityOperationsState result = + applicationServerRegistrator.didEntityOperationsSucceeded(registrationId); + return result; + } catch (Exception exception) + { + operationLog + .debug("Error in checking status of registration. Probably AS is down. Will wait.", + exception); + waitTheRetryPeriod(); + } + } + } + + private void waitTheRetryPeriod() + { + ConcurrencyUtilities.sleep(RETRY_SLEEP_PERIOD); } /** diff --git a/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/RegistrationExceptionsCollection.java b/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/RegistrationExceptionsCollection.java new file mode 100644 index 00000000000..cfc30e9b44b --- /dev/null +++ b/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/RegistrationExceptionsCollection.java @@ -0,0 +1,108 @@ +/* + * Copyright 2012 ETH Zuerich, CISD + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ch.systemsx.cisd.etlserver.registrator; + +import java.util.HashMap; + +/** + * The collection that behaves like a MultiSet<Throwable> with two Exceptions beeing treaded equal + * if they have the same class, and their cause exceptions are equal in the same sense. The + * implementation has weak performance, but the expected size of data is very small. If that would + * ever change, consider reimplementing it. + * + * @author jakubs + */ +public class RegistrationExceptionsCollection +{ + private final HashMap<Throwable, Integer> collection; + + public RegistrationExceptionsCollection() + { + collection = new HashMap<Throwable, Integer>(); + } + + /** + * Adds exception to the list of exceptions and returns the integer denoting how many similar + * exceptions are already in this collection. + * + * @return the number of times this exception has been add to the collection (incl. this + * operation) + */ + public int add(Throwable t) + { + Throwable key = findEqualKey(t); + Integer oldValue = collection.get(key); + if (oldValue == null) + { + oldValue = 0; + } + collection.put(key, oldValue + 1); + System.err.println("Adding the throwable "+t.getMessage()+ " for the time "+(oldValue+1)); + return oldValue + 1; + } + + /** + * @return the count for the given exception + */ + public int get(Throwable t) + { + Integer value = collection.get(findEqualKey(t)); + if (value == null) + { + return 0; + } else + { + return value; + } + } + + /** + * @return the Throwable that can be used as a key in the internal collection. It's either + * already an existing key or the <code>param<code> if no suitable throwable exists. + */ + private Throwable findEqualKey(Throwable param) + { + for (Throwable key : collection.keySet()) + { + if (equal(key, param)) + { + return key; + } + } + return param; + } + + /** + * check the equality of two throwables + */ + private boolean equal(Throwable first, Throwable second) + { + if (first == null && second == null) + { + return true; + } + if (first == null || second == null) + { + return false; + } + if (first.getClass().equals(second.getClass())) + { + return equal(first.getCause(), second.getCause()); + } + return false; + } +} diff --git a/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/api/v1/impl/DataSetRegistrationTransaction.java b/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/api/v1/impl/DataSetRegistrationTransaction.java index cd6223a9d0b..ed745cab440 100644 --- a/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/api/v1/impl/DataSetRegistrationTransaction.java +++ b/datastore_server/source/java/ch/systemsx/cisd/etlserver/registrator/api/v1/impl/DataSetRegistrationTransaction.java @@ -66,6 +66,7 @@ import ch.systemsx.cisd.openbis.dss.generic.shared.api.internal.v1.ISpaceImmutab import ch.systemsx.cisd.openbis.dss.generic.shared.dto.AtomicEntityOperationDetails; import ch.systemsx.cisd.openbis.dss.generic.shared.dto.DataSetInformation; import ch.systemsx.cisd.openbis.dss.generic.shared.dto.DataSetRegistrationInformation; +import ch.systemsx.cisd.openbis.generic.shared.basic.EntityOperationsState; import ch.systemsx.cisd.openbis.generic.shared.basic.TechId; import ch.systemsx.cisd.openbis.generic.shared.basic.dto.MaterialIdentifier; import ch.systemsx.cisd.openbis.generic.shared.dto.identifier.ExperimentIdentifier; @@ -519,6 +520,12 @@ public class DataSetRegistrationTransaction<T extends DataSetInformation> implem entityRegistrationService.performOperationsInApplcationServer(registrationDetails); } + @Override + public EntityOperationsState didEntityOperationsSucceeded(TechId registrationId) + { + return openBisService.didEntityOperationsSucceed(registrationId); + } + public boolean isCommittedOrRolledback() { return isCommitted() || isRolledback(); diff --git a/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/AbstractJythonDataSetHandlerTest.java b/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/AbstractJythonDataSetHandlerTest.java index 2e92b477770..cdd0e18df37 100644 --- a/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/AbstractJythonDataSetHandlerTest.java +++ b/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/AbstractJythonDataSetHandlerTest.java @@ -27,10 +27,15 @@ import java.util.Properties; import net.lemnik.eodsql.DynamicTransactionQuery; import org.apache.commons.io.FileUtils; +import org.apache.log4j.ConsoleAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; import org.jmock.Expectations; import org.jmock.Mockery; import org.testng.annotations.AfterMethod; import org.testng.annotations.AfterTest; +import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeMethod; import org.testng.annotations.BeforeTest; @@ -65,6 +70,24 @@ import ch.systemsx.cisd.openbis.generic.shared.dto.StorageFormat; */ public abstract class AbstractJythonDataSetHandlerTest extends AbstractFileSystemTestCase { + @BeforeClass + public static void classSetUp() + { + Logger rootLogger = Logger.getRootLogger(); + if (!rootLogger.getAllAppenders().hasMoreElements()) + { + rootLogger.setLevel(Level.INFO); + rootLogger.addAppender(new ConsoleAppender(new PatternLayout("%-5p [%t]: %m%n"))); + + // The TTCC_CONVERSION_PATTERN contains more info than + // the pattern we used for the root logger + Logger pkgLogger = + rootLogger.getLoggerRepository().getLogger("robertmaldon.moneymachine"); + pkgLogger.setLevel(Level.DEBUG); + pkgLogger.addAppender(new ConsoleAppender(new PatternLayout( + PatternLayout.TTCC_CONVERSION_PATTERN))); + } + } /** * Return the path of the folder that contains the registration scripts. diff --git a/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/JythonDropboxRecoveryTest.java b/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/JythonDropboxRecoveryTest.java index de0ffef188f..0c4cd4d4367 100644 --- a/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/JythonDropboxRecoveryTest.java +++ b/datastore_server/sourceTest/java/ch/systemsx/cisd/etlserver/registrator/JythonDropboxRecoveryTest.java @@ -359,15 +359,15 @@ public class JythonDropboxRecoveryTest extends AbstractJythonDataSetHandlerTest { assertDirNotEmpty(precommitDirectory, "Precommit directory should not be empty"); } - assertRecoveryFile(expectedRetryCount, - RecoveryInfoDateConstraint.AFTER_ORIGINAL, testCase.recoveryLastTry); + assertRecoveryFile(expectedRetryCount, RecoveryInfoDateConstraint.AFTER_ORIGINAL, + testCase.recoveryLastTry); assertOriginalMarkerFileExists(); break; case RETRY_AT_STORAGE_CONFIRMED_FAILURE: assertStorageProcess(atomicatOperationDetails.recordedObject(), DATA_SET_CODE, "sub_data_set_1", 0); - assertRecoveryFile(expectedRetryCount, - RecoveryInfoDateConstraint.AFTER_ORIGINAL, testCase.recoveryLastTry); + assertRecoveryFile(expectedRetryCount, RecoveryInfoDateConstraint.AFTER_ORIGINAL, + testCase.recoveryLastTry); assertOriginalMarkerFileExists(); break; case GIVE_UP: @@ -710,7 +710,7 @@ public class JythonDropboxRecoveryTest extends AbstractJythonDataSetHandlerTest { initialContainerExpectations(); } - // first try - fail at registration + // first try - fail at registration and realize that the operation has not succeeded registerDataSetsAndThrow(true); // second handle - fail at storage @@ -911,6 +911,11 @@ public class JythonDropboxRecoveryTest extends AbstractJythonDataSetHandlerTest }; } + /** + * This method should make sure that the registration will fail, and it will go into the + * recovery mode. It means that it also has to assure that the subsequent retries introduce + * in SP-107 are failing. + */ protected void registerDataSetsAndThrow(boolean canRecoverFromError) { one(openBisService).drawANewUniqueID(); @@ -930,6 +935,13 @@ public class JythonDropboxRecoveryTest extends AbstractJythonDataSetHandlerTest } will(throwException(e)); + + if (canRecoverFromError) + { + // the check immediately after the exception fails as well + one(openBisService).didEntityOperationsSucceed(with(any(TechId.class))); + will(returnValue(EntityOperationsState.NO_OPERATION)); + } } protected void registerDataSetsAndSucceed() -- GitLab