From ecb01289e19693f3b2962d705e677f669d54a24c Mon Sep 17 00:00:00 2001 From: pkupczyk <pkupczyk> Date: Thu, 23 Nov 2017 15:11:54 +0000 Subject: [PATCH] SSDM-5370 : openBIS sync datasource doesn't scale - change the harvester to properly handle both regular resource lists as well as resource list indices SVN: 38947 --- .../DataSourceConnector.java | 145 +++++++++++++++--- 1 file changed, 125 insertions(+), 20 deletions(-) diff --git a/datastore_server/source/java/ch/ethz/sis/openbis/generic/server/dss/plugins/sync/harvester/synchronizer/datasourceconnector/DataSourceConnector.java b/datastore_server/source/java/ch/ethz/sis/openbis/generic/server/dss/plugins/sync/harvester/synchronizer/datasourceconnector/DataSourceConnector.java index 41229a7b258..b670d331457 100644 --- a/datastore_server/source/java/ch/ethz/sis/openbis/generic/server/dss/plugins/sync/harvester/synchronizer/datasourceconnector/DataSourceConnector.java +++ b/datastore_server/source/java/ch/ethz/sis/openbis/generic/server/dss/plugins/sync/harvester/synchronizer/datasourceconnector/DataSourceConnector.java @@ -18,8 +18,7 @@ package ch.ethz.sis.openbis.generic.server.dss.plugins.sync.harvester.synchroniz import java.io.ByteArrayInputStream; import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; +import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; @@ -28,25 +27,34 @@ import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import org.apache.log4j.Logger; import org.eclipse.jetty.client.HttpClient; -import org.eclipse.jetty.client.api.AuthenticationStore; import org.eclipse.jetty.client.api.ContentResponse; import org.eclipse.jetty.client.api.Request; -import org.eclipse.jetty.client.util.BasicAuthentication; +import org.eclipse.jetty.http.HttpHeader; import org.eclipse.jetty.http.HttpStatus; +import org.eclipse.jetty.util.B64Code; import org.w3c.dom.Document; +import org.w3c.dom.Node; import org.xml.sax.SAXException; import ch.ethz.sis.openbis.generic.server.dss.plugins.sync.harvester.config.BasicAuthCredentials; import ch.systemsx.cisd.common.http.JettyHttpClientFactory; +import ch.systemsx.cisd.common.logging.LogCategory; +import ch.systemsx.cisd.common.logging.LogFactory; /** - * - * * @author Ganime Betul Akin */ public class DataSourceConnector implements IDataSourceConnector { + + private static final Logger operationLog = LogFactory.getLogger(LogCategory.OPERATION, DataSourceConnector.class); + + final String ENTRY_START_TAG = "<url>"; + + final String ENTRY_FINISH_TAG = "</url>"; + private final String dataSourceUrl; private final BasicAuthCredentials authCredentials; @@ -57,20 +65,113 @@ public class DataSourceConnector implements IDataSourceConnector this.authCredentials = authCredentials; } + @Override public Document getResourceListAsXMLDoc(List<String> spaceBlackList) throws Exception { HttpClient client = JettyHttpClientFactory.getHttpClient(); - addAuthenticationCredentials(client); - Request requestEntity = createNewHttpRequest(client, spaceBlackList); + Request requestEntity = createResourceListRequest(client, spaceBlackList); + + operationLog.info("Start loading a resource list from " + requestEntity.getURI()); + ContentResponse contentResponse = getResponse(requestEntity); - return parseResponse(contentResponse); + Document document = parse(contentResponse.getContent()); + + if (isResourceListIndex(document)) + { + operationLog.info("Received a resource list index (the resource list was too big and was split into parts)."); + List<String> locations = getResourceListPartLocations(document); + List<String> parts = loadResourceListParts(client, locations); + return mergeResourceListParts(parts); + } else + { + operationLog.info("Received the resource list."); + return document; + } } - private Document parseResponse(ContentResponse contentResponse) throws ParserConfigurationException, SAXException, IOException + private boolean isResourceListIndex(Document document) + { + return document != null && document.hasChildNodes() && document.getFirstChild().getNodeName().equals("sitemapindex"); + } + + private List<String> getResourceListPartLocations(Document document) + { + Node sitemapindex = document.getFirstChild(); + List<String> locations = new ArrayList<String>(); + + for (int i = 0; i < sitemapindex.getChildNodes().getLength(); i++) + { + Node child = sitemapindex.getChildNodes().item(i); + if (child.getNodeName().equals("sitemap")) + { + for (int j = 0; j < child.getChildNodes().getLength(); j++) + { + Node grandChild = child.getChildNodes().item(j); + if (grandChild.getNodeName().equals("loc")) + { + String location = grandChild.getTextContent().trim(); + operationLog.info("Resource list part location: " + location); + locations.add(location); + } + } + } + } + + if (locations.isEmpty()) + { + operationLog.info("No locations of the resource list parts were found in the index."); + } + + return locations; + } + + private List<String> loadResourceListParts(HttpClient client, List<String> locations) throws Exception + { + List<String> parts = new ArrayList<String>(); + + for (String location : locations) + { + Request request = createRequest(client, location); + operationLog.info("Start loading a resource list part from " + location); + ContentResponse response = getResponse(request); + operationLog.info("Received the resource list part."); + parts.add(response.getContentAsString()); + } + + return parts; + } + + private Document mergeResourceListParts(List<String> parts) throws Exception + { + StringBuilder merged = new StringBuilder(); + + if (parts.size() > 0) + { + merged.append(parts.get(0).substring(0, parts.get(0).indexOf(ENTRY_START_TAG))); + + for (String part : parts) + { + int firstEntryIndex = part.indexOf(ENTRY_START_TAG); + int lastEntryIndex = part.lastIndexOf(ENTRY_FINISH_TAG); + + if (firstEntryIndex != -1 && lastEntryIndex != -1 && firstEntryIndex < lastEntryIndex) + { + merged.append(part.substring(firstEntryIndex, lastEntryIndex + ENTRY_FINISH_TAG.length())); + } + } + + merged.append("</urlset>"); + } + + operationLog.info("Merged the resource list parts."); + + return parse(merged.toString().getBytes()); + } + + private Document parse(byte[] content) throws ParserConfigurationException, SAXException, IOException { DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); domFactory.setNamespaceAware(true); - byte[] content = contentResponse.getContent(); ByteArrayInputStream bis = new ByteArrayInputStream(content); DocumentBuilder builder = domFactory.newDocumentBuilder(); Document doc = builder.parse(bis); @@ -90,28 +191,32 @@ public class DataSourceConnector implements IDataSourceConnector return contentResponse; } - private Request createNewHttpRequest(HttpClient client, List<String> spaceBlackList) + private Request createResourceListRequest(HttpClient client, List<String> spaceBlackList) { StringBuffer sb = new StringBuffer(); + for (String dataSourceSpace : spaceBlackList) { sb.append(dataSourceSpace + ","); } - String req = dataSourceUrl + "?verb=resourcelist.xml"; + + String url = dataSourceUrl + "?verb=resourcelist.xml"; + if (sb.length() != 0) { String str = sb.toString(); str = str.substring(0, str.length() - 1); - req += "&black_list=" + str; + url += "&black_list=" + str; } - Request requestEntity = client.newRequest(req).method("GET"); - return requestEntity; + + return createRequest(client, url); } - private void addAuthenticationCredentials(HttpClient client) throws URISyntaxException + private Request createRequest(HttpClient client, String url) { - AuthenticationStore auth = client.getAuthenticationStore(); - auth.addAuthentication(new BasicAuthentication(new URI(dataSourceUrl), authCredentials.getRealm(), authCredentials.getUser(), authCredentials - .getPassword())); + Request requestEntity = client.newRequest(url).method("GET"); + requestEntity.header(HttpHeader.AUTHORIZATION, "Basic " + B64Code.encode(authCredentials.getUser() + ":" + authCredentials.getPassword())); + return requestEntity; } + } -- GitLab