From eba1e46008d625961c724f54174a47e75772d8cd Mon Sep 17 00:00:00 2001
From: Viktor Kovtun <viktor.kovtun@id.ethz.ch>
Date: Mon, 16 Aug 2021 13:42:29 +0200
Subject: [PATCH] SSDM-11382 Made improvements. Made the poputation of
 DataFrame and Objects configurable.

---
 pybis/src/python/pybis/pybis.py    | 330 +++++++++++++++++------------
 pybis/src/python/pybis/sample.py   |  14 +-
 pybis/src/python/pybis/things.py   |  12 +-
 pybis/src/python/pybis/utils.py    |  31 +--
 pybis/src/python/tests/conftest.py |   4 +-
 5 files changed, 228 insertions(+), 163 deletions(-)

diff --git a/pybis/src/python/pybis/pybis.py b/pybis/src/python/pybis/pybis.py
index c6aa9b38588..468969b282a 100644
--- a/pybis/src/python/pybis/pybis.py
+++ b/pybis/src/python/pybis/pybis.py
@@ -86,6 +86,8 @@ import errno
 import requests
 
 import urllib3
+import logging
+import sys
 
 
 # import the various openBIS entities
@@ -101,6 +103,8 @@ LOG_DEBUG = 7
 
 DEBUG_LEVEL = LOG_NONE
 
+def now():
+    return time.time_ns() // 1000000
 
 def get_search_type_for_entity(entity, operator=None):
     """Returns a dictionary containing the correct search criteria type
@@ -2034,6 +2038,7 @@ class Openbis:
         attrs=None,
         props=None,
         where=None,
+        thingsFetchOptions={"withDataFrame": True, "withObjects": True},
         **properties,
     ):
         """Returns a DataFrame of all samples for a given space/project/experiment (or any combination)
@@ -2068,6 +2073,10 @@ class Openbis:
                         b) property is not defined for this sampleType
         """
 
+        logger = logging.getLogger('get_samples')
+        logger.setLevel(logging.DEBUG)
+        logger.addHandler(logging.StreamHandler(sys.stdout))
+
         if collection is not None:
             experiment = collection
 
@@ -2150,27 +2159,39 @@ class Openbis:
                 fetchopts,
             ],
         }
+
+        time1 = now()
+        logger.debug("get_samples posting request")
         resp = self._post_request(self.as_v3, request)
 
-        samples = []
+        time2 = now()
+
+        logger.debug(f"get_samples got response. Delay: {time2 - time1}")
         parse_jackson(resp)
-        for obj in resp["objects"]:
-            sample = Sample(
-                openbis_obj=self,
-                type=self.get_sample_type(obj["type"]["code"]),
-                data=obj,
-            )
-            samples.append(sample)
 
-        return self._sample_list_for_response(
-            response=resp["objects"],
-            attrs=attrs,
-            props=props,
-            start_with=start_with,
-            count=count,
-            totalCount=resp["totalCount"],
-            objects=samples,
-        )
+        time3 = now()
+
+        response = resp["objects"]
+        # samples = range(len(objects))
+        logger.debug(f"get_samples got JSON. Delay: {time3 - time2}")
+
+        objects = None
+        if thingsFetchOptions["withObjects"]:
+            objects = list(map(lambda obj: Sample(openbis_obj=self, type=self.get_sample_type(obj["type"]["code"]),
+                                              data=obj), response))
+
+        time4 = now()
+
+        logger.debug(f"get_samples after result mapping. Delay: {time4 - time3}")
+
+        result = self._sample_list_for_response(response=response, thingsFetchOptions=thingsFetchOptions,
+                                                attrs=attrs, props=props, start_with=start_with, count=count,
+                                                totalCount=resp["totalCount"], objects=objects, parsed=True)
+
+        time5 = now()
+
+        logger.debug(f"get_samples computed final result. Delay: {time5 - time4}")
+        return result
 
     get_objects = get_samples  # Alias
 
@@ -2534,7 +2555,6 @@ class Openbis:
                 )
             fetchopts["kind"] = kind
             raise NotImplementedError("you cannot search for dataSet kinds yet")
-
         request = {
             "method": "searchDataSets",
             "params": [
@@ -2749,7 +2769,11 @@ class Openbis:
             if only_data:
                 return resp[projectId]
 
-            project = Project(openbis_obj=self, type=None, data=resp[projectId])
+            project = Project(
+                openbis_obj=self,
+                type=None,
+                data=resp[projectId]
+            )
             if self.use_cache:
                 self._object_cache(entity="project", code=projectId, value=project)
             return project
@@ -2771,7 +2795,11 @@ class Openbis:
             if only_data:
                 return resp["objects"][0]
 
-            project = Project(openbis_obj=self, type=None, data=resp["objects"][0])
+            project = Project(
+                openbis_obj=self,
+                type=None,
+                data=resp["objects"][0]
+            )
             if self.use_cache:
                 self._object_cache(entity="project", code=projectId, value=project)
             return project
@@ -3330,7 +3358,6 @@ class Openbis:
                         "code"
                     ]
                 obj["creationDate"] = format_timestamp(obj["creationDate"])
-
             return objects
 
     def get_semantic_annotations(self):
@@ -3466,7 +3493,6 @@ class Openbis:
 
     def new_plugin(self, name, pluginType, **kwargs):
         """Creates a new Plugin in openBIS.
-
         name        -- name of the plugin
         description --
         pluginType  -- DYNAMIC_PROPERTY, MANAGED_PROPERTY, ENTITY_VALIDATION
@@ -4195,7 +4221,13 @@ class Openbis:
         )
 
     def get_sample(
-        self, sample_ident, only_data=False, withAttachments=False, props=None, **kvals
+            self,
+            sample_ident,
+            only_data=False,
+            withAttachments=False,
+            props=None,
+            thingsFetchOptions={"withDataFrame": True, "withObjects": True},
+            **kvals
     ):
         """Retrieve metadata for the sample.
         Get metadata for the sample and any directly connected parents of the sample to allow access
@@ -4258,9 +4290,149 @@ class Openbis:
         else:
             return self._sample_list_for_response(
                 response=list(resp.values()),
+                thingsFetchOptions=thingsFetchOptions,
                 props=props,
             )
 
+    def _sample_list_for_response(
+        self,
+        response,
+        thingsFetchOptions,
+        attrs=None,
+        props=None,
+        start_with=None,
+        count=None,
+        totalCount=0,
+        objects=None,
+        parsed=False,
+    ):
+        """returns a Things object, containing a DataFrame plus additional information"""
+
+        def extract_attribute(attribute_to_extract):
+            def return_attribute(obj):
+                if obj is None:
+                    return ""
+                return obj.get(attribute_to_extract, "")
+
+            return return_attribute
+
+        logger = logging.getLogger('_sample_list_for_response')
+        logger.setLevel(logging.DEBUG)
+        logger.addHandler(logging.StreamHandler(sys.stdout))
+
+        time1 = now()
+
+        logger.debug("_sample_list_for_response before parsing JSON")
+        if not parsed:
+            parse_jackson(response)
+
+        time2 = now()
+
+        logger.debug(f"_sample_list_for_response got response. Delay: {time2 - time1}")
+
+        dataFrame = None
+        if thingsFetchOptions["withDataFrame"]:
+            if attrs is None:
+                attrs = []
+            default_attrs = [
+                "identifier",
+                "permId",
+                "type",
+                "registrator",
+                "registrationDate",
+                "modifier",
+                "modificationDate",
+            ]
+            display_attrs = default_attrs + attrs
+
+            if props is None:
+                props = []
+            else:
+                if isinstance(props, str):
+                    props = [props]
+
+            if len(response) == 0:
+                for prop in props:
+                    if prop == "*":
+                        continue
+                    display_attrs.append(prop)
+                samples = DataFrame(columns=display_attrs)
+            else:
+                time3 = now()
+                logger.debug(f"_sample_list_for_response computing attributes. Delay: {time3 - time2}")
+
+                samples = DataFrame(response)
+                for attr in attrs:
+                    if "." in attr:
+                        entity, attribute_to_extract = attr.split(".")
+                        samples[attr] = samples[entity].map(
+                            extract_attribute(attribute_to_extract)
+                        )
+                    # if no dot supplied, just display the code of the space, project or experiment
+                    elif attr in ["project", "experiment"]:
+                        samples[attr] = samples[attr].map(extract_nested_identifier)
+                    elif attr in ["space"]:
+                        samples[attr] = samples[attr].map(extract_code)
+
+                samples["registrationDate"] = samples["registrationDate"].map(
+                    format_timestamp
+                )
+                samples["modificationDate"] = samples["modificationDate"].map(
+                    format_timestamp
+                )
+                samples["registrator"] = samples["registrator"].map(extract_person)
+                samples["modifier"] = samples["modifier"].map(extract_person)
+                samples["identifier"] = samples["identifier"].map(extract_identifier)
+                samples["container"] = samples["container"].map(extract_nested_identifier)
+                for column in ["parents", "children", "components"]:
+                    if column in samples:
+                        samples[column] = samples[column].map(extract_identifiers)
+                samples["permId"] = samples["permId"].map(extract_permid)
+                samples["type"] = samples["type"].map(extract_nested_permid)
+
+                time4 = now()
+                logger.debug(f"_sample_list_for_response computed attributes. Delay: {time4 - time3}")
+
+                for prop in props:
+                    if prop == "*":
+                        # include all properties in dataFrame.
+                        # expand the dataFrame by adding new columns
+                        columns = []
+                        for i, sample in enumerate(response):
+                            for prop_name, val in sample.get("properties", {}).items():
+                                samples.loc[i, prop_name.upper()] = val
+                                columns.append(prop_name.upper())
+
+                        display_attrs += set(columns)
+                        continue
+                    else:
+                        # property name is provided
+                        for i, sample in enumerate(response):
+                            if "properties" in sample:
+                                properties = sample["properties"]
+                                val = properties.get(prop, "") or properties.get(prop.upper(), "")
+                                samples.loc[i, prop.upper()] = val
+                            else:
+                                samples.loc[i, prop.upper()] = ""
+                        display_attrs.append(prop.upper())
+
+                time5 = now()
+                logger.debug(f"_sample_list_for_response computed properties. Delay: {time5 - time4}")
+
+            dataFrame = samples[display_attrs]
+
+        time6 = now()
+        logger.debug("_sample_list_for_response computing result.")
+
+        result = Things(openbis_obj=self, entity="sample",
+                        df=dataFrame,
+                        identifier_name="identifier", start_with=start_with,
+                        count=count, totalCount=totalCount, objects=objects, response=response)
+
+        time7 = now()
+        logger.debug(f"_sample_list_for_response computed result. Delay: {time7 - time6}")
+        return result
+
     @staticmethod
     def decode_attribute(entity, attribute):
         params = {}
@@ -4328,118 +4500,6 @@ class Openbis:
 
         return params
 
-    def _sample_list_for_response(
-        self,
-        response,
-        attrs=None,
-        props=None,
-        start_with=None,
-        count=None,
-        totalCount=0,
-        objects=None,
-    ):
-        """returns a Things object, containing a DataFrame plus additional information"""
-
-        def extract_attribute(attribute_to_extract):
-            def return_attribute(obj):
-                if obj is None:
-                    return ""
-                return obj.get(attribute_to_extract, "")
-
-            return return_attribute
-
-        parse_jackson(response)
-
-        if attrs is None:
-            attrs = []
-        default_attrs = [
-            "identifier",
-            "permId",
-            "type",
-            "registrator",
-            "registrationDate",
-            "modifier",
-            "modificationDate",
-        ]
-        display_attrs = default_attrs + attrs
-
-        if props is None:
-            props = []
-        else:
-            if isinstance(props, str):
-                props = [props]
-
-        if len(response) == 0:
-            for prop in props:
-                if prop == "*":
-                    continue
-                display_attrs.append(prop)
-            samples = DataFrame(columns=display_attrs)
-        else:
-            samples = DataFrame(response)
-            for attr in attrs:
-                if "." in attr:
-                    entity, attribute_to_extract = attr.split(".")
-                    samples[attr] = samples[entity].map(
-                        extract_attribute(attribute_to_extract)
-                    )
-
-            for attr in attrs:
-                # if no dot supplied, just display the code of the space, project or experiment
-                if attr in ["project", "experiment"]:
-                    samples[attr] = samples[attr].map(extract_nested_identifier)
-                if attr in ["space"]:
-                    samples[attr] = samples[attr].map(extract_code)
-
-            samples["registrationDate"] = samples["registrationDate"].map(
-                format_timestamp
-            )
-            samples["modificationDate"] = samples["modificationDate"].map(
-                format_timestamp
-            )
-            samples["registrator"] = samples["registrator"].map(extract_person)
-            samples["modifier"] = samples["modifier"].map(extract_person)
-            samples["identifier"] = samples["identifier"].map(extract_identifier)
-            samples["container"] = samples["container"].map(extract_nested_identifier)
-            for column in ["parents", "children", "components"]:
-                if column in samples:
-                    samples[column] = samples[column].map(extract_identifiers)
-            samples["permId"] = samples["permId"].map(extract_permid)
-            samples["type"] = samples["type"].map(extract_nested_permid)
-
-            for prop in props:
-                if prop == "*":
-                    # include all properties in dataFrame.
-                    # expand the dataFrame by adding new columns
-                    columns = []
-                    for i, sample in enumerate(response):
-                        for prop_name, val in sample.get("properties", {}).items():
-                            samples.loc[i, prop_name.upper()] = val
-                            columns.append(prop_name.upper())
-
-                    display_attrs += set(columns)
-                    continue
-
-                else:
-                    # property name is provided
-                    for i, sample in enumerate(response):
-                        val = sample.get("properties", {}).get(prop, "") or sample.get(
-                            "properties", {}
-                        ).get(prop.upper(), "")
-                        samples.loc[i, prop.upper()] = val
-                    display_attrs.append(prop.upper())
-
-        return Things(
-            openbis_obj=self,
-            entity="sample",
-            df=samples[display_attrs],
-            identifier_name="identifier",
-            start_with=start_with,
-            count=count,
-            totalCount=totalCount,
-            objects=objects,
-        )
-
     get_object = get_sample  # Alias
 
     def get_external_data_management_systems(
diff --git a/pybis/src/python/pybis/sample.py b/pybis/src/python/pybis/sample.py
index 1bedb359095..7ae6c8ab428 100644
--- a/pybis/src/python/pybis/sample.py
+++ b/pybis/src/python/pybis/sample.py
@@ -1,6 +1,6 @@
 from .property import PropertyHolder
 from .attribute import AttrHolder
-from .openbis_object import OpenBisObject 
+from .openbis_object import OpenBisObject
 from .definitions import openbis_definitions
 from .utils import VERBOSE
 
@@ -23,11 +23,13 @@ class Sample(
         if data is not None:
             self._set_data(data)
 
+        # TODO: Why are we using getattr() and setattr() here? They are considerably slower.
         if project is not None:
-            setattr(self, 'project', project)
+            self.project = project
 
         if props is not None:
             for key in props:
+                # self.p[key] = props[key]
                 setattr(self.p, key, props[key])
 
         if kwargs is not None:
@@ -36,10 +38,10 @@ class Sample(
 
             if 'experiment' in kwargs:
                 try:
-                    experiment = getattr(self, 'experiment')
+                    experiment = self.experiment
                     if not 'space' in kwargs:
                         project = experiment.project
-                        setattr(self.a, 'space', project.space)
+                        self.a.space = project.space
                 except Exception:
                     pass
 
@@ -77,10 +79,10 @@ class Sample(
         return [
             'type',
             'get_parents()', 'get_children()', 'get_components()',
-            'add_parents()', 'add_children()', 'add_components()', 
+            'add_parents()', 'add_children()', 'add_components()',
             'del_parents()', 'del_children()', 'del_components()',
             'set_parents()', 'set_children()', 'set_components()',
-            'get_datasets()', 
+            'get_datasets()',
             'space', 'project', 'experiment', 'container', 'tags',
             'set_tags()', 'add_tags()', 'del_tags()',
             'add_attachment()', 'get_attachments()', 'download_attachments()',
diff --git a/pybis/src/python/pybis/things.py b/pybis/src/python/pybis/things.py
index 51497d3a6c9..166a330191d 100644
--- a/pybis/src/python/pybis/things.py
+++ b/pybis/src/python/pybis/things.py
@@ -16,18 +16,19 @@ class Things():
     Because the order of the elements cannot be ensured, you should choose the identifier instead:
         openbis.get_samples()['/SOME_SPACE/SAMPLE_CODE']
 
-    Of course, if you know the identifier already, you would rather do: 
+    Of course, if you know the identifier already, you would rather do:
         openbis.get_sample('/SOME_SPACE/SAMPLE_CODE')
-    
-    
+
+
     """
 
     def __init__(
         self, openbis_obj, entity, df,
-        identifier_name='code', additional_identifier=None, 
+        identifier_name='code', additional_identifier=None,
         start_with=None, count=None, totalCount=None,
         single_item_method=None,
-        objects=None
+        objects=None,
+        response=None
     ):
         self.openbis = openbis_obj
         self.entity = entity
@@ -39,6 +40,7 @@ class Things():
         self.totalCount=totalCount
         self.single_item_method=single_item_method
         self.objects=objects
+        self.response = response
 
     def __repr__(self):
         return tabulate(self.df, headers=list(self.df))
diff --git a/pybis/src/python/pybis/utils.py b/pybis/src/python/pybis/utils.py
index ad917f5e46e..6ebd6cf7245 100644
--- a/pybis/src/python/pybis/utils.py
+++ b/pybis/src/python/pybis/utils.py
@@ -16,14 +16,14 @@ def parse_jackson(input_json):
        Any further findings only carry this reference id.
        This function is used to dereference the output.
     """
-    interesting=['tags', 'registrator', 'modifier', 'owner', 'type', 
-        'parents', 'children', 'containers', # 'container', 
+    interesting=['tags', 'registrator', 'modifier', 'owner', 'type',
+        'parents', 'children', 'containers', # 'container',
         'properties', 'experiment', 'sample',
         'project', 'space', 'propertyType', 'entityType', 'propertyType', 'propertyAssignment',
         'externalDms', 'roleAssignments', 'user', 'users', 'authorizationGroup', 'vocabulary',
         'validationPlugin', 'dataSetPermId', 'dataStore'
     ]
-    found = {} 
+    found = {}
     def build_cache(graph):
         if isinstance(graph, list):
             for item in graph:
@@ -45,15 +45,17 @@ def parse_jackson(input_json):
                     build_cache(value)
                 elif isinstance(value, list):
                     build_cache(value)
-                    
-    def deref_graph(graph):            
+
+    def deref_graph(graph):
         if isinstance(graph, list):
             for i, list_item in enumerate(graph):
                 if isinstance(list_item, int):
-                    try:
-                        graph[i] = found[list_item]
-                    except KeyError:
-                        pass
+                    # try: # TODO: use "if list_item in found" with found.get()
+                    #     graph[i] = found[list_item]
+                    # except KeyError:
+                    #     pass
+                    if list_item in found:
+                        graph[i] = found.get(list_item)
                 else:
                     deref_graph(list_item)
         elif isinstance(graph, dict) and len(graph) > 0:
@@ -63,7 +65,6 @@ def parse_jackson(input_json):
                         deref_graph(value)
                     elif isinstance(value, int):
                         graph[key] = found.get(value)
-
                     elif isinstance(value, list):
                         for i, list_item in enumerate(value):
                             if isinstance(list_item, int):
@@ -189,9 +190,9 @@ def extract_identifiers(items):
         return []
     try:
         return [
-            data['identifier']['identifier'] 
-            if 'identifier' in data 
-            else data['permId']['permId'] 
+            data['identifier']['identifier']
+            if 'identifier' in data
+            else data['permId']['permId']
             for data in items
         ]
     except TypeError:
@@ -206,8 +207,8 @@ def extract_nested_identifier(ident):
 def extract_nested_permid(permid):
     if not isinstance(permid, dict):
         return '' if permid is None else str(permid)
-    return '' if permid['permId']['permId'] is None else permid['permId']['permId'] 
-    
+    return '' if permid['permId']['permId'] is None else permid['permId']['permId']
+
 def extract_nested_permids(items):
     if not isinstance(items, list):
         return []
diff --git a/pybis/src/python/tests/conftest.py b/pybis/src/python/tests/conftest.py
index 4bddb2c2294..edeaf72291f 100644
--- a/pybis/src/python/tests/conftest.py
+++ b/pybis/src/python/tests/conftest.py
@@ -4,12 +4,10 @@ import time
 import pytest
 from pybis import Openbis
 
-
 openbis_url = "https://localhost:8443"
 admin_username = "admin"
 admin_password = "changeit"
 
-
 @pytest.fixture(scope="module")
 def openbis_instance():
     instance = Openbis(
@@ -28,6 +26,7 @@ def other_openbis_instance():
     instance = Openbis(
         url=openbis_url,
         verify_certificates=False,
+
     )
     print("\nLOGGING IN...")
     instance.login(admin_username, admin_password)
@@ -41,6 +40,7 @@ def space():
     o = Openbis(
         url=openbis_url,
         verify_certificates=False,
+
     )
     o.login(admin_username, admin_password)
 
-- 
GitLab