pybis.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
pybis.py

A class with methods for interacting with openBIS.

Created by Chandrasekhar Ramakrishnan on 2016-05-10.
Copyright (c) 2016 ETH Zuerich. All rights reserved.
"""

import os
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

import time
import json
import re
from urllib.parse import urlparse

import pandas as pd
from pandas import DataFrame, Series

import threading
from threading import Thread
from queue import Queue
DROPBOX_PLUGIN = "jupyter-uploader-api"


class Openbis:
    """Interface for communicating with openBIS."""

    def __init__(self, url='https://localhost:8443', verify_certificates=True, token=None):
        """Initialize an interface to openBIS with information necessary to connect to the server.
        :param host:
        """

        url_obj = urlparse(url)
        if  url_obj.netloc is None:
            raise ValueError("please provide the url in this format: https://openbis.host.ch:8443")

        self.url_obj = url_obj
        self.url     = url_obj.geturl()
        self.port    = url_obj.port
        self.hostname = url_obj.hostname
        self.as_v3 = '/openbis/openbis/rmi-application-server-v3.json'
        self.as_v1 = '/openbis/openbis/rmi-general-information-v1.json'
        self.reg_v1 = '/openbis/openbis/rmi-query-v1.json'
        self.verify_certificates = verify_certificates
        self.token = token
        self.datastores = []
        self.spaces = []
        self.files_in_wsp = []
        self.token_path = None

        # use an existing token, if available
        if self.token is None:
            self.token = self.get_cached_token()


    def get_cached_token(self):
        """Read the token from the cache, and set the token ivar to it, if there, otherwise None.
        If the token is not valid anymore, delete it. 
        """
        token_path = self.gen_token_path()
        if not os.path.exists(token_path):
            return None
        try:
            with open(token_path) as f:
                token = f.read()
                if not self.is_token_valid(token):
                    os.remove(token_path)
                    return None
                else:
                    return token
        except FileNotFoundError:
            return None


    def gen_token_path(self, parent_folder=None):
        """generates a path to the token file.
        The token is usually saved in a file called
        ~/.pybis/hostname.token
        """
        if parent_folder is None:
            # save token under ~/.pybis folder
            parent_folder = os.path.join(
                os.path.expanduser("~"),
                '.pybis'
            )
        path = os.path.join(parent_folder, self.hostname + '.token')
        return path


    def save_token(self, token=None, parent_folder=None):
        if token is None:
            token = self.token

        token_path = None;
        if parent_folder is None:
            token_path = self.gen_token_path()
        else:
            token_path = self.gen_token_path(parent_folder)

        # create the necessary directories, if they don't exist yet
        os.makedirs(os.path.dirname(token_path), exist_ok=True)
        with open(token_path, 'w') as f:
            f.write(token)
            self.token_path = token_path


    def delete_token(self, token_path=None):
        if token_path is None:
            token_path = self.token_path
        os.remove(token_path)


    def post_request(self, resource, data):
        resp = requests.post(self.url + resource, json.dumps(data), verify=self.verify_certificates)

        if resp.ok:
            data = resp.json()
            if 'error' in data:
                raise ValueError('an error has occured: ' + data['error']['message'] )
            elif 'result' in data:
                return data['result']
            else:
                raise ValueError('request did not return either result nor error')
        else:
            raise ValueError('general error while performing post request')


    def logout(self):
        if self.token is None:
            return

        logout_request = {
            "method":"logout",
            "params":[self.token],
            "id":"1",
            "jsonrpc":"2.0"
        }
        resp = self.post_request(self.as_v3, logout_request)
        self.token = None
        self.token_path = None
        return resp


    def login(self, username=None, password=None, save_token=False):
        """Log into openBIS.
        Expects a username and a password and updates the token (session-ID).
        The token is then used for every request.
        Clients may want to store the credentials object in a credentials store after successful login.
        Throw a ValueError with the error message if login failed.
        """

        login_request = {
            "method":"login",
            "params":[username, password],
            "id":"1",
            "jsonrpc":"2.0"
        }
        result = self.post_request(self.as_v3, login_request)
        if result is None:
            raise ValueError("login to openBIS failed")
        else:
            self.token = result
            if save_token:
                self.save_token()
            return self.token


    def get_datastores(self):
        if len(self.datastores) == 0: 
            request = {
                "method": "listDataStores",
                "params": [ self.token ],
                "id": "1",
                "jsonrpc": "2.0"
            }
            resp = self.post_request(self.as_v1, request)
            if resp is not None:
                self.datastores = DataFrame(resp)[['code','downloadUrl', 'hostUrl']]
                return self.datastores
            else:
                raise ValueError("No datastore found!")
        else:
            return self.datastores


    def get_spaces(self, refresh=None):
        if len(self.spaces) == 0 or refresh is not None:
            request = {
                "method": "searchSpaces",
                "params": [ self.token, {}, {} ],
                "id": "1",
                "jsonrpc": "2.0"
            }
            resp = self.post_request(self.as_v3, request)
            if resp is not None:
                self.spaces = DataFrame(resp['objects'])[['code','description']]
                return self.spaces
            else:
                raise ValueError("No spaces found!")
        else:
            return self.spaces

    def get_space(self, spaceId):
        request = {
        "method": "getSpaces",
            "params": [ 
            self.token,
            [{ 
                "@id": 0,
                "permId": spaceId,
                "@type": "as.dto.space.id.SpacePermId"
            }],
            {
                "@id": 0,
                "@type": "as.dto.space.fetchoptions.SpaceFetchOptions",
                "registrator": None,
                "samples": None,
                "projects": None,
                "sort": None
            } 
            ],
                "id": "1",
                "jsonrpc": "2.0"
        } 
        resp = self.post_request(self.as_v3, request)
        return Space(self, resp[spaceId])

    def get_sample_types(self):
        request = {
            "method": "searchSampleTypes",
            "params": [ self.token, {}, {} ],
            "id": "1",
            "jsonrpc": "2.0"
        }
        resp = self.post_request(self.as_v3, request)
        if resp is not None:
            datasets = DataFrame(resp['objects'])[['code','description']]
            return datasets
        return DataFrame()


    def get_dataset_types(self):
        request = {
            "method": "searchDataSetTypes",
            "params": [ self.token, {}, {} ],
            "id": "1",
            "jsonrpc": "2.0"
        }
        resp = self.post_request(self.as_v3, request)
        if resp is not None:
            datasets = DataFrame(resp['objects'])[['code','description','kind']]
            return datasets
        return DataFrame()
        

    def is_session_active(self):
        return self.is_token_valid(self.token)


    def is_token_valid(self, token=None):
        """Check if the connection to openBIS is valid.
        This method is useful to check if a token is still valid or if it has timed out,
        requiring the user to login again.
        :return: Return True if the token is valid, False if it is not valid.
        """
        if token is None:
            token = self.token
        
        if token is None:
            return False

        request = {
            "method": "isSessionActive",
            "params": [ token ],
            "id": "1",
            "jsonrpc": "2.0"
        }
        resp = self.post_request(self.as_v1, request)
        return resp


    def get_dataset(self, permid):
        """fetch a dataset and some metadata attached to it:
        - properties
        - sample
        - parents
        - children
        - containers
        - dataStore
        - physicalData
        - linkedData
        :return: a DataSet object
        """

        dataset_request = {
            "method": "getDataSets",
            "params": [
                self.token,
                [
                    {
                        "permId": permid,
                        "@type": "as.dto.dataset.id.DataSetPermId"
                    }
                ],
                {
                "parents": {
                    "@type": "as.dto.dataset.fetchoptions.DataSetFetchOptions"
                },
                "children": {
                  "@type": "as.dto.dataset.fetchoptions.DataSetFetchOptions"
                },
                "containers": {
                    "@type": "as.dto.dataset.fetchoptions.DataSetFetchOptions"
                },
                "physicalData": {
                    "@type": "as.dto.dataset.fetchoptions.PhysicalDataFetchOptions"
                },
                "linkedData": {
                    "@type": "as.dto.dataset.fetchoptions.LinkedDataFetchOptions",
                },
                "dataStore": {
                    "@type": "as.dto.datastore.fetchoptions.DataStoreFetchOptions",
                },
                "sample": {
                    "@type": "as.dto.sample.fetchoptions.SampleFetchOptions"
                },
                "properties": {
                    "@type": "as.dto.property.fetchoptions.PropertyFetchOptions"
                },
                "@type": "as.dto.dataset.fetchoptions.DataSetFetchOptions"
                }
            ],
            "id": permid,
            "jsonrpc": "2.0"
        }

        resp = self.post_request(self.as_v3, dataset_request)
        if resp is not None:
            for permid in resp:
                return DataSet(self, permid, resp[permid])


    def get_sample(self, sample_ident):
        """Retrieve metadata for the sample.
        Get metadata for the sample and any directly connected parents of the sample to allow access
        to the same information visible in the ELN UI. The metadata will be on the file system.
        :param sample_identifiers: A list of sample identifiers to retrieve.
        """

        if self.token is None:
            raise ValueError("Please login first")

        search_request = None

        # assume we got a sample identifier e.g. /TEST/TEST-SAMPLE
        match = re.match('/', sample_ident)
        if match:
            search_request = {
                "identifier": sample_ident,
                "@type": "as.dto.sample.id.SampleIdentifier"
            }
        else:
            # look if we got a PermID eg. 234567654345-123
            match = re.match('\d+\-\d+', sample_ident)
            if match:
                search_request = {
                    "permId": sample_ident,
                    "@type": "as.dto.sample.id.SamplePermId"
                }
            else:
                raise ValueError(
                    '"' + sample_ident + '" is neither a Sample Identifier nor a PermID'
                )

        sample_request = {
            "method": "getSamples",
            "params": [
                self.token,
                [
                    search_request, 
                ],
                {
                    "type": {
                        "@type": "as.dto.sample.fetchoptions.SampleTypeFetchOptions"
                    },
                    "properties": {
                        "@type": "as.dto.property.fetchoptions.PropertyFetchOptions"
                    },
                    "parents": {
                        "@type": "as.dto.sample.fetchoptions.SampleFetchOptions",
                        "properties": {
                            "@type": "as.dto.property.fetchoptions.PropertyFetchOptions"
                        }
                    },
                    "children": {
                        "@type": "as.dto.sample.fetchoptions.SampleFetchOptions",
                        "properties": {
                            "@type": "as.dto.property.fetchoptions.PropertyFetchOptions"
                        }
                    },
                    "dataSets": {
                        "@type": "as.dto.dataset.fetchoptions.DataSetFetchOptions",
                        "properties": {
                            "@type": "as.dto.property.fetchoptions.PropertyFetchOptions"
                        }
                    },
                    "registrator": {
                        "@type": "as.dto.person.fetchoptions.PersonFetchOptions",
                    },
                    "tags": {
                        "@type": "as.dto.tag.fetchoptions.TagFetchOptions",
                    },
                }
            ],
            "id": sample_ident,
            "jsonrpc": "2.0"
        }
        resp = self.post_request(self.as_v3, sample_request)
        if resp is not None:
            for sample_ident in resp:
                return Sample(self, resp[sample_ident])


    def delete_sample(self, permid, reason):
        sample_delete_request = {
            "method": "deleteSamples",
            "params": [
                self.token,
                [
                    {
                        "permId": permid,
                        "@type": "as.dto.sample.id.SamplePermId"
                    }
                ],
                {
                    "reason": reason,
                    "@type": "as.dto.sample.delete.SampleDeletionOptions"
                }
            ],
            "id": "1",
            "jsonrpc": "2.0"
        }
        resp = self.post_request(self.as_v3, sample_delete_request)
        return


    def new_space(self, name, description=None):
        request = {
            "method": "createSpaces",
            "params": [
                self.token,
                [ {
                    "@id": 0,
                    "code": name,
                    "description": description,
                    "@type": "as.dto.space.create.SpaceCreation"
                } ]
            ],
            "id": "1",
            "jsonrpc": "2.0"
        }
        resp = self.post_request(self.as_v3, request)
        return self.get_spaces(refresh=True)


    def new_analysis(self, name, description=None, sample=None, dss_code=None, result_files=None,
    notebook_files=None, parents=[]):

        """ general method to upload datasets to the datastore server
            and starting the aggregation service. Technically it involves
            uploading files to the session workspace and activating the
            dropbox aka dataset ingestion service
        """

        if dss_code is None:
            dss_code = self.get_datastores()['code'][0]

        # if a sample identifier was given, use it as a string.
        # if a sample object was given, take its identifier
        # TODO: handle permId's 
        sample_identifier = None
        if isinstance(sample, str):
            sample_identifier = sample
        else:
            sample_identifier = sample.ident
        
        datastore_url = self.get_dss_url(dss_code)
        folder = time.strftime('%Y-%m-%d_%H-%M-%S')

        data_sets = []
        if notebook_files is not None:
            notebooks_folder = os.path.join(folder, 'notebook_files')
            self.upload_files(
                datastore_url = datastore_url,
                files=notebook_files,
                folder= notebooks_folder, 
                wait_until_finished=True
            )
            data_sets.append({
                "dataSetType" : "JUPYTER_NOTEBOOk",
                "sessionWorkspaceFolder": notebooks_folder,
                "fileNames" : notebook_files,
                "properties" : {}
            })
        if result_files is not None:
            results_folder = os.path.join(folder, 'result_files')
            self.upload_files(
                datastore_url = datastore_url,
                files=result_files,
                folder=results_folder,
                wait_until_finished=True
            )
            data_sets.append({
                "dataSetType" : "JUPYTER_RESULT",
                "sessionWorkspaceFolder" : results_folder,
                "fileNames" : result_files,
                "properties" : {}
            })


        request = {
          "method": "createReportFromAggregationService",
          "params": [
            self.token,
            dss_code,
            DROPBOX_PLUGIN,
            { 
            	"sample" : {
                	"identifier" : sample.identifier
                },
                "containers" : [ 
                    {
                    	"dataSetType" : "JUPYTER_CONTAINER",
                    	"properties" : {
                			"NAME" : "another Analysis name",
                			"DESCRIPTION" : "This is a new description of my analysis"
                    	}
                    }
                ],
                "dataSets" : data_sets,
                "parents" : parents,
            }
          ],
          "id": "1",
          "jsonrpc": "2.0"
        }

        return request


    def new_sample(self, sample_name, space_name, sample_type="UNKNOWN", tags=[]):

        if isinstance(tags, str):
            tags = [tags]
        tag_ids = []
        for tag in tags:
            tag_dict = {
                "code":tag,
                "@type":"as.dto.tag.id.TagCode"
            }
            tag_ids.append(tag_dict)


        sample_create_request = {
            "method":"createSamples",
            "params":[
                self.token,
                [ {
                    "properties":{},
                    "typeId":{
                        "permId": sample_type,
                        "@type":"as.dto.entitytype.id.EntityTypePermId"
                    },
                    "code": sample_name,
                    "spaceId":{
                        "permId": space_name,
                        "@type":"as.dto.space.id.SpacePermId"
                    },
                    "tagIds":tag_ids,
                    "@type":"as.dto.sample.create.SampleCreation",
                    "experimentId":None,
                    "containerId":None,
                    "componentIds":None,
                    "parentIds":None,
                    "childIds":None,
                    "attachments":None,
                    "creationId":None,
                    "autoGeneratedCode":None
                } ]
            ],
            "id":"1",
            "jsonrpc":"2.0"
        }
        resp = self.post_request(self.as_v3, sample_create_request)
        if 'permId' in resp[0]:
            return self.get_sample(resp[0]['permId'])
        else:
            raise ValueError("error while trying to fetch sample from server: " + str(resp))


    def get_dss_url(self, dss_code=None):
        
        dss = self.get_datastores()
        if dss_code is None:
            return dss['downloadUrl'][0]
        else:
            return dss[dss['code'] == dss_code]['downloadUrl'][0]
        

    def upload_files(self, datastore_url=None, files=None, folder=None, wait_until_finished=False):

        if datastore_url is None:
            datastore_url = self.get_dss_url()

        if files is None:
            raise ValueError("Please provide a filename.")

        if folder is None:
            # create a unique foldername
            folder = time.strftime('%Y-%m-%d_%H-%M-%S')

        if isinstance(files, str):
            files = [files]

        self.files = files
        self.startByte = 0
        self.endByte   = 0
    
        # define a queue to handle the upload threads
        queue = DataSetUploadQueue()

        real_files = []
        for filename in files:
            if os.path.isdir(filename):
                real_files.extend([os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(filename)) for f in fn])
            else:
                real_files.append(os.path.join(filename))

        # compose the upload-URL and put URL and filename in the upload queue 
        for filename in real_files:
            file_in_wsp = os.path.join(folder, filename)
            self.files_in_wsp.append(file_in_wsp)
            upload_url = (
                datastore_url + '/session_workspace_file_upload'
                + '?filename=' + os.path.join(folder,filename)
                + '&id=1'
                + '&startByte=0&endByte=0'
                + '&sessionID=' + self.token
            )
            queue.put([upload_url, filename, self.verify_certificates])

        # wait until all files have uploaded
        if wait_until_finished:
            queue.join()

        # return files with full path in session workspace
        return self.files_in_wsp


class DataSetUploadQueue:
   
    def __init__(self, workers=20):
        # maximum files to be uploaded at once
        self.upload_queue = Queue()

        # define number of threads and start them
        for t in range(workers):
            t = Thread(target=self.upload_file)
            t.daemon = True
            t.start()


    def put(self, things):
        """ expects a list [url, filename] which is put into the upload queue
        """
        self.upload_queue.put(things)


    def join(self):
        """ needs to be called if you want to wait for all uploads to be finished
        """
        self.upload_queue.join()


    def upload_file(self):
        while True:
            # get the next item in the queue
            upload_url, filename, verify_certificates = self.upload_queue.get()

            # upload the file to our DSS session workspace
            with open(filename, 'rb') as f:
                resp = requests.post(upload_url, data=f, verify=verify_certificates)
                resp.raise_for_status()

            # Tell the queue that we are done
            self.upload_queue.task_done()


class DataSetDownloadQueue:
    
    def __init__(self, workers=20):
        # maximum files to be downloaded at once
        self.download_queue = Queue()

        # define number of threads
        for t in range(workers):
            t = Thread(target=self.download_file)
            t.daemon = True
            t.start()


    def put(self, things):
        """ expects a list [url, filename] which is put into the download queue
        """
        self.download_queue.put(things)


    def join(self):
        """ needs to be called if you want to wait for all downloads to be finished
        """
        self.download_queue.join()


    def download_file(self):
        while True:
            url, filename, verify_certificates = self.download_queue.get()
            # create the necessary directory structure if they don't exist yet
            os.makedirs(os.path.dirname(filename), exist_ok=True)

            # request the file in streaming mode
            r = requests.get(url, stream=True, verify=verify_certificates)
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024): 
                    if chunk: # filter out keep-alive new chunks
                        f.write(chunk)

            self.download_queue.task_done()


class DataSet():
    """objects which contain datasets"""

    def __init__(self, openbis_obj, permid, data):
        self.openbis = openbis_obj
        self.permid  = permid
        self.data    = data
        self.v1_ds = '/datastore_server/rmi-dss-api-v1.json'
        self.downloadUrl = self.data['dataStore']['downloadUrl']


    def download(self, files=None, wait_until_finished=False, workers=10):
        """ download the actual files and put them in the following folder:
        __current_dir__/hostname/dataset_permid/
        """

        if files == None:
            files = self.file_list()
        elif isinstance(files, str):
            files = [files]

        base_url = self.downloadUrl + '/datastore_server/' + self.permid + '/'

        queue = DataSetDownloadQueue(workers=workers)

        # get file list and start download
        for filename in files:
            download_url = base_url + filename + '?sessionID=' + self.openbis.token 
            filename = os.path.join(self.openbis.hostname, self.permid, filename)
            queue.put([download_url, filename, self.openbis.verify_certificates])

        # wait until all files have downloaded
        if wait_until_finished:
            queue.join()


    def get_parents(self):
        parents = []
        for item in self.data['parents']:
            parent = self.openbis.get_dataset(item['code'])
            if parent is not None:
                parents.append(parent)
        return parents


    def get_children(self):
        children = []
        for item in self.data['children']:
            child = self.openbis.get_dataset(item['code'])
            if child is not None:
                children.append(child)
        return children


    def file_list(self):
        files = []
        for file in self.get_file_list(recursive=True):
            if file['isDirectory']:
                pass
            else:
                files.append(file['pathInDataSet'])
        return files

        
    def get_file_list(self, recursive=True, start_folder="/"):
        request = {
            "method" : "listFilesForDataSet",
            "params" : [ 
                self.openbis.token,
                self.permid, 
                start_folder,
                recursive,
            ],
            "id":"1"
        }

        resp = requests.post(
            self.downloadUrl + self.v1_ds, 
            json.dumps(request), 
            verify=self.openbis.verify_certificates
        )

        if resp.ok:
            data = resp.json()
            if 'error' in data:
                raise ValueError('Error from openBIS: ' + data['error'] )
            elif 'result' in data:
                return data['result']
            else:
                raise ValueError('request to openBIS did not return either result nor error')
        else:
            raise ValueError('internal error while performing post request')


class AttrDict(dict):
    """ this class is just transforming a dictionary into an object
    """
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


class Sample(dict):
    """ managing openBIS samples
    """

    def __init__(self, openbis_obj, *args, **kwargs):
        super(Sample, self).__init__(*args, **kwargs)
        self.__dict__ = self
        self.openbis = openbis_obj
        self.permid = self.permId['permId']
        self.ident = self.identifier['identifier']


    def delete(self, permid, reason):
        self.openbis.delete_sample(permid, reason)


    def get_datasets(self):
        datasets = []
        for item in self.dataSets:
            datasets.append(self.openbis.get_dataset(item['permId']['permId']))
        return datasets


    def get_parents(self):
        parents = []
        for item in self.parents:
            parent = self.openbis.get_sample(item['permId']['permId'])
            if parent is not None:
                parents.append(parent)
        return parents


    def get_children(self):
        children = []
        for item in self.children:
            child = self.openbis.get_sample(item['permId']['permId'])
            if child is not None:
                children.append(child)
        return children


class Space(dict):
    """ managing openBIS spaces
    """

    def __init__(self, openbis_obj, *args, **kwargs):
        super(Space, self).__init__(*args, **kwargs)
        self.__dict__ = self
        self.openbis = openbis_obj
        self.code = self.code

    def get_samples(self):
        fields = ['spaceCode','permId', 'identifier','experimentIdentifierOrNull']
        request = {
            "method": "searchForSamples",
            "params": [
                self.openbis.token,
                {
                "matchClauses": [
                    {
                    "@type": "AttributeMatchClause",
                    "fieldType": "ATTRIBUTE",
                    "attribute": "SPACE",
                    "desiredValue": self.code,
                    }
                ],
                    "subCriterias": [],
                    "operator": "MATCH_ALL_CLAUSES"
                },
                [
                    "PROPERTIES",
                    "PARENTS"
                ]
            ],
            "id": "1",
            "jsonrpc": "2.0"
        }
        resp = self.openbis.post_request(self.openbis.as_v1, request)
        if resp is not None and len(resp) > 0:
            datasets = DataFrame(resp)[fields]
            return datasets
        else:
            return None