Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
O
openbis
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
sispub
openbis
Commits
1f1bdeed
Commit
1f1bdeed
authored
12 years ago
by
kohleman
Browse files
Options
Downloads
Patches
Plain Diff
CISDTT-30: Work Package 3: Sample Sheet Creation for Novartis NGS Master Data Schema.
SVN: 27199
parent
0f057518
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
deep_sequencing_unit/source/Jython/createSampleSheet_nov.py
+387
-0
387 additions, 0 deletions
deep_sequencing_unit/source/Jython/createSampleSheet_nov.py
with
387 additions
and
0 deletions
deep_sequencing_unit/source/Jython/createSampleSheet_nov.py
0 → 100644
+
387
−
0
View file @
1f1bdeed
'''
@copyright:
Copyright 2012 ETH Zuerich, CISD
@license:
Licensed under the Apache License, Version 2.0 (the
'
License
'
);
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an
'
AS IS
'
BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
@author:
Manuel Kohler
@description:
Creates the SampleSheet.csv out of values from openBIS for Demultiplexing
used in the Illumina pipeline (configureBclToFastq.pl)
@attention:
Runs under Jython
@note:
Takes into account to replace special characters with an underscore so that the Illumina script
does not fail
HiSeq Header Description
========================
Column Header Description
FCID Flow cell ID
Lane Positive integer, indicating the lane number (1-8)
SampleID ID of the sample
SampleRef The reference used for alignment for the sample
Index Index sequences. Multiple index reads are separated by a hyphen (for example, ACCAGTAA-GGACATGA).
Description Description of the sample
Control Y indicates this lane is a control lane, N means sample
Recipe Recipe used during sequencing
Operator Name or ID of the operator
SampleProject The project the sample belongs to
'''
from
__future__
import
with_statement
import
os
import
logging
import
re
import
sys
import
string
import
smtplib
from
ConfigParser
import
SafeConfigParser
from
optparse
import
OptionParser
from
datetime
import
*
from
ch.systemsx.cisd.openbis.dss.client.api.v1
import
OpenbisServiceFacadeFactory
from
ch.systemsx.cisd.openbis.generic.shared.api.v1.dto
import
SearchCriteria
from
ch.systemsx.cisd.openbis.generic.shared.api.v1.dto
import
SearchSubCriteria
from
java.util
import
EnumSet
lineending
=
{
'
win32
'
:
'
\r\n
'
,
'
linux
'
:
'
\n
'
,
'
mac
'
:
'
\r
'
}
COMMA
=
'
,
'
def
login
(
configMap
,
logger
):
logger
.
info
(
'
Logging into
'
+
configMap
[
'
openbisServer
'
])
try
:
service
=
OpenbisServiceFacadeFactory
.
tryCreate
(
configMap
[
'
openbisUserName
'
],
configMap
[
'
openbisPassword
'
],
configMap
[
'
openbisServer
'
],
configMap
[
'
connectionTimeout
'
])
except
:
raise
(
'
Could not connect to
'
+
configMap
[
'
openbisServer
'
]
+
'
. Please check if the server
'
+
'
address is OK, the firewall is not blocking the communication or openBIS is down.
'
)
return
service
def
logout
(
service
,
logger
):
service
.
logout
()
logger
.
info
(
'
Logged out
'
)
def
setUpLogger
(
logPath
,
logLevel
=
logging
.
INFO
):
logFileName
=
'
createSampleSheet
'
d
=
datetime
.
now
()
logFileName
=
logFileName
+
'
_
'
+
d
.
strftime
(
'
%Y-%m-%d_%H_%M_%S
'
)
+
'
.log
'
logging
.
basicConfig
(
filename
=
logPath
+
logFileName
,
format
=
'
%(asctime)s %(message)s
'
,
level
=
logLevel
)
logger
=
logging
.
getLogger
(
logFileName
)
return
logger
def
parseConfigurationFile
(
logger
,
propertyFile
=
'
etc/createSampleSheet_nov.properties
'
):
'''
Parses the given config files and returns the values
'''
logger
.
info
(
'
Reading config file
'
+
propertyFile
)
config
=
SafeConfigParser
()
config
.
read
(
propertyFile
)
config
.
sections
()
return
config
def
parseOptions
(
logger
):
logger
.
info
(
'
Parsing command line parameters
'
)
parser
=
OptionParser
(
version
=
'
%prog 1.0
'
)
parser
.
add_option
(
'
-f
'
,
'
--flowcell
'
,
dest
=
'
flowcell
'
,
help
=
'
The flowcell which is used to create the SampleSheet.csv
'
,
metavar
=
'
<flowcell>
'
)
parser
.
add_option
(
'
-l
'
,
'
--lineending
'
,
dest
=
'
lineending
'
,
type
=
'
choice
'
,
action
=
'
store
'
,
choices
=
[
'
win32
'
,
'
linux
'
,
'
mac
'
],
default
=
'
linux
'
,
help
=
'
Specify end of line separator: win32, linux, mac. Default: linux
'
,
metavar
=
'
<lineending>
'
)
parser
.
add_option
(
'
-o
'
,
'
--outdir
'
,
dest
=
'
outdir
'
,
default
=
'
./
'
,
help
=
'
Specify the ouput directory. Default: ./
'
,
metavar
=
'
<outdir>
'
)
parser
.
add_option
(
'
-d
'
,
'
--debug
'
,
dest
=
'
debug
'
,
default
=
False
,
action
=
'
store_true
'
,
help
=
'
Verbose debug logging. Default: False
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
options
.
outdir
[
-
1
]
<>
'
/
'
:
options
.
outdir
=
options
.
outdir
+
'
/
'
if
options
.
flowcell
is
None
:
parser
.
print_help
()
exit
(
-
1
)
return
options
def
readConfig
(
logger
):
configMap
=
{}
configParameters
=
parseConfigurationFile
(
logger
)
configMap
[
'
facilityName
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
facilityName
'
)
configMap
[
'
facilityNameShort
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
facilityNameShort
'
)
configMap
[
'
facilityInstitution
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
facilityInstitution
'
)
configMap
[
'
sampleSheetFileName
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
sampleSheetFileName
'
)
configMap
[
'
lanePrefix
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
lanePrefix
'
)
configMap
[
'
separator
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
separator
'
)
configMap
[
'
indexSeparator
'
]
=
configParameters
.
get
(
'
GENERAL
'
,
'
indexSeparator
'
)
configMap
[
'
openbisUserName
'
]
=
configParameters
.
get
(
'
OPENBIS
'
,
'
openbisUserName
'
)
configMap
[
'
openbisPassword
'
]
=
configParameters
.
get
(
'
OPENBIS
'
,
'
openbisPassword
'
,
raw
=
True
)
configMap
[
'
openbisServer
'
]
=
configParameters
.
get
(
'
OPENBIS
'
,
'
openbisServer
'
)
configMap
[
'
connectionTimeout
'
]
=
configParameters
.
getint
(
'
OPENBIS
'
,
'
connectionTimeout
'
)
configMap
[
'
illuminaFlowCellTypeName
'
]
=
configParameters
.
get
(
'
OPENBIS
'
,
'
illuminaFlowCellTypeName
'
)
configMap
[
'
index1Name
'
]
=
configParameters
.
get
(
'
OPENBIS
'
,
'
index1Name
'
)
configMap
[
'
index2Name
'
]
=
configParameters
.
get
(
'
OPENBIS
'
,
'
index2Name
'
)
configMap
[
'
hiSeqNames
'
]
=
configParameters
.
get
(
'
ILLUMINA
'
,
'
hiSeqNames
'
)
configMap
[
'
hiSeqHeader
'
]
=
configParameters
.
get
(
'
ILLUMINA
'
,
'
hiSeqHeader
'
)
return
configMap
def
sanitizeString
(
myString
):
return
re
.
sub
(
'
[^A-Za-z0-9]+
'
,
'
_
'
,
myString
)
def
getVocabulary
(
vocabularyCode
):
'''
Returns the vocabulary terms and vocabulary labels of a vocabulary in a dictionary
specified by the parameter vocabularyCode
'''
terms
=
[]
vocabularies
=
service
.
listVocabularies
()
vocabularyDict
=
{}
for
vocabulary
in
vocabularies
:
if
(
vocabulary
.
getCode
()
==
vocabularyCode
):
terms
=
vocabulary
.
getTerms
()
if
terms
:
for
term
in
terms
:
vocabularyDict
[
term
.
getCode
()]
=
term
.
getLabel
()
else
:
print
(
'
No vocabulary found for
'
+
vocabularyCode
)
return
vocabularyDict
def
getFlowCell
(
illuminaFlowCellTypeName
,
flowCellName
,
service
,
logger
):
'''
Getting the the matching FlowCell
'''
sc
=
SearchCriteria
();
sc
.
addMatchClause
(
SearchCriteria
.
MatchClause
.
createAttributeMatch
(
SearchCriteria
.
MatchClauseAttribute
.
TYPE
,
illuminaFlowCellTypeName
));
sc
.
addMatchClause
(
SearchCriteria
.
MatchClause
.
createAttributeMatch
(
SearchCriteria
.
MatchClauseAttribute
.
CODE
,
flowCellName
));
foundSample
=
service
.
searchForSamples
(
sc
)
try
:
assert
foundSample
.
size
()
==
1
except
AssertionError
:
print
(
str
(
foundSample
.
size
())
+
'
flow cells found which match.
'
)
exit
(
1
)
logger
.
info
(
'
Found
'
+
foundSample
[
0
].
getCode
()
+
'
in openBIS
'
)
# Search for contained samples
sampleSc
=
SearchCriteria
()
sampleSc
.
addSubCriteria
(
SearchSubCriteria
.
createSampleContainerCriteria
(
sc
))
foundContainedSamples
=
service
.
searchForSamples
(
sampleSc
)
return
foundSample
[
0
],
foundContainedSamples
def
getParents
(
sampleName
,
service
):
'''
Returns a list of parents of a sample
'''
sc
=
SearchCriteria
();
sc
.
addMatchClause
(
SearchCriteria
.
MatchClause
.
createAttributeMatch
(
SearchCriteria
.
MatchClauseAttribute
.
CODE
,
sampleName
));
foundSample
=
service
.
searchForSamples
(
sc
)
# set the criteria for getting the parents when providing the child name
sampleSc
=
SearchCriteria
()
sampleSc
.
addSubCriteria
(
SearchSubCriteria
.
createSampleChildCriteria
(
sc
))
foundParentSamples
=
service
.
searchForSamples
(
sampleSc
)
return
foundParentSamples
def
getContainedSampleProperties
(
containedSamples
,
service
):
'''
Takes a list of contained samples, retrieves the parents and their properties and returns it
as a dictionary. The key is the sample name, the value is a list of the properties
'''
laneParentDict
=
{}
for
lane
in
containedSamples
:
parents
=
getParents
(
lane
.
getCode
(),
service
)
for
parent
in
parents
:
parentCode
=
parent
.
getCode
()
parentProperties
=
parent
.
getProperties
()
propertyDict
=
{}
for
property
in
parentProperties
:
propertyDict
[
property
]
=
parentProperties
.
get
(
property
)
propertyDict
[
'
LANE
'
]
=
lane
.
getCode
()
propertyDict
[
'
SAMPLE_TYPE
'
]
=
parent
.
getSampleTypeCode
()
myKey
=
sanitizeString
(
parentCode
+
'
_
'
+
lane
.
getCode
())
laneParentDict
[
myKey
]
=
propertyDict
return
laneParentDict
def
convertSampleToDict
(
foundFlowCell
,
configMap
):
'''
converts <type
'
ch.systemsx.cisd.openbis.generic.shared.api.v1.dto.Sample
'
> to a python dict
'''
flowCellDict
=
{}
fcProperties
=
foundFlowCell
.
getProperties
()
for
property
in
fcProperties
:
flowCellDict
[
property
]
=
fcProperties
.
get
(
property
)
flowCellDict
[
'
Name
'
]
=
foundFlowCell
.
getIdentifier
().
split
(
'
/
'
)[
-
1
]
flowCellDict
[
'
CODE
'
]
=
foundFlowCell
.
getCode
()
return
flowCellDict
def
getIndex
(
indx1
,
indx2
,
index1ReadLength
,
indexRead2Length
,
properties
,
configMap
):
if
indx1
in
properties
and
index1ReadLength
>
0
:
index
=
properties
[
indx1
][
0
:
index1ReadLength
]
else
:
index
=
''
if
indx2
in
properties
and
indexRead2Length
>
0
:
index
=
index
+
configMap
[
'
indexSeparator
'
]
+
properties
[
indx2
][
0
:
index1ReadLength
]
return
index
def
getSampleProperties
(
parentsKey
,
service
,
logger
):
for
sample
in
parentsKey
:
sampleProperties
=
sample
.
getProperties
()
logger
.
debug
(
sample
.
getSampleTypeCode
()
+
'
'
+
sample
.
getCode
())
parentSamples
=
getParents
(
sample
.
getCode
(),
service
)
for
parentSample
in
parentSamples
:
logger
.
debug
(
parentSample
.
getSampleTypeCode
()
+
'
'
+
parentSample
.
getCode
())
parentSampleProperties
=
parentSample
.
getProperties
()
return
parentSample
,
parentSampleProperties
def
createSampleSheetDict
(
configMap
,
control
,
sampleSheetDict
,
flowCellName
,
flowCellOperator
,
end_type
,
cycles
,
lane
,
gaNumber
,
index
,
sample
,
sampleProperties
):
sampleSheetDict
[
lane
+
'
_
'
+
sample
.
getCode
()]
=
[
flowCellName
+
COMMA
+
configMap
[
'
lanePrefix
'
]
+
lane
+
COMMA
+
sample
.
getCode
()
+
COMMA
+
sampleProperties
[
'
SPECIES
'
]
+
COMMA
+
index
+
COMMA
+
sanitizeString
(
sampleProperties
[
'
SAMPLE_NAME
'
])
+
COMMA
+
control
+
COMMA
+
end_type
+
'
_
'
+
cycles
+
COMMA
+
flowCellOperator
+
COMMA
+
gaNumber
]
def
createHiseqSampleSheet
(
laneParentDict
,
flowCellDict
,
configMap
,
service
,
logger
,
myoptions
):
'''
Builds up a dictionary with all entries in the Sample Sheet
'''
control
=
'
N
'
# the illlumina pipeline uses always one base less than the sequencer is sequencing
demultiplexIndexLengthPenalty
=
-
1
sampleSheetDict
=
{}
# Making sure this is on the top of the Sample Sheet
sampleSheetDict
[
u
'
!
'
]
=
([
configMap
[
'
hiSeqHeader
'
]])
indx1
=
configMap
[
'
index1Name
'
]
indx2
=
configMap
[
'
index2Name
'
]
flowCellName
=
flowCellDict
[
'
CODE
'
]
flowCellOperator
=
flowCellDict
[
'
OPERATOR
'
]
end_type
=
flowCellDict
[
'
END_TYPE
'
]
cycles
=
flowCellDict
[
'
READ_LEN
'
]
index1ReadLength
=
int
(
flowCellDict
[
'
LENGTH_OF_INDEX1
'
])
+
demultiplexIndexLengthPenalty
indexRead2Length
=
int
(
flowCellDict
[
'
LENGTH_OF_INDEX2
'
])
+
demultiplexIndexLengthPenalty
for
key
in
laneParentDict
.
keys
():
lane
=
laneParentDict
[
key
][
'
LANE
'
][
-
1
:]
properties
=
laneParentDict
[
key
]
# already Library with index
if
indx1
in
properties
:
gaNumber
=
laneParentDict
[
key
][
'
GA_NUMBER
'
]
index
=
getIndex
(
indx1
,
indx2
,
index1ReadLength
,
indexRead2Length
,
properties
,
configMap
)
sample
,
sampleProperties
=
getSampleProperties
(
getParents
(
key
.
rsplit
(
'
_
'
,
2
)[
0
],
service
),
service
,
logger
)
createSampleSheetDict
(
configMap
,
control
,
sampleSheetDict
,
flowCellName
,
flowCellOperator
,
end_type
,
cycles
,
lane
,
gaNumber
,
index
,
sample
,
sampleProperties
)
else
:
for
library
in
getParents
(
key
.
rsplit
(
'
_
'
,
2
)[
0
],
service
):
libraryProperties
=
library
.
getProperties
()
gaNumber
=
libraryProperties
[
'
GA_NUMBER
'
]
logger
.
debug
(
library
.
getSampleTypeCode
())
index
=
getIndex
(
indx1
,
indx2
,
index1ReadLength
,
indexRead2Length
,
libraryProperties
,
configMap
)
sample
,
sampleProperties
=
getSampleProperties
(
getParents
(
library
.
getCode
(),
service
),
service
,
logger
)
createSampleSheetDict
(
configMap
,
control
,
sampleSheetDict
,
flowCellName
,
flowCellOperator
,
end_type
,
cycles
,
lane
,
gaNumber
,
index
,
sample
,
sampleProperties
)
logger
.
debug
(
sampleSheetDict
)
sortedSampleSheetList
=
sampleSheetDict
.
keys
()
sortedSampleSheetList
.
sort
()
writeSampleSheet
(
flowCellName
,
sampleSheetDict
,
sortedSampleSheetList
,
myoptions
,
logger
,
fileName
=
myoptions
.
outdir
+
configMap
[
'
sampleSheetFileName
'
])
def
writeSampleSheet
(
flowCellName
,
sampleSheetDict
,
sortedSampleSheetList
,
myoptions
,
logger
,
fileName
):
'''
Write the given dictionary to a csv file
'''
newline
=
lineending
[
myoptions
.
lineending
]
myFile
=
fileName
+
'
_
'
+
flowCellName
+
'
.csv
'
try
:
with
open
(
myFile
,
'
w
'
)
as
sampleSheetFile
:
for
listElement
in
sortedSampleSheetList
:
sampleSheetFile
.
write
(
sampleSheetDict
[
listElement
][
0
]
+
newline
)
logger
.
info
(
'
Writing file
'
+
myFile
)
print
(
'
Written
'
+
myFile
)
except
IOError
:
logger
.
error
(
'
File error:
'
+
str
(
err
))
print
(
'
File error:
'
+
str
(
err
))
def
main
():
'''
Main script
'''
logger
=
setUpLogger
(
'
log/
'
)
logger
.
info
(
'
Started Creation of Sample Sheet...
'
)
myoptions
=
parseOptions
(
logger
)
if
myoptions
.
debug
:
logger
.
setLevel
(
logging
.
DEBUG
)
flowCellName
=
myoptions
.
flowcell
configMap
=
readConfig
(
logger
)
service
=
login
(
configMap
,
logger
)
foundFlowCell
,
containedSamples
=
getFlowCell
(
configMap
[
'
illuminaFlowCellTypeName
'
],
flowCellName
,
service
,
logger
)
flowCellName
=
foundFlowCell
.
getCode
()
flowCellDict
=
convertSampleToDict
(
foundFlowCell
,
configMap
)
laneParentDict
=
getContainedSampleProperties
(
containedSamples
,
service
)
logger
.
info
(
'
Found
'
+
str
(
len
(
laneParentDict
))
+
'
samples on the flow cell
'
+
flowCellName
)
sampleSheetList
=
[]
createHiseqSampleSheet
(
laneParentDict
,
flowCellDict
,
configMap
,
service
,
logger
,
myoptions
)
logout
(
service
,
logger
)
print
(
'
DONE
'
)
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment