Skip to content

Commit 09d075a

Browse files
authored
Improve the speed of some post-cluster calculations (#932)
* Start adding scripts for benchmarking clustering * Determine number of trajin needed * Finish up cluster script * Add script to extract timings * Move to correct dir * Reduce # frames for initial testing * Dramatically improve cluster Summary execution time by creating an array with sieve status of each frame * Have ClusterDist take boolean array of frame is present * Remove some debug info * Benchmark silhouette * Use frameIsPresent arrays in silhouette calc * Clean up and better-document the Sieve class * Rewrite the frame is present array as being present inside Sieve, this way it only needs to be generated once. It may be better to just have it generated all the time. * Benchmark the bestreps calc * Add a debug number of frames * Add some debug print, commented out * Use frameIsPresent array instead of HasFrame. Make it a warning if no rep found when doing best rep search for cluster by parts and all frames just happen to be sieved out * Pass in the FrameIsPresent array for bestreps calc. Add part to Node for bookkeeping * Get rid of FIXME, fixed * Start new Silhouette class * Add Silhouette class * Silhouette calc is now in a separate class * Silhouette write moved to Silhouette * Use new Silhouette calc * Add missing depend for <list> * Write out actual frame numbers for cluster silhouette for each frame. Not sure why I wasn't doing this already... * Add option to choose either the sorted silhouette index or frame number for cluster silhouette frame output. * Sorted frame indices have been fixed for silhouette * Minor bump to 6.2.0. Fixes for post-clustering speed (summary etc), fix best reps for split, fix frame index in silhouette calc. * Add Silhouette.cpp to CMakeLists.txt
1 parent 272e0e2 commit 09d075a

31 files changed

+657
-295
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/bin/bash
2+
3+
CPPTRAJ=`which cpptraj`
4+
#export OMP_NUM_THREADS=12
5+
#CPPTRAJ=`which cpptraj.OMP`
6+
7+
TOP=~/Cpptraj/Cpptraj-ExtendedTests/ChainA_1-268_NAD_TCL-gaff.tip3p.parm7
8+
TRJ=~/Cpptraj/Cpptraj-ExtendedTests/run9.nc # 2000 frames
9+
COUNT=2000
10+
11+
# Each trajectory is 100000 frames
12+
if [ ! -f '../framelist.sh' ] ; then
13+
echo "no ../framelist.sh"
14+
exit 1
15+
fi
16+
source ../framelist.sh
17+
18+
for TOTAL in $FRAMELIST ; do
19+
if [ -f 'CpptrajPairDist' ] ; then
20+
rm CpptrajPairDist
21+
fi
22+
cat > cpptraj.in <<EOF
23+
set PREFIX = test1
24+
parm $TOP
25+
EOF
26+
# Set trajin
27+
echo " ===== $TOTAL ===== "
28+
if [ $TOTAL -le $COUNT ] ; then
29+
echo "trajin $TRJ 1 $TOTAL 1" >> cpptraj.in
30+
else
31+
((NTRAJ = $TOTAL / $COUNT))
32+
((REM = $TOTAL % $COUNT))
33+
if [ $REM -gt 0 ] ; then
34+
((NTRAJ++))
35+
fi
36+
echo "$NTRAJ $REM"
37+
for (( i=1; i < $NTRAJ; i++ )) ; do
38+
echo "trajin $TRJ 1 $COUNT 1" >> cpptraj.in
39+
done
40+
if [ $REM -gt 0 ] ; then
41+
echo "trajin $TRJ 1 $REM 1" >> cpptraj.in
42+
else
43+
echo "trajin $TRJ 1 $COUNT 1" >> cpptraj.in
44+
fi
45+
fi
46+
47+
# Get number of frames we want from any given trajectory
48+
((NFRAMES = $TOTAL / 4))
49+
# Calculate split frames
50+
((SPLIT1 = $NFRAMES * 1))
51+
((SPLIT2 = $NFRAMES * 2))
52+
((SPLIT3 = $NFRAMES * 3))
53+
echo "Total $TOTAL, Each section $NFRAMES splits $SPLIT1 $SPLIT2 $SPLIT3"
54+
55+
cat >> cpptraj.in <<EOF
56+
57+
cluster \
58+
kmeans clusters 5 randompoint kseed 42 \
59+
rms :10-260@CA sieve 20 random sieveseed 23 savepairdist \
60+
summaryhalf \$PREFIX.split.dat splitframe $SPLIT1,$SPLIT2,$SPLIT3 \
61+
info \$PREFIX.info.dat \
62+
summary \$PREFIX.summary.dat \
63+
sil \$PREFIX.Sil \
64+
cpopvtime \$PREFIX.cpop.agr normframe \
65+
bestrep cumulative_nosieve
66+
EOF
67+
$CPPTRAJ -i cpptraj.in -o cpptraj.$TOTAL.out
68+
done
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# source this
2+
3+
FRAMELIST0="1000 2000 3000 4000 5000 6000 7000 8000 9000 10000"
4+
#FRAMELIST1="20000 30000 40000 50000"
5+
FRAMELIST="$FRAMELIST0 $FRAMELIST1"
6+
#FRAMELIST='3000'

devtools/benchmark/cluster/plot.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
4+
rm time.pw.dat time.cluster.dat time.restore.dat time.total.dat time.summary.dat \
5+
time.calcoutput.dat time.bestrep.dat
6+
source ../framelist.sh
7+
for TOTAL in $FRAMELIST ; do
8+
OUTFILE=cpptraj.$TOTAL.out
9+
grep TIME $OUTFILE | awk -v nframes=$TOTAL '{
10+
if ($2 == "Pairwise") printf("%i %f\n", nframes, $5) >> "time.pw.dat";
11+
if ($2 == "Clustering") printf("%i %f\n", nframes, $4) >> "time.cluster.dat";
12+
if ($4 == "restore") printf("%i %f\n", nframes, $6) >> "time.restore.dat";
13+
if ($2 == "Analyses") printf("%i %f\n", nframes, $4) >> "time.total.dat";
14+
if ($2 == "Summary") printf("%i %f\n", nframes, $5) >> "time.summary.dat";
15+
if ($2 == "Output") printf("%i %f\n", nframes, $5) >> "time.calcoutput.dat";
16+
if ($2 == "Find") printf("%i %f\n", nframes, $6) >> "time.bestrep.dat";
17+
}'
18+
done

doc/cpptraj.lyx

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40389,11 +40389,11 @@ cluster
4038940389
\end_layout
4039040390

4039140391
\begin_layout LyX-Code
40392-
[clustersvtime <file> [cvtwindow <#>]] [sil <prefix>] [metricstats <file>]
40392+
[clustersvtime <file> [cvtwindow <#>]] [sil <prefix> [silidx {idx|frm}]]
4039340393
\end_layout
4039440394

4039540395
\begin_layout LyX-Code
40396-
[cpopvtime <file> [{normpop|normframe}]] [lifetime]
40396+
[metricstats <file>] [cpopvtime <file> [{normpop|normframe}]] [lifetime]
4039740397
\end_layout
4039840398

4039940399
\begin_layout LyX-Code
@@ -41378,6 +41378,25 @@ uster.dat' and cluster silhouette value for each individual frame to '<prefix>.f
4137841378
me.dat'.
4137941379
\end_layout
4138041380

41381+
\begin_deeper
41382+
\begin_layout Description
41383+
[silidx
41384+
\begin_inset space ~
41385+
\end_inset
41386+
41387+
{idx|frm}] Choose what indices to write to the cluster silhouette frame
41388+
file:
41389+
\series bold
41390+
idx
41391+
\series default
41392+
(the default) specifies the sorted index (starting from 0),
41393+
\series bold
41394+
frm
41395+
\series default
41396+
specifies the actual frame number.
41397+
\end_layout
41398+
41399+
\end_deeper
4138141400
\begin_layout Description
4138241401
[metricstats
4138341402
\begin_inset space ~
@@ -41791,7 +41810,13 @@ The cluster silhouette is a measure of how well each point fits within a
4179141810
Values of -1 indicate the point is dissimilar and may fit better in a neighbori
4179241811
ng cluster.
4179341812
Values of 0 indicate the point is on a border between two clusters.
41794-
41813+
The file containing the cluster silhouette value for each frame can be
41814+
viewed directly with the XMGRACE plotting program using the default (sorted
41815+
by silhouette value) index ('
41816+
\series bold
41817+
silidx idx
41818+
\series default
41819+
').
4179541820
\end_layout
4179641821

4179741822
\begin_layout Subsubsection*

src/Cluster/Algorithm.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
double Cpptraj::Cluster::Algorithm::ClusterDistance(Node const& C1, Node const& C2,
88
MetricArray& pmatrix,
99
bool includeSieved,
10-
Cframes const& sievedOut) const
10+
std::vector<bool> const& frameIsPresent)
11+
const
1112
{
1213
if (C1.Cent().empty() || C2.Cent().empty()) {
1314
mprinterr("Internal Error: One or both centroids are null in ClusterDistance().\n");

src/Cluster/Algorithm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#ifndef INC_CLUSTER_ALGORITHM_H
22
#define INC_CLUSTER_ALGORITHM_H
3+
#include <vector>
34
class ArgList;
45
class CpptrajFile;
56
namespace Cpptraj {
@@ -27,7 +28,7 @@ class Algorithm {
2728
virtual void Timing(double) const = 0;
2829
/// /return Algorithm-specific between-cluster distance. Default to centroid distance.
2930
virtual double ClusterDistance(Node const&, Node const&, MetricArray&,
30-
bool, Cframes const&) const;
31+
bool, std::vector<bool> const&) const;
3132
/// /return Epsilon for density-based algorithms; intended for use with sieve restore.
3233
virtual double Epsilon() const { return 0.0; }
3334
// -------------------------------------------

src/Cluster/Algorithm_DPeaks.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include <cmath> // fabs
22
#include <algorithm> // sort
3+
#include <list>
34
#include "Algorithm_DPeaks.h"
45
#include "List.h"
56
#include "MetricArray.h"

src/Cluster/Algorithm_HierAgglo.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include <limits> // double max
2+
#include <list>
23
#include "Algorithm_HierAgglo.h"
34
#include "MetricArray.h"
45
#include "Node.h"
@@ -346,7 +347,7 @@ void Cpptraj::Cluster::Algorithm_HierAgglo::calcAvgDist(List::cluster_it& C1_it,
346347
double Cpptraj::Cluster::Algorithm_HierAgglo::ClusterDistance(Node const& C1, Node const& C2,
347348
MetricArray& pmatrix,
348349
bool includeSieved,
349-
Cframes const& sievedOut)
350+
std::vector<bool> const& frameIsPresent)
350351
const
351352
{
352353
double dval = -1.0;
@@ -376,10 +377,10 @@ const
376377
// No sieved frames included.
377378
for (Node::frame_iterator f1 = C1.beginframe(); f1 != C1.endframe(); ++f1)
378379
{
379-
if (!sievedOut.HasFrame(*f1)) {
380+
if (frameIsPresent[*f1]) {
380381
for (Node::frame_iterator f2 = C2.beginframe(); f2 != C2.endframe(); ++f2)
381382
{
382-
if (!sievedOut.HasFrame(*f2)) {
383+
if (frameIsPresent[*f2]) {
383384
double Dist = pmatrix.Frame_Distance(*f1, *f2);
384385
//mprintf("\t\t\tFrame %i to frame %i = %f\n",*c1frames,*c2frames,Dist);
385386
switch (linkage_) {

src/Cluster/Algorithm_HierAgglo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class Algorithm_HierAgglo : public Algorithm {
1919
int DoClustering(List&, Cframes const&, MetricArray&);
2020
void Timing(double) const;
2121
double ClusterDistance(Node const&, Node const&, MetricArray&,
22-
bool, Cframes const&) const;
22+
bool, std::vector<bool> const&) const;
2323
private:
2424
void buildInitialClusters(List&, Cframes const&, MetricArray&);
2525
//void InitializeClusterDistances();

src/Cluster/Algorithm_Kmeans.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include <list>
12
#include "Algorithm_Kmeans.h"
23
#include "Cframes.h"
34
#include "List.h"

0 commit comments

Comments
 (0)