%APN3_PROCEEDINGS_FORM%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% TEMPLATE.TEX -- APN3 (2003) ASP Conference Proceedings template.
%
% Derived from ADASS VIII (98) ASP Conference Proceedings template
% Updated by N. Manset for ADASS IX (99), F. Primini for ADASS 2000,
% D.Bohlender for ADASS 2001, and H. Payne for ADASS XII and LaTeX2e.
%
% Use this template to create your proceedings paper in LaTeX format
% by following the instructions given below.  Much of the input will
% be enclosed by braces (i.e., { }).  The percent sign, "%", denotes
% the start of a comment; text after it will be ignored by LaTeX.  
% You might also notice in some of the examples below the use of "\ "
% after a period; this prevents LaTeX from interpreting the period as
% the end of a sentence and putting extra space after it.  
% 
% You should check your paper by processing it with LaTeX.  For
% details about how to run LaTeX as well as how to print out the User
% Guide, consult the README file.  You should also consult the sample
% LaTeX papers, sample1.tex and sample2.tex, for examples of including
% figures, html links, special symbols, and other advanced features.
%
% If you do not have access to the LaTeX software or a laser printer
% at your site, you can still prepare your paper following the
% instructions in the User Guide.  In such cases, the editors will
% process the file and make any necessary editorial adjustments.
% 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% 
\documentclass[11pt,twoside]{article}  % Leave intact
\usepackage{adassconf}

% If you have the old LaTeX 2.09, and not the current LaTeX2e, comment
% out the \documentclass and \usepackage lines above and uncomment
% the following:

%\documentstyle[11pt,twoside,adassconf]{article}

\begin{document}   % Leave intact

%-----------------------------------------------------------------------
%			    Paper ID Code
%-----------------------------------------------------------------------
% Enter the proper paper identification code.  The ID code for your
% paper is the session number associated with your presentation as
% published in the official conference proceedings.  You can           
% find this number locating your abstract in the printed proceedings
% that you received at the meeting or on-line at the conference web
% site; the ID code is the letter/number sequence proceeding the title 
% of your presentation. 
%
% This will not appear in your paper; however, it allows different
% papers in the proceedings to cross-reference each other.  Note that
% you should only have one \paperID, and it should not include a
% trailing period.
%
% EXAMPLE: \paperID{O4-1}
% EXAMPLE: \paperID{P7-7}
%

\paperID{P2-6}
%%%% ID=P2-6

%-----------------------------------------------------------------------
%		            Paper Title 
%-----------------------------------------------------------------------
% Enter the title of the paper.
%
% EXAMPLE: \title{A Breakthrough in Astronomical Software Development}
% 
% If your title is so long as to fill the page header when you print it,
% then please supply a short form as a \titlemark.
%
% EXAMPLE: 
%  \title{Rapid Development for Distributed Computing, with Implications
%         for the Virtual Observatory}
%  \titlemark{Rapid Development for Distributed Computing}
%

\title{The SuperCOSMOS Science Archive}
%\titlemark{ }

%-----------------------------------------------------------------------
%		          Authors of Paper
%-----------------------------------------------------------------------
% Enter the authors followed by their affiliations.  The \author and
% \affil commands may appear multiple times as necessary (see example
% below).  List each author by giving the first name or initials first
% followed by the last name.  Authors with the same affiliations
% should grouped together. 
%
% EXAMPLE: \author{Raymond Plante, Doug Roberts, 
%                  R.\ M.\ Crutcher\altaffilmark{1}}
%          \affil{National Center for Supercomputing Applications, 
%                 University of Illinois Urbana-Champaign, Urbana, IL
%                 61801}
%          \author{Tom Troland}
%          \affil{University of Kentucky}
%
%          \altaffiltext{1}{Astronomy Department, UIUC}
%
% In this example, the first three authors, "Plante", "Roberts", and
% "Crutcher" are affiliated with "NCSA".  "Crutcher" has an alternate 
% affiliation with the "Astronomy Department".  The fourth author,
% "Troland", is affiliated with "University of Kentucky"

\author{Nigel Hambly, Mike Read, Bob Mann, Eckhard Sutorius, Ian Bond,
Harvey MacGillivray, Peredur Williams, Andrew Lawrence}
\affil{Wide Field Astronomy Unit, Institute for Astronomy, University of
Edinburgh, Blackford Hill, Edinburgh EH9~3HJ, UK}

%-----------------------------------------------------------------------
%			 Contact Information
%-----------------------------------------------------------------------
% This information will not appear in the paper but will be used by
% the editors in case you need to be contacted concerning your
% submission.  Enter your name as the contact along with your email
% address.
% 
% EXAMPLE:  \contact{Dennis Crabtree}
%           \email{crabtree@cfht.hawaii.edu}
%

\contact{Nigel Hambly}
\email{nch@roe.ac.uk}

%-----------------------------------------------------------------------
%		      Author Index Specification
%-----------------------------------------------------------------------
% Specify how each author name should appear in the author index.  The 
% \paindex{ } should be used to indicate the primary author, and the
% \aindex for all other co-authors.  You MUST use the following
% syntax: 
%
% SYNTAX:  \aindex{Lastname, F. M.}
% 
% where F is the first initial and M is the second initial (if
% used).  This guarantees that authors that appear in multiple papers
% will appear only once in the author index.  
%
% EXAMPLE: \paindex{Crabtree, D.}
%          \aindex{Manset, N.}        
%          \aindex{Veillet, C.}        
%
% NOTE: this information is also used to build the author list that
% appears in the table of contents.  Authors will be listed in the order
% of the \paindex and \aindex commmands.
%

\paindex{Hambly, N.}
\aindex{Read, M.}    
\aindex{Mann, R.}    
\aindex{Sutorius, E.}    
\aindex{Bond, I.}    
\aindex{Macgillivray@MacGillivray, H.}    
\aindex{Williams, P.}    
\aindex{Lawrence, A.}    

%-----------------------------------------------------------------------
%		      Author list for page header	
%-----------------------------------------------------------------------
% Please supply a list of author last names for the page header. in
% one of these formats:
%
% EXAMPLES:
% \authormark{Lastname}
% \authormark{Lastname1 \& Lastname2}
% \authormark{Lastname1, Lastname2, ... \& LastnameN}
% \authormark{Lastname et al.}
%
% Use the "et al." form in the case of seven or more authors, or if
% the preferred form is too long to fit in the header.

\authormark{Hambly et al.}

%-----------------------------------------------------------------------
%			Subject Index keywords
%-----------------------------------------------------------------------
% Enter a comma separated list of up to 6 keywords describing your
% paper.  These will NOT be printed as part of your paper; however,
% they will be used to generate the subject index for the proceedings.
% There is no standard list; however, you can consult the indices
% for past proceedings (http://adass.org/adass/proceedings/).
%
% EXAMPLE:  \keywords{visualization, astronomy: radio, parallel
%                     computing, AIPS++, Galactic Center}
%
% In this example, the author noticed that "radio astronomy" appeared
% in the ADASS VII Index as "astronomy" being the major keyword and
% "radio" as the minor keyword.  The colon is used to introduce another
% level into the index.

\keywords{astronomy: optical sky surveys, DBMS: Microsoft SQL Server,
          hardware: terabyte-scale storage, hardware: high IO bandwidth}

%-----------------------------------------------------------------------
%			       Abstract
%-----------------------------------------------------------------------
% Type abstract in the space below.  Consult the User Guide and Latex
% Information file for a list of supported macros (e.g. for typesetting 
% special symbols). Do not leave a blank line between \begin{abstract} 
% and the start of your text.

\begin{abstract}          % Leave intact
The SuperCOSMOS Sky Survey (
\htmladdnormallinkfoot{SSS}{http://www-wfau.roe.ac.uk/sss}; Hambly 
et al., 2001) consists 
of digitised scans of Schmidt photographic survey
material in a multi--colour 
(BRI), multi--epoch, uniformly calibrated
product.  It covers the whole southern hemisphere, with an extension into
the north currently underway. Public online access to 
the 2~Tbytes of SSS pixel data and
object catalogues has been available for some time; data are being
downloaded at a rate of several gigabytes per week, and many new science
results are emerging from community use of the data.
In this poster we describe the terabyte--scale 
\htmladdnormallinkfoot{SuperCOSMOS Science Archive}{http://thoth.roe.ac.uk/ssa}
(SSA), which is a recasting of 
the SSS object catalogue system from flat files into an RDBMS,
with an enhanced user interface. We describe some aspects of the
hardware and schema design of the SSA, which aims to produce a high
performance, VO--compatible 
database, suitable for data mining by `power
users', while maintaining the ease of use praised in the old SSS system. 
Initially, the SSA will allow access through web forms and a flexible 
SQL interface. It acts as the prototype for the next generation survey 
archives to be hosted by the University of Edinburgh's Wide Field 
Astronomy Unit, such as the WFCAM Science Archive of infrared sky survey 
data, as well as being a scalability testbed for use by AstroGrid, the 
UK's Virtual Observatory project. As a result of these roles, it will 
display subsequently an expanding functionality, as web --
and later, Grid -- services are deployed on it.
\end{abstract}

%-----------------------------------------------------------------------
%			      Main Body
%-----------------------------------------------------------------------
% Place the text for the main body of the paper here.  You should use
% the \section command to label the various sections; use of
% \subsection is optional.  Significant words in section titles should
% be capitalized.  Sections and subsections will be numbered
% automatically. 
%
% EXAMPLE:  \section{Introduction}
%           ...
%           \subsection{Our View of the World}
%           ...
%           \section{A New Approach}
%
% It is recommended that you look at the sample papers, sample1.tex
% and sample2.tex, for examples for formatting references, footnotes,
% figures, equations, html links, lists, and other special features.  

\section{Relational Model for the SSA Data}
                 
The photographic material used in the SSA
comprise the SERC J/EJ, ER/AAO--R and I original surveys
along with first epoch R data from the ESO--R and 
POSS--I E copies in the southern hemisphere; ultimately, the
northern hemisphere will be included via POSS--II J, R and~I
copies with POSS--I E copies again providing early epoch R
data. Hence, 8 single colour surveys on 3 different field systems
(ESO/SRC, POSS--I and POSS--II) make up the source
material. The rest of the schema follows from the several individual
colour/epoch plates in each field, each plate giving rise to a set of
detections, some of which are flagged as spurious (eg.\ Storkey et 
al.\ 2003), and all of which are merged into multi--colour,
multi--epoch sources (merging allows computation of 
colour--corrected magnitudes and proper motions). 
In this way, all information
concerning the source photographic material, its measurement on 
SuperCOSMOS and subsequent processing by in--house software
is available as metadata in the schema, in addition to the 
parameterised detection and source attributes. The raw data volume is 
dominated by two tables:
the Detection table contains 3.7 billion rows; with 228 bytes per
row, it is 0.86 Tbyte in size;
the Source table contains just over 1 billion rows; with 246 bytes
per row, it is 0.26 Tbyte in size.
So the raw data volume is a little over 1.1 Tbyte before adding in 
DBMS overheads such as indexing.

\section{Hardware Design}

The hardware design of the catalogue server for the terabyte scale SSA
is based around maximising IO bandwidth in order to service user queries
as fast as possible. The SSA catalogue server uses a Tyan Thunder PC
motherboard based on dual Xeon processors (2.8 GHz) and an 
associated Intel chipset. The bus architecture of the motherboard incorporates
three independent PCI--X (64 bit, 133 MHz) busses for the
highest potential aggregate IO bandwidth. Other aspects of the science
archive hardware design that are relevant to maintenance of Tbyte data
volumes include 1 Gbit/s LAN connectivity and an Ultrium--II
LTO tape backup facility (encpasulated in on Overland 30--slot
library system) which is capable of backing up Tbytes in several hours.

Design of the disk subsystem is important for high aggregate IO 
applications. We have experimented with a number of disk array 
configurations along with tests using different interfaces (IDE and
SCSI) and different controllers (hardware RAID, 
fibre--to--IDE, Ultra--SCSI etc). Our
findings were in agreement with those of others (eg. Gray et al.\ 2002)
in that hardware RAID fault tolerance comes at a significant performance
cost -- a simple design based around software striping over
multiple Ultra--SCSI channels yields the best IO performance
at reasonable cost. Hence, our SSA catalogue server employs four
dual channel Adaptec Ultra320 SCSI controllers on each of the three
external and one on--board PCI--X motherboard
interface slots. We have attached four Seagate 15 krpm, 150 Gbyte 
Ultra320 SCSI disks
to each of the eight controller channels -- measurements
using MemSpeed show that individual disks sustain nearly 60 Mbyte/s
read/write speed on a single channel, but also that software stripe
IO performance saturates at around 200 Mbyte/s (ie.\ 4 disks).
The 32 disks yield 4.8 Tbyte of storage that is arranged in four 
logical volumes of 1.2 Tbyte each with 8 disks (one on each independent
SCSI channel) used in each stripe set. Because there is no fault
tolerance in this design (ie.\ this is a RAID0 configuration), we are
mirroring database files on a separate system in addition to
backing up on LTO--2.

\section{OS, RDBMS and Software}

Implementation of the SSA leans heavily on developments in science archiving
for the Sloan Digital Sky Survey at Johns Hopkins (Gray et al.\ 2002 and
references therein). The SSA is deployed within Microsoft SQL Server running
on the Windows 2003 Server operating system. Figure~\ref{P2-6:dbpic} illustrates
the full SSA schema as implemented within SQL Server; primary keys
and foreign key relationships between the tables are shown.
Some software from the SDSS SkyServer design have been used, most notably
the 2d spatial indexing scheme known as 
\htmladdnormallinkfoot{Hierarchical Triangular Mesh}{http://www.sdss.jhu.edu/htm}.
We have used Windows software striping as described previously to set up
high IO bandwidth disk arrays, and have additionally used file groups to
distribute all large database files across the logical devices.

\begin{figure}
\epsscale{0.8}
\plotone{P2-6_f1.eps}
\caption{Database picture of the SSA tables generated in MS SQL Server.}
\label{P2-6:dbpic}
\end{figure}


\section{Data Loading}

Small data files are easily imported into RDBMSs like SQL Server using
comma--separated ASCII files. However, for terabyte volumes, 
numerical data files formatted in this way become impractically large.
We have designed a binary loading scheme for the SSA that uses native
format files (ie.\ 4 byte floats occupy 4 bytes in the ingest files;
short integers occupy 2 bytes etc). This has enabled efficient
transformation, transfer and ingest of the data stored in the 
pre--existing SSS flat files. The native binary files are
typically less than 1/3rd the size of their ASCII equivalents making
staging and network transfer significantly easier. Furthermore,
loading of the native files is much faster. We employed a minimally
logged, heaped load using \verb+BULK INSERT+ within SQL Server,
attaching primary key constraints after the loading process has finished.

\section{User Interfaces}

Currently, user access to the SSA is via browsable web forms. These action Java servlets which parse the query, connect to the MS~SQL~Server and
retrieve and format the results set. Features of the interface include
choice of traditional radial search, form--filled and
free--form SQL queries and a catalogue cross--match
facility;
choice of output formats (HTML table summary, comma--separated
ASCII, FITS binary and VOTable) with complete control over returned attribute
sets; and links to the existing SSS pixel image facilities.
Future enhancements to the SSA will include deployment of web services as
alternative client access points, and ultimately Grid services as we enter
the next phase of science archiving at WFAU: implementation of the WFCAM
Science Archive.

\section{The WFCAM Science Archive}

The next large scale imaging datasets that will be curated and archived at
WFAU will be the survey programmes undertaken with WFCAM, a large format
infrared imager for the UK Infrared Telescope UKIRT.
Because the 
\htmladdnormallinkfoot{UKIDSS survey programme}{http://www.ukidss.org/} is
more complex than the legacy Schmidt surveys, the relational model for
the associated science archive (hereafter, the 
\htmladdnormallinkfoot{WSA}{http://www.roe.ac.uk/~nch/wfcam}) is far more
complicated. Furthermore, curation of the programmes (eg.\ source
association, final photometric calibration, computation of derived
quantities like proper motion) will be database--driven and
will take place within the RDBMS (unlike analogous procedures for the
SSA). 
Other features of the WSA relational model include
provision for database driven products, eg.\ difference images,
image stacks and image mosaics; and
provision for pixel re--analysis given master lists of
positions and apertures (ie.\ SDSS--like consistent
image analysis across all colours available in a given field).
WSA development also includes investigation of scale--out issues
for 10s to 100s of Tbytes of data, eg.\
choice of horizontal (clustered) versus vertical 
(multiprocessor) hardware scaling; spatial indexing, eg.\ HTM versus 
others, eg.\ HEALPix;
and investigation of other enterprise class RDBMSs, ie. DB2 and Oracle.

\acknowledgements
We would like to thank Jim Gray and Alex Szalay for much advice and help
concerning design and implementation of the SSA. Funding for WFAU is
provided by the UK PPARC. We acknowledge software and support
from the Microsoft Development Network Academic Alliance.

%-----------------------------------------------------------------------
%			      References
%-----------------------------------------------------------------------
% List your references below within the reference environment
% (i.e. between the \begin{references} and \end{references} tags).
% Each new reference should begin with a \reference command which sets
% up the proper indentation.  Observe the following order when listing
% bibliographical information for each reference:  author name(s),
% publication year, journal name, volume, and page number for
% articles.  Note that many journal names are available as macros; see
% the User Guide listing "macro-ized" journals.   
%
% EXAMPLE:  \reference Hagiwara, K., \& Zeppenfeld, D.\  1986, 
%                Nucl.Phys., 274, 1
%           \reference H\'enon, M.\  1961, Ann.d'Ap., 24, 369
%           \reference King, I.\ R.\  1966, \aj, 71, 276
%           \reference King, I.\ R.\  1975, in Dynamics of Stellar 
%                Systems, ed.\ A.\ Hayli (Dordrecht: Reidel), 99
%           \reference Tody, D.\  1998, \adassvii, 146
%           \reference Zacharias, N.\ \& Zacharias, M.\ 2003,
%                \adassxii, \paperref{P7.6}
% 
% Note the following tricks used in the example above:
%
%   o  \& is used to format an ampersand symbol (&).
%   o  \'e puts an accent agu over the letter e.  See the User Guide
%      and the sample files for details on formatting special
%      characters.  
%   o  "\ " after a period prevents LaTeX from interpreting the period 
%      as an end of a sentence.
%   o  \aj is a macro that expands to "Astron. J."  See the User Guide
%      for a full list of journal macros
%   o  \adassvii is a macro that expands to the full title, editor,
%      and publishing information for the ADASS VII conference
%      proceedings.  Such macros are defined for ADASS conferences I
%      through XI.
%   o  When referencing a paper in the current volume, use the
%      \adassxii and \paperref macros.  The argument to \paperref is
%      the paper ID code for the paper you are referencing.  See the 
%      note in the "Paper ID Code" section above for details on how to 
%      determine the paper ID code for the paper you reference.  
%
\begin{references}
\reference
Gray, J. et al.\ 2002, 
{\it Data Mining the SDSS SkyServer Database}, 
Microsoft Technical Report MSR--TR--2002--01 
\reference
Hambly, N.\ et al.\ 2001, \mnras, 326, 1279
\reference
Storkey, A.\ et al.\ 2003, \mnras, in press (astro--ph/0309565)
\end{references}

% Do not place any material after the references section

\end{document}  % Leave intact
