%APN3_PROCEEDINGS_FORM%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% TEMPLATE.TEX -- APN3 (2003) ASP Conference Proceedings template.
%
% Derived from ADASS VIII (98) ASP Conference Proceedings template
% Updated by N. Manset for ADASS IX (99), F. Primini for ADASS 2000,
% D.Bohlender for ADASS 2001, and H. Payne for ADASS XII and LaTeX2e.
%
% Use this template to create your proceedings paper in LaTeX format
% by following the instructions given below.  Much of the input will
% be enclosed by braces (i.e., { }).  The percent sign, "%", denotes
% the start of a comment; text after it will be ignored by LaTeX.
% You might also notice in some of the examples below the use of "\ "
% after a period; this prevents LaTeX from interpreting the period as
% the end of a sentence and putting extra space after it.
%
% You should check your paper by processing it with LaTeX.  For
% details about how to run LaTeX as well as how to print out the User
% Guide, consult the README file.  You should also consult the sample
% LaTeX papers, sample1.tex and sample2.tex, for examples of including
% figures, html links, special symbols, and other advanced features.
%
% If you do not have access to the LaTeX software or a laser printer
% at your site, you can still prepare your paper following the
% instructions in the User Guide.  In such cases, the editors will
% process the file and make any necessary editorial adjustments.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
\documentclass[11pt,twoside]{article}
\usepackage{adassconf}
\begin{document}

%-----------------------------------------------------------------------
%               Paper ID Code
%-----------------------------------------------------------------------

\paperID{O7-2}
%%%% ID=O7-2

%-----------------------------------------------------------------------
%                   Paper Title
%-----------------------------------------------------------------------

\title{Compare: A Scaleable and Portable Catalog Cross-Comparison Engine for the NVO}
\titlemark{Compare: A Catalog Cross-Comparison Engine}

%-----------------------------------------------------------------------
%                 Authors of Paper
%-----------------------------------------------------------------------

\author{Serge Monkewitz, John Good}
\affil{Infrared Processing and Analysis Center,
    California Institute of Technology, Pasadena, CA}

%-----------------------------------------------------------------------
%            Contact Information
%-----------------------------------------------------------------------

\contact{Serge Monkewitz} \email{smm@ipac.caltech.edu}

%-----------------------------------------------------------------------
%             Author Index Specification
%-----------------------------------------------------------------------

\paindex{Monkewitz, S.}
\aindex{Good, J.}

%-----------------------------------------------------------------------
%             Author list for page header
%-----------------------------------------------------------------------

\authormark{Monkewitz \& Good}

%-----------------------------------------------------------------------
%           Subject Index keywords
%-----------------------------------------------------------------------
% Enter a comma separated list of up to 6 keywords describing your
% paper.  These will NOT be printed as part of your paper; however,
% they will be used to generate the subject index for the proceedings.
% There is no standard list; however, you can consult the indices
% for past proceedings (http://adass.org/adass/proceedings/).
%
% EXAMPLE:  \keywords{visualization, astronomy: radio, parallel
%                     computing, AIPS++, Galactic Center}
%
% In this example, the author noticed that "radio astronomy" appeared
% in the ADASS VII Index as "astronomy" being the major keyword and
% "radio" as the minor keyword.  The colon is used to introduce another
% level into the index.
\keywords{NVO, spatial join, cross identification, computing: parallel, computing: distributed}

%-----------------------------------------------------------------------
%                  Abstract
%-----------------------------------------------------------------------
% Type abstract in the space below.  Consult the User Guide and Latex
% Information file for a list of supported macros (e.g. for typesetting
% special symbols). Do not leave a blank line between \begin{abstract}
% and the start of your text.

\begin{abstract}
We describe the architecture of a general cross-comparison engine
capable of spatially matching sources in one astronomical source
catalog with those in another. The software is highly modular and
is written in portable C++. By performing many cross-comparisons
of small sky regions in parallel, the software will scale to very
large input catalog sizes. Support is provided for common catalog
formats and data sources (e.g. local disk, database servers), and
the addition of support for custom data formats and sources is
simplified by the modular architecture employed. Hooks for
customized source pre-processing and match-list post-processing
are also available. Taken together, these attributes will make
Compare a powerful package for cross-comparing astronomical
catalogs on all scales and for cross-identifying sources between
catalogs, allowing it to serve the needs of both large projects
and individual astronomers. In particular, the package will be
installed at San Diego Supercomputer Center, where it will perform
cross-comparison between large-scale catalogs (such as MACHO and
2MASS) housed there. When complete, it will be a cornerstone
compute service for the NVO. We have applied an early version of
the package to the cross-comparison of the SDSS Early Data Release
and the 2MASS 2nd Incremental Data Release catalogs, a computation
central to the NVO Brown Dwarf demonstration project. Despite
being performed sequentially, the comparison of 9.8 million SDSS
sources to 0.5 million 2MASS sources completed in approximately
100 seconds when run on a 4 CPU Sun V480 with 16GB of memory.
\end{abstract}

%-----------------------------------------------------------------------
%                 Main Body
%-----------------------------------------------------------------------
% Place the text for the main body of the paper here.  You should use
% the \section command to label the various sections; use of
% \subsection is optional.  Significant words in section titles should
% be capitalized.  Sections and subsections will be numbered
% automatically.
%
% EXAMPLE:  \section{Introduction}
%           ...
%           \subsection{Our View of the World}
%           ...
%           \section{A New Approach}
%
% It is recommended that you look at the sample papers, sample1.tex
% and sample2.tex, for examples for formatting references, footnotes,
% figures, equations, html links, lists, and other special features.

\section{Introduction}
The \textbf{Compare} software package is a framework for
performing spatial joins between two lists of astronomical source
positions. For each source $s$ in a primary catalog $P$, it finds
all sources in a secondary catalog $S$ within a given angular
distance $d_m$ of $s$. This will be referred to as the \emph{match
list} for $s$, and corresponds to a list of candidates in $S$
which might be observations of the same astronomical object as
$s$. In addition, the software finds all the sources $s_n\in{P}$
such that $\forall t\in{S}$, $dist(s_n,t)>d_m$ (the \emph{primary
no-match list}), as well as all sources $t_n\in{S}$ such that
$\forall s\in{P}$, $dist(s,t_n)>d_m$ (the \emph{secondary no-match
list}). The process of cross-comparing two catalogs is a necessary
first step when cross-identifying observations from different
missions, but also has many other applications. It can for example
be used to pick a "best" observation from a cluster of
observations, to merge or group observations in some way, or to
help identify artifacts in a catalog.

\section{Design Goals}
The primary goals targeted by the \textbf{Compare} software
package are high performance regardless of input catalog size,
generality (the ability to process catalogs of arbitrary size and
format on a variety of run-time platforms), and extensibility. The
existing cross-comparison codes at
\htmladdnormallinkfoot{IRSA}{Infrared Science Archive:
http://irsa.ipac.caltech.edu} suffer from various limitations
which render them incapable of meeting these goals. They were
written for a fixed hardware platform and do not make any attempt
to be portable; they are tied to specific input and output data
formats; finally, they operate on fixed column sets, limiting
their use to specific catalogs. This last limitation means that
any processing requiring column values not initially retrieved
requires an additional pass through the potentially very large
input catalogs. \textbf{Compare} is designed to overcome all of
these limitations.

\section{Design}
The software, written in portable C++, is partitioned into five
major components:
\begin{description}
    \item[Data Access:] The component responsible for reading
    source positions (as well as any other requested fields).
    Implementations which read data from ASCII/binary table files
    and from Informix database tables are provided.
    \item[Source List Processing:] A component which can filter sources,
    as well as modify or generate the data fields associated with each source.
    \item[Cross-Comparison] This component computes match lists as well as
    primary and secondary no-match lists.
    \item[Match List processing] This component allows for customizable match
    list filtering and processing.
    \item[Data Storage] This component is responsible for storing match and
    no-match lists. Implementations which store data to ASCII/binary table files
    are provided.
\end{description}
These are illustrated in Figures \ref{O7-2:fig-1} and \ref{O7-2:fig-2}. Each
component implementation conforms to a simple interface, and
communication between different components is limited to the
consumption and production of sources and match lists. This makes
it easy to add support for new input/output data formats,
cross-comparison algorithms, and source or match list processing
modules. Writing a working cross-comparison application becomes a
matter of choosing and linking component implementations.

\begin{figure}
\epsscale{0.75} \plotone{O7-2_1.eps} \caption{Data access and
source list processing} \label{O7-2:fig-1}
\end{figure}
\begin{figure}
\epsscale{1.0} \plotone{O7-2_2.eps} \caption{Cross-comparison,
match list processing, and data storage} \label{O7-2:fig-2}
\end{figure}

\section{Implementation and Scaleability}
The problem of scaling to very large catalog sizes is handled by
splitting the sky into smaller disjoint regions which fit into
machine RAM. A single \textbf{Compare} process is only capable of
cross-comparing sources one region at a time; however, regions can
be distributed across multiple \textbf{Compare} processes for
simultaneous execution. It is important to note that the single
largest performance bottleneck is I/O, so performance gains from
parallelization will be modest unless I/O is also partitioned
across multiple independent storage devices. Redundant I/O can be
avoided by spatially indexing the input catalogs, and by
performing all required source and match-list processing inside
\textbf{Compare}. To summarize, high performance (when comparing
large catalogs) requires high I/O bandwidth and spatially ordered
data-access, a fast way to retrieve sources very close to a
position, and a fast way to retrieve sources for larger regions of
the sky.

\subsection{Results} An early version of the \textbf{Compare}
software was used by the
\htmladdnormallinkfoot{NVO}{http://us-vo.org}
\htmladdnormallinkfoot{Brown Dwarf Demonstration
Project}{http://irsa.ipac.caltech.edu/applications/WebCompare/nvodemo/}.
The prototype compared 9.8 million SDSS sources to 0.5 million
2MASS sources, finding 326020 source pairs within 3 arcseconds of
each other in approximately 100 seconds. Roughly 80\% of execution
time was spent in I/O. Further confirmation that I/O is the major
performance bottleneck for large-scale cross-comparisons is the
fact that a series of simple SQL queries which retrieve the entire
2MASS working point source catalog (1.3 billion sources, 1.2TB of
disk) take a total of around 4 days to complete.

\section{Applications and Future Work}
\textbf{Compare} was funded by the National Partnership for
Advanced Computational Infrastructure as a demonstration project
for Grid computing, and as such will be ported to the Tera-Grid as
soon as it matures. Furthermore, spatial joins of the 2MASS, SDSS,
USNO, and MACHO catalogs are planned, with results to be served as
publicly available data sets. In the interest of promoting
research, we would like to make source code for the software
available to individual astronomers. In addition, there are
several avenues of future development to explore. Firstly, support
for more input formats (specifically the VOTable format) and data
sources (RDBMSes other than Informix) is desirable. Secondly,
performance could be improved by generating I/O code and data
structures specific to a desired catalog column set at
compile-time (or even run-time). Thirdly, allowing the use of SQL
expressions to filter sources and matches (or to generate new
column values) would allow astronomers unfamiliar with C++ to make
wider and more efficient use of the software. Finally, including a
module capable of determining whether or not the neighborhood of a
source from one mission has been observed by another would be
invaluable when processing primary and secondary no-match lists.
Software availability is expected in early 2004.


%-----------------------------------------------------------------------
%                 References
%-----------------------------------------------------------------------
% List your references below within the reference environment
% (i.e. between the \begin{references} and \end{references} tags).
% Each new reference should begin with a \reference command which sets
% up the proper indentation.  Observe the following order when listing
% bibliographical information for each reference:  author name(s),
% publication year, journal name, volume, and page number for
% articles.  Note that many journal names are available as macros; see
% the User Guide listing "macro-ized" journals.
%
% EXAMPLE:  \reference Hagiwara, K., \& Zeppenfeld, D.\  1986,
%                Nucl.Phys., 274, 1
%           \reference H\'enon, M.\  1961, Ann.d'Ap., 24, 369
%           \reference King, I.\ R.\  1966, \aj, 71, 276
%           \reference King, I.\ R.\  1975, in Dynamics of Stellar
%                Systems, ed.\ A.\ Hayli (Dordrecht: Reidel), 99
%           \reference Tody, D.\  1998, \adassvii, 146
%           \reference Zacharias, N.\ \& Zacharias, M.\ 2003,
%                \adassxii, \paperref{P7.6}
%
% Note the following tricks used in the example above:
%
%   o  \& is used to format an ampersand symbol (&).
%   o  \'e puts an accent agu over the letter e.  See the User Guide
%      and the sample files for details on formatting special
%      characters.
%   o  "\ " after a period prevents LaTeX from interpreting the period
%      as an end of a sentence.
%   o  \aj is a macro that expands to "Astron. J."  See the User Guide
%      for a full list of journal macros
%   o  \adassvii is a macro that expands to the full title, editor,
%      and publishing information for the ADASS VII conference
%      proceedings.  Such macros are defined for ADASS conferences I
%      through XI.
%   o  When referencing a paper in the current volume, use the
%      \adassxii and \paperref macros.  The argument to \paperref is
%      the paper ID code for the paper you are referencing.  See the
%      note in the "Paper ID Code" section above for details on how to
%      determine the paper ID code for the paper you reference.
%
%\begin{references}
%\end{references}

\end{document}
