%APN3_PROCEEDINGS_FORM%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% TEMPLATE.TEX -- APN3 (2003) ASP Conference Proceedings template.
%
% Derived from ADASS VIII (98) ASP Conference Proceedings template
% Updated by N. Manset for ADASS IX (99), F. Primini for ADASS 2000,
% D.Bohlender for ADASS 2001, and H. Payne for ADASS XII and LaTeX2e.
%
% Use this template to create your proceedings paper in LaTeX format
% by following the instructions given below.  Much of the input will
% be enclosed by braces (i.e., { }).  The percent sign, "%", denotes
% the start of a comment; text after it will be ignored by LaTeX.
% You might also notice in some of the examples below the use of "\ "
% after a period; this prevents LaTeX from interpreting the period as
% the end of a sentence and putting extra space after it.
%
% You should check your paper by processing it with LaTeX.  For
% details about how to run LaTeX as well as how to print out the User
% Guide, consult the README file.  You should also consult the sample
% LaTeX papers, sample1.tex and sample2.tex, for examples of including
% figures, html links, special symbols, and other advanced features.
%
% If you do not have access to the LaTeX software or a laser printer
% at your site, you can still prepare your paper following the
% instructions in the User Guide.  In such cases, the editors will
% process the file and make any necessary editorial adjustments.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
\documentclass[11pt,twoside]{article}  % Leave intact
\usepackage{adassconf}

% If you have the old LaTeX 2.09, and not the current LaTeX2e, comment
% out the \documentclass and \usepackage lines above and uncomment
% the following:

%\documentstyle[11pt,twoside,adassconf]{article}

\begin{document}   % Leave intact

%-----------------------------------------------------------------------
%               Paper ID Code
%-----------------------------------------------------------------------
% Enter the proper paper identification code.  The ID code for your
% paper is the session number associated with your presentation as
% published in the official conference proceedings.  You can
% find this number locating your abstract in the printed proceedings
% that you received at the meeting or on-line at the conference web
% site; the ID code is the letter/number sequence proceeding the title
% of your presentation.
%
% This will not appear in your paper; however, it allows different
% papers in the proceedings to cross-reference each other.  Note that
% you should only have one \paperID, and it should not include a
% trailing period.
%
% EXAMPLE: \paperID{O4-1}
% EXAMPLE: \paperID{P7-7}
%

\paperID{P3-7}
%%%% ID=P3-7

%-----------------------------------------------------------------------
%                   Paper Title
%-----------------------------------------------------------------------
% Enter the title of the paper.
%
% EXAMPLE: \title{A Breakthrough in Astronomical Software Development}
%
% If your title is so long as to fill the page header when you print it,
% then please supply a short form as a \titlemark.
%
% EXAMPLE:
%  \title{Rapid Development for Distributed Computing, with Implications
%         for the Virtual Observatory}
%  \titlemark{Rapid Development for Distributed Computing}
%

\title{Astronomical Catalogues - Simultaneous Querying and Matching}
\titlemark{Astronomical Catalogues: Simultaneous Querying and Matching}
\titlemark{ Simultaneous Querying and Matching }	%% FO 22.04.04

%-----------------------------------------------------------------------
%                 Authors of Paper
%-----------------------------------------------------------------------
% Enter the authors followed by their affiliations.  The \author and
% \affil commands may appear multiple times as necessary (see example
% below).  List each author by giving the first name or initials first
% followed by the last name.  Authors with the same affiliations
% should grouped together.
%
% EXAMPLE: \author{Raymond Plante, Doug Roberts,
%                  R.\ M.\ Crutcher\altaffilmark{1}}
%          \affil{National Center for Supercomputing Applications,
%                 University of Illinois Urbana-Champaign, Urbana, IL
%                 61801}
%          \author{Tom Troland}
%          \affil{University of Kentucky}
%
%          \altaffiltext{1}{Astronomy Department, UIUC}
%
% In this example, the first three authors, "Plante", "Roberts", and
% "Crutcher" are affiliated with "NCSA".  "Crutcher" has an alternate
% affiliation with the "Astronomy Department".  The fourth author,
% "Troland", is affiliated with "University of Kentucky"

\author{Hans-Martin Adorf, Gerard Lemson, Wolfgang Voges}
\affil{Max-Planck-Institut f\"ur extraterrestrische Physik,
Garching, Germany}
\author{Harry Enke, Matthias Steinmetz}
\affil{Astrophysikalisches Institut Potsdam, Germany}

%-----------------------------------------------------------------------
%            Contact Information
%-----------------------------------------------------------------------
% This information will not appear in the paper but will be used by
% the editors in case you need to be contacted concerning your
% submission.  Enter your name as the contact along with your email
% address.
%
% EXAMPLE:  \contact{Dennis Crabtree}
%           \email{crabtree@cfht.hawaii.edu}
%

\contact{Hans-Martin Adorf}
\email{adorf@mpe.mpg.de}

%-----------------------------------------------------------------------
%             Author Index Specification
%-----------------------------------------------------------------------
% Specify how each author name should appear in the author index.  The
% \paindex{ } should be used to indicate the primary author, and the
% \aindex for all other co-authors.  You MUST use the following
% syntax:
%
% SYNTAX:  \aindex{Lastname, F. M.}
%
% where F is the first initial and M is the second initial (if
% used).  This guarantees that authors that appear in multiple papers
% will appear only once in the author index.
%
% EXAMPLE: \paindex{Crabtree, D.}
%          \aindex{Manset, N.}
%          \aindex{Veillet, C.}
%
% NOTE: this information is also used to build the author list that
% appears in the table of contents.  Authors will be listed in the order
% of the \paindex and \aindex commmands.
%

\paindex{Adorf, H.-M.}
\aindex{Lemson, G.}     % Remove this line if there is only one author
\aindex{Voges, W.}
\aindex{Enke, H.}
\aindex{Steinmetz, M.}

%-----------------------------------------------------------------------
%             Author list for page header
%-----------------------------------------------------------------------
% Please supply a list of author last names for the page header. in
% one of these formats:
%
% EXAMPLES:
% \authormark{Lastname}
% \authormark{Lastname1 \& Lastname2}
% \authormark{Lastname1, Lastname2, ... \& LastnameN}
% \authormark{Lastname et al.}
%
% Use the "et al." form in the case of seven or more authors, or if
% the preferred form is too long to fit in the header.

\authormark{Adorf, Lemson, Voges, Enke \& Steinmetz}

%-----------------------------------------------------------------------
%           Subject Index keywords
%-----------------------------------------------------------------------
% Enter a comma separated list of up to 6 keywords describing your
% paper.  These will NOT be printed as part of your paper; however,
% they will be used to generate the subject index for the proceedings.
% There is no standard list; however, you can consult the indices
% for past proceedings (http://adass.org/adass/proceedings/).
%
% EXAMPLE:  \keywords{visualization, astronomy: radio, parallel
%                     computing, AIPS++, Galactic Center}
%
% In this example, the author noticed that "radio astronomy" appeared
% in the ADASS VII Index as "astronomy" being the major keyword and
% "radio" as the minor keyword.  The colon is used to introduce another
% level into the index.

\keywords{ archives: interoperability, catalogs, ConeSearch,
cross matching: uncertainties, data: mining, Java,
Virtual Observatory: GAVO }
% grid computing

%-----------------------------------------------------------------------
%                  Abstract
%-----------------------------------------------------------------------
% Type abstract in the space below.  Consult the User Guide and Latex
% Information file for a list of supported macros (e.g. for typesetting
% special symbols). Do not leave a blank line between \begin{abstract}
% and the start of your text.

\begin{abstract}          % Leave intact
We report on our experience gained by executing multiple simple
cone searches on a number of published astronomical catalogues.
Individual search results are fed into a catalogue cross-matcher
developed by GAVO. The matcher is designed to perform a
probabilistic ``fuzzy outer join'' based on sky-positions and
their uncertainties. We describe current features of the GAVO
architecture that support such simultaneous queries, and outline
some requirements for future versions.
\end{abstract}

%-----------------------------------------------------------------------
%                 Main Body
%-----------------------------------------------------------------------
% Place the text for the main body of the paper here.  You should use
% the \section command to label the various sections; use of
% \subsection is optional.  Significant words in section titles should
% be capitalized.  Sections and subsections will be numbered
% automatically.
%
% EXAMPLE:  \section{Introduction}
%           ...
%           \subsection{Our View of the World}
%           ...
%           \section{A New Approach}
%
% It is recommended that you look at the sample papers, sample1.tex
% and sample2.tex, for examples for formatting references, footnotes,
% figures, equations, html links, lists, and other special features.

\section{Introduction}
The \htmladdnormallinkfoot{German Astrophysical Virtual
Observatory (GAVO)}{http://www.g-vo.org} is setting up an
infrastructure that will allow (1) exercising the existing simple
cone search (SCS) services; (2) searching for exotic objects like
isolated neutron stars, brown and white dwarfs; and (3)
constructing a multi-band spectral energy distribution (SED) from
various catalogues, useful e.g. for source identification and
classification purposes.

To this end GAVO is developing a multi-catalogue multi-cone (MCMC)
search service feeding a probabilistic cross-matcher. The overall
architecture of the search and matching service is depicted in
Fig.~\ref{P7-3-fig-1}.
%The three major building blocks are: the multi-catalogue
%multi-cone search ``download manager'', a VOTable processor, and
%the cross-matcher.

\begin{figure}
%\epsscale{0.95}
\plotone{P3-7_1.eps}

\caption{Dataflow through GAVO's search and matching service: the
MCMCS application queries a registry of available cone search
services. The MCMCS application takes one or more queries,
executes multiple simple cone searches, and retrieves catalogue
subsets in VOTable-format. Each dataset is pre-processed and
forwarded to the probabilistic matcher.}

\label{P7-3-fig-1}
\end{figure}


\section{The MCMCS Download Manager}
The MCMCS application
% (Fig.~\ref{P7-3-fig-2})
is similar to the \htmladdnormallinkfoot{IVOA ``VODownload''
manager}{http://skyservice.pha.jhu.edu/develop/vo/ivoa/default.aspx}.
Using a SOAP/WSDL-based Web-service, it queries the
\htmladdnormallinkfoot{Virtual Observatory Registry
Prototype}{http://skyservice.pha.jhu.edu/devel/registry} at
Johns-Hopkins University in order to retrieve the base URLs of
available simple cone searches. The MCMCS download manager is an
event-based, multi-threaded Java application, designed to minimize
the latency between query submission and retrieval of the last
result. It passes the incoming VOTables (Ochsenbein et al.\ 2002,
Williams et al.\ 2002) to one or more registered ``result
handlers'' for further processing. The default result handler
stores the VOTables on disk in different directories.

GAVO intends to offer the MCMCS-functionality within its
Web-services. In addition, GAVO plans to make this application
generally available as a stand\-alone tool, and/or as a plug-in
component usable by other software systems.

%\begin{figure}
%\epsscale{1.0} \plotone{p3-7_2.eps}
%
%\caption{Screenshot of the multi-catalogue multi-cone search
%(MCMCS) download manager at work. A table (on the left) lists the
%available SCS-services. The user selects those to be queried, and
%specifies one or more simple cone searches. The download manager
%retrieves the corresponding VOTables. A control panel (on the
%right) allows the user to monitor the progress of the multiple
%queries. }
%
%\label{P7-3-fig-2}
%\end{figure}

\section{The VOTable Processor}
We are experimenting with different approaches for processing the
VOTables, in order to extract the data needed by the matcher: XSL
translation into tabular formats (e.g.\ comma-separated value
``CSV'' files), and XML-parsing using a JAXB parser compiled from
the VOTable schema. XSLT-processing is rather robust, but requires
the handling of table files. The alternative approach, JAXB-based
VOTable parsing, while elegant, is hampered by the fact that many
VOTables received do not (yet) validate, thus causing parsing
errors. Other VOTable parsers will be evaluated in the near
future.

\section{The Probabilistic Cross-Matcher}
GAVO's cross-matcher is based on positional information. It aims
at performing a probabilistic match of the sources found in
datasets -- equivalent to a ``fuzzy outer join'' in database
terminology. We are experimenting with a symmetric, recursive
algorithm. Match candidates are selected from a starting pair of
datasets; the result may be matched with further primary datasets
or with other intermediate datasets, in order to obtain
higher-order match candidates.

In our work we are pursuing goals similar to those of the SkyNode
/ SkyQuery project (Malik et al.\ 2002, Thakar et al.\ 2003). We
differ, however, in several aspects: firstly, we try to use
individual positional uncertainties on a per-object basis;
secondly we try to take into account the full information on the
astrometric uncertainty including correlations between RA and Dec
(as displayed by some scanning sky-survey instruments). Whether
this additional complexity pays off in the end is still an open
question. Finally, our matcher does not run in a distributed
fashion, but locally, which simplifies the processing in some
respect.

For each sky-position, we are assuming a multivariate Gaussian
probability distribution. The inspected catalogues specify the
astrometric uncertainties in different ways, and so far we have
identified four types:

\begin{itemize}
\item {\it Type 0:} no error information is specified in the
catalog/dataset;

\item {\it Type 1:} a single error column specifies an {\it
isotropic} astrometric error;

\item {\it Type 2:} two error columns specify two {\it
uncorrelated} errors, one in the direction of the right ascension
and the other in the direction of the declination;

\item {\it Type 3:} three error columns specifying a general error
ellipse through its major and minor axis, and a position angle.
\end{itemize}

We assume that the error for the right ascension always specifies
the uncertainty in form of an arc-distance in the direction of the
right ascension, implying a correction with cos(Dec). However, it
is unclear whether this assumption can be relied upon (see Ortiz
2004). The difference would be most notable near the poles.

For each candidate match the matcher computes an estimated
position for the hypothetical astrophysical object, along with an
estimate of the uncertainty of this position.

Different statistical measures are conceivable for assessing the
``plausibility'' of a candidate match. We are exploring the use of
the Mahalanobis distance. Inferior match candidates are
discriminated against by applying a threshold.

\section{Observations and Issues}
Overall we found most of the advertised SCS services operational,
with a failure rate as low as 5\%. However, the results returned
vary syntactically and semantically to a degree that currently
prevents a fully automated, unassisted search and matching
service.

Here is a preliminary list of our findings: (1)~Many VOTables
received do not validate. (2)~The service name is not unique
(e.g.\ 2MASS-PSC is used by both Vizier and Irsatest). (3)~There
is no established standard for determining which columns are
returned at a given verbosity level. (4)~When no source is found,
some services return an error, others return an empty VOTable.
(5)~Some VOTables have more than one field with a {\tt
POS\_EQ\_RA\_MAIN} (or {\tt POS\_EQ\_DEC\_MAIN}) Unified Content
Descriptor (UCD). (6)~It is difficult to automatically detect the
type of the positional error information (see above). Likewise,
even if the type were known, it is not easily possible to
automatically find the columns containing the uncertainty
information. (7)~The positional uncertainty information may not be
available at SCS verbosity level~1. Thus different verbosity
levels have to be tried, or one has to resort to always using
verbosity level~3. (8)~It seems to be unclear whether the {\tt ID}
or the {\tt NAME} attribute should contain the ``official'' name
of a data column. Some VOTables use both attributes. (9)~The
angular units are not homogeneously specified; sometimes
``degrees'' was found. The positional uncertainties are usually
not given in units {\it deg}, but {\it arcsec}, so a unit
conversion is required.

%Some of the issues mentioned above, e.g.\ the
%\htmladdnormallinkfoot{``NAME or ID
%problem''}{http://www.ivoa.net/forum/votable/0250.htm} has been
%noted before. Others are addressed in the
%\htmladdnormallinkfoot{``Proposed Extensions to the VOTable 1.0''
%standard.}{http://www.ivoa.net/internal/IVOA/IvoaVOTable/votable-1x.html}
%Also \htmladdnormallinkfoot{``Column Groups in
%VOTable''}{http://www.ivoa.net/forum/votable/0190.htm} are being
%proposed.

\section{Suggestions}
We should like to make some suggestions for improving the format
and content of VOTables, so that a fully automated search and
match process will be possible in the future: (1)~Use unique
service names in the registry, and include them in the VOTable
(e.g.\ 2MASS-PSC@Vizier). (2)~Replicate the SCS query in the
VOTable. (3)~Standardize on a mechanism that allows retrieving
just the field descriptions, e.g.\ by issuing a SCS with a zero or
negative search radius. (4)~Always return the positional error
information along with the positions at the same SCS verbosity
level. (5)~Specify and implement a unique mechanism that allows an
automatic identification of the position and uncertainty fields.
(6)~Support groupings of VOTable fields. (7)~Indicate the type of
the astrometric uncertainty specification (0 to 3 error columns).
(8)~Standardize on how angular units are specified. Perhaps,
always use decimal degrees, also for the positional uncertainties.
(9)~As a stop-gap measure, include extensive comments in the field
descriptions (following Vizier's practice), so that at least
humans can find out what the fields are.

\section{Conclusion}
It is certainly an impressive accomplishment of the VO community
that, with rather modest effort, it is possible to invoke a
simultaneous cone search on 60+ catalogue services on the
Internet. It is likewise impressive that the resulting datasets
are available in ``almost'' the same data format. However, in
order to enable a fully automated search and matcher service, the
VO community needs to spend some further work on straightening out
different interpretations of the existing standards, as well as on
augmenting these.

\acknowledgments This work was carried out as part of the GAVO
project, funded by the Bundesministerium f\"ur Bildung und
Forschung (BMBF). The MCMCS download manager was kindly made
available to GAVO by Julius E.\ Adorf.


%-----------------------------------------------------------------------
%                 References
%-----------------------------------------------------------------------
% List your references below within the reference environment
% (i.e. between the \begin{references} and \end{references} tags).
% Each new reference should begin with a \reference command which sets
% up the proper indentation.  Observe the following order when listing
% bibliographical information for each reference:  author name(s),
% publication year, journal name, volume, and page number for
% articles.  Note that many journal names are available as macros; see
% the User Guide listing "macro-ized" journals.
%
% EXAMPLE:  \reference Hagiwara, K., \& Zeppenfeld, D.\  1986,
%                Nucl.Phys., 274, 1
%           \reference H\'enon, M.\  1961, Ann.d'Ap., 24, 369
%           \reference King, I.\ R.\  1966, \aj, 71, 276
%           \reference King, I.\ R.\  1975, in Dynamics of Stellar
%                Systems, ed.\ A.\ Hayli (Dordrecht: Reidel), 99
%           \reference Tody, D.\  1998, \adassvii, 146
%           \reference Zacharias, N.\ \& Zacharias, M.\ 2003,
%                \adassxii, \paperref{P7.6}
%
% Note the following tricks used in the example above:
%
%   o  \& is used to format an ampersand symbol (&).
%   o  \'e puts an accent agu over the letter e.  See the User Guide
%      and the sample files for details on formatting special
%      characters.
%   o  "\ " after a period prevents LaTeX from interpreting the period
%      as an end of a sentence.
%   o  \aj is a macro that expands to "Astron. J."  See the User Guide
%      for a full list of journal macros
%   o  \adassvii is a macro that expands to the full title, editor,
%      and publishing information for the ADASS VII conference
%      proceedings.  Such macros are defined for ADASS conferences I
%      through XI.
%   o  When referencing a paper in the current volume, use the
%      \adassxii and \paperref macros.  The argument to \paperref is
%      the paper ID code for the paper you are referencing.  See the
%      note in the "Paper ID Code" section above for details on how to
%      determine the paper ID code for the paper you reference.
%
\begin{references}

\reference Malik, T., et al.\ 2002, \htmladdnormallink{SkyQuery --
A distributed Web-based Query Service for
Astronomy.}{http://www.skyquery.net/images/skyquery.doc} The Johns
Hopkins University: Baltimore

\reference Ochsenbein, F., et al.\ 2002,
\htmladdnormallink{VOTable: Tabular Data for Virtual
Observatory.}{http://www.eso.org/gen-fac/meetings/vo2002/up/talks/ochsenbein/Ochsenbein.ppt}

\reference Ortiz, P. 2004, ``Merging data from a collection of
    sources'', \adassxiii, \paperref{P2-16}\ok

\reference Thakar, A.R., et al.\ 2003, \htmladdnormallink{SkyQuery
-- A Prototype Distributed Query and Cross-Matching Web Service
for the Virtual
Observatory.}{http://www.aas.org/publications/baas/v34n4/aas201/1137.htm}
in AAS $201^{st}$ Meeting, 606-607

\reference Williams, R., et al.\ 2002, \htmladdnormallink{VOTable:
A Proposed XML Format for Astronomical
Tables.}{http://cdsweb.u-strasbg.fr/doc/VOTable/VOTable-1-0.pdf}
 CDS: Strasbourg. 28

\end{references}

% Do not place any material after the references section

\end{document}  % Leave intact
