import os 
import sys

import pyvo as vo
from pyvo.dal import tap

# Verify the version of pyvo 
from pkg_resources import parse_version
if parse_version(vo.__version__) < parse_version('1.4'):
    raise ImportError('pyvo version must be 1.4 or higher')
    
print('pyvo             version {version}'.format(version=vo.__version__))

# Defining the ESO tap service to query for phase 3 products:
tap = vo.dal.TAPService("https://archive.eso.org/tap_obs")

pyvo             version 1.4


## Given a certain product, how do I find all the files belonging it?
# Suppose that the ESO identifier of a product is `ADP.2024-03-19T05:58:04.407`
# To find all the files that belong to it, issue the following query:

query="""SELECT archive_id, extension, eso_category
FROM phase3v2.product_files 
where product_id='ADP.2024-03-19T05:58:04.407'
"""
res = tap.search(query)
print(res)

<Table length=6>
         archive_id         extension        eso_category       
           object             object            object          
--------------------------- --------- --------------------------
ADP.2024-03-19T05:58:04.407      FITS           SCIENCE.CUBE.IFS
ADP.2024-03-19T05:58:04.408      FITS ANCILLARY.IMAGE.WHITELIGHT
ADP.2024-03-19T05:58:04.409       LOG           ANCILLARY.README
ADP.2024-03-19T05:58:04.410       PNG          ANCILLARY.PREVIEW
ADP.2024-03-19T05:58:04.411       PNG          ANCILLARY.PREVIEW
ADP.2024-03-19T05:58:04.412       PNG          ANCILLARY.PREVIEW


# Use the `access_url` and the `access_estsize` columns
# to get the estimated size in kbytes and the download link of each file.

query="""SELECT archive_id, access_url, access_estsize
FROM phase3v2.product_files 
where product_id='ADP.2024-03-19T05:58:04.407'
"""
res = tap.search(query)
print(res)

<Table length=6>
         archive_id                                       access_url                               access_estsize
                                                                                                       kbyte     
           object                                           object                                     int64     
--------------------------- ---------------------------------------------------------------------- --------------
ADP.2024-03-19T05:58:04.407 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.407        2968672
ADP.2024-03-19T05:58:04.408 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.408            529
ADP.2024-03-19T05:58:04.409 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.409            241
ADP.2024-03-19T05:58:04.410 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.410            340
ADP.2024-03-19T05:58:04.411 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.411            341
ADP.2024-03-19T05:58:04.412 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.412            335


# Use the `original_filename` column
# to get the original name the file had before ingestion in the ESO Science Archive

query="""SELECT archive_id, original_filename
FROM phase3v2.product_files 
where product_id='ADP.2024-03-19T05:58:04.407'
"""
res = tap.search(query)
print(res)

<Table length=6>
         archive_id                              original_filename                     
           object                                      object                          
--------------------------- -----------------------------------------------------------
ADP.2024-03-19T05:58:04.407 MU_SCBY_3442284_2024-03-07T08:09:46.977_WFM-NOAO-N_SKY.fits
ADP.2024-03-19T05:58:04.408 MU_SIMY_3442284_2024-03-07T08:09:46.977_WFM-NOAO-N_SKY.fits
ADP.2024-03-19T05:58:04.409                      r.MUSE.2024-03-07T08:09:46.977_tpl.log
ADP.2024-03-19T05:58:04.410                      r.MUSE.2024-03-07T08:09:46.977_tpl.png
ADP.2024-03-19T05:58:04.411                      r.MUSE.2024-03-07T08:09:46.977_pst.png
ADP.2024-03-19T05:58:04.412                      r.MUSE.2024-03-07T08:36:46.807_pst.png


# Note: you could get all those columns at once in a single query,
# while here above the query was split in 3 parts, as large tables 
# do not display well in a jupyter notebook. 

query = """SELECT * 
FROM phase3v2.product_files 
where product_id='ADP.2024-03-19T05:58:04.407'
"""


# The `original_filename` column can be useful to isolate specific ancillary files in a given collection.
# Let's take as an example an ESPRESSO stacked 1d spectrum: `ADP.2021-04-12T12:27:33.887`

query="""SELECT archive_id, original_filename, eso_category
from phase3v2.product_files
where product_id='ADP.2021-04-12T12:27:33.887' 
"""
res = tap.search(query)
print(res)

<Table length=18>
         archive_id                           original_filename                          eso_category     
           object                                   object                                  object        
--------------------------- ------------------------------------------------------ -----------------------
ADP.2021-04-12T12:27:33.887 ES_SOBF_2174806_2018-09-03T09:11:43.369_HR_2x1_U3.fits        SCIENCE.SPECTRUM
ADP.2021-04-12T12:27:33.888 ES_SFLA_2174806_2018-09-03T09:11:43.369_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.889 ES_SFLA_2174806_2018-09-03T09:22:49.937_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.890 ES_SFLA_2174806_2018-09-03T09:33:56.370_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.891 ES_SFLA_2174806_2018-09-03T09:45:02.953_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.892 ES_SFLA_2174806_2018-09-03T09:56:07.370_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.893 ES_SFLB_2174806_2018-09-03T09:11:43.369_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.894 ES_SFLB_2174806_2018-09-03T09:22:49.937_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.895 ES_SFLB_2174806_2018-09-03T09:33:56.370_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.896 ES_SFLB_2174806_2018-09-03T09:45:02.953_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.897 ES_SFLB_2174806_2018-09-03T09:56:07.370_HR_2x1_U3.fits      ANCILLARY.SPECTRUM
ADP.2021-04-12T12:27:33.898 ES_SCCA_2174806_2018-09-03T09:11:43.369_HR_2x1_U3.fits           ANCILLARY.CCF
ADP.2021-04-12T12:27:33.899 ES_SCCA_2174806_2018-09-03T09:22:49.937_HR_2x1_U3.fits           ANCILLARY.CCF
ADP.2021-04-12T12:27:33.900 ES_SCCA_2174806_2018-09-03T09:33:56.370_HR_2x1_U3.fits           ANCILLARY.CCF
ADP.2021-04-12T12:27:33.901 ES_SCCA_2174806_2018-09-03T09:45:02.953_HR_2x1_U3.fits           ANCILLARY.CCF
ADP.2021-04-12T12:27:33.902 ES_SCCA_2174806_2018-09-03T09:56:07.370_HR_2x1_U3.fits           ANCILLARY.CCF
ADP.2021-04-12T12:27:33.903  ES_SOBF_2174806_2018-09-03T09:11:43.369_HR_2x1_U3.tar ANCILLARY.2DECHELLE.TAR
ADP.2021-04-12T12:27:33.904           r.ESPRE.2018-09-03T09:11:43.369_com_0000.png       ANCILLARY.PREVIEW


# As of 2024, the ESPRESSO pipeline stores measured radial velocities in the header of the individual
# non-stacked 1D spectra. It does not report any estimator of the radial velocity in the stacked spectra.
# A stacked product stores, along the science file, many ancillary files; among them, the object and the 
# sky (or Fabry-Perot [FP]) spectra that contributed to the product are stored in the ANCILLARY.SPECTRUM files. 
# Therefore, to get to the radial velocities measurements of a stacked spectral product, one has to get to
# the headers of the ANCILLARY.SPECTRUM taken on source (fiber A), excluding the ones on sky/FP (fiber B).
# The fiber A spectra can be recognised by looking at their original_filenames which start with the string:
# ES_SFLA (as opposed to ES_SFLAB).
# Hence, the query to perform is the following:

query="""SELECT archive_id, original_filename
from phase3v2.product_files
where product_id='ADP.2021-04-12T12:27:33.887'
and eso_category='ANCILLARY.SPECTRUM'
and original_filename like 'ES_SFLA%'
"""
res = tap.search(query)
print(res)

# How would you know all the above?
# The above details, the nomenclature and description of those original filenames, 
# can be found in the data documentation of the ESPRESSO collection. Link to it
# can be found, as described above (see the original_filename description), at the address:
# https://archive.eso.org/wdb/wdb/adp/phase3_main/query?dp_id=ADP.2021-04-12T12:27:33.887
# where the link to the release description is provided,
# i.e., https://www.eso.org/rm/api/v1/public/releaseDescriptions/176

<Table length=5>
         archive_id                           original_filename                   
           object                                   object                        
--------------------------- ------------------------------------------------------
ADP.2021-04-12T12:27:33.888 ES_SFLA_2174806_2018-09-03T09:11:43.369_HR_2x1_U3.fits
ADP.2021-04-12T12:27:33.889 ES_SFLA_2174806_2018-09-03T09:22:49.937_HR_2x1_U3.fits
ADP.2021-04-12T12:27:33.890 ES_SFLA_2174806_2018-09-03T09:33:56.370_HR_2x1_U3.fits
ADP.2021-04-12T12:27:33.891 ES_SFLA_2174806_2018-09-03T09:45:02.953_HR_2x1_U3.fits
ADP.2021-04-12T12:27:33.892 ES_SFLA_2174806_2018-09-03T09:56:07.370_HR_2x1_U3.fits


# A user is interested in getting the access_url of all preview files of the MUSE collection.
# Here it is shown the query that can be used to get that info, followed by a detailed explanation.

query = """SELECT product_id,archive_id, product_files.access_url
           from phase3v2.product_files product_files
           inner join ivoa.ObsCore obscore on obscore.dp_id = product_files.product_id
           where obscore.obs_collection = 'MUSE'
           and eso_category='ANCILLARY.PREVIEW'"""

# Showing here only the first 3 previews as an example of what the query returns:
res = tap.search(query, maxrec=3)
print(res)

<Table length=3>
         product_id                  archive_id                                       access_url                              
           object                      object                                           object                                
--------------------------- --------------------------- ----------------------------------------------------------------------
ADP.2016-06-01T14:07:30.134 ADP.2016-06-01T14:07:30.137 https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.137
ADP.2016-06-01T14:07:30.134 ADP.2016-06-01T14:07:30.138 https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.138
ADP.2016-06-01T14:07:30.134 ADP.2016-06-01T14:07:30.139 https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.139

/Users/amicol/miniconda3/lib/python3.8/site-packages/pyvo/dal/query.py:324: DALOverflowWarning: Partial result set. Potential causes MAXREC, async storage space, etc.
  warn("Partial result set. Potential causes MAXREC, async storage space, etc.",


# Explanation of the above MUSE query

# The ivoa.ObsCore table stores the metadata of all the Phase 3 data products (+ ALMA products).
# As an example, the following query returns the id of one product from the MUSE collection:
query = """SELECT top 1 dp_id from ivoa.ObsCore where obs_collection='MUSE' order by dp_id desc"""
res = tap.search(query)
print(res)

<Table length=1>
           dp_id           
           object          
---------------------------
ADP.2024-11-19T12:54:21.960


# You already know that you need to query the `phase3v2.product_files` table 
# to find all the files that belong to that product, and that you can restrict
# to just the ANCILLARY.PREVIEW files:

query = """ SELECT product_id,archive_id, product_files.access_url 
            from phase3v2.product_files product_files
            where product_id='ADP.2024-03-19T05:58:04.407' 
            and eso_category='ANCILLARY.PREVIEW'
            """
res = tap.search(query)
print(res)

<Table length=3>
         product_id                  archive_id                                       access_url                              
           object                      object                                           object                                
--------------------------- --------------------------- ----------------------------------------------------------------------
ADP.2024-03-19T05:58:04.407 ADP.2024-03-19T05:58:04.410 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.410
ADP.2024-03-19T05:58:04.407 ADP.2024-03-19T05:58:04.411 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.411
ADP.2024-03-19T05:58:04.407 ADP.2024-03-19T05:58:04.412 https://dataportal.eso.org/dataPortal/file/ADP.2024-03-19T05:58:04.412


# You could combine the above two queries into a single one,
# joining the two tables ivoa.ObsCore and phase3v2.product_files.

# To find out the correct joining condition, we query the standard TAP_SCHEMA keys and key_columns tables:
query = """SELECT from_table, from_column, target_table, target_column
        from keys, key_columns
        where keys.key_id=key_columns.key_id
        and target_table='ivoa.ObsCore' and from_table='phase3v2.product_files'"""
res = tap.search(query)
print(res)

<Table length=1>
      from_table       from_column target_table target_column
        object            object      object        object   
---------------------- ----------- ------------ -------------
phase3v2.product_files  product_id ivoa.ObsCore         dp_id


# The join is therefore to be performed via the:
#     ivoa.ObsCore dp_id primary key,
# and the 
#     phase3v2.product_files archive_id foreign key,
# using the syntax:
#
#     inner join ivoa.ObsCore obscore on obscore.dp_id = product_files.product_id
#
# and so obtaining the query:

query = """SELECT product_id,archive_id, original_filename 
           from phase3v2.product_files product_files
           inner join ivoa.ObsCore obscore on obscore.dp_id = product_files.product_id
           where obscore.obs_collection = 'MUSE'
           and eso_category='ANCILLARY.PREVIEW'"""


# If we want to download all the MUSE previews, we need some disk space available...
# How much disk space is needed? Let's make a query to discover that:

query = """SELECT count(*) num_files, sum(product_files.access_estsize)/1000./1000. size_GB
           from phase3v2.product_files product_files
           inner join ivoa.ObsCore obscore on obscore.dp_id = product_files.product_id
           where obscore.obs_collection = 'MUSE'
           and eso_category='ANCILLARY.PREVIEW'
           """
res = tap.search(query)
print(res)

<Table length=1>
num_files  size_gb 
  int32    float64 
--------- ---------
    64782 16.777803


#
# Be sure to have available at least 16 GB of disk space before starting the download.
#


# Here it is shown how to get the links to the ancillary files.
# It is left to the user to actually perform the download using those links.

# As an example here the query shows only the first 6 links:
query = """SELECT top 6 product_files.access_url as url
           from phase3v2.product_files product_files
           inner join ivoa.ObsCore obscore on obscore.dp_id = product_files.product_id
           where obscore.obs_collection = 'MUSE'
           and eso_category='ANCILLARY.PREVIEW'"""

res = tap.search(query)

print(res['url'])

['https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.137'
 'https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.138'
 'https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.139'
 'https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.143'
 'https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.144'
 'https://dataportal.eso.org/dataPortal/file/ADP.2016-06-01T14:07:30.145']

Column	Description
product_id	ESO identifier of a published phase 3 product. A product is composed of a science file, and optionally of a number of ancillary files. Same as the dp_id field in ivoa.ObCore.
archive_id	ESO identifier of a file belonging to the product: either a science file (eso_category: "SCIENCE.") or an ancillary file (eso_category: "ANCILLARY."). If a science file, its id is the same as the id of the product (product_id=archive_id). Ancillary files are not listed in the ivoa.ObsCore table.
eso_category	ESO file category; It starts with "SCIENCE." or "ANCILLARY." followed by one or more dot-separated and uppercased tokens describing the file at hand. For the full list of categories, please refer to the ESO Science Data Product standard available at: https://www.eso.org/sci/observing/phase3.html
extension	File name extension (upper case) of the product file, e.g.: FITS, PNG, TAR, FZ, etc.
original_filename	Original name of the product file before ingestion in the ESO archive. It may provide useful hints on the file content (usually described in the release description of the product, available at: https://archive.eso.org/wdb/wdb/adp/phase3_main/query?dp_id=here_the_product_id). It is not an identifier, as in general, there could be multiple files with the same original_filename but different archive_ids.
access_url	The download link of the individual archive_id file.
access_estsize	Estimated size of the downloaded file in KBytes. It is only "estimated" as in general, FITS headers can be patched at download time, making the file size varying with time.
internal_file_id	Internal file identification number, useful in combination with the provenance table.

How to search for ancillary files¶

Ancillary files :== files like weightmaps, previews, spectra, etc., that are associated to the science data products stored in the ESO Science Archive.¶

Introduction¶

The `phase3v2.product_files` columns¶

Example 1: How to find all the files belonging to a given product¶

Example 2: The ESPRESSO ancillary files¶

Example 3: Getting all the ANCILLARY.PREVIEW of the MUSE collection¶

Disk space required to download all the MUSE previews¶

How to search for ancillary files¶

Ancillary files :== files like weightmaps, previews, spectra, etc., that are associated to the science data products stored in the ESO Science Archive.¶

Introduction¶

The phase3v2.product_files columns¶

Example 1: How to find all the files belonging to a given product¶

Example 2: The ESPRESSO ancillary files¶

Example 3: Getting all the ANCILLARY.PREVIEW of the MUSE collection¶

Disk space required to download all the MUSE previews¶

The `phase3v2.product_files` columns¶