Source code for pysagereader.sage_ii_reader

import numpy as np
import xarray as xr
import pandas as pd
import os
from collections import OrderedDict
# from astropy.time import Time
import logging
import copy
from typing import List, Dict, Union, Tuple
import pysagereader


[docs]class SAGEIILoaderV700(object):
    """
    Class designed to load the v7.00 SAGE II spec and index files provided by NASA ADSC into python

    Data files must be accessible by the users machine, and can be downloaded from:
    https://eosweb.larc.nasa.gov/project/sage2/sage2_v7_table

    Parameters
    ----------
    data_folder
        location of sage ii index and spec files.
    output_format
        format for the output data. If ``'xarray'`` the output is returned as an ``xarray.Dataset``.
        If None the output is returned as a dictionary of numpy arrays.

        **NOTE: the following options only apply to xarray output types**
    species
        Species to be returned in the output data. If None all species are returned. Options are
        ``aerosol``, ``ozone``, ``h2o``, and ``no2``. If more than one species is returned fields will be NaN-padded
        where data is not available. ``species`` is only used if ``'xarray'`` is set as the ``output_data`` format,
        otherwise it has no effect.
    cf_names
        If True then CF-1.7 naming conventions are used for the output_data when ``xarray`` is selected.
    filter_aerosol
        filter the aerosol using the cloud flag
    filter_ozone
        filter the ozone using the criteria recommended in the release notes

            * Exclusion of all data points with an uncertainty estimate of 300% or greater
            * Exclusion of all profiles with an uncertainty greater than 10% between 30 and 50 km
            * Exclusion of all data points at altitude and below the occurrence of an aerosol extinction value of
              greater than 0.006 km^-1
            * Exclusion of all data points at altitude and below the occurrence of both the 525nm aerosol extinction
              value exceeding 0.001 km^-1 and the 525/1020 extinction ratio falling below 1.4
            * Exclusion of all data points below 35km an 200% or larger uncertainty estimate
    enumerate_flags
        expand the index and species flags to their boolean values.
    normalize_percent_error
        give the species error as percent rather than percent * 100
    return_separate_flags
        return the enumerated flags as a separate data array

    Example
    -------

    >>> sage = SAGEIILoaderV700()
    >>> sage.data_folder = 'path/to/data'
    >>> data = sage.load_data('2004-1-1','2004-5-1')

    In addition to the sage ii fields reported in the files, two additional time fields are provided
    to allow for easier subsetting of the data.

    ``data['mjd']`` is a numpy array containing the modified julian dates of each scan

    ``date['time']`` is an pandas time series object containing the times of each scan

    """
    def __init__(self, data_folder: str = None,
                 output_format: str = 'xarray',
                 species: List[str] = ('aerosol', 'h2o', 'no2', 'ozone', 'background'),
                 cf_names: bool = False,
                 filter_aerosol: bool = False,
                 filter_ozone: bool = False,
                 enumerate_flags: bool = False,
                 normalize_percent_error: bool = False,
                 return_separate_flags: bool = False):

        if type(species) == str:
            species = [species]

        self.data_folder = data_folder  # Type: str
        self.version = '7.00'
        self.index_file = 'SAGE_II_INDEX_'
        self.spec_file = 'SAGE_II_SPEC_'

        self.fill_value = np.nan

        self.spec_format = self.get_spec_format()
        self.index_format = self.get_index_format()
        self.output_format = output_format
        self.species = [s.lower() for s in species]
        self.cf_names = cf_names
        self.filter_aerosol = filter_aerosol
        self.filter_ozone = filter_ozone
        self.normalize_percent_error = normalize_percent_error
        self.enumerate_flags = enumerate_flags
        self.return_separate_flags = return_separate_flags

[docs]    @staticmethod
    def get_spec_format() -> Dict[str, Tuple[str, int]]:
        """
        spec format taken from sg2_specinfo.pro provided in the v7.00 download

        used for reading the binary data format

        Returns
        -------
        Dict
            Ordered dictionary of variables provided in the spec file. Each dictionary field contains a
            tuple with the information (data type, number of data points). Ordering is important as the
            sage ii binary files are read sequentially.
        """
        spec = OrderedDict()
        spec['Tan_Alt'] = ('float32', 8)       # Subtangent Altitudes(km)
        spec['Tan_Lat'] = ('float32', 8)       # Subtangent Latitudes @ Tan_Alt(deg)
        spec['Tan_Lon'] = ('float32', 8)       # Subtangent Longitudes @ Tan_Alt(deg)

        spec['NMC_Pres'] = ('float32', 140)    # Gridded Pressure profile(mb)
        spec['NMC_Temp'] = ('float32', 140)    # Gridded Temperature profile(K)
        spec['NMC_Dens'] = ('float32', 140)    # Gridded Density profile(cm ^ (-3))
        spec['NMC_Dens_Err'] = ('int16', 140)  # Error in NMC_Dens( % * 1000)

        spec['Trop_Height'] = ('float32', 1)   # NMC Tropopause Height(km)
        spec['Wavelength'] = ('float32', 7)    # Wavelength of each channel(nm)

        spec['O3'] = ('float32', 140)          # O3 Density profile 0 - 70 Km(cm ^ (-3))
        spec['NO2'] = ('float32', 100)         # NO2 Density profile 0 - 50 Km(cm ^ (-3))
        spec['H2O'] = ('float32', 100)         # H2O Volume Mixing Ratio 0 - 50 Km(ppp)

        spec['Ext386'] = ('float32', 80)       # 386 nm Extinction 0 - 40 Km(1 / km)
        spec['Ext452'] = ('float32', 80)       # 452 nm Extinction 0 - 40 Km(1 / km)
        spec['Ext525'] = ('float32', 80)       # 525 nm Extinction 0 - 40 Km(1 / km)
        spec['Ext1020'] = ('float32', 80)      # 1020 nm Extinction 0 - 40 Km(1 / km)
        spec['Density'] = ('float32', 140)     # Calculated Density 0 - 70 Km(cm ^ (-3))
        spec['SurfDen'] = ('float32', 80)      # Aerosol surface area dens 0 - 40 km(um ^ 2 / cm ^ 3)
        spec['Radius'] = ('float32', 80)       # Aerosol effective radius 0 - 40 km(um)

        spec['Dens_Mid_Atm'] = ('float32', 70)  # Middle Atmosphere Density(cm ^ (-3))
        spec['O3_Err'] = ('int16', 140)        # Error in O3 density profile( % * 100)
        spec['NO2_Err'] = ('int16', 100)       # Error in NO2 density profile( % * 100)
        spec['H2O_Err'] = ('int16', 100)       # Error in H2O mixing ratio( % * 100)

        spec['Ext386_Err'] = ('int16', 80)     # Error in 386 nm Extinction( % * 100)
        spec['Ext452_Err'] = ('int16', 80)     # Error in 452 nm Extinction( % * 100)
        spec['Ext525_Err'] = ('int16', 80)     # Error in 525 nm Extinction( % * 100)
        spec['Ext1020_Err'] = ('int16', 80)    # Error in 1019 nm Extinction( % * 100)
        spec['Density_Err'] = ('int16', 140)   # Error in Density( % * 100)
        spec['SurfDen_Err'] = ('int16', 80)    # Error in surface area dens( % * 100)
        spec['Radius_Err'] = ('int16', 80)     # Error in aerosol radius( % * 100)

        spec['Dens_Mid_Atm_Err'] = ('int16', 70)  # Error in Middle Atm.Density( % * 100)
        spec['InfVec'] = ('uint16', 140)       # Informational Bit flags

        return spec

[docs]    @staticmethod
    def get_index_format() -> Dict[str, Tuple[str, int]]:
        """
        index format taken from sg2_indexinfo.pro provided in the v7.00 download

        used for reading the binary data format

        Returns
        -------
        Dict
            an ordered dictionary of variables provided in the index file. Each dictionary
            field contains a tuple with the information (data type, length). Ordering is
            important as the sage ii binary files are read sequentially.
        """

        info = OrderedDict()

        info['num_prof'] = ('uint32', 1)       # Number of profiles in these files
        info['Met_Rev_Date'] = ('uint32', 1)   # LaRC Met Model Revision Date(YYYYMMDD)
        info['Driver_Rev'] = ('S1', 8)       # LaRC Driver Version(e.g. 6.20)
        info['Trans_Rev'] = ('S1', 8)        # LaRC Transmission Version
        info['Inv_Rev'] = ('S1', 8)          # LaRC Inversion Version
        info['Spec_Rev'] = ('S1', 8)         # LaRC Inversion Version
        info['Eph_File_Name'] = ('S1', 32)   # Ephemeris data file name
        info['Met_File_Name'] = ('S1', 32)   # Meteorological data file name
        info['Ref_File_Name'] = ('S1', 32)   # Refraction data file name
        info['Tran_File_Name'] = ('S1', 32)  # Transmission data file name
        info['Spec_File_Name'] = ('S1', 32)  # Species profile file name
        info['FillVal'] = ('float32', 1)       # Fill value

        # Altitude grid and range info
        info['Grid_Size'] = ('float32', 1)     # Altitude grid spacing(0.5 km)
        info['Alt_Grid'] = ('float32', 200)    # Geometric altitudes(0.5, 1.0, ..., 100.0 km)
        info['Alt_Mid_Atm'] = ('float32', 70)  # Middle atmosphere geometric altitudes
        info['Range_Trans'] = ('float32', 2)   # Transmission min & max altitudes[0.5, 100.]
        info['Range_O3'] = ('float32', 2)      # Ozone min & max altitudes[0.5, 70.0]
        info['Range_NO2'] = ('float32', 2)     # NO2 min & max altitudes[0.5, 50.0]
        info['Range_H2O'] = ('float32', 2)     # Water vapor min & max altitudes[0.5, 50.0]
        info['Range_Ext'] = ('float32', 2)     # Aerosol extinction min & max altitudes[0.5, 40.0]
        info['Range_Dens'] = ('float32', 2)    # Density min & max altitudes[0.5, 70.0]
        info['Spare'] = ('float32', 2)         #

        # Event specific info useful for data subsetting
        info['YYYYMMDD'] = ('int32', 930)      # Event date at 20km subtangent point
        info['Event_Num'] = ('int32', 930)     # Event number
        info['HHMMSS'] = ('int32', 930)        # Event time at 20km
        info['Day_Frac'] = ('float32', 930)    # Time of year(DDD.frac) at 20 km
        info['Lat'] = ('float32', 930)         # Subtangent latitude at 20 km(-90, +90)
        info['Lon'] = ('float32', 930)         # Subtangent longitude at 20 km(-180, +180)
        info['Beta'] = ('float32', 930)        # Spacecraft beta angle(deg)
        info['Duration'] = ('float32', 930)    # Duration of event(sec)
        info['Type_Sat'] = ('int16', 930)      # Event Type Instrument(0 = SR, 1 = SS)
        info['Type_Tan'] = ('int16', 930)      # Event Type Local(0 = SR, 1 = SS)

        # Process tracking and flag info
        info['Dropped'] = ('int32', 930)       # Dropped event flag
        info['InfVec'] = ('uint32', 930)       # Bit flags relating to processing (
        # NOTE: readme_sage2_v6.20.txt says InfVec is 16 bit but appears to actually be 32 (also in IDL software)

        # Record creation dates and times
        info['Eph_Cre_Date'] = ('int32', 930)  # Record creation date(YYYYMMDD format)
        info['Eph_Cre_Time'] = ('int32', 930)  # Record creation time(HHMMSS format)
        info['Met_Cre_Date'] = ('int32', 930)  # Record creation date(YYYYMMDD format)
        info['Met_Cre_Time'] = ('int32', 930)  # Record creation time(HHMMSS format)
        info['Ref_Cre_Date'] = ('int32', 930)  # Record creation date(YYYYMMDD format)
        info['Ref_Cre_Time'] = ('int32', 930)  # Record creation time(HHMMSS format)
        info['Tran_Cre_Date'] = ('int32', 930)  # Record creation date(YYYYMMDD format)
        info['Tran_Cre_Time'] = ('int32', 930)  # Record creation time(HHMMSS format)
        info['Spec_Cre_Date'] = ('int32', 930)  # Record creation date(YYYYMMDD format)
        info['Spec_Cre_Time'] = ('int32', 930)  # Record creation time(HHMMSS format)

        return info

[docs]    def get_spec_filename(self, year: int, month: int) -> str:
        """
        Returns the spec filename given a year and month

        Parameters
        ----------
        year
            year of the data that will be loaded
        month
            month of the data that will be loaded

        Returns
        -------
            filename of the spec file where the data is stored
        """
        file = os.path.join(self.data_folder,
                            self.spec_file + str(int(year)) + str(int(month)).zfill(2) + '.' + self.version)

        if not os.path.isfile(file):
            file = None

        return file

[docs]    def get_index_filename(self, year: int, month: int) -> str:
        """
        Returns the index filename given a year and month

        Parameters
        ----------
        year
            year of the data that will be loaded
        month
            month of the data that will be loaded

        Returns
        -------
            filename of the index file where the data is stored
        """

        file = os.path.join(self.data_folder,
                            self.index_file + str(int(year)) + str(int(month)).zfill(2) + '.' + self.version)

        if not os.path.isfile(file):
            file = None

        return file

[docs]    def read_spec_file(self, file: str, num_profiles: int) -> List[Dict]:
        """

        Parameters
        ----------
        file
            name of the spec file to be read
        num_profiles
            number of profiles to read from the spec file (usually determined from the index file)

        Returns
        -------
            list of dictionaries containing the spec data. Each list is one event
        """

        # load the file into the buffer
        file_format = self.spec_format
        with open(file, "rb") as f:
            buffer = f.read()

        # initialize the list of dictionaries
        data = [None] * num_profiles
        for p in range(num_profiles):
            data[p] = dict()

        # load the data from the buffer
        bidx = 0
        for p in range(num_profiles):
            for key in file_format.keys():
                nbytes = np.dtype(file_format[key][0]).itemsize * file_format[key][1]
                data[p][key] = copy.copy(np.frombuffer(buffer[bidx:bidx + nbytes],
                                                       dtype=file_format[key][0]))
                bidx += nbytes

        return data

[docs]    def read_index_file(self, file: str) -> Dict:
        """
        Read the binary file into a python data structure

        Parameters
        ----------
        file
            filename to be read

        Returns
        -------
            data from the file
        """

        file_format = self.index_format
        with open(file, "rb") as f:
            buffer = f.read()

        data = dict()

        # load the data from file into a list
        bidx = 0
        for key in file_format.keys():
            nbytes = np.dtype(file_format[key][0]).itemsize * file_format[key][1]
            if file_format[key][0] == 'S1':
                data[key] = copy.copy(buffer[bidx:bidx + nbytes].decode('utf-8'))
            else:
                data[key] = copy.copy(np.frombuffer(buffer[bidx:bidx + nbytes], dtype=file_format[key][0]))
                if len(data[key]) == 1:
                    data[key] = data[key][0]
            bidx += nbytes

        # make a more useable time field
        date_str = []
        # If the time overflows by less than the scan time just set it to midnight
        data['HHMMSS'][(data['HHMMSS'] >= 240000) & (data['HHMMSS'] < (240000 + data['Duration']))] = 235959
        # otherwise, set it as invalid
        data['HHMMSS'][data['HHMMSS'] >= 240000] = -999
        for idx, (ymd, hms) in enumerate(zip(data['YYYYMMDD'], data['HHMMSS'])):
            if (ymd < 0) | (hms < 0):
                date_str.append('1970-1-1 00:00:00')    # invalid sage ii date
            else:
                hours = int(hms / 10000)
                mins = int((hms % 10000) / 100)
                secs = hms % 100
                date_str.append(str(ymd)[0:4] + '-' + str(ymd)[4:6].zfill(2) + '-' +
                                str(ymd)[6::].zfill(2) + ' ' + str(hours).zfill(2) + ':' +
                                str(mins).zfill(2) + ':' + str(secs).zfill(2))

        # data['time'] = Time(date_str, format='iso')
        data['time'] = pd.to_datetime(date_str)
        data['mjd'] = np.array((data['time'] - pd.Timestamp('1858-11-17')) / pd.Timedelta(1, 'D'))
        data['mjd'][data['mjd'] < 40588] = -999  # get rid of invalid dates

        return data

[docs]    def load_data(self, min_date: str, max_date: str,
                  min_lat: float = -90, max_lat: float = 90,
                  min_lon: float = -180, max_lon: float = 360) -> Union[Dict, xr.Dataset]:
        """
        Load the SAGE II data for the specified dates and locations.

        Parameters
        ----------
        min_date
            start date where data will be loaded in iso format, eg: '2004-1-1'
        max_date
            end date where data will be loaded in iso format, eg: '2004-1-1'
        min_lat
            minimum latitude (optional)
        max_lat
            maximum latitude (optional)
        min_lon
            minimum longitude (optional)
        max_lon
            maximum longitude (optional)

        Returns
        -------
            Variables are returned as numpy arrays (1 or 2 dimensional depending on the variable)
        """
        min_time = pd.Timestamp(min_date)
        max_time = pd.Timestamp(max_date)
        data = dict()
        init = False

        # create a list of unique year/month combinations between the start/end dates
        uniq = OrderedDict()
        for year in [(t.date().year, t.date().month) for t in
                     pd.date_range(min_time, max_time + pd.Timedelta(27, 'D'), freq='27D')]:
            uniq[year] = year

        # load in the data from the desired months
        for (year, month) in list(uniq.values()):

            logging.info('loading data for : ' + str(year) + '/' + str(month))
            indx_file = self.get_index_filename(year, month)

            # if the file does not exist move on to the next month
            if indx_file is None:
                continue

            indx_data = self.read_index_file(indx_file)
            numprof = indx_data['num_prof']
            spec_data = self.read_spec_file(self.get_spec_filename(year, month), numprof)

            # get rid of the duplicate names for InfVec
            for sp in spec_data:
                sp['ProfileInfVec'] = copy.copy(sp['InfVec'])
                del sp['InfVec']

            for key in indx_data.keys():
                # get rid of extraneous profiles in the index so index and spec are the same lengths
                if hasattr(indx_data[key], '__len__') and type(indx_data[key]) is not str:
                    if len(indx_data[key]) == 930:
                        indx_data[key] = np.delete(indx_data[key], np.arange(numprof, 930))

                # add the index values to the data set
                if key in data.keys():
                    # we dont want to replicate certain fields
                    if (key[0:3] != 'Alt') & (key[0:5] != 'Range') & (key[0:7] != 'FillVal'):
                        data[key] = np.append(data[key], indx_data[key])
                else:
                    if key == 'FillVal':
                        data[key] = indx_data[key]
                    else:
                        data[key] = [indx_data[key]]

            # initialize the data dictionaries as lists
            if init is False:
                for key in spec_data[0].keys():
                    data[key] = []
                init = True

            # add the spec values to the data set
            for key in spec_data[0].keys():
                data[key].append(np.asarray([sp[key] for sp in spec_data]))

        # join all of our lists into an array - this could be done more elegantly with vstack to avoid
        # the temporary lists, but this is much faster
        for key in data.keys():
            if key == 'FillVal':
                data[key] = float(data[key])  # make this a simple float rather than zero dimensional array
            elif type(data[key][0]) is str:
                data[key] = str(data[key][0])
            elif len(data[key][0].shape) > 0:
                data[key] = np.concatenate(data[key], axis=0)
            else:
                data[key] = np.asarray(data[key])

        data = self.subset_data(data, min_date, max_date, min_lat, max_lat, min_lon, max_lon)
        if not data:
            return None

        if self.output_format == 'xarray':
            data = self.convert_to_xarray(data)

        return data

[docs]    @staticmethod
    def subset_data(data: Dict, min_date: str, max_date: str,
                    min_lat: float, max_lat: float,
                    min_lon: float, max_lon: float) -> Dict:
        """
        Removes any data from the dictionary that does not meet the specified time, latitude and longitude requirements.

        Parameters
        ----------
        data
            dictionary of sage ii data. Must have the fields 'mjd', 'Lat' and 'Lon'. All others are optional
        min_date
            start date where data will be loaded in iso format, eg: '2004-1-1'
        max_date
            end date where data will be loaded in iso format, eg: '2004-1-1'
        min_lat
            minimum latitude (optional)
        max_lat
            maximum latitude (optional)
        min_lon
            minimum longitude (optional)
        max_lon
            maximum longitude (optional)

        Returns
        -------
            returns the dictionary with only data in the valid latitude, longitude and time range
        """
        min_mjd = (pd.Timestamp(min_date) - pd.Timestamp('1858-11-17')) / pd.Timedelta(1, 'D')
        max_mjd = (pd.Timestamp(max_date) - pd.Timestamp('1858-11-17')) / pd.Timedelta(1, 'D')

        good = (data['mjd'] > min_mjd) & (data['mjd'] < max_mjd) & \
               (data['Lat'] > min_lat) & (data['Lat'] < max_lat) & \
               (data['Lon'] > min_lon) & (data['Lon'] < max_lon)

        if np.any(good):
            for key in data.keys():
                if type(data[key]) is str:
                    pass
                elif hasattr(data[key], '__len__'):
                    if data[key].shape[0] == len(good):
                        data[key] = data[key][good]
        else:
            print('no data satisfies the criteria')
            data = {}

        return data

[docs]    def convert_to_xarray(self, data: Dict) -> Union[xr.Dataset, Tuple[xr.Dataset, xr.Dataset]]:
        """
        Parameters
        ----------
        data
            Data from the ``load_data`` function

        Returns
        -------
            data formatted to an xarray Dataset
        """

        # split up the fields into one of different sizes and optional returns
        fields = dict()

        # not currently returned
        fields['geometry'] = ['Tan_Alt', 'Tan_Lat', 'Tan_Lon']
        fields['flags'] = ['InfVec', 'Dropped']
        fields['profile_flags'] = ['ProfileInfVec']

        # always returned - 1 per profile
        fields['general'] = ['Event_Num', 'Lat', 'Lon', 'Beta', 'Duration', 'Type_Sat', 'Type_Tan', 'Trop_Height']

        # optional return parameters
        fields['background'] = ['NMC_Pres', 'NMC_Temp', 'NMC_Dens', 'NMC_Dens_Err', 'Density', 'Density_Err']
        fields['ozone'] = ['O3', 'O3_Err']
        fields['no2'] = ['NO2', 'NO2_Err']
        fields['h2o'] = ['H2O', 'H2O_Err']
        fields['aerosol'] = ['Ext386', 'Ext452', 'Ext525', 'Ext1020', 'Ext386_Err', 'Ext452_Err', 'Ext525_Err',
                             'Ext1020_Err']
        fields['particle_size'] = ['SurfDen', 'Radius', 'SurfDen_Err', 'Radius_Err']

        xr_data = []
        index_flags = self.convert_index_bit_flags(data)
        species_flags = self.convert_species_bit_flags(data)
        time = pd.to_timedelta(data['mjd'], 'D') + pd.Timestamp('1858-11-17')

        data['Trop_Height'] = data['Trop_Height'].flatten()
        for key in fields['general']:
            xr_data.append(xr.DataArray(data[key], coords=[time], dims=['time'], name=key))

        if 'aerosol' in self.species or self.filter_ozone:  # we need aerosol to filter ozone
            altitude = data['Alt_Grid'][0:80]
            wavel = np.array([386.0, 452.0, 525.0, 1020.0])
            ext = np.array([data['Ext386'], data['Ext452'], data['Ext525'], data['Ext1020']])
            xr_data.append(xr.DataArray(ext, coords=[wavel, time, altitude],
                                        dims=['wavelength', 'time', 'Alt_Grid'], name='Ext'))
            ext = np.array([data['Ext386_Err'], data['Ext452_Err'], data['Ext525_Err'], data['Ext1020_Err']])
            xr_data.append(xr.DataArray(ext, coords=[wavel, time, altitude],
                                        dims=['wavelength', 'time', 'Alt_Grid'], name='Ext_Err'))
            for key in fields['particle_size']:
                xr_data.append(xr.DataArray(data[key], coords=[time, altitude],
                                            dims=['time', 'Alt_Grid'], name=key))
        if 'no2' in self.species:
            altitude = data['Alt_Grid'][0:100]
            for key in fields['no2']:
                xr_data.append(xr.DataArray(data[key], coords=[time, altitude],
                                            dims=['time', 'Alt_Grid'], name=key))
        if 'h2o' in self.species:
            altitude = data['Alt_Grid'][0:100]
            for key in fields['h2o']:
                xr_data.append(xr.DataArray(data[key], coords=[time, altitude],
                                            dims=['time', 'Alt_Grid'], name=key))
        if any(i in ['ozone', 'o3'] for i in self.species):
            altitude = data['Alt_Grid'][0:140]
            for key in fields['ozone']:
                xr_data.append(xr.DataArray(data[key], coords=[time, altitude],
                                            dims=['time', 'Alt_Grid'], name=key))

        if 'background' in self.species:
            altitude = data['Alt_Grid'][0:140]
            for key in fields['background']:
                xr_data.append(xr.DataArray(data[key], coords=[time, altitude],
                                            dims=['time', 'Alt_Grid'], name=key))

        xr_data = xr.merge(xr_data)

        if self.enumerate_flags:
            xr_data = xr.merge([xr_data, index_flags, species_flags])

        for var in xr_data.variables.keys():
            if xr_data[var].dtype == 'float32' or 'Err' in var:
                xr_data[var] = xr_data[var].where(xr_data[var] != data['FillVal'])

        # determine cloud filter for aerosol data
        cloud_filter = xr.full_like(species_flags.Cloud_Bit_1, fill_value=True, dtype=bool)
        min_alt = (xr_data.Alt_Grid * (species_flags.Cloud_Bit_1 & species_flags.Cloud_Bit_2)).max(dim='Alt_Grid')
        cloud_filter = cloud_filter.where(cloud_filter.Alt_Grid > min_alt)
        xr_data['cloud_filter'] = np.isnan(cloud_filter)

        # determine valid ozone altitudes
        if any(i in ['ozone', 'o3'] for i in self.species):
            # add an ozone filter field for convenience
            ozone_good = xr.full_like(species_flags.Cloud_Bit_1, fill_value=True, dtype=bool)
            # Exclusion of all data points with an uncertainty estimate of 300% or greater
            ozone_good = ozone_good.where(xr_data.O3_Err < 30000)
            # Exclusion of all profiles with an uncertainty greater than 10% between 30 and 50 km
            no_good = (xr_data.O3_Err > 1000) & (xr_data.Alt_Grid > 30) & (xr_data.Alt_Grid < 50)
            ozone_good = ozone_good.where(~no_good)
            # Exclusion of all data points at altitude and below the occurrence of an aerosol extinction value of
            # greater than 0.006 km^-1
            # NOTE: the wavelength to use as the filter is not specified in the documentation, so I have chosen the
            # wavelength with the smallest extinction and therefore the strictest filtering
            min_alt = (xr_data.Alt_Grid * (xr_data.Ext.sel(wavelength=1020) > 0.006)).max(dim='Alt_Grid')
            ozone_good = ozone_good.where(xr_data.Alt_Grid > min_alt)
            # Exclusion of all data points at altitude and below the occurrence of both the 525nm aerosol extinction
            # value exceeding 0.001 km^-1 and the 525/1020 extinction ratio falling below 1.4
            min_alt = (xr_data.Alt_Grid * ((xr_data.Ext.sel(wavelength=525) > 0.001) &
                                           ((xr_data.Ext.sel(wavelength=525) / xr_data.Ext.sel(
                                               wavelength=1020)) < 1.4))).max(dim='Alt_Grid')
            ozone_good = ozone_good.where(xr_data.Alt_Grid > min_alt)
            # Exclusion of all data points below 35km an 200% or larger uncertainty estimate
            no_good = (xr_data.O3_Err > 20000) & (xr_data.Alt_Grid < 35)
            ozone_good = ~np.isnan(ozone_good.where(~no_good))
            xr_data['ozone_filter'] = ozone_good

        if self.filter_aerosol:
            xr_data['Ext'] = xr_data.Ext.where(~xr_data.cloud_filter)

        if self.filter_ozone:
            xr_data['O3'] = xr_data.O3.where(ozone_good)

        # drop aerosol if not requested
        if self.filter_ozone and not ('aerosol' in self.species):
            xr_data.drop(['Ext', 'Ext_Err', 'wavelength'])

        if self.normalize_percent_error:
            for var in xr_data.variables.keys():
                if 'Err' in var:  # put error units back into percent
                    xr_data[var] = (xr_data[var] / 100).astype('float32')

        xr_data = xr_data.transpose('time', 'Alt_Grid', 'wavelength')
        xr_data = self.apply_cf_conventions(xr_data)

        if self.return_separate_flags:
            return xr_data, xr.merge([index_flags, species_flags])
        else:
            return xr_data

    def apply_cf_conventions(self, data):

        attrs = {'time': {'standard_name': 'time'},
                 'Lat': {'standard_name': 'latitude',
                         'units': 'degrees_north'},
                 'Lon': {'standard_name': 'longitude',
                         'units': 'degrees_east'},
                 'Alt_Grid': {'units': 'km'},
                 'wavelength': {'units': 'nm',
                                'description': 'wavelength at which aerosol extinction is retrieved'},
                 'O3': {'standard_name': 'number_concentration_of_ozone_molecules_in_air',
                        'units': 'cm-3'},
                 'NO2': {'standard_name': 'number_concentration_of_nitrogen_dioxide_molecules_in_air',
                         'units': 'cm-3'},
                 'H2O': {'standard_name': 'number_concentration_of_water_vapor_in_air',
                         'units': 'cm-3'},
                 'Ext': {'standard_name': 'volume_extinction_coefficient_in_air_due_to_ambient_aerosol_particles',
                         'units': 'km-1'},
                 'O3_Err': {'standard_name': 'number_concentration_of_ozone_molecules_in_air_error',
                            'units': 'percent'},
                 'NO2_Err': {'standard_name': 'number_concentration_of_nitrogen_dioxide_molecules_in_air_error',
                             'units': 'percent'},
                 'H2O_Err': {'standard_name': 'number_concentration_of_water_vapor_in_air_error',
                             'units': 'percent'},
                 'Ext_Err': {'standard_name': 'volume_extinction_coefficient_in_air_due_to_ambient_aerosol_'
                                              'particles_error',
                             'units': 'percent'},
                 'Duration': {'units': 'seconds',
                              'description': 'duration of the sunrise/sunset event'},
                 'Beta': {'units': 'degrees',
                          'description': 'angle between the satellite orbit plane and the sun'},
                 'Trop_Height': {'units': 'km'},
                 'Radius': {'units': 'microns'},
                 'SurfDen': {'units': 'microns2 cm-3'}}

        for key in attrs.keys():
            data[key].attrs = attrs[key]

        git_repo = 'https://github.com/LandonRieger/pySAGE.git'
        data.attrs = {'description': 'Retrieved vertical profiles of  aerosol extinction, ozone, '
                                     'nitrogen dioxide, water vapor, and meteorological profiles from SAGE II '
                                     'version 7.00',
                      'publication reference': 'Damadeo, R. P., Zawodny, J. M., Thomason, L. W., & Iyer, N. (2013). '
                                               'SAGE version 7.0 algorithm: application to SAGE II. Atmospheric '
                                               'Measurement Techniques, 6(12), 3539-3561.',
                      'title': 'SAGE II version 7.00',
                      'date_created': pd.Timestamp.now().strftime('%B %d %Y'),
                      'source_code': 'repository: ' + git_repo + ' revision: ' + pysagereader.__version__,
                      'source_data': 'https://eosweb.larc.nasa.gov/project/sage2/sage2_v7_table',
                      'version': pysagereader.__version__,
                      'Conventions': 'CF-1.7'}

        if self.cf_names:
            names = {'Lat': 'latitude',
                     'Lon': 'longitude',
                     'Alt_Grid': 'altitude',
                     'Beta': 'beta_angle',
                     'Ext': 'aerosol_extinction',
                     'Ext_Err': 'aerosol_extinction_error',
                     'O3': 'ozone',
                     'O3_Err': 'ozone_error',
                     'NO2': 'no2',
                     'NO2_Err': 'no2_error',
                     'SurfDen': 'surface_area_density',
                     'SurfDen_Err': 'surface_area_density_error',
                     'radius': 'effective_radius',
                     'radius_err': 'effective_radius_error',
                     'Density': 'air_density',
                     'Density_Err': 'air_density_error',
                     'Type_Sat': 'satellite_sunset',
                     'Type_Tan': 'local_sunset',
                     'Trop_Height': 'tropopause_altitude',
                     'Duration': 'event_duration'}

            for key in names.keys():
                try:
                    data.rename({key: names[key]}, inplace=True)
                except ValueError:
                    pass

        return data

[docs]    @staticmethod
    def convert_index_bit_flags(data: Dict) -> xr.Dataset:
        """
        Convert the int32 index flags to a dataset of distinct flags

        Parameters
        ----------
        data
            Dictionary of input data as returned by ``load_data``

        Returns
        -------
            Dataset of the index bit flags
        """
        flags = dict()
        flags['pmc_present'] = 0
        flags['h2o_zero_found'] = 1
        flags['h2o_slow_convergence'] = 2
        flags['h2o_ega_failure'] = 3
        flags['default_nmc_temp_errors'] = 4
        flags['ch2_aero_model_A'] = 5
        flags['ch2_aero_model_B'] = 6
        flags['ch2_new_wavelength'] = 7
        flags['incomplete_nmc_data'] = 8
        flags['mirror_model'] = 15
        flags['twomey_non_conv_rayleigh'] = 19
        flags['twomey_non_conv_386_Aero'] = 20
        flags['twomey_non_conv_452_Aero'] = 21
        flags['twomey_non_conv_525_Aero'] = 22
        flags['twomey_non_conv_1020_Aero'] = 23
        flags['twomey_non_conv_NO2'] = 24
        flags['twomey_non_conv_ozone'] = 25
        flags['no_shock_correction'] = 30

        f = dict()
        for key in flags.keys():
            f[key] = (data['InfVec'] & 2 ** flags[key]) > 0

        xr_data = []
        time = pd.to_timedelta(data['mjd'], 'D') + pd.Timestamp('1858-11-17')
        for key in f.keys():
            xr_data.append(xr.DataArray(f[key], coords=[time], dims=['time'], name=key))

        return xr.merge(xr_data)

[docs]    @staticmethod
    def convert_species_bit_flags(data: Dict) -> xr.Dataset:
        """
        Convert the int32 species flags to a dataset of distinct flags

        Parameters
        ----------
        data
            Dictionary of input data as returned by `load_data`

        Returns
        -------
            Dataset of the index bit flags
        """
        flags = dict()
        flags['separation_method'] = [0, 1, 2]
        flags['one_chan_aerosol_corr'] = 3
        flags['no_935_aerosol_corr'] = 4
        flags['Large_1020_OD'] = 5
        flags['NO2_Extrap'] = 6
        flags['Water_vapor_ratio'] = [7, 8, 9, 10]
        flags['Cloud_Bit_1'] = 11
        flags['Cloud_Bit_2'] = 12
        flags['No_H2O_Corr'] = 13
        flags['In_Troposphere'] = 14

        separation_method = dict()
        separation_method['no_aerosol_method'] = 0
        separation_method['trans_no_aero_to_five_chan'] = 1
        separation_method['standard_method'] = 2
        separation_method['trans_five_chan_to_low'] = 3
        separation_method['four_chan_method'] = 4
        separation_method['trans_four_chan_to_three_chan'] = 5
        separation_method['three_chan_method'] = 6
        separation_method['extension_method'] = 7

        f = dict()
        for key in flags.keys():
            if hasattr(flags[key], '__len__'):
                if key == 'separation_method':
                    for k in separation_method.keys():
                        temp = data['ProfileInfVec'] & np.sum([2 ** k for k in flags[key]])
                        f[k] = temp == separation_method[k]
                else:
                    temp = data['ProfileInfVec'] & np.sum([2 ** k for k in flags[key]])
                    f[key] = temp >> flags[key][0]  # shift flag to save only significant bits
            else:
                f[key] = (data['ProfileInfVec'] & 2 ** flags[key]) > 0

        xr_data = []
        time = pd.to_timedelta(data['mjd'], 'D') + pd.Timestamp('1858-11-17')
        for key in f.keys():
            xr_data.append(xr.DataArray(f[key], coords=[time, data['Alt_Grid'][0:140]], dims=['time', 'Alt_Grid'],
                                        name=key))

        return xr.merge(xr_data)