Source code for dgp.lib.trajectory_ingestor

# coding=utf-8

"""
trajectory_ingestor.py
Library for trajectory data import functions

"""

import csv
import numpy as np
import pandas as pd
import functools
import datetime

from .time_utils import leap_seconds, convert_gps_time, datenum_to_datetime
from .etc import interp_nans

[docs]def import_trajectory(filepath, delim_whitespace=False, interval=0, interp=False, is_utc=False,
                      columns=None, skiprows=None, timeformat='sow'):
    """
    import_trajectory

    Read and parse ASCII trajectory data in a comma-delimited format.

    :param path: str
        Filesystem path to trajectory data file
    :param interval: float, default 0
        Output data rate. Default behavior is to infer the rate.
    :param interp: list of ints or list of strs, default None
        Gaps in data will be filled with interpolated values. List of
        column indices (list of ints) or list of column names (list of strs)
        to interpolate. Default behavior is not to interpolate.
    :param is_utc: boolean, default False
        Indicates that the timestamps are UTC. The index datetimes will be
        shifted to remove the GPS-UTC leap second offset.
    :param colums: list of strs, default: None
        Strings to use as the column names.
    :param skiprows: list-like or integer or callable, default None
        Line numbers to skip (0-indexed) or number of lines to skip (int) at
        the start of the file. If callable, the callable function will be
        evaluated against the row indices, returning True if the row should
        be skipped and False otherwise. An example of a valid callable argument
        would be lambda x: x in [0, 2].
    :param timeformat: 'sow' | 'hms' | 'serial', default: 'hms'
        Indicates the time format to expect. The 'sow' format requires a field
        named 'week' with the GPS week, and a field named 'sow' with the GPS
        seconds of week. The 'hms' format requires a field named 'mdy' with the
        date in the format 'MM/DD/YYYY', and a field named 'hms' with the time
        in the format 'HH:MM:SS.SSS'. The 'serial' format (not yet implemented)
        requires a field named 'datenum' with the serial date number.
    :return: DataFrame
    """

    df = pd.read_csv(filepath, delim_whitespace=delim_whitespace, header=None, engine='c', na_filter=False, skiprows=skiprows)

    # assumed position of these required fields
    if columns is None:
        if timeformat == 'sow':
            columns = ['week', 'sow', 'lat', 'long', 'ell_ht']
        elif timeformat == 'hms':
            columns = ['mdy', 'hms', 'lat', 'long', 'ell_ht']
        elif timeformat == 'serial':
            columns = ['datenum', 'lat', 'long', 'ell_ht']
        else:
            raise ValueError('timeformat value {fmt!r} not recognized'
                             .format(fmt=timeformat))

    # 'None' indicates a not-needed field
    # if a field is after all non-essentials, and is not named, it will be removed
    if len(df.columns) > len(columns):
            columns.extend([None] * (len(df.columns) - len(columns)))

    # drop unwanted columns
    drop_list = list()
    for idx, val in enumerate(columns):
        if val is None:
            drop_list.append(idx)

    columns = [x for x in columns if x is not None]

    if drop_list:
        df.drop(df.columns[drop_list], axis=1, inplace=True)

    df.columns = columns

    # create index
    if timeformat == 'sow':
        df.index = convert_gps_time(df['week'], df['sow'], format='datetime')
        df.drop(['sow', 'week'], axis=1, inplace=True)
    elif timeformat == 'hms':
        df.index = pd.to_datetime(df['mdy'].str.strip() + df['hms'].str.strip(), format="%m/%d/%Y%H:%M:%S.%f")
        df.drop(['mdy', 'hms'], axis=1, inplace=True)
    elif timeformat == 'serial':
        raise NotImplementedError
        #df.index = datenum_to_datetime(df['datenum'])

    # remove leap second
    if is_utc:
        # TO DO: Check dates at beginning and end to determine whether a leap second was added in the middle of the survey.
        shift = leap_seconds(df.index[0])
        df.index = df.index.shift(-shift, freq='S')

    # set or infer the interval
    # TO DO: Need to infer interval for both cases to know whether resample
    if interval > 0:
        offset_str = '{:d}U'.format(int(interval * 1e6))
    else:
        offset_str = '100000U'

    # fill gaps with NaNs
    new_index = pd.date_range(df.index[0], df.index[-1], freq=offset_str)
    df = df.reindex(new_index)

    if interp:
        numeric = df.select_dtypes(include=[np.number])
        numeric = numeric.apply(interp_nans)

        # replace columns
        for col in numeric.columns:
            df[col] = numeric[col]

    return df