Source code for dgp.lib.gravity_ingestor

# coding=utf-8

Library for gravity data import functions


import csv
import numpy as np
import pandas as pd
import functools
import datetime
import struct
import fnmatch
import os
import re

from .time_utils import convert_gps_time
from .etc import interp_nans

def _extract_bits(bitfield, columns=None, as_bool=False):
    Function that extracts bitfield values from integers.

    A pandas.Series or numpy.array of integers is converted to a
    pandas.DataFrame of 1/0 or True/False values for as many bits
    there are in the integer - least signficant bit first - or for as many
    column names that are given.

    bitfields : numpy.array or pandas.Series
        16, 32, or 64-bit integers
    columns : list, optional
        If a list is given, then the column names are given to the resulting
        columns in the order listed.
    as_bool : bool, optional
        If True, then values in returned DataFrame are type numpy.bool_


    def _unpack_bits(n):
        x = np.array(struct.unpack('4B', struct.pack('>I', n)), dtype=np.uint8)
        return np.flip(np.unpackbits(x), axis=0)

    data = bitfield.apply(_unpack_bits)
    df = pd.DataFrame(np.column_stack(list(zip(*data))))

    # set column names
    if columns is not None:
        # remove fields from the end if not named
        if len(columns) < len(df.columns):
            df.drop(df.columns[range(len(columns), len(df.columns))], axis=1, inplace=True)
            df.columns = columns
        elif len(columns) > len(df.columns):
            df.columns = columns[:len(df.columns)]
            df.columns = columns

    if as_bool:
        return df.astype(np.bool_)
        return df

[docs]def read_at1a(path, fill_with_nans=True, interp=False): """ Read and parse gravity data file from DGS AT1A (Airborne) meter. CSV Columns: gravity, long, cross, beam, temp, status, pressure, Etemp, GPSweek, GPSweekseconds Parameters ---------- path : str Filesystem path to gravity data file fill_with_nans : boolean, default True Fills time gaps with NaNs for all fields interp : boolean, default False Interpolate all NaNs for fields of type numpy.number Returns ------- pandas.DataFrame Gravity data indexed by datetime. """ fields = ['gravity', 'long', 'cross', 'beam', 'temp', 'status', 'pressure', 'Etemp', 'GPSweek', 'GPSweekseconds'] data = [] df = pd.read_csv(path, header=None, engine='c', na_filter=False) df.columns = fields # expand status field status_field_names = ['clamp', 'unclamp', 'gps_sync', 'feedback', 'reserved1', 'reserved2', 'ad_lock', 'cmd_rcvd', 'nav_mode_1', 'nav_mode_2', 'plat_comm', 'sens_comm', 'gps_input', 'ad_sat', 'long_sat', 'cross_sat', 'on_line'] status = _extract_bits(df['status'], columns=status_field_names, as_bool=True) df = pd.concat([df, status], axis=1) df.drop('status', axis=1, inplace=True) # create datetime index dt_list = [] for (week, sow) in zip(df['GPSweek'], df['GPSweekseconds']): dt_list.append(convert_gps_time(week, sow, format='datetime')) df.index = pd.DatetimeIndex(dt_list) if fill_with_nans: # select rows where time is synced with the GPS NMEA df = df.loc[df['gps_sync']] # fill gaps with NaNs interval = '100000U' index = pd.date_range(df.index[0], df.index[-1], freq=interval) df = df.reindex(index) if interp: numeric = df.select_dtypes(include=[np.number]) numeric = numeric.apply(interp_nans) # replace columns for col in numeric.columns: df[col] = numeric[col] return df
def _parse_ZLS_file_name(filename): # split by underscore fname = [e.split('.') for e in filename.split('_')] # split hour from day and then flatten into one tuple b = [int(el) for fname_parts in fname for el in fname_parts] # generate datetime c = datetime.datetime(b[0], 1, 1) + datetime.timedelta(days=b[2]-1, hours=b[1]) return c def _read_ZLS_format_file(filepath): col_names = ['line_name', 'year', 'day', 'hour', 'minute', 'second', 'sensor', 'spring_tension', 'cross_coupling', 'raw_beam', 'vcc', 'al', 'ax', 've2', 'ax2', 'xacc2', 'lacc2', 'xacc', 'lacc', 'par_port', 'platform_period'] col_widths = [10, 4, 3, 2, 2, 2, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6] time_columns = ['year', 'day', 'hour', 'minute', 'second'] # read into dataframe df = pd.read_fwf(filepath, widths=col_widths, names=col_names) day_fmt = lambda x: '{:03d}'.format(x) time_fmt = lambda x: '{:02d}'.format(x) t = df['year'].map(str) + df['day'].map(day_fmt) + \ df['hour'].map(time_fmt) + df['minute'].map(time_fmt) + \ df['second'].map(time_fmt) # index by datetime df.index = pd.to_datetime(t, format='%Y%j%H%M%S') df.drop(time_columns, axis=1, inplace=True) return df
[docs]def read_zls(dirpath, begin_time=None, end_time=None, excludes=['.*']): """ Read and parse gravity data file from ZLS meter. Files are segmented by hour and data is presented as ASCII in a fixed-width format. Columns: line name, year, day, hour, minute, second, gravity, spring tension, \ cross coupling, raw beam, vcc, al, ax, ve2, ax2, xacc2, lacc2, xacc, \ lacc, par port, platform period Parameters ---------- dirpath : str Filesystem path to directory containing files begin_time : datetime, optional Data start time if not importing from the first file in the directory end_time : datetime, optional Data end time if not importing to the last file in the directory excludes : list Files and directories to exclude from directory listing. Returns ------- pandas.DataFrame Gravity data indexed by datetime. """ excludes = r'|'.join([fnmatch.translate(x) for x in excludes]) or r'$.' # list files in directory files = [_parse_ZLS_file_name(f) for f in os.listdir(dirpath) if os.path.isfile(os.path.join(dirpath, f)) if not re.match(excludes, f)] # sort files files = sorted(files) # validate begin and end times if begin_time is None and end_time is None: begin_time = files[0] end_time = files[-1] + datetime.timedelta(hours=1) elif begin_time is None and end_time is not None: begin_time = files[0] if end_time < begin_time or end_time > files[-1]: raise ValueError('end time ({end}) is out of bounds' .format(end=end_time)) elif begin_time is not None and end_time is None: end_time = files[-1] if begin_time > end_time or begin_time < files[0]: raise ValueError('begin time ({begin}) is out of bounds' .format(begin=begin_time)) else: if begin_time > end_time: raise ValueError('begin time ({begin}) is after end time ({end})' .format(begin=begin_time, end=end_time)) # filter file list based on begin and end times files = filter(lambda x: (x >= begin_time and x <= end_time) or (begin_time >= x and begin_time <= x + datetime.timedelta(hours=1)) or (end_time - datetime.timedelta(hours=1) <= x and end_time >= x), files) # convert to ZLS-type file names files = [dt.strftime('%Y_%H.%j') for dt in files] df = pd.DataFrame() for f in files: frame = _read_ZLS_format_file(os.path.join(dirpath, f)) df = pd.concat([df, frame]) df.drop(df.index[df.index < begin_time], inplace=True) df.drop(df.index[df.index > end_time], inplace=True) return df