diff --git a/.gitignore b/.gitignore index 4340cdf..32474de 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ data example notebooks *.pyc -*.sh \ No newline at end of file +*.sh +Segments_Debugging.ipynb +segments.zip diff --git a/gtfs_functions/aux_functions.py b/gtfs_functions/aux_functions.py index 6db210c..6c2cfab 100644 --- a/gtfs_functions/aux_functions.py +++ b/gtfs_functions/aux_functions.py @@ -4,6 +4,9 @@ import geopandas as gpd import logging import numpy as np +from shapely import Point +from shapely import LineString +from shapely.ops import substring def add_runtime(st): @@ -72,6 +75,144 @@ def add_speed(speeds): return speeds +def detect_loops(df): + """ + Runs on a shape_stop df, grouped by patterns and warns on the number of + potential loops detected + + Returns false if there are loops detected, True if not + """ + df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(1).fillna(0) + df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1) + df['prev_wrong'] = (df['cdsp_prev_stop'] > df['cut_distance_stop_point']) + df['next_wrong'] = (df['cdsp_next_stop'] < df['cut_distance_stop_point']) + #df['cdsp_next_ordered'] + if df['next_wrong'].any() or df['prev_wrong'].any(): + num_loops = (df['prev_wrong'].sum() + df['next_wrong'].sum())//2 + logging.warning(f"Identified {num_loops} loops in Pattern " + f"{df['shape_id'].iloc[0]} of Route " + f"{df['route_name'].iloc[0]}") + return False + else: + return True + +def loops_fixer(df:pd.DataFrame): + """ + Fixes loops using a relaxaation algorithm + + Parameter: df, a gtfs combined stop & shape dataframe + Returns a corrected dataframe + + Lilah Rosenfield | Utah Transit Authority + lilah.rosenfield@rideuta.com + """ + df = df.copy() + df['sub_string'] = np.nan + df.loc[df.index[0], 'cut_distance_stop_point'] = 0 # First stop + df.loc[df.index[-1], 'cut_distance_stop_point'] = 1 # Last stop + df = _diff_and_reproject(df) + df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-1).fillna(1) + df['end_stop_name'] = df['stop_name'].shift(-1) + df['end_stop_id'] = df['stop_id'].shift(-1) + df = df.iloc[:-1] + df['geometry'] = df.apply(_final_substring, axis=1) + return df + +def _diff_and_reproject(df:pd.DataFrame, + max_iterations:int = 50): + """ + Iterates outwards from identified discontinuities in the linestring from + the sequence, looping until it finds a fix. + + Returns a corrected df. + + My initial implementation of this was recursive, which I liked, but + this version uses a while loop, which is more memory efficient. + + Lilah Rosenfield | Utah Transit Authority + (with the support of a large pile of linear algebra) + lilah.rosenfield@rideuta.com + """ + df = df.sort_index(ascending=True) + iteration = 0 + offset = 1 + while iteration < max_iterations: + iteration += 1 + df['cdsp_prev_stop'] = df['cut_distance_stop_point'].shift(offset).fillna(0) + df['cdsp_next_stop'] = df['cut_distance_stop_point'].shift(-offset).fillna(1) + df['prev_wrong'] = (df['cdsp_prev_stop'] > df['cut_distance_stop_point']) + df['next_wrong'] = (df['cdsp_next_stop'] < df['cut_distance_stop_point']) + if not (df['next_wrong'].any() or df['prev_wrong'].any()): + break + df['prev_cdsp'] = df['cut_distance_stop_point'].copy() + + # Fix Pattern + df['sub_string'] = df[df['next_wrong'] | df['prev_wrong']].apply( + _substring_general, axis=1 + ) + df['substproj'] = df['sub_string'].project( + df['geometry_stop'], normalized = True) + + # Convert back to full line coordinates + df['substproj'] = ( + df['substproj']* + (df['cdsp_next_stop']-df['cdsp_prev_stop'])+ + df['cdsp_prev_stop']) + valid_update = ( + (df['substproj'] < df['cdsp_next_stop']) & + (df['substproj'] > df['cdsp_prev_stop']) + ) + df['cut_distance_stop_point'][valid_update] = df['substproj'] + tester = df['prev_cdsp'] != df['cut_distance_stop_point'] + if tester.any(): + logging.info("Made corrections, re-setting offset to 1") + offset=1 + else: + logging.info("No corrections make, increasing offset") + offset = offset+1 + if offset > len(df) // 2: + raise pd.errors.InvalidIndexError("Offset got too large!") + if iteration >= max_iterations: + logging.warning("Maximum iteration depth reached for shape id " + f"{df['shape_id'].iloc[0]}. You are likely to" + "encounter further errors") + return df + + +def _final_substring(ser:pd.Series): + subst = substring(ser['geometry_shape'], + ser['cut_distance_stop_point'], + ser['cdsp_next_stop'], + normalized = True) + if type(subst) == Point: + logging.warning(f"Segment between {ser['stop_name']} and " + f"{ser['end_stop_name']} on {ser['route_name']}" + "is a point! This is likely due to a out-of-spec " + "GTFS implementation! We're returning the segment with " + "an empty linestring instead.") + return LineString() + return subst + +def _substring_general(ser:pd.Series): + """ + Applied substringification function for _diff_and_reproject + """ + logging.info(f'Trying {ser['stop_name']} of line {ser['route_name']}') + subst = substring(ser['geometry_shape'], + ser['cdsp_prev_stop'], + ser['cdsp_next_stop'], + normalized=True) + if type(subst) == Point: + logging.warning("Got a point from the substring function!") + logging.warning("This is probably a single-stop deviation, " + "in which case you are safe to ignore.") + logging.warning("However, if the loop fix fails, " \ + "your feed may have a corner case") + return ser['geometry_shape'] + else: + return subst + + def fix_outliers(speeds): # Calculate average speed to modify outliers diff --git a/gtfs_functions/gtfs_functions.py b/gtfs_functions/gtfs_functions.py index 122c4a2..42f1aaf 100644 --- a/gtfs_functions/gtfs_functions.py +++ b/gtfs_functions/gtfs_functions.py @@ -9,12 +9,15 @@ import pendulum as pl import hashlib from shapely.geometry import LineString, MultiPoint +from shapely.ops import substring from gtfs_functions.aux_functions import ( add_all_lines, add_runtime, add_distance, add_speed, + detect_loops, code, + loops_fixer, fix_outliers, num_to_letters, add_route_name, @@ -52,6 +55,7 @@ def __init__( patterns: bool = True, start_date: str = None, end_date: str = None, + fix_loops: bool = False ): self._gtfs_path = gtfs_path @@ -61,6 +65,7 @@ def __init__( self._patterns = patterns self._start_date = start_date self._end_date = end_date + self._fix_loops = fix_loops self._dates = None self._routes_patterns = None self._trips_patterns = None @@ -280,6 +285,21 @@ def dates_service_id(self): if self._dates_service_id is None: self._dates_service_id = self.get_dates_service_id() return self._dates_service_id + + @property + def loop_log(self): + """ + A dataframe logging changes that have been made if fix_loops is enabled + + Warns and returns false if disabled + """ + if self._fix_loops: + self.get_segments() + return self._loop_log.copy() + else: + logging.warning("fix_loops is set to false, " + "so loop_log does not exist!") + return False @trips.setter def trips(self, value): @@ -866,17 +886,26 @@ def get_lines_freq(self): line_frequencies = line_frequencies.loc[~line_frequencies.geometry.isnull(), keep_these] return line_frequencies + - def get_segments(self): - """Splits each route's shape into stop-stop LineString called segments + def _segments_helper(self): + """ + Splits each route's shape into stop-stop LineString called segments - Returns the segment geometry as well as additional segment information + This part just gets the segment geometry, and returns. Addtional + segment information is added in the main get_segments function. + + Lilah Rosenfield | Utah Transit Authority + lilah.rosenfield@rideuta.com """ logging.info("Getting segments...") stop_times = self.stop_times shapes = self.shapes - req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry"] + # added route_pattern to handle the corner case two patterns can + # Have the same linestring but different stops + req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry", + "route_pattern"] add_columns = ["route_id", "route_name", "direction_id", "stop_name"] # merge stop_times and shapes to calculate cut distance and interpolated point @@ -886,15 +915,29 @@ def get_segments(self): .merge(shapes, on="shape_id", suffixes=("_stop", "_shape")) ) logging.info("Projecting stops onto shape...") - df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stop", "geometry_shape"]].apply( - lambda x: x[1].project(x[0], normalized=True), axis=1 - ) + df_shape_stop["cut_distance_stop_point"] = \ + df_shape_stop['geometry_shape'].project( + df_shape_stop['geometry_stop'], normalized=True + ) + df_shape_stop_group = df_shape_stop.groupby(["shape_id","route_pattern"]) + test = df_shape_stop_group.apply(detect_loops) + if not self._fix_loops and test.any(): + logging.warning("Detected looped routes! It is recommended you " + "re-initialize the feed with fix_loops enabled") + segment_df = self._segments_helper_noloopfix(df_shape_stop) + if test.any() and self._fix_loops: + segment_df = self._segments_helper_with_loopfix(df_shape_stop_group, df_shape_stop) + + return segment_df + + def _segments_helper_noloopfix(self, df_shape_stop:pd.DataFrame): + stop_times = self.stop_times + shapes = self.shapes logging.info("Interpolating stops onto shape...") - df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply( - lambda x: x[0].interpolate(x[1], normalized=True), axis=1 - ) - # calculate cut distance for + df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shape", "cut_distance_stop_point"]].apply( + lambda x: x[0].interpolate(x[1], normalized=True), axis=1) + logging.info("Sorting shape points and stops...") df_shape = shapes[shapes.shape_id.isin(stop_times.shape_id.unique())] df_shape["list_of_points"] = df_shape.geometry.apply(lambda x: list(MultiPoint(x.coords).geoms)) @@ -957,7 +1000,47 @@ def get_segments(self): ], axis=1, inplace=True, - ) + ) + return segment_df + + def _segments_helper_with_loopfix(self, df_shape_stop_group, df_shape_stop): + old_cdsp = df_shape_stop["cut_distance_stop_point"] + fixed_loops = df_shape_stop_group.apply(loops_fixer) + df_shape_stop = fixed_loops.droplevel([0,1]) + + # Double checking may add a bit of inefficiency + # but we'll do it anyway + df_shape_stop_group = df_shape_stop.groupby(["shape_id","route_pattern"]) + logging.info("Finished fixing loops!") + test = df_shape_stop_group.apply(detect_loops) + if test.any(): + logging.warning("Warning we were not able to fix all loops!") + new_cdsp = df_shape_stop["cut_distance_stop_point"] + #changes_made = new_cdsp != old_cdsp + self._loop_log = pd.concat([ + df_shape_stop[['stop_id', 'stop_sequence']], + old_cdsp.rename('old_cut_distance'), + new_cdsp.rename('new_cut_distance'), + #changes_made.rename('changed') + ], axis=1) + df_shape_stop = df_shape_stop.drop( + ['cdsp_prev_stop','cdsp_next_stop','prev_wrong', + 'next_wrong','prev_cdsp','substproj', 'sub_string'], axis=1) + return df_shape_stop + + + def get_segments(self): + """Splits each route's shape into stop-stop LineString called segments + + Returns the segment geometry as well as additional segment information + + If fix_loops is set to true on init, will attempt experimental + loop-resolution loop. + """ + #stop_times = self.stop_times + #shapes = self.shapes + segment_gdf = self._segments_helper() + segment_gdf.crs = "EPSG:4326" # Add segment length in meters @@ -972,6 +1055,7 @@ def get_segments(self): "shape_id", "route_id", "route_name", + "route_pattern", "direction_id", "stop_sequence", "segment_name",