Skip to content

Commit e3a2ba5

Browse files
authored
Keep track of what retry method actually works (#216)
* For debug analytics, keep track of what retry method actually works * Bump testing dependencies * Add a global fixture to clean up asyncio tasks between tests * Log the name of the retry methods attempted, not the numerical value
1 parent c4db8d1 commit e3a2ba5

File tree

4 files changed

+117
-45
lines changed

4 files changed

+117
-45
lines changed

pyproject.toml

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,11 @@ exclude = ["tests", "tests.*"]
2626

2727
[project.optional-dependencies]
2828
testing = [
29-
"pytest>=7.1.2",
30-
"pytest-asyncio>=0.19.0",
29+
"pytest>=7.3.1",
30+
"pytest-asyncio>=0.21.0",
3131
"pytest-timeout>=2.1.0",
32-
"pytest-mock>=3.8.2",
33-
"pytest-cov>=3.0.0",
34-
"coveralls",
32+
"pytest-mock>=3.10.0",
33+
"pytest-cov>=4.1.0",
3534
]
3635

3736
[tool.setuptools-git-versioning]

tests/application/test_requests.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,9 @@ def set_route_discovered(req):
534534
await was_route_discovered
535535
await zdo_req
536536

537+
# 6 accounts for the loopback requests
538+
assert sum(c.value for c in app.state.counters["Retry_NONE"].values()) == 6 + 1
539+
537540
await app.shutdown()
538541

539542

@@ -602,6 +605,9 @@ def set_route_discovered(req):
602605
)
603606

604607
await was_route_discovered
608+
assert (
609+
sum(c.value for c in app.state.counters["Retry_RouteDiscovery"].values()) == 1
610+
)
605611

606612
await app.shutdown()
607613

@@ -666,6 +672,7 @@ def data_confirm_replier(req):
666672
)
667673

668674
assert was_ieee_addr_used
675+
assert sum(c.value for c in app.state.counters["Retry_IEEEAddress"].values()) == 1
669676

670677
await app.shutdown()
671678

tests/conftest.py

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1+
import gc
2+
import sys
13
import json
4+
import typing
25
import asyncio
36
import inspect
47
import logging
@@ -7,6 +10,7 @@
710

811
import pytest
912
import zigpy.types
13+
import zigpy.config
1014
import zigpy.device
1115

1216
try:
@@ -42,6 +46,46 @@ def pytest_collection_modifyitems(session, config, items):
4246
item.add_marker(pytest.mark.filterwarnings("error::RuntimeWarning"))
4347

4448

49+
@pytest.hookimpl(trylast=True)
50+
def pytest_fixture_post_finalizer(fixturedef, request) -> None:
51+
"""Called after fixture teardown"""
52+
if fixturedef.argname != "event_loop":
53+
return
54+
55+
policy = asyncio.get_event_loop_policy()
56+
try:
57+
loop = policy.get_event_loop()
58+
except RuntimeError:
59+
loop = None
60+
if loop is not None:
61+
# Cleanup code based on the implementation of asyncio.run()
62+
try:
63+
if not loop.is_closed():
64+
asyncio.runners._cancel_all_tasks(loop) # type: ignore[attr-defined]
65+
loop.run_until_complete(loop.shutdown_asyncgens())
66+
if sys.version_info >= (3, 9):
67+
loop.run_until_complete(loop.shutdown_default_executor())
68+
finally:
69+
loop.close()
70+
new_loop = policy.new_event_loop() # Replace existing event loop
71+
# Ensure subsequent calls to get_event_loop() succeed
72+
policy.set_event_loop(new_loop)
73+
74+
75+
@pytest.fixture
76+
def event_loop(
77+
request: pytest.FixtureRequest,
78+
) -> typing.Iterator[asyncio.AbstractEventLoop]:
79+
"""Create an instance of the default event loop for each test case."""
80+
yield asyncio.get_event_loop_policy().new_event_loop()
81+
# Call the garbage collector to trigger ResourceWarning's as soon
82+
# as possible (these are triggered in various __del__ methods).
83+
# Without this, resources opened in one test can fail other tests
84+
# when the warning is generated.
85+
gc.collect()
86+
# Event loop cleanup handled by pytest_fixture_post_finalizer
87+
88+
4589
class ForwardingSerialTransport:
4690
"""
4791
Serial transport that hooks directly into a protocol
@@ -86,7 +130,12 @@ def __repr__(self):
86130

87131

88132
def config_for_port_path(path):
89-
return conf.CONFIG_SCHEMA({conf.CONF_DEVICE: {conf.CONF_DEVICE_PATH: path}})
133+
return conf.CONFIG_SCHEMA(
134+
{
135+
conf.CONF_DEVICE: {conf.CONF_DEVICE_PATH: path},
136+
zigpy.config.CONF_NWK_BACKUP_ENABLED: False,
137+
}
138+
)
90139

91140

92141
@pytest.fixture

zigpy_znp/zigbee/application.py

Lines changed: 56 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,14 @@
6868
LOGGER = logging.getLogger(__name__)
6969

7070

71+
class RetryMethod(t.enum_flag_uint8):
72+
NONE = 0
73+
AssocRemove = 2 << 0
74+
RouteDiscovery = 2 << 1
75+
LastGoodRoute = 2 << 2
76+
IEEEAddress = 2 << 3
77+
78+
7179
class ControllerApplication(zigpy.application.ControllerApplication):
7280
SCHEMA = conf.CONFIG_SCHEMA
7381
SCHEMA_DEVICE = conf.SCHEMA_DEVICE
@@ -809,19 +817,19 @@ def _find_endpoint(self, dst_ep: int, profile: int, cluster: int) -> int:
809817

810818
async def _send_request_raw(
811819
self,
812-
dst_addr,
813-
dst_ep,
814-
src_ep,
815-
profile,
816-
cluster,
817-
sequence,
818-
options,
819-
radius,
820-
data,
820+
dst_addr: t.AddrModeAddress,
821+
dst_ep: int,
822+
src_ep: int,
823+
profile: int,
824+
cluster: int,
825+
sequence: int,
826+
options: c.af.TransmitOptions,
827+
radius: int,
828+
data: bytes,
821829
*,
822-
relays=None,
823-
extended_timeout=False,
824-
):
830+
relays: list[int] | None = None,
831+
extended_timeout: bool = False,
832+
) -> None:
825833
"""
826834
Used by `request`/`mrequest`/`broadcast` to send a request.
827835
Picks the correct request sending mechanism and fixes endpoint information.
@@ -922,9 +930,7 @@ async def _send_request_raw(
922930

923931
if dst_ep == ZDO_ENDPOINT or dst_addr.mode == t.AddrMode.Broadcast:
924932
# Broadcasts and ZDO requests will not receive a confirmation
925-
response = await self._znp.request(
926-
request=request, RspStatus=t.Status.SUCCESS
927-
)
933+
await self._znp.request(request=request, RspStatus=t.Status.SUCCESS)
928934
else:
929935
async with async_timeout.timeout(
930936
EXTENDED_DATA_CONFIRM_TIMEOUT
@@ -956,8 +962,6 @@ async def _send_request_raw(
956962
response,
957963
)
958964

959-
return response
960-
961965
@combine_concurrent_calls
962966
async def _discover_route(self, nwk: t.NWK) -> None:
963967
"""
@@ -1006,18 +1010,15 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
10061010

10071011
dst_addr = t.AddrModeAddress.from_zigpy_type(packet.dst)
10081012

1009-
status = None
1010-
response = None
1013+
succeeded = False
10111014
association = None
10121015
force_relays = None
10131016

10141017
if packet.source_route is not None:
10151018
force_relays = packet.source_route
10161019

1017-
tried_assoc_remove = False
1018-
tried_route_discovery = False
1019-
tried_last_good_route = False
1020-
tried_ieee_address = False
1020+
retry_methods = RetryMethod.NONE
1021+
last_retry_method = RetryMethod.NONE
10211022

10221023
# Don't release the concurrency-limiting semaphore until we are done trying.
10231024
# There is no point in allowing requests to take turns getting buffer errors.
@@ -1047,7 +1048,7 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
10471048
if route_status.Status != c.zdo.RoutingStatus.SUCCESS:
10481049
await self._discover_route(dst_addr.address)
10491050

1050-
response = await self._send_request_raw(
1051+
await self._send_request_raw(
10511052
dst_addr=dst_addr,
10521053
dst_ep=packet.dst_ep,
10531054
src_ep=packet.src_ep,
@@ -1060,7 +1061,7 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
10601061
relays=force_relays,
10611062
extended_timeout=packet.extended_timeout,
10621063
)
1063-
status = response.Status
1064+
succeeded = True
10641065
break
10651066
except InvalidCommandResponse as e:
10661067
status = e.response.Status
@@ -1078,23 +1079,27 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
10781079
or dst_addr.mode not in (t.AddrMode.NWK, t.AddrMode.IEEE)
10791080
):
10801081
LOGGER.debug(
1081-
"Request failed (%s), retry attempt %s of %s",
1082+
"Request failed (%s), retry attempt %s of %s (%s)",
10821083
e,
10831084
attempt + 1,
10841085
REQUEST_MAX_RETRIES,
1086+
retry_methods.name,
10851087
)
10861088
await asyncio.sleep(3 * REQUEST_ERROR_RETRY_DELAY)
10871089
continue
10881090

10891091
# If we can't contact the device by forcing a specific route,
1090-
# there is not point in trying this more than once.
1091-
if tried_last_good_route and force_relays is not None:
1092+
# there is no point in trying this more than once.
1093+
if (
1094+
retry_methods & RetryMethod.LastGoodRoute
1095+
and force_relays is not None
1096+
):
10921097
force_relays = None
10931098

10941099
# If we fail to contact the device with its IEEE address, don't
10951100
# try again.
10961101
if (
1097-
tried_ieee_address
1102+
retry_methods & RetryMethod.IEEEAddress
10981103
and dst_addr.mode == t.AddrMode.IEEE
10991104
and device is not None
11001105
):
@@ -1111,7 +1116,7 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
11111116
status == t.Status.MAC_TRANSACTION_EXPIRED
11121117
and device is not None
11131118
and association is None
1114-
and not tried_assoc_remove
1119+
and not retry_methods & RetryMethod.AssocRemove
11151120
and self._znp.version >= 3.30
11161121
):
11171122
association = await self._znp.request(
@@ -1129,7 +1134,8 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
11291134
await self._znp.request(
11301135
c.UTIL.AssocRemove.Req(IEEE=device.ieee)
11311136
)
1132-
tried_assoc_remove = True
1137+
retry_methods |= RetryMethod.AssocRemove
1138+
last_retry_method = RetryMethod.AssocRemove
11331139

11341140
# Route discovery must be performed right after
11351141
await self._discover_route(device.nwk)
@@ -1138,39 +1144,46 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
11381144
"The UTIL.AssocRemove command is available only"
11391145
" in Z-Stack 3 releases built after 20201017"
11401146
)
1141-
elif not tried_last_good_route and device is not None:
1147+
elif (
1148+
not retry_methods & RetryMethod.LastGoodRoute
1149+
and device is not None
1150+
):
11421151
# `ZDO.SrcRtgInd` callbacks tell us the last path taken by
11431152
# messages from the device back to the coordinator. Sending
11441153
# packets backwards via this same route may work.
11451154
force_relays = (device.relays or [])[::-1]
1146-
tried_last_good_route = True
1155+
retry_methods |= RetryMethod.LastGoodRoute
1156+
last_retry_method = RetryMethod.LastGoodRoute
11471157
elif (
1148-
not tried_route_discovery
1158+
not retry_methods & RetryMethod.RouteDiscovery
11491159
and dst_addr.mode == t.AddrMode.NWK
11501160
):
11511161
# If that doesn't work, try re-discovering the route.
11521162
# While we can in theory poll and wait until it is fixed,
11531163
# letting the retry mechanism deal with it simpler.
11541164
await self._discover_route(dst_addr.address)
1155-
tried_route_discovery = True
1165+
retry_methods |= RetryMethod.RouteDiscovery
1166+
last_retry_method = RetryMethod.RouteDiscovery
11561167
elif (
1157-
not tried_ieee_address
1168+
not retry_methods & RetryMethod.IEEEAddress
11581169
and device is not None
11591170
and dst_addr.mode == t.AddrMode.NWK
11601171
):
11611172
# Try using the device's IEEE address instead of its NWK.
11621173
# If it works, the NWK will be updated when relays arrive.
1163-
tried_ieee_address = True
1174+
retry_methods |= RetryMethod.IEEEAddress
1175+
last_retry_method = RetryMethod.IEEEAddress
11641176
dst_addr = t.AddrModeAddress(
11651177
mode=t.AddrMode.IEEE,
11661178
address=device.ieee,
11671179
)
11681180

11691181
LOGGER.debug(
1170-
"Request failed (%s), retry attempt %s of %s",
1182+
"Request failed (%s), retry attempt %s of %s (%s)",
11711183
e,
11721184
attempt + 1,
11731185
REQUEST_MAX_RETRIES,
1186+
retry_methods.name,
11741187
)
11751188

11761189
# We've tried everything already so at this point just wait
@@ -1181,11 +1194,15 @@ async def send_packet(self, packet: zigpy.types.ZigbeePacket) -> None:
11811194
f" {status!r}",
11821195
status=status,
11831196
)
1197+
1198+
self.state.counters[f"Retry_{last_retry_method.name}"][
1199+
attempt
1200+
].increment()
11841201
finally:
11851202
# We *must* re-add the device association if we previously removed it but
11861203
# the request still failed. Otherwise, it may be a direct child and we will
11871204
# not be able to find it again.
1188-
if tried_assoc_remove and response is None:
1205+
if not succeeded and retry_methods & RetryMethod.AssocRemove:
11891206
await self._znp.request(
11901207
c.UTIL.AssocAdd.Req(
11911208
NWK=association.Device.shortAddr,

0 commit comments

Comments
 (0)