diff --git a/dev/environment b/dev/environment index 0aab128a9568..6d9cdc64f9e0 100644 --- a/dev/environment +++ b/dev/environment @@ -43,3 +43,5 @@ WAREHOUSE_LEGACY_DOMAIN=pypi.python.org VAULT_URL="http://vault:8200" VAULT_TOKEN="an insecure vault access token" + +GITHUB_TOKEN_SCANNING_META_API_URL="http://notgithub:8000/meta/public_keys/token_scanning" diff --git a/docker-compose.yml b/docker-compose.yml index e066d2c6220a..37f07c92eb95 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -142,3 +142,10 @@ services: - "8125:8125/udp" volumes: - ./dev/notdatadog.py:/opt/warehouse/dev/notdatadog.py + + notgithub: + image: ewjoachim/notgithub-token-scanning + environment: + NOTGITHUB_DEFAULT_URL: "http://web:8000/_/github/disclose-token" + ports: + - "8964:8000" diff --git a/docs/development/index.rst b/docs/development/index.rst index e2c58151f9a8..a5f1896a57ec 100644 --- a/docs/development/index.rst +++ b/docs/development/index.rst @@ -32,6 +32,7 @@ or the `distutils-sig mailing list`_, to ask questions or get involved. development-database cloud malware-checks + token-scanning .. _`GitHub`: https://github.com/pypa/warehouse .. _`"What to put in your bug report"`: http://www.contribution-guide.org/#what-to-put-in-your-bug-report diff --git a/docs/development/token-scanning.rst b/docs/development/token-scanning.rst new file mode 100644 index 000000000000..3fbe8ac23053 --- /dev/null +++ b/docs/development/token-scanning.rst @@ -0,0 +1,100 @@ +Token Scanning +============== + +People make mistakes. Sometimes, they post their PyPI tokens publicly. Some +content managers run regexes to try and identify published secrets, and ideally +have them deactivated. PyPI has started integrating with such systems in order +to help secure packages. + +.. contents:: + :local: + +How to recognize a PyPI secret +------------------------------ + +A PyPI API token is a string consisting of a prefix (``pypi``), a separator +(``-``) and a macaroon serialized with PyMacaroonv2, which means it's the +``base64`` of:: + + \x02\x01\x08pypi.org\x02\x01b + +Thanks to this, we know that a PyPI token is bound to start with:: + + pypi-AgEIcHlwaS5vcmc[A-Za-z0-9-_]{70,} + +A token can be arbitrary long because we may add arbitrary many caveats. For +more details on the token format, see `pypitoken +`_. + +GitHub Secret Scanning +---------------------- + +GitHub's Token scanning feature used to be called "Token Scanning" and is now +"Secret Scanning". You may find the 2 names. GitHub scans public commits with +the regex above (actually the limit to at least 130 characters long). For all +tokens identified within a "push" event, they send us reports in bulk. The +format is explained thouroughly in `their doc +`_ as well as +in the `warehouse implementation ticket +`_. + +In short: they send us a cryptographically signed payload describing each +leaked token alongside with a public URL pointing to it. + +How to test it manually +^^^^^^^^^^^^^^^^^^^^^^^ + +A fake github service is launched by Docker Compose. Head your browser to +``http://localhost:8964``. Create/reorder/... one ore more public keys, make +sure one key is marked as current, then write your payload, using the following +format: + +.. code-block:: json + + [{ + "type": "pypi_api_token", + "token": "pypi-...", + "url": "https://example.com" + }] + +Send your payload. It sends it to your local Warhouse. If a match is found, you +should find that: + +- the token you sent has disappeared from the user account page, +- 2 new security events have been sent: one for the token deletion, one for the + notification email. + +After you send the token, the page will reload, and you'll find the details of +the request at the bottom. If all went well, you should see a ``204`` ('No +Content'). + +Whether it worked or not, a bunch of metrics have been issued, you can see them +in the `notdatadog` container log. + +GitLab Secret Detection +----------------------- + +GitLab also has an equivalent mechanism, named "Secret Detection", not +implemented in Warehouse yet (see `#9280 +`_). + +PyPI token disclosure infrastructure +------------------------------------ + +The code is mainly in ``warehouse/integration/github``. +There are 3 main parts in handling a token disclosure report: + +- The Web view, which is the top-level glue but does not implement the logic +- Vendor specific authenticity check & loading. In the case of GitHub, we check + that the payload and the associated signature match with the public keys + available in their meta-API +- (Supposedly-)Vendor-independent disclosure analysis: + + - Each token is processed individually in its own celery task + - Token is analyzed, we check if its format is correct and if it + corresponds to a macaroon we have in the DB + - We don't check the signature. This is something that could change in the + future but for now, we consider that if a token identifier leaked, even + without a valid signature, it's enough to warrant deleting it. + - If it's valid, we delete it, log a security event and send an email + (which will spawn a second celery task) diff --git a/tests/unit/integration/github/test_utils.py b/tests/unit/integration/github/test_utils.py index 697c7089934b..62425a3674bd 100644 --- a/tests/unit/integration/github/test_utils.py +++ b/tests/unit/integration/github/test_utils.py @@ -114,15 +114,21 @@ def test_init(self): metrics = pretend.stub() session = pretend.stub() token = "api_token" + url = "http://foo" cache = utils.PublicKeysCache(cache_time=12) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=session, metrics=metrics, api_token=token, public_keys_cache=cache + api_url=url, + session=session, + metrics=metrics, + api_token=token, + public_keys_cache=cache, ) assert verifier._session is session assert verifier._metrics is metrics assert verifier._api_token == token + assert verifier._api_url == url assert verifier._public_keys_cache is cache def test_verify_cache_miss(self): @@ -148,6 +154,7 @@ def test_verify_cache_miss(self): metrics = pretend.stub(increment=pretend.call_recorder(lambda str: None)) cache = utils.PublicKeysCache(cache_time=12) verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=session, metrics=metrics, api_token="api-token", @@ -189,6 +196,7 @@ def test_verify_cache_hit(self): } ] verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=session, metrics=metrics, api_token="api-token", @@ -219,6 +227,7 @@ def test_verify_error(self): metrics = pretend.stub(increment=pretend.call_recorder(lambda str: None)) cache = utils.PublicKeysCache(cache_time=12) verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=metrics, api_token="api-token", @@ -237,6 +246,7 @@ def test_verify_error(self): def test_headers_auth_no_token(self): headers = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), api_token=None, @@ -246,6 +256,7 @@ def test_headers_auth_no_token(self): def test_headers_auth_token(self): headers = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), api_token="api-token", @@ -274,6 +285,7 @@ def test_retrieve_public_key_payload(self): metrics = pretend.stub(increment=pretend.call_recorder(lambda str: None)) verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=session, metrics=metrics, api_token="api-token", @@ -282,7 +294,7 @@ def test_retrieve_public_key_payload(self): assert verifier._retrieve_public_key_payload() == meta_payload assert session.get.calls == [ pretend.call( - "https://api.github.com/meta/public_keys/token_scanning", + "http://foo", headers={"Authorization": "token api-token"}, ) ] @@ -295,7 +307,10 @@ def test_get_cached_public_key_cache_hit(self): cache.set(now=time.time(), value=cache_value) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=session, metrics=metrics, public_keys_cache=cache + api_url="http://foo", + session=session, + metrics=metrics, + public_keys_cache=cache, ) assert verifier._get_cached_public_keys() is cache_value @@ -306,7 +321,10 @@ def test_get_cached_public_key_cache_miss_no_cache(self): cache = utils.PublicKeysCache(cache_time=12) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=session, metrics=metrics, public_keys_cache=cache + api_url="http://foo", + session=session, + metrics=metrics, + public_keys_cache=cache, ) with pytest.raises(utils.CacheMiss): @@ -322,7 +340,10 @@ def test_retrieve_public_key_payload_http_error(self): get=lambda *a, **k: response, ) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=session, metrics=pretend.stub(), public_keys_cache=pretend.stub() + api_url="http://foo", + session=session, + metrics=pretend.stub(), + public_keys_cache=pretend.stub(), ) with pytest.raises(utils.GitHubPublicKeyMetaAPIError) as exc: verifier._retrieve_public_key_payload() @@ -338,7 +359,10 @@ def test_retrieve_public_key_payload_json_error(self): ) session = pretend.stub(get=lambda *a, **k: response) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=session, metrics=pretend.stub(), public_keys_cache=pretend.stub() + api_url="http://foo", + session=session, + metrics=pretend.stub(), + public_keys_cache=pretend.stub(), ) with pytest.raises(utils.GitHubPublicKeyMetaAPIError) as exc: verifier._retrieve_public_key_payload() @@ -350,7 +374,10 @@ def test_retrieve_public_key_payload_connection_error(self): session = pretend.stub(get=pretend.raiser(requests.ConnectionError)) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=session, metrics=pretend.stub(), public_keys_cache=pretend.stub() + api_url="http://foo", + session=session, + metrics=pretend.stub(), + public_keys_cache=pretend.stub(), ) with pytest.raises(utils.GitHubPublicKeyMetaAPIError) as exc: @@ -375,7 +402,10 @@ def test_extract_public_keys(self): } cache = utils.PublicKeysCache(cache_time=12) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=cache + api_url="http://foo", + session=pretend.stub(), + metrics=pretend.stub(), + public_keys_cache=cache, ) keys = verifier._extract_public_keys(pubkey_api_data=meta_payload) @@ -415,7 +445,10 @@ def test_extract_public_keys(self): def test_extract_public_keys_error(self, payload, expected): cache = utils.PublicKeysCache(cache_time=12) verifier = utils.GitHubTokenScanningPayloadVerifier( - session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=cache + api_url="http://foo", + session=pretend.stub(), + metrics=pretend.stub(), + public_keys_cache=cache, ) with pytest.raises(utils.GitHubPublicKeyMetaAPIError) as exc: @@ -427,6 +460,7 @@ def test_extract_public_keys_error(self, payload, expected): def test_check_public_key(self): verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=pretend.stub(), @@ -440,6 +474,7 @@ def test_check_public_key(self): def test_check_public_key_error(self): verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=pretend.stub(), @@ -453,6 +488,7 @@ def test_check_public_key_error(self): def test_check_signature(self): verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=pretend.stub(), @@ -482,6 +518,7 @@ def test_check_signature(self): def test_check_signature_invalid_signature(self): verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=pretend.stub(), @@ -513,6 +550,7 @@ def test_check_signature_invalid_signature(self): def test_check_signature_invalid_crypto(self): verifier = utils.GitHubTokenScanningPayloadVerifier( + api_url="http://foo", session=pretend.stub(), metrics=pretend.stub(), public_keys_cache=pretend.stub(), diff --git a/tests/unit/integration/github/test_views.py b/tests/unit/integration/github/test_views.py index 03f7b8ba2b1b..ea1c97714a73 100644 --- a/tests/unit/integration/github/test_views.py +++ b/tests/unit/integration/github/test_views.py @@ -29,7 +29,10 @@ def test_github_disclose_token(self, pyramid_request, monkeypatch): pyramid_request.body = "[1, 2, 3]" pyramid_request.json_body = [1, 2, 3] - pyramid_request.registry.settings = {"github.token": "token"} + pyramid_request.registry.settings = { + "github.token": "token", + "github.token_scanning_meta_api.url": "http://foo", + } pyramid_request.find_service = lambda *a, **k: metrics http = pyramid_request.http = pretend.stub() @@ -46,7 +49,9 @@ def test_github_disclose_token(self, pyramid_request, monkeypatch): assert response.status_code == 204 assert verifier_cls.calls == [ - pretend.call(session=http, metrics=metrics, api_token="token") + pretend.call( + session=http, metrics=metrics, api_token="token", api_url="http://foo" + ) ] assert verify.calls == [ pretend.call(payload="[1, 2, 3]", key_id="foo", signature="bar") @@ -70,7 +75,9 @@ def test_github_disclose_token_no_token(self, pyramid_request, monkeypatch): pyramid_request.body = "[1, 2, 3]" pyramid_request.json_body = [1, 2, 3] - pyramid_request.registry.settings = {} + pyramid_request.registry.settings = { + "github.token_scanning_meta_api.url": "http://foo" + } pyramid_request.find_service = lambda *a, **k: metrics pyramid_request.http = pretend.stub() @@ -96,7 +103,10 @@ def test_github_disclose_token_verify_fail(self, monkeypatch, pyramid_request): pyramid_request.body = "[1, 2, 3]" pyramid_request.find_service = lambda *a, **k: metrics - pyramid_request.registry.settings = {"github.token": "token"} + pyramid_request.registry.settings = { + "github.token": "token", + "github.token_scanning_meta_api.url": "http://foo", + } pyramid_request.http = pretend.stub() @@ -137,7 +147,12 @@ def find_service(self, *a, **k): response = pretend.stub(status_int=200) http = pretend.stub() - registry = pretend.stub(settings={"github.token": "token"}) + registry = pretend.stub( + settings={ + "github.token": "token", + "github.token_scanning_meta_api.url": "http://foo", + } + ) request = Request() response = views.github_disclose_token(request) @@ -160,7 +175,10 @@ def metrics_increment(key): pyramid_request.body = "{}" pyramid_request.json_body = {} - pyramid_request.registry.settings = {"github.token": "token"} + pyramid_request.registry.settings = { + "github.token": "token", + "github.token_scanning_meta_api.url": "http://foo", + } pyramid_request.find_service = lambda *a, **k: metrics_service pyramid_request.http = pretend.stub() diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 94c1fdc3834d..a758d34bdd35 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -231,6 +231,9 @@ def __init__(self): "token.default.max_age": 21600, "warehouse.xmlrpc.client.ratelimit_string": "3600 per hour", "warehouse.xmlrpc.search.enabled": True, + "github.token_scanning_meta_api.url": ( + "https://api.github.com/meta/public_keys/token_scanning" + ), } if environment == config.Environment.development: expected_settings.update( diff --git a/warehouse/config.py b/warehouse/config.py index ff723bd0e1c5..595f86a08977 100644 --- a/warehouse/config.py +++ b/warehouse/config.py @@ -164,6 +164,12 @@ def configure(settings=None): settings, "warehouse.release_files_table", "WAREHOUSE_RELEASE_FILES_TABLE" ) maybe_set(settings, "github.token", "GITHUB_TOKEN") + maybe_set( + settings, + "github.token_scanning_meta_api.url", + "GITHUB_TOKEN_SCANNING_META_API_URL", + default="https://api.github.com/meta/public_keys/token_scanning", + ) maybe_set(settings, "warehouse.trending_table", "WAREHOUSE_TRENDING_TABLE") maybe_set(settings, "celery.broker_url", "BROKER_URL") maybe_set(settings, "celery.result_url", "REDIS_URL") diff --git a/warehouse/integrations/github/utils.py b/warehouse/integrations/github/utils.py index d3c72a1f0a82..eaa7c2a0163b 100644 --- a/warehouse/integrations/github/utils.py +++ b/warehouse/integrations/github/utils.py @@ -164,6 +164,7 @@ def __init__( *, session, metrics, + api_url: str, api_token: Optional[str] = None, public_keys_cache=PUBLIC_KEYS_CACHE, ): @@ -171,6 +172,7 @@ def __init__( self._session = session self._api_token = api_token self._public_keys_cache = public_keys_cache + self._api_url = api_url def verify(self, *, payload, key_id, signature): @@ -217,15 +219,8 @@ def _headers_auth(self): return {"Authorization": f"token {self._api_token}"} def _retrieve_public_key_payload(self): - - token_scanning_pubkey_api_url = ( - "https://api.github.com/meta/public_keys/token_scanning" - ) - try: - response = self._session.get( - token_scanning_pubkey_api_url, headers=self._headers_auth() - ) + response = self._session.get(self._api_url, headers=self._headers_auth()) response.raise_for_status() return response.json() except requests.HTTPError as exc: diff --git a/warehouse/integrations/github/views.py b/warehouse/integrations/github/views.py index 21020d3b5f9f..b35e1bc420b4 100644 --- a/warehouse/integrations/github/views.py +++ b/warehouse/integrations/github/views.py @@ -47,6 +47,7 @@ def github_disclose_token(request): verifier = utils.GitHubTokenScanningPayloadVerifier( session=request.http, metrics=metrics, + api_url=request.registry.settings["github.token_scanning_meta_api.url"], api_token=request.registry.settings.get("github.token"), )