Working, prompting in place, not active

bentito · bentito · commit 3a3f663fca49 · 2024-12-07T10:02:59.000-05:00
Signed-off-by: Brett Tofel &lt;btofel@redhat.com&gt;
diff --git a/hack/tools/ownership_tree.py b/hack/tools/ownership_tree.py
@@ -1,32 +1,39 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python3.11
 import json
+import os
+import openai
 import subprocess
 import sys
 import argparse
 from collections import defaultdict
 
-parser = argparse.ArgumentParser(description="Print a tree of ownership for all resources in a namespace, including cluster-scoped ones that reference the namespace.")
+parser = argparse.ArgumentParser(description="Print a tree of ownership for all resources in a namespace, optionally gather cluster extension state.")
 parser.add_argument("namespace", help="The namespace to inspect")
 parser.add_argument("--no-events", action="store_true", help="Do not show Events kind grouping")
 parser.add_argument("--with-event-info", action="store_true", help="Show additional info (message) for Events")
+parser.add_argument("--gather-cluster-extension-state", action="store_true",
+                    help="Gather and save a compressed fingerprint of the cluster extension state to a file.")
+parser.add_argument("--no-tree", action="store_true", help="Do not print the tree output (only used if gather-cluster-extension-state is set).")
 args = parser.parse_args()
 
 NAMESPACE = args.namespace
-SHOW_EVENTS = not args.no_events
-WITH_EVENT_INFO = args.with_event_info
+
+# If gather-cluster-extension-state is used, we want full info regardless of other flags
+if args.gather_cluster_extension_state:
+    SHOW_EVENTS = True
+    WITH_EVENT_INFO = True
+else:
+    SHOW_EVENTS = not args.no_events
+    WITH_EVENT_INFO = args.with_event_info
 
 def parse_api_resources_line(line):
     parts = [p for p in line.split(' ') if p]
     if len(parts) < 3:
         return None
-    # KIND is last
     kind = parts[-1]
-    # NAMESPACED is second-last
     namespaced_str = parts[-2].lower()
     namespaced = (namespaced_str == "true")
-    # NAME is first
     name = parts[0]
-    # We don't need SHORTNAMES/APIVERSION for the tree logic.
     return name, namespaced, kind
 
 kind_to_plural = {}
@@ -39,18 +46,13 @@ def parse_api_resources_line(line):
     for line in lines[1:]:
         if not line.strip():
             continue
-        parts = [p for p in line.split(' ') if p]
-        if len(parts) < 3:
+        parsed = parse_api_resources_line(line)
+        if not parsed:
             continue
-        # Parse from right: kind=last, namespaced=second-last, name=first
-        kind = parts[-1]
-        namespaced_str = parts[-2].lower()
-        namespaced = (namespaced_str == "true")
-        name = parts[0]
-
+        name, is_namespaced, kind = parsed
         if kind not in kind_to_plural:
             kind_to_plural[kind] = name
-        resource_info.append((kind, name, namespaced))
+        resource_info.append((kind, name, is_namespaced))
 except subprocess.CalledProcessError:
     pass
 
@@ -79,7 +81,7 @@ def get_resources_for_type(resource_name, namespaced):
     if namespaced:
         return items
 
-    # Cluster-scoped: filter by namespace reference
+    # cluster-scoped: filter by namespace reference
     filtered = []
     for item in items:
         meta_ns = item.get("metadata", {}).get("namespace")
@@ -90,7 +92,7 @@ def get_resources_for_type(resource_name, namespaced):
     if filtered:
         return filtered
 
-    # Fallback: try get by name if no filtered items
+    # fallback by name
     try:
         single_json = subprocess.check_output(
             ["kubectl", "get", resource_name, NAMESPACE, "-o", "json", "--ignore-not-found"],
@@ -110,18 +112,16 @@ def get_resources_for_type(resource_name, namespaced):
 
 # Collect resources
 for (kind, plural_name, is_namespaced) in resource_info:
-    # Skip events if we don't show them at all
+    # If we are gathering CE state or SHOW_EVENTS is True, we process events, else skip if no events
     if kind == "Event" and not SHOW_EVENTS:
         continue
-
     items = get_resources_for_type(plural_name, is_namespaced)
     for item in items:
         uid = item["metadata"]["uid"]
         k = item["kind"]
         nm = item["metadata"]["name"]
         owners = [(o["kind"], o["name"], o["uid"]) for o in item["metadata"].get("ownerReferences", [])]
 
-        # If it's an Event and we don't show events, skip
         if k == "Event" and not SHOW_EVENTS:
             continue
 
@@ -133,7 +133,6 @@ def get_resources_for_type(resource_name, namespaced):
             "owners": owners
         }
 
-        # If it's an Event and we want event info, store the message
         if k == "Event" and WITH_EVENT_INFO:
             res_entry["message"] = item.get("message", "")
 
@@ -142,7 +141,7 @@ def get_resources_for_type(resource_name, namespaced):
 
 owner_to_children = defaultdict(list)
 for uid, res in uid_to_resource.items():
-    for (o_kind, o_name, o_uid) in res["owners"]:
+    for (_, _, o_uid) in res["owners"]:
         owner_to_children[o_uid].append(uid)
 
 # Identify top-level
@@ -167,7 +166,7 @@ def get_resources_for_type(resource_name, namespaced):
     kind_groups[r["kind"]].append(uid)
 
 pseudo_nodes = {}
-for kind, uids in kind_groups.items():
+for kind, uids_ in kind_groups.items():
     if kind == "Event" and not SHOW_EVENTS:
         continue
     plural = kind_to_plural.get(kind, kind.lower() + "s")
@@ -180,7 +179,7 @@ def get_resources_for_type(resource_name, namespaced):
         "uid": pseudo_uid,
         "owners": []
     }
-    for child_uid in uids:
+    for child_uid in uids_:
         owner_to_children[pseudo_uid].append(child_uid)
 
 top_level_kinds = list(pseudo_nodes.values())
@@ -198,20 +197,196 @@ def resource_sort_key(uid):
 def print_tree(uid, prefix="", is_last=True):
     r = uid_to_resource[uid]
     branch = "└── " if is_last else "├── "
-    print(prefix + branch + f"{r['kind']}" + (f"/{r['name']}" if r['name'] else ""))
-
-    # If Event and we want message info
+    if r['name']:
+        print(prefix + branch + f"{r['kind']}/{r['name']}")
+    else:
+        print(prefix + branch + f"{r['kind']}")
     if WITH_EVENT_INFO and r['kind'] == "Event" and "message" in r:
-        # Print event message as a child line
         child_prefix = prefix + ("    " if is_last else "│   ")
-        # message line
         print(child_prefix + "└── message: " + r["message"])
-
     children = owner_to_children.get(uid, [])
     children.sort(key=resource_sort_key)
     child_prefix = prefix + ("    " if is_last else "│   ")
     for i, c_uid in enumerate(children):
         print_tree(c_uid, prefix=child_prefix, is_last=(i == len(children)-1))
 
-for i, uid in enumerate(top_level_kinds):
-    print_tree(uid, prefix="", is_last=(i == len(top_level_kinds)-1))
+
+###############################
+# Code for gather fingerprint
+###############################
+def extract_resource_summary(kind, name, namespace):
+    is_namespaced = (namespace is not None and namespace != "")
+    cmd = ["kubectl", "get", kind.lower()+"/"+name]
+    if is_namespaced:
+        cmd.extend(["-n", namespace])
+    cmd.extend(["-o", "json", "--ignore-not-found"])
+
+    try:
+        out = subprocess.check_output(cmd, text=True, stderr=subprocess.DEVNULL)
+        if not out.strip():
+            return {}
+        data = json.loads(out)
+    except subprocess.CalledProcessError:
+        return {}
+
+    summary = {
+        "kind": data.get("kind", kind),
+        "name": data.get("metadata", {}).get("name", name),
+        "namespace": data.get("metadata", {}).get("namespace", namespace)
+    }
+
+    conditions = data.get("status", {}).get("conditions", [])
+    if conditions:
+        summary["conditions"] = [
+            {
+                "type": c.get("type"),
+                "status": c.get("status"),
+                "reason": c.get("reason"),
+                "message": c.get("message")
+            } for c in conditions
+        ]
+
+    # For pods/deployments, extract container images
+    if data.get("kind") in ["Pod", "Deployment"]:
+        images = []
+        if data["kind"] == "Pod":
+            containers = data.get("spec", {}).get("containers", [])
+            for cont in containers:
+                images.append({"name": cont.get("name"), "image": cont.get("image")})
+        elif data["kind"] == "Deployment":
+            containers = data.get("spec", {}).get("template", {}).get("spec", {}).get("containers", [])
+            for cont in containers:
+                images.append({"name": cont.get("name"), "image": cont.get("image")})
+        if images:
+            summary["containers"] = images
+
+    # For Events, show reason and message
+    if data.get("kind") == "Event":
+        summary["reason"] = data.get("reason")
+        summary["message"] = data.get("message")
+
+    metadata = data.get("metadata", {})
+    if metadata.get("labels"):
+        summary["labels"] = metadata["labels"]
+    if metadata.get("annotations"):
+        summary["annotations"] = metadata["annotations"]
+
+    return summary
+
+def load_fingerprint(file_path):
+    """Load the JSON fingerprint file from the specified path."""
+    with open(file_path, 'r') as f:
+        return json.load(f)
+
+def generate_prompt(fingerprint):
+    """Generate the diagnostic prompt by embedding the fingerprint into the request."""
+    prompt = """
+You are an expert in Kubernetes operations and diagnostics. I will provide you with a JSON file that represents a snapshot ("fingerprint") of the entire state of a Kubernetes namespace focusing on a particular ClusterExtension and all related resources. This fingerprint includes:
+
+- The ClusterExtension itself.
+- All resources in the namespace that are either owned by or possibly needed by the ClusterExtension.
+- Key details such as resource conditions, event messages, container images (with references), and minimal metadata.
+
+Your task is:
+1. Analyze the provided fingerprint to determine if there are any issues with the ClusterExtension, its related resources, or its configuration.
+2. If issues are found, provide a diagnosis of what might be wrong and suggest steps to fix them.
+3. If no issues appear, acknowledge that the ClusterExtension and its resources seem healthy.
+4. Keep your answer concise and action-focused, as the output will be used by a human operator to troubleshoot or confirm the health of their cluster.
+
+**Important Details:**
+- The fingerprint might contain events that show what happened in the cluster recently.
+- Check conditions of deployments, pods, and other resources to see if they indicate errors or warnings.
+- Look at event messages for hints about failures, restarts, or other anomalies.
+- Consider if all necessary resources (like ServiceAccounts, ConfigMaps, or other dependencies) are present and seemingly functional.
+
+**BEGIN FINGERPRINT**
+{fingerprint}
+**END FINGERPRINT**
+
+Please provide a summarized diagnosis and suggested fixes below:
+    """.format(fingerprint=json.dumps(fingerprint, indent=2))
+    return prompt
+
+def send_to_openai(prompt, model="gpt-4o"):
+    """Send the prompt to OpenAI's completions API and get the response."""
+    try:
+        openai.api_key = os.getenv("OPENAI_API_KEY")
+        if not openai.api_key:
+            raise ValueError("OPENAI_API_KEY environment variable is not set.")
+
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[{"role": "user", "content": prompt}]
+        )
+
+        # Extract and return the assistant's message
+        message_content = response['choices'][0]['message']['content']
+        return message_content
+
+    except Exception as e:
+        return f"Error communicating with OpenAI API: {e}"
+
+def gather_fingerprint(namespace):
+    # Find cluster extension(s)
+    ce_uids = [uid for uid, res in uid_to_resource.items() if res["kind"] == "ClusterExtension" and res["namespace"] == namespace]
+    if not ce_uids:
+        return []
+
+    all_images = {}
+    image_ref_count = 0
+
+    def process_resource(uid):
+        nonlocal image_ref_count
+        r = uid_to_resource[uid]
+        k = r["kind"]
+        nm = r["name"]
+        ns = r["namespace"]
+        summary = extract_resource_summary(k, nm, ns)
+        # Deduplicate images
+        if "containers" in summary:
+            new_containers = []
+            for c in summary["containers"]:
+                img = c["image"]
+                if img not in all_images:
+                    image_ref_count += 1
+                    ref_name = f"image_ref_{image_ref_count}"
+                    all_images[img] = ref_name
+                c["imageRef"] = all_images[img]
+                del c["image"]
+                new_containers.append(c)
+            summary["containers"] = new_containers
+        return summary
+
+    results = []
+    for ce_uid in ce_uids:
+        fingerprint = {}
+        # Include all discovered resources
+        for uid in uid_to_resource:
+            r = uid_to_resource[uid]
+            key = f"{r['kind']}/{r['name']}"
+            fp = process_resource(uid)
+            fingerprint[key] = fp
+        if all_images:
+            fingerprint["_image_map"] = {v: k for k, v in all_images.items()}
+        ce_name = uid_to_resource[ce_uid]["name"]
+        fname = f"{ce_name}-state.json"
+        with open(fname, "w") as f:
+            json.dump(fingerprint, f, indent=2)
+        results.append(fname)
+    return results
+
+# If gather-cluster-extension-state, generate state file(s)
+state_files = []
+if args.gather_cluster_extension_state:
+    state_files = gather_fingerprint(NAMESPACE)
+
+# Print tree unless --no-tree is given AND we are in gather-cluster-extension-state mode
+if not (args.gather_cluster_extension_state and args.no_tree):
+    for i, uid in enumerate(top_level_kinds):
+        print_tree(uid, prefix="", is_last=(i == len(top_level_kinds)-1))
+
+if args.gather_cluster_extension_state:
+    if not state_files:
+        print("No ClusterExtension found in the namespace, no state file created.", file=sys.stderr)
+    else:
+        print("Created state file(s):", ", ".join(state_files))