Integuru/integuru/util/har_processing.py at 557fe5e0660c3104e51dc81b00c92fa43e949f9b · Integuru-AI/Integuru · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
import json
import os
from urllib.parse import urlparse
from integuru.models.request import Request
from typing import Tuple, Dict, Optional, Any, List

excluded_keywords = (
    "google",
    "taboola",
    "datadog",
    "sentry",
    # "relic"
)

excluded_header_keywords = (
    "cookie",
    "sec-",
    "accept",
    "user-agent",
    "referer",
    "relic",
    "sentry",
    "datadog",
    "amplitude",
    "mixpanel",
    "segment",
    "heap",
    "hotjar",
    "fullstory",
    "pendo",
    "optimizely",
    "adobe",
    "analytics",
    "tracking",
    "telemetry",
    "clarity",  # Microsoft Clarity
    "matomo",
    "plausible",
)

sensitive_keywords = (
    "Authorization",
    "Token",
    "Auth",
    "Password",
    "Secret",
    "Key",
    "Credential",
    "Session",
    "Bearer",
)

def filter_sensitive_info(request: Dict[str, Any]) -> Dict[str, Any]:
    """
    Filters out sensitive information from the request headers and body.
    """
    filtered_headers = {
        k: v for k, v in request.get("headers", {}).items()
        if not any(keyword.lower() in k.lower() for keyword in sensitive_keywords)
    }
    request["headers"] = filtered_headers

    if "postData" in request:
        post_data = request["postData"].get("text", "")
        if any(keyword.lower() in post_data.lower() for keyword in sensitive_keywords):
            request["postData"]["text"] = "[FILTERED]"

    return request

def format_request(har_request: Dict[str, Any]) -> Request:
    """
    Formats a HAR request into a Request object.
    """
    har_request = filter_sensitive_info(har_request)
    method = har_request.get("method", "GET")
    url = har_request.get("url", "")

    # Store headers as a dictionary, excluding headers containing excluded keywords
    headers = {
        header.get("name", ""): header.get("value", "")
        for header in har_request.get("headers", [])
        if not any(keyword.lower() in header.get("name", "").lower()
                  for keyword in excluded_header_keywords)
    }

    query_params_list = har_request.get("queryString", [])
    query_params = {param["name"]: param["value"] for param in query_params_list} if query_params_list else None

    post_data = har_request.get("postData", {})
    body = post_data.get("text") if post_data else None

    # Try to parse body as JSON if Content-Type is application/json
    if body:
        headers_lower = {k.lower(): v for k, v in headers.items()}
        content_type = headers_lower.get('content-type')
        if content_type and 'application/json' in content_type.lower():
            try:
                body = json.loads(body)
            except json.JSONDecodeError:
                pass  # Keep body as is if not valid JSON

    return Request(
        method=method,
        url=url,
        headers=headers,
        query_params=query_params,
        body=body
    )


def format_response(har_response: Dict[str, Any]) -> Dict[str, str]:
    """
    Extracts and returns the content text and content type from a HAR response.
    """
    content = har_response.get("content", {})
    return {
        "text": content.get("text", ""),
        "type": content.get("mimeType", "")
    }


def parse_har_file(har_file_path: str) -> Dict[Request, Dict[str, str]]:
    """
    Parses the HAR file and returns a dictionary mapping Request objects to response dictionaries.
    """
    req_res_dict = {}

    with open(har_file_path, 'r', encoding='utf-8') as file:
        har_data = json.load(file)

    entries = har_data.get("log", {}).get("entries", [])

    for entry in entries:
        request_data = entry.get("request", {})
        response_data = entry.get("response", {})

        formatted_request = format_request(request_data)
        response_dict = format_response(response_data)

        req_res_dict[formatted_request] = response_dict

    return req_res_dict


def build_url_to_req_res_map(req_res_dict: Dict[Request, Dict[str, str]]) -> Dict[str, Dict[str, Any]]:
    """
    Builds a dictionary mapping URLs to {'request': formatted_request, 'response': response_dict}
    """
    url_to_req_res_dict = {}

    for request, response in req_res_dict.items():
        url = request.url
        # If multiple requests to the same URL, you can choose to overwrite or store all
        url_to_req_res_dict[url] = {
            'request': request,
            'response': response
        }

    return url_to_req_res_dict


def get_har_urls(har_file_path: str) -> List[Tuple[str, str, str, str]]:
    """
    Extracts and returns a list of tuples containing method, URL, response format, and response preview
    from a HAR file, excluding certain file types and keywords.
    """
    # List to store tuples of URLs, request methods, response file formats, and response preview
    urls_with_details = []

    # Define a tuple of file extensions to exclude
    excluded_extensions = (
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".webp",
        ".svg",
        ".ico",  # Image files
        ".css",  # Stylesheets
        # ".js",
        # ".map",  # JavaScript files
        ".woff",
        ".woff2",
        ".ttf",
        ".otf",
        ".eot",  # Font files
        ".mp3",
        ".mp4",
        ".wav",
        ".avi",
        ".mov",
        ".flv",
        ".wmv",
        ".webm",  # Media files
        # ".pdf",
        # ".zip",
        ".rar",
        ".7z",
        ".tar",
        ".gz",
        ".exe",
        ".dmg",  # Other non-text files
    )

    # Read the HAR file
    with open(har_file_path, "r", encoding="utf-8") as file:
        har_data = json.load(file)

    # Extract entries from the HAR data
    entries = har_data.get("log", {}).get("entries", [])
    for entry in entries:
        request = entry.get("request", {})
        response = entry.get("response", {})
        url = request.get("url")
        method = request.get("method", "GET")  # Default to 'GET' if method is missing
        response_format = response.get("content", {}).get("mimeType", "")
        response_text = response.get("content", {}).get("text", "")
        response_preview = response_text[:30] if response_text else ""

        if url:
            parsed_url = urlparse(url)
            path = parsed_url.path.lower()

            _, extension = os.path.splitext(path)

            request_text = url.lower()

            headers = request.get("headers", [])
            for header in headers:
                request_text += header.get("name", "").lower()
                request_text += header.get("value", "").lower()

            postData = request.get("postData", {}).get("text", "").lower()
            request_text += postData

            # Exclude URLs with the specified extensions or if keywords are in the request
            # this is done to reduce the number of requests we send to the LLM
            if extension not in excluded_extensions and not any(
                keyword.lower() in request_text for keyword in excluded_keywords
            ):
                urls_with_details.append((method, url, response_format, response_preview))

    return urls_with_details


def parse_cookie_file_to_dict(cookie_file_path: str) -> Dict[str, Dict[str, Any]]:
    """
    Parses a JSON cookie file and returns a dictionary of cookie data.
    """
    parsed_data = {}

    with open(cookie_file_path, "r") as file:
        cookies = json.load(file)

    for cookie in cookies:
        name = cookie.get("name")
        value = cookie.get("value")
        domain = cookie.get("domain")
        path = cookie.get("path")

        if name:
            parsed_data[name] = {
                "value": value,
                "domain": domain,
                "path": path,
                "expires": cookie.get("expires"),
                "httpOnly": cookie.get("httpOnly"),
                "secure": cookie.get("secure"),
                "sameSite": cookie.get("sameSite"),
            }

    return parsed_data