Merge pull request #2448 from github/koesie10/auto-model

koesie10 · web-flow · commit 5a66d6ff2dbc · 2023-05-30T12:19:22.000+02:00
Add initial implementation of auto-modeling
diff --git a/extensions/ql-vscode/src/config.ts b/extensions/ql-vscode/src/config.ts
@@ -711,3 +711,10 @@ const QUERIES_PANEL = new Setting("queriesPanel", ROOT_SETTING);
 export function showQueriesPanel(): boolean {
   return !!QUERIES_PANEL.getValue<boolean>();
 }
+
+const DATA_EXTENSIONS = new Setting("dataExtensions", ROOT_SETTING);
+const LLM_GENERATION = new Setting("llmGeneration", DATA_EXTENSIONS);
+
+export function showLlmGeneration(): boolean {
+  return !!LLM_GENERATION.getValue<boolean>();
+}
diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model-api.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model-api.ts
@@ -0,0 +1,54 @@
+import { Credentials } from "../common/authentication";
+import { OctokitResponse } from "@octokit/types";
+
+export enum ClassificationType {
+  Unknown = "CLASSIFICATION_TYPE_UNKNOWN",
+  Neutral = "CLASSIFICATION_TYPE_NEUTRAL",
+  Source = "CLASSIFICATION_TYPE_SOURCE",
+  Sink = "CLASSIFICATION_TYPE_SINK",
+  Summary = "CLASSIFICATION_TYPE_SUMMARY",
+}
+
+export interface Classification {
+  type: ClassificationType;
+  kind: string;
+  explanation: string;
+}
+
+export interface Method {
+  package: string;
+  type: string;
+  name: string;
+  signature: string;
+  usages: string[];
+  classification?: Classification;
+  input?: string;
+  output?: string;
+}
+
+export interface ModelRequest {
+  language: string;
+  candidates: Method[];
+  samples: Method[];
+}
+
+export interface ModelResponse {
+  language: string;
+  predicted: Method[];
+}
+
+export async function autoModel(
+  credentials: Credentials,
+  request: ModelRequest,
+): Promise<ModelResponse> {
+  const octokit = await credentials.getOctokit();
+
+  const response: OctokitResponse<ModelResponse> = await octokit.request(
+    "POST /repos/github/codeql/code-scanning/codeql/auto-model",
+    {
+      data: request,
+    },
+  );
+
+  return response.data;
+}
diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model.ts
@@ -0,0 +1,218 @@
+import { ExternalApiUsage } from "./external-api-usage";
+import { ModeledMethod, ModeledMethodType } from "./modeled-method";
+import {
+  Classification,
+  ClassificationType,
+  Method,
+  ModelRequest,
+} from "./auto-model-api";
+
+export function createAutoModelRequest(
+  language: string,
+  externalApiUsages: ExternalApiUsage[],
+  modeledMethods: Record<string, ModeledMethod>,
+): ModelRequest {
+  const request: ModelRequest = {
+    language,
+    samples: [],
+    candidates: [],
+  };
+
+  // Sort by number of usages so we always send the most used methods first
+  externalApiUsages = [...externalApiUsages];
+  externalApiUsages.sort((a, b) => b.usages.length - a.usages.length);
+
+  for (const externalApiUsage of externalApiUsages) {
+    const modeledMethod: ModeledMethod = modeledMethods[
+      externalApiUsage.signature
+    ] ?? {
+      type: "none",
+    };
+
+    const numberOfArguments =
+      externalApiUsage.methodParameters === "()"
+        ? 0
+        : externalApiUsage.methodParameters.split(",").length;
+
+    for (
+      let argumentIndex = 0;
+      argumentIndex < numberOfArguments;
+      argumentIndex++
+    ) {
+      const method: Method = {
+        package: externalApiUsage.packageName,
+        type: externalApiUsage.typeName,
+        name: externalApiUsage.methodName,
+        signature: externalApiUsage.methodParameters,
+        classification:
+          modeledMethod.type === "none"
+            ? undefined
+            : toMethodClassification(modeledMethod),
+        usages: externalApiUsage.usages
+          .slice(0, 10)
+          .map((usage) => usage.label),
+        input: `Argument[${argumentIndex}]`,
+      };
+
+      if (modeledMethod.type === "none") {
+        request.candidates.push(method);
+      } else {
+        request.samples.push(method);
+      }
+    }
+  }
+
+  request.candidates = request.candidates.slice(0, 20);
+  request.samples = request.samples.slice(0, 100);
+
+  return request;
+}
+
+/**
+ * For now, we have a simplified model that only models methods as sinks. It does not model methods as neutral,
+ * so we aren't actually able to correctly determine that a method is neutral; it could still be a source or summary.
+ * However, to keep this method simple and give output to the user, we will model any method for which none of its
+ * arguments are modeled as sinks as neutral.
+ *
+ * If there are multiple arguments which are modeled as sinks, we will only model the first one.
+ */
+export function parsePredictedClassifications(
+  predicted: Method[],
+): Record<string, ModeledMethod> {
+  const predictedBySignature: Record<string, Method[]> = {};
+  for (const method of predicted) {
+    if (!method.classification) {
+      continue;
+    }
+
+    const signature = toFullMethodSignature(method);
+
+    if (!(signature in predictedBySignature)) {
+      predictedBySignature[signature] = [];
+    }
+
+    predictedBySignature[signature].push(method);
+  }
+
+  const modeledMethods: Record<string, ModeledMethod> = {};
+
+  for (const signature in predictedBySignature) {
+    const predictedMethods = predictedBySignature[signature];
+
+    const sinks = predictedMethods.filter(
+      (method) => method.classification?.type === ClassificationType.Sink,
+    );
+    if (sinks.length === 0) {
+      // For now, model any method for which none of its arguments are modeled as sinks as neutral
+      modeledMethods[signature] = {
+        type: "neutral",
+        kind: "",
+        input: "",
+        output: "",
+      };
+      continue;
+    }
+
+    // Order the sinks by the input alphabetically. This will ensure that the first argument is always
+    // first in the list of sinks, the second argument is always second, etc.
+    // If we get back "Argument[1]" and "Argument[3]", "Argument[1]" should always be first
+    sinks.sort((a, b) => compareInputOutput(a.input ?? "", b.input ?? ""));
+
+    const sink = sinks[0];
+
+    modeledMethods[signature] = {
+      type: "sink",
+      kind: sink.classification?.kind ?? "",
+      input: sink.input ?? "",
+      output: sink.output ?? "",
+    };
+  }
+
+  return modeledMethods;
+}
+
+function toMethodClassificationType(
+  type: ModeledMethodType,
+): ClassificationType {
+  switch (type) {
+    case "source":
+      return ClassificationType.Source;
+    case "sink":
+      return ClassificationType.Sink;
+    case "summary":
+      return ClassificationType.Summary;
+    case "neutral":
+      return ClassificationType.Neutral;
+    default:
+      return ClassificationType.Unknown;
+  }
+}
+
+function toMethodClassification(modeledMethod: ModeledMethod): Classification {
+  return {
+    type: toMethodClassificationType(modeledMethod.type),
+    kind: modeledMethod.kind,
+    explanation: "",
+  };
+}
+
+function toFullMethodSignature(method: Method): string {
+  return `${method.package}.${method.type}#${method.name}${method.signature}`;
+}
+
+const argumentRegex = /^Argument\[(\d+)]$/;
+
+// Argument[this] is before ReturnValue
+const nonNumericArgumentOrder = ["Argument[this]", "ReturnValue"];
+
+/**
+ * Compare two inputs or outputs matching `Argument[<number>]`, `Argument[this]`, or `ReturnValue`.
+ * If they are the same, return 0. If a is less than b, returns a negative number.
+ * If a is greater than b, returns a positive number.
+ */
+export function compareInputOutput(a: string, b: string): number {
+  if (a === b) {
+    return 0;
+  }
+
+  const aMatch = a.match(argumentRegex);
+  const bMatch = b.match(argumentRegex);
+
+  // Numeric arguments are always first
+  if (aMatch && !bMatch) {
+    return -1;
+  }
+  if (!aMatch && bMatch) {
+    return 1;
+  }
+
+  // Neither is an argument
+  if (!aMatch && !bMatch) {
+    const aIndex = nonNumericArgumentOrder.indexOf(a);
+    const bIndex = nonNumericArgumentOrder.indexOf(b);
+
+    // If either one is unknown, it is sorted last
+    if (aIndex === -1 && bIndex === -1) {
+      return a.localeCompare(b);
+    }
+    if (aIndex === -1) {
+      return 1;
+    }
+    if (bIndex === -1) {
+      return -1;
+    }
+
+    return aIndex - bIndex;
+  }
+
+  // This case shouldn't happen, but makes TypeScript happy
+  if (!aMatch || !bMatch) {
+    return 0;
+  }
+
+  // Both are arguments
+  const aIndex = parseInt(aMatch[1]);
+  const bIndex = parseInt(bMatch[1]);
+
+  return aIndex - bIndex;
+}
diff --git a/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts b/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts
@@ -39,6 +39,12 @@ import { createDataExtensionYaml, loadDataExtensionYaml } from "./yaml";
 import { ExternalApiUsage } from "./external-api-usage";
 import { ModeledMethod } from "./modeled-method";
 import { ExtensionPackModelFile } from "./shared/extension-pack";
+import { autoModel } from "./auto-model-api";
+import {
+  createAutoModelRequest,
+  parsePredictedClassifications,
+} from "./auto-model";
+import { showLlmGeneration } from "../config";
 
 function getQlSubmoduleFolder(): WorkspaceFolder | undefined {
   const workspaceFolder = workspace.workspaceFolders?.find(
@@ -127,6 +133,13 @@ export class DataExtensionsEditorView extends AbstractWebview<
       case "generateExternalApi":
         await this.generateModeledMethods();
 
+        break;
+      case "generateExternalApiFromLlm":
+        await this.generateModeledMethodsFromLlm(
+          msg.externalApiUsages,
+          msg.modeledMethods,
+        );
+
         break;
       default:
         assertNever(msg);
@@ -149,6 +162,7 @@ export class DataExtensionsEditorView extends AbstractWebview<
       viewState: {
         extensionPackModelFile: this.modelFile,
         modelFileExists: await pathExists(this.modelFile.filename),
+        showLlmButton: showLlmGeneration(),
       },
     });
   }
@@ -367,6 +381,29 @@ export class DataExtensionsEditorView extends AbstractWebview<
     await this.clearProgress();
   }
 
+  private async generateModeledMethodsFromLlm(
+    externalApiUsages: ExternalApiUsage[],
+    modeledMethods: Record<string, ModeledMethod>,
+  ): Promise<void> {
+    const request = createAutoModelRequest(
+      this.databaseItem.language,
+      externalApiUsages,
+      modeledMethods,
+    );
+
+    const response = await autoModel(this.app.credentials, request);
+
+    const predictedModeledMethods = parsePredictedClassifications(
+      response.predicted,
+    );
+
+    await this.postMessage({
+      t: "addModeledMethods",
+      modeledMethods: predictedModeledMethods,
+      overrideNone: true,
+    });
+  }
+
   /*
    * Progress in this class is a bit weird. Most of the progress is based on running the query.
    * Query progress is always between 0 and 1000. However, we still have some steps that need
diff --git a/extensions/ql-vscode/src/data-extensions-editor/shared/view-state.ts b/extensions/ql-vscode/src/data-extensions-editor/shared/view-state.ts
@@ -3,4 +3,5 @@ import { ExtensionPackModelFile } from "./extension-pack";
 export interface DataExtensionEditorViewState {
   extensionPackModelFile: ExtensionPackModelFile;
   modelFileExists: boolean;
+  showLlmButton: boolean;
 }
diff --git a/extensions/ql-vscode/src/pure/interface-types.ts b/extensions/ql-vscode/src/pure/interface-types.ts
@@ -544,6 +544,12 @@ export interface GenerateExternalApiMessage {
   t: "generateExternalApi";
 }
 
+export interface GenerateExternalApiFromLlmMessage {
+  t: "generateExternalApiFromLlm";
+  externalApiUsages: ExternalApiUsage[];
+  modeledMethods: Record<string, ModeledMethod>;
+}
+
 export type ToDataExtensionsEditorMessage =
   | SetExtensionPackStateMessage
   | SetExternalApiUsagesMessage
@@ -556,4 +562,5 @@ export type FromDataExtensionsEditorMessage =
   | OpenExtensionPackMessage
   | JumpToUsageMessage
   | SaveModeledMethods
-  | GenerateExternalApiMessage;
+  | GenerateExternalApiMessage
+  | GenerateExternalApiFromLlmMessage;
diff --git a/extensions/ql-vscode/src/stories/data-extensions-editor/DataExtensionsEditor.stories.tsx b/extensions/ql-vscode/src/stories/data-extensions-editor/DataExtensionsEditor.stories.tsx
@@ -30,6 +30,7 @@ DataExtensionsEditor.args = {
         "/home/user/vscode-codeql-starter/codeql-custom-queries-java/sql2o/models/sql2o.yml",
     },
     modelFileExists: true,
+    showLlmButton: true,
   },
   initialExternalApiUsages: [
     {
diff --git a/extensions/ql-vscode/src/view/data-extensions-editor/DataExtensionsEditor.tsx b/extensions/ql-vscode/src/view/data-extensions-editor/DataExtensionsEditor.tsx
diff --git a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model.test.ts b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model.test.ts

Original file line number	Diff line number	Diff line change
`@@ -3,4 +3,5 @@ import { ExtensionPackModelFile } from "./extension-pack";`
`3`	`3`	`export interface DataExtensionEditorViewState {`
`4`	`4`	`extensionPackModelFile: ExtensionPackModelFile;`
`5`	`5`	`modelFileExists: boolean;`
	`6`	`+ showLlmButton: boolean;`
`6`	`7`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ DataExtensionsEditor.args = {`
`30`	`30`	`"/home/user/vscode-codeql-starter/codeql-custom-queries-java/sql2o/models/sql2o.yml",`
`31`	`31`	`},`
`32`	`32`	`modelFileExists: true,`
	`33`	`+ showLlmButton: true,`
`33`	`34`	`},`
`34`	`35`	`initialExternalApiUsages: [`
`35`	`36`	`{`