Use filtering queries to do batched AI quering.

starcke · starcke · commit f6c492dca47d · 2023-08-04T13:09:59.000+02:00
diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model-codeml-queries.ts
@@ -17,6 +17,10 @@ import { redactableError } from "../common/errors";
 import { interpretResultsSarif } from "../query-results";
 import { join } from "path";
 import { assertNever } from "../common/helpers-pure";
+import { dir } from "tmp-promise";
+import { writeFile, outputFile } from "fs-extra";
+import { dump as dumpYaml } from "js-yaml";
+import { MethodSignature } from "./external-api-usage";
 
 type AutoModelQueryOptions = {
   queryTag: string;
@@ -26,6 +30,7 @@ type AutoModelQueryOptions = {
   databaseItem: DatabaseItem;
   qlpack: QlPacksForLanguage;
   sourceInfo: SourceInfo | undefined;
+  additionalPacks: string[];
   extensionPacks: string[];
   queryStorageDir: string;
 
@@ -52,6 +57,7 @@ async function runAutoModelQuery({
   databaseItem,
   qlpack,
   sourceInfo,
+  additionalPacks,
   extensionPacks,
   queryStorageDir,
   progress,
@@ -99,7 +105,7 @@ async function runAutoModelQuery({
       quickEvalCountOnly: false,
     },
     false,
-    getOnDiskWorkspaceFolders(),
+    additionalPacks,
     extensionPacks,
     queryStorageDir,
     undefined,
@@ -147,6 +153,7 @@ async function runAutoModelQuery({
 
 type AutoModelQueriesOptions = {
   mode: Mode;
+  candidateMethods: MethodSignature[];
   cliServer: CodeQLCliServer;
   queryRunner: QueryRunner;
   databaseItem: DatabaseItem;
@@ -161,6 +168,7 @@ export type AutoModelQueriesResult = {
 
 export async function runAutoModelQueries({
   mode,
+  candidateMethods,
   cliServer,
   queryRunner,
   databaseItem,
@@ -189,7 +197,13 @@ export async function runAutoModelQueries({
           sourceLocationPrefix,
         };
 
-  const additionalPacks = getOnDiskWorkspaceFolders();
+  // Generate a pack containing the candidate filters
+  const filterPackDir = await generateCandidateFilterPack(
+    databaseItem.language,
+    candidateMethods,
+  );
+
+  const additionalPacks = [...getOnDiskWorkspaceFolders(), filterPackDir];
   const extensionPacks = Object.keys(
     await cliServer.resolveQlpacks(additionalPacks, true),
   );
@@ -208,6 +222,7 @@ export async function runAutoModelQueries({
     databaseItem,
     qlpack,
     sourceInfo,
+    additionalPacks,
     extensionPacks,
     queryStorageDir,
     progress: (update) => {
@@ -228,3 +243,59 @@ export async function runAutoModelQueries({
     candidates,
   };
 }
+
+/**
+ * generateCandidateFilterPack will create a temporary extension pack.
+ * This pack will contain a filter that will restrict the automodel queries
+ * to the specified candidate methods only.
+ * This is done using the `extensible` predicate "automodelCandidateFilter".
+ * @param language
+ * @param candidateMethods
+ * @returns
+ */
+export async function generateCandidateFilterPack(
+  language: string,
+  candidateMethods: MethodSignature[],
+): Promise<string> {
+  // Pack resides in a temporary directory, to not pollute the workspace.
+  const packDir = (await dir({ unsafeCleanup: true })).path;
+
+  const syntheticConfigPack = {
+    name: "codeql/automodel-filter",
+    version: "0.0.0",
+    library: true,
+    extensionTargets: {
+      [`codeql/${language}-all`]: "*",
+    },
+    dataExtensions: ["filter.yml"],
+  };
+
+  const qlpackFile = join(packDir, "codeql-pack.yml");
+  await outputFile(qlpackFile, dumpYaml(syntheticConfigPack), "utf8");
+
+  // The predicate has the following defintion:
+  // extensible predicate automodelCandidateFilter(string package, string type, string name, string signature)
+  const dataRows = candidateMethods.map((method) => [
+    method.packageName,
+    method.typeName,
+    method.methodName,
+    method.methodParameters,
+  ]);
+
+  const filter = {
+    extensions: [
+      {
+        addsTo: {
+          pack: `codeql/${language}-queries`,
+          extensible: "automodelCandidateFilter",
+        },
+        data: dataRows,
+      },
+    ],
+  };
+
+  const filterFile = join(packDir, "filter.yml");
+  await writeFile(filterFile, dumpYaml(filter), "utf8");
+
+  return packDir;
+}
diff --git a/extensions/ql-vscode/src/data-extensions-editor/auto-model-v2.ts b/extensions/ql-vscode/src/data-extensions-editor/auto-model-v2.ts
@@ -4,6 +4,63 @@ import { AutoModelQueriesResult } from "./auto-model-codeml-queries";
 import { assertNever } from "../common/helpers-pure";
 import * as Sarif from "sarif";
 import { gzipEncode } from "../common/zlib";
+import { ExternalApiUsage, MethodSignature } from "./external-api-usage";
+import { ModeledMethod } from "./modeled-method";
+import { groupMethods, sortGroupNames, sortMethods } from "./shared/sorting";
+
+// Soft limit on the number of candidates to send to the model.
+// Note that the model may return fewer than this number of candidates.
+const candidateLimit = 20;
+/**
+ * Return the candidates that the model should be run on. This includes limiting the number of
+ * candidates to the candidate limit and filtering out anything that is already modeled and respecting
+ * the order in the UI.
+ * @param mode Whether it is application or framework mode.
+ * @param externalApiUsages all external API usages.
+ * @param modeledMethods the currently modeled methods.
+ * @returns list of modeled methods that are candidates for modeling.
+ */
+export function getCandidates(
+  mode: Mode,
+  externalApiUsages: ExternalApiUsage[],
+  modeledMethods: Record<string, ModeledMethod>,
+): MethodSignature[] {
+  // Sort the same way as the UI so we send the first ones listed in the UI first
+  const grouped = groupMethods(externalApiUsages, mode);
+  const sortedGroupNames = sortGroupNames(grouped);
+  const sortedExternalApiUsages = sortedGroupNames.flatMap((name) =>
+    sortMethods(grouped[name]),
+  );
+
+  const candidates: MethodSignature[] = [];
+
+  for (const externalApiUsage of sortedExternalApiUsages) {
+    const modeledMethod: ModeledMethod = modeledMethods[
+      externalApiUsage.signature
+    ] ?? {
+      type: "none",
+    };
+
+    // If we have reached the max number of candidates then stop
+    if (candidates.length >= candidateLimit) {
+      break;
+    }
+
+    // Anything that is modeled is not a candidate
+    if (modeledMethod.type !== "none") {
+      continue;
+    }
+
+    // A method that is supported is modeled outside of the model file, so it is not a candidate.
+    if (externalApiUsage.supported) {
+      continue;
+    }
+
+    // The rest are candidates
+    candidates.push(externalApiUsage);
+  }
+  return candidates;
+}
 
 /**
  * Encode a SARIF log to the format expected by the server: JSON, GZIP-compressed, base64-encoded
diff --git a/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts b/extensions/ql-vscode/src/data-extensions-editor/data-extensions-editor-view.ts
@@ -56,9 +56,10 @@ import { join } from "path";
 import { pickExtensionPack } from "./extension-pack-picker";
 import { getLanguageDisplayName } from "../common/query-language";
 import { runAutoModelQueries } from "./auto-model-codeml-queries";
-import { createAutoModelV2Request } from "./auto-model-v2";
+import { createAutoModelV2Request, getCandidates } from "./auto-model-v2";
 import { load as loadYaml } from "js-yaml";
 import { loadDataExtensionYaml } from "./yaml";
+import { extLogger } from "../common/logging/vscode";
 
 export class DataExtensionsEditorView extends AbstractWebview<
   ToDataExtensionsEditorMessage,
@@ -380,8 +381,22 @@ export class DataExtensionsEditorView extends AbstractWebview<
       let predictedModeledMethods: Record<string, ModeledMethod>;
 
       if (useLlmGenerationV2()) {
+        // Fetch the candidates to send to the model
+        const candidateMethods = getCandidates(
+          this.mode,
+          externalApiUsages,
+          modeledMethods,
+        );
+
+        // If there are no candidates, there is nothing to model and we just return
+        if (candidateMethods.length === 0) {
+          void extLogger.log("No candidates to model. Stopping.");
+          return;
+        }
+
         const usages = await runAutoModelQueries({
           mode: this.mode,
+          candidateMethods,
           cliServer: this.cliServer,
           queryRunner: this.queryRunner,
           queryStorageDir: this.queryStorageDir,
@@ -421,12 +436,33 @@ export class DataExtensionsEditorView extends AbstractWebview<
           filename: "auto-model.yml",
         });
 
-        const modeledMethods = loadDataExtensionYaml(models);
-        if (!modeledMethods) {
+        const loadedMethods = loadDataExtensionYaml(models);
+        if (!loadedMethods) {
           return;
         }
 
-        predictedModeledMethods = modeledMethods;
+        // Any candidate that was part of the response is a negative result
+        // meaning that the canidate is not a sink for the kinds that the LLM is checking for.
+        // For now we model this as a sink neutral method, however this is subject
+        // to discussion.
+        for (const candidate of candidateMethods) {
+          if (!(candidate.signature in loadedMethods)) {
+            loadedMethods[candidate.signature] = {
+              type: "neutral",
+              kind: "sink",
+              input: "",
+              output: "",
+              provenance: "ai-generated",
+              signature: candidate.signature,
+              packageName: candidate.packageName,
+              typeName: candidate.typeName,
+              methodName: candidate.methodName,
+              methodParameters: candidate.methodParameters,
+            };
+          }
+        }
+
+        predictedModeledMethods = loadedMethods;
       } else {
         const usages = await getAutoModelUsages({
           cliServer: this.cliServer,
diff --git a/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts b/extensions/ql-vscode/test/unit-tests/data-extensions-editor/auto-model-v2.test.ts
@@ -1,12 +1,15 @@
 import {
   createAutoModelV2Request,
   encodeSarif,
+  getCandidates,
 } from "../../../src/data-extensions-editor/auto-model-v2";
 import { Mode } from "../../../src/data-extensions-editor/shared/mode";
 import { AutomodelMode } from "../../../src/data-extensions-editor/auto-model-api-v2";
 import { AutoModelQueriesResult } from "../../../src/data-extensions-editor/auto-model-codeml-queries";
 import * as sarif from "sarif";
 import { gzipDecode } from "../../../src/common/zlib";
+import { ExternalApiUsage } from "../../../src/data-extensions-editor/external-api-usage";
+import { ModeledMethod } from "../../../src/data-extensions-editor/modeled-method";
 
 describe("createAutoModelV2Request", () => {
   const createSarifLog = (queryId: string): sarif.Log => {
@@ -80,3 +83,106 @@ describe("createAutoModelV2Request", () => {
     expect(parsed).toEqual(result.candidates);
   });
 });
+
+describe("getCandidates", () => {
+  it("doesnt return methods that are already modelled", () => {
+    const externalApiUsages: ExternalApiUsage[] = [];
+    externalApiUsages.push({
+      library: "my.jar",
+      signature: "org.my.A#x()",
+      packageName: "org.my",
+      typeName: "A",
+      methodName: "x",
+      methodParameters: "()",
+      supported: false,
+      supportedType: "none",
+      usages: [],
+    });
+    const modeledMethods: Record<string, ModeledMethod> = {
+      "org.my.A#x()": {
+        type: "neutral",
+        kind: "",
+        input: "",
+        output: "",
+        provenance: "manual",
+        signature: "org.my.A#x()",
+        packageName: "org.my",
+        typeName: "A",
+        methodName: "x",
+        methodParameters: "()",
+      },
+    };
+    const candidates = getCandidates(
+      Mode.Application,
+      externalApiUsages,
+      modeledMethods,
+    );
+    expect(candidates.length).toEqual(0);
+  });
+  it("doesnt return methods that are supported from other sources", () => {
+    const externalApiUsages: ExternalApiUsage[] = [];
+    externalApiUsages.push({
+      library: "my.jar",
+      signature: "org.my.A#x()",
+      packageName: "org.my",
+      typeName: "A",
+      methodName: "x",
+      methodParameters: "()",
+      supported: true,
+      supportedType: "none",
+      usages: [],
+    });
+    const modeledMethods = {};
+    const candidates = getCandidates(
+      Mode.Application,
+      externalApiUsages,
+      modeledMethods,
+    );
+    expect(candidates.length).toEqual(0);
+  });
+  it("return methods that neither modeled nor supported from other sources", () => {
+    const externalApiUsages: ExternalApiUsage[] = [];
+    externalApiUsages.push({
+      library: "my.jar",
+      signature: "org.my.A#x()",
+      packageName: "org.my",
+      typeName: "A",
+      methodName: "x",
+      methodParameters: "()",
+      supported: false,
+      supportedType: "none",
+      usages: [],
+    });
+    const modeledMethods = {};
+    const candidates = getCandidates(
+      Mode.Application,
+      externalApiUsages,
+      modeledMethods,
+    );
+    expect(candidates.length).toEqual(1);
+  });
+  it("respects the limit", () => {
+    const externalApiUsages: ExternalApiUsage[] = [];
+    for (let i = 0; i < 30; i++) {
+      externalApiUsages.push({
+        library: "my.jar",
+        signature: `org.my.A#x${i}()`,
+
+        packageName: "org.my",
+        typeName: "A",
+        methodName: `x${i}`,
+        methodParameters: "()",
+        supported: false,
+        supportedType: "none",
+        usages: [],
+      });
+    }
+    const modeledMethods = {};
+    const candidates = getCandidates(
+      Mode.Application,
+      externalApiUsages,
+      modeledMethods,
+    );
+    expect(candidates.length).toEqual(20);
+  });
+});
diff --git a/extensions/ql-vscode/test/vscode-tests/no-workspace/data-extensions-editor/auto-model-codeml-queries.test.ts b/extensions/ql-vscode/test/vscode-tests/no-workspace/data-extensions-editor/auto-model-codeml-queries.test.ts