|
| 1 | +import { ExternalApiUsage } from "./external-api-usage"; |
| 2 | +import { ModeledMethod, ModeledMethodType } from "./modeled-method"; |
| 3 | +import { |
| 4 | + Classification, |
| 5 | + ClassificationType, |
| 6 | + Method, |
| 7 | + ModelRequest, |
| 8 | +} from "./auto-model-api"; |
| 9 | + |
| 10 | +export function createAutoModelRequest( |
| 11 | + language: string, |
| 12 | + externalApiUsages: ExternalApiUsage[], |
| 13 | + modeledMethods: Record<string, ModeledMethod>, |
| 14 | +): ModelRequest { |
| 15 | + const request: ModelRequest = { |
| 16 | + language, |
| 17 | + samples: [], |
| 18 | + candidates: [], |
| 19 | + }; |
| 20 | + |
| 21 | + // Sort by number of usages so we always send the most used methods first |
| 22 | + externalApiUsages = [...externalApiUsages]; |
| 23 | + externalApiUsages.sort((a, b) => b.usages.length - a.usages.length); |
| 24 | + |
| 25 | + for (const externalApiUsage of externalApiUsages) { |
| 26 | + const modeledMethod: ModeledMethod = modeledMethods[ |
| 27 | + externalApiUsage.signature |
| 28 | + ] ?? { |
| 29 | + type: "none", |
| 30 | + }; |
| 31 | + |
| 32 | + const numberOfArguments = |
| 33 | + externalApiUsage.methodParameters === "()" |
| 34 | + ? 0 |
| 35 | + : externalApiUsage.methodParameters.split(",").length; |
| 36 | + |
| 37 | + for ( |
| 38 | + let argumentIndex = 0; |
| 39 | + argumentIndex < numberOfArguments; |
| 40 | + argumentIndex++ |
| 41 | + ) { |
| 42 | + const method: Method = { |
| 43 | + package: externalApiUsage.packageName, |
| 44 | + type: externalApiUsage.typeName, |
| 45 | + name: externalApiUsage.methodName, |
| 46 | + signature: externalApiUsage.methodParameters, |
| 47 | + classification: |
| 48 | + modeledMethod.type === "none" |
| 49 | + ? undefined |
| 50 | + : toMethodClassification(modeledMethod), |
| 51 | + usages: externalApiUsage.usages |
| 52 | + .slice(0, 10) |
| 53 | + .map((usage) => usage.label), |
| 54 | + input: `Argument[${argumentIndex}]`, |
| 55 | + }; |
| 56 | + |
| 57 | + if (modeledMethod.type === "none") { |
| 58 | + request.candidates.push(method); |
| 59 | + } else { |
| 60 | + request.samples.push(method); |
| 61 | + } |
| 62 | + } |
| 63 | + } |
| 64 | + |
| 65 | + request.candidates = request.candidates.slice(0, 20); |
| 66 | + request.samples = request.samples.slice(0, 100); |
| 67 | + |
| 68 | + return request; |
| 69 | +} |
| 70 | + |
| 71 | +/** |
| 72 | + * For now, we have a simplified model that only models methods as sinks. It does not model methods as neutral, |
| 73 | + * so we aren't actually able to correctly determine that a method is neutral; it could still be a source or summary. |
| 74 | + * However, to keep this method simple and give output to the user, we will model any method for which none of its |
| 75 | + * arguments are modeled as sinks as neutral. |
| 76 | + * |
| 77 | + * If there are multiple arguments which are modeled as sinks, we will only model the first one. |
| 78 | + */ |
| 79 | +export function parsePredictedClassifications( |
| 80 | + predicted: Method[], |
| 81 | +): Record<string, ModeledMethod> { |
| 82 | + const predictedBySignature: Record<string, Method[]> = {}; |
| 83 | + for (const method of predicted) { |
| 84 | + if (!method.classification) { |
| 85 | + continue; |
| 86 | + } |
| 87 | + |
| 88 | + const signature = toFullMethodSignature(method); |
| 89 | + |
| 90 | + if (!(signature in predictedBySignature)) { |
| 91 | + predictedBySignature[signature] = []; |
| 92 | + } |
| 93 | + |
| 94 | + predictedBySignature[signature].push(method); |
| 95 | + } |
| 96 | + |
| 97 | + const modeledMethods: Record<string, ModeledMethod> = {}; |
| 98 | + |
| 99 | + for (const signature in predictedBySignature) { |
| 100 | + const predictedMethods = predictedBySignature[signature]; |
| 101 | + |
| 102 | + const sinks = predictedMethods.filter( |
| 103 | + (method) => method.classification?.type === ClassificationType.Sink, |
| 104 | + ); |
| 105 | + if (sinks.length === 0) { |
| 106 | + // For now, model any method for which none of its arguments are modeled as sinks as neutral |
| 107 | + modeledMethods[signature] = { |
| 108 | + type: "neutral", |
| 109 | + kind: "", |
| 110 | + input: "", |
| 111 | + output: "", |
| 112 | + }; |
| 113 | + continue; |
| 114 | + } |
| 115 | + |
| 116 | + // Order the sinks by the input alphabetically. This will ensure that the first argument is always |
| 117 | + // first in the list of sinks, the second argument is always second, etc. |
| 118 | + // If we get back "Argument[1]" and "Argument[3]", "Argument[1]" should always be first |
| 119 | + sinks.sort((a, b) => compareInputOutput(a.input ?? "", b.input ?? "")); |
| 120 | + |
| 121 | + const sink = sinks[0]; |
| 122 | + |
| 123 | + modeledMethods[signature] = { |
| 124 | + type: "sink", |
| 125 | + kind: sink.classification?.kind ?? "", |
| 126 | + input: sink.input ?? "", |
| 127 | + output: sink.output ?? "", |
| 128 | + }; |
| 129 | + } |
| 130 | + |
| 131 | + return modeledMethods; |
| 132 | +} |
| 133 | + |
| 134 | +function toMethodClassificationType( |
| 135 | + type: ModeledMethodType, |
| 136 | +): ClassificationType { |
| 137 | + switch (type) { |
| 138 | + case "source": |
| 139 | + return ClassificationType.Source; |
| 140 | + case "sink": |
| 141 | + return ClassificationType.Sink; |
| 142 | + case "summary": |
| 143 | + return ClassificationType.Summary; |
| 144 | + case "neutral": |
| 145 | + return ClassificationType.Neutral; |
| 146 | + default: |
| 147 | + return ClassificationType.Unknown; |
| 148 | + } |
| 149 | +} |
| 150 | + |
| 151 | +function toMethodClassification(modeledMethod: ModeledMethod): Classification { |
| 152 | + return { |
| 153 | + type: toMethodClassificationType(modeledMethod.type), |
| 154 | + kind: modeledMethod.kind, |
| 155 | + explanation: "", |
| 156 | + }; |
| 157 | +} |
| 158 | + |
| 159 | +function toFullMethodSignature(method: Method): string { |
| 160 | + return `${method.package}.${method.type}#${method.name}${method.signature}`; |
| 161 | +} |
| 162 | + |
| 163 | +const argumentRegex = /^Argument\[(\d+)]$/; |
| 164 | + |
| 165 | +// Argument[this] is before ReturnValue |
| 166 | +const nonNumericArgumentOrder = ["Argument[this]", "ReturnValue"]; |
| 167 | + |
| 168 | +/** |
| 169 | + * Compare two inputs or outputs matching `Argument[<number>]`, `Argument[this]`, or `ReturnValue`. |
| 170 | + * If they are the same, return 0. If a is less than b, returns a negative number. |
| 171 | + * If a is greater than b, returns a positive number. |
| 172 | + */ |
| 173 | +export function compareInputOutput(a: string, b: string): number { |
| 174 | + if (a === b) { |
| 175 | + return 0; |
| 176 | + } |
| 177 | + |
| 178 | + const aMatch = a.match(argumentRegex); |
| 179 | + const bMatch = b.match(argumentRegex); |
| 180 | + |
| 181 | + // Numeric arguments are always first |
| 182 | + if (aMatch && !bMatch) { |
| 183 | + return -1; |
| 184 | + } |
| 185 | + if (!aMatch && bMatch) { |
| 186 | + return 1; |
| 187 | + } |
| 188 | + |
| 189 | + // Neither is an argument |
| 190 | + if (!aMatch && !bMatch) { |
| 191 | + const aIndex = nonNumericArgumentOrder.indexOf(a); |
| 192 | + const bIndex = nonNumericArgumentOrder.indexOf(b); |
| 193 | + |
| 194 | + // If either one is unknown, it is sorted last |
| 195 | + if (aIndex === -1 && bIndex === -1) { |
| 196 | + return a.localeCompare(b); |
| 197 | + } |
| 198 | + if (aIndex === -1) { |
| 199 | + return 1; |
| 200 | + } |
| 201 | + if (bIndex === -1) { |
| 202 | + return -1; |
| 203 | + } |
| 204 | + |
| 205 | + return aIndex - bIndex; |
| 206 | + } |
| 207 | + |
| 208 | + // This case shouldn't happen, but makes TypeScript happy |
| 209 | + if (!aMatch || !bMatch) { |
| 210 | + return 0; |
| 211 | + } |
| 212 | + |
| 213 | + // Both are arguments |
| 214 | + const aIndex = parseInt(aMatch[1]); |
| 215 | + const bIndex = parseInt(bMatch[1]); |
| 216 | + |
| 217 | + return aIndex - bIndex; |
| 218 | +} |
0 commit comments