Skip to content

Commit

Permalink
Adds Structured Path Based Rules (#489)
Browse files Browse the repository at this point in the history
* Adds Experimental XML path parsing

* Add JSON support for the Structured queries

* Fixes

* Add logging to textcontainer

* Implement XPath and JsonPath for string matching rules.

* Removed unused.

* Fix potential double return.

* Remove unused imports

* Remove redundant and non-cached regex builder

* Null checking

* Nit: Typos in test data

Also move the String clause tests to the correct file.

* Allow specification of both a JsonPath and an XML Path

A user potentially would want to detect a similar pattern at a location inside both an XML and a JSON file, so they can provide separate path specifications for each rather than having two nearly identical rules.

* Adds test case for a rule that targets both Json and XML

* Adds validation for JsonPath and XPath arguments
  • Loading branch information
gfs authored Aug 4, 2022
1 parent 76476ba commit 41d733a
Show file tree
Hide file tree
Showing 14 changed files with 646 additions and 50 deletions.
6 changes: 3 additions & 3 deletions AppInspector.RulesEngine/AbstractRuleSet.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public IEnumerable<ConvertedOatRule> GetUniversalRules()
var modifiers = pattern.Modifiers?.ToList() ?? new List<string>();
if (pattern.PatternType is PatternType.String or PatternType.Substring)
{
clauses.Add(new OatSubstringIndexClause(scopes, useWordBoundaries: pattern.PatternType == PatternType.String)
clauses.Add(new OatSubstringIndexClause(scopes, useWordBoundaries: pattern.PatternType == PatternType.String, xPath: pattern.XPath, jsonPath:pattern.JsonPath)
{
Label = clauseNumber.ToString(CultureInfo.InvariantCulture),//important to pattern index identification
Data = new List<string>() { pattern.Pattern },
Expand All @@ -95,7 +95,7 @@ public IEnumerable<ConvertedOatRule> GetUniversalRules()
}
else if (pattern.PatternType == PatternType.Regex)
{
clauses.Add(new OatRegexWithIndexClause(scopes)
clauses.Add(new OatRegexWithIndexClause(scopes, null, pattern.XPath, pattern.JsonPath)
{
Label = clauseNumber.ToString(CultureInfo.InvariantCulture),//important to pattern index identification
Data = new List<string>() { pattern.Pattern },
Expand All @@ -112,7 +112,7 @@ public IEnumerable<ConvertedOatRule> GetUniversalRules()
}
else if (pattern.PatternType == PatternType.RegexWord)
{
clauses.Add(new OatRegexWithIndexClause(scopes)
clauses.Add(new OatRegexWithIndexClause(scopes, null, pattern.XPath, pattern.JsonPath)
{
Label = clauseNumber.ToString(CultureInfo.InvariantCulture),//important to pattern index identification
Data = new List<string>() { $"\\b({pattern.Pattern})\\b" },
Expand Down
1 change: 1 addition & 0 deletions AppInspector.RulesEngine/AppInspector.RulesEngine.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="JsonCons.JsonPath" Version="1.1.0" />
<PackageReference Include="Microsoft.CST.OAT" Version="1.2.19" />
<PackageReference Include="Microsoft.CST.RecursiveExtractor" Version="1.1.11" />
<PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="6.0.1" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@ namespace Microsoft.ApplicationInspector.RulesEngine.OatExtensions
{
public class OatRegexWithIndexClause : Clause
{
public OatRegexWithIndexClause(PatternScope[] scopes, string? field = null) : base(Operation.Custom, field)
public OatRegexWithIndexClause(PatternScope[] scopes, string? field = null, string? xPath = null, string? jsonPath = null) : base(Operation.Custom, field)
{
Scopes = scopes;
CustomOperation = "RegexWithIndex";
XPath = xPath;
JsonPath = jsonPath;
}

public string? JsonPath { get; }

public string? XPath { get; }

public PatternScope[] Scopes { get; }
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
using System;
using System.Collections;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Text.RegularExpressions;
using Microsoft.CST.OAT;
using Microsoft.CST.OAT.Operations;
Expand Down Expand Up @@ -34,7 +36,7 @@ public OatRegexWithIndexOperation(Analyzer analyzer, ILoggerFactory? loggerFacto
OperationDelegate = RegexWithIndexOperationDelegate;
ValidationDelegate = RegexWithIndexValidationDelegate;
}

private static IEnumerable<Violation> RegexWithIndexValidationDelegate(CST.OAT.Rule rule, Clause clause)
{
if (clause.Data?.Count is null or 0)
Expand Down Expand Up @@ -85,30 +87,41 @@ private OperationResult RegexWithIndexOperationDelegate(Clause clause, object? s

if (Analyzer != null)
{
var regex = StringToRegex(string.Join('|', RegexList), regexOpts);

if (regex != null)
foreach (var regexString in RegexList)
{
foreach (var match in regex.Matches(tc.FullContent))
if (StringToRegex(regexString, regexOpts) is { } regex)
{
if (match is Match m)
if (src.XPath is not null)
{
Boundary translatedBoundary = new()
var targets = tc.GetStringFromXPath(src.XPath);
foreach (var target in targets)
{
Length = m.Length,
Index = m.Index
};

//regex patterns will be indexed off data while string patterns result in N clauses
int patternIndex = Convert.ToInt32(clause.Label);

// Should return only scoped matches
if (tc.ScopeMatch(src.Scopes, translatedBoundary))
var matches = GetMatches(regex, target.Item1, tc, clause, src.Scopes);
foreach (var match in matches)
{
match.Item2.Index += target.Item2.Index;
outmatches.Add(match);
}
}
}
if (src.JsonPath is not null)
{
var targets = tc.GetStringFromJsonPath(src.JsonPath);
foreach (var target in targets)
{
outmatches.Add((patternIndex, translatedBoundary));
var matches = GetMatches(regex, target.Item1, tc, clause, src.Scopes);
foreach (var match in matches)
{
match.Item2.Index += target.Item2.Index;
outmatches.Add(match);
}
}
}
}
if (src.JsonPath is null && src.XPath is null)
{
outmatches.AddRange(GetMatches(regex, tc.FullContent, tc, clause, src.Scopes));
}
}
}

var result = src.Invert ? outmatches.Count == 0 : outmatches.Count > 0;
Expand All @@ -117,6 +130,31 @@ private OperationResult RegexWithIndexOperationDelegate(Clause clause, object? s
}
return new OperationResult(false, null);
}

private IEnumerable<(int, Boundary)> GetMatches(Regex regex, string content, TextContainer tc, Clause clause, PatternScope[] scopes)
{
foreach (var match in regex.Matches(content))
{
if (match is Match m)
{
Boundary translatedBoundary = new()
{
Length = m.Length,
Index = m.Index
};

//regex patterns will be indexed off data while string patterns result in N clauses
int patternIndex = Convert.ToInt32(clause.Label);

// Should return only scoped matches
if (tc.ScopeMatch(scopes, translatedBoundary))
{
yield return (patternIndex, translatedBoundary);
}
}
}
}

/// <summary>
/// Converts a strings to a compiled regex.
/// Uses an internal cache.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,19 @@ namespace Microsoft.ApplicationInspector.RulesEngine.OatExtensions
{
public class OatSubstringIndexClause : Clause
{
public OatSubstringIndexClause(PatternScope[] scopes, string? field = null, bool useWordBoundaries = false) : base(Operation.Custom, field)
public OatSubstringIndexClause(PatternScope[] scopes, string? field = null, bool useWordBoundaries = false, string? xPath = null, string? jsonPath = null) : base(Operation.Custom, field)
{
Scopes = scopes;
CustomOperation = "SubstringIndex";
UseWordBoundaries = useWordBoundaries;
XPath = xPath;
JsonPath = jsonPath;
}

public string? JsonPath { get; }

public string? XPath { get; }

public PatternScope[] Scopes { get; }

public bool UseWordBoundaries {get;}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using Microsoft.CST.OAT;
using Microsoft.CST.OAT.Operations;
using Microsoft.CST.OAT.Utils;
Expand Down Expand Up @@ -29,7 +30,7 @@ public OatSubstringIndexOperation(Analyzer analyzer, ILoggerFactory? loggerFacto
OperationDelegate = SubstringIndexOperationDelegate;
ValidationDelegate = SubstringIndexValidationDelegate;
}

public static IEnumerable<Violation> SubstringIndexValidationDelegate(CST.OAT.Rule rule, Clause clause)
{
if (clause.Data?.Count is null or 0)
Expand All @@ -51,45 +52,47 @@ public static IEnumerable<Violation> SubstringIndexValidationDelegate(CST.OAT.Ru
/// <param name="state2"></param>
/// <param name="captures"></param>
/// <returns></returns>
public static OperationResult SubstringIndexOperationDelegate(Clause clause, object? state1, object? state2, IEnumerable<ClauseCapture>? captures)
private OperationResult SubstringIndexOperationDelegate(Clause clause, object? state1, object? state2, IEnumerable<ClauseCapture>? captures)
{
var comparisonType = clause.Arguments.Contains("i") ? StringComparison.InvariantCultureIgnoreCase : StringComparison.InvariantCulture;
if (state1 is TextContainer tc && clause is OatSubstringIndexClause src)
{
if (clause.Data is List<string> stringList && stringList.Count > 0)
if (clause.Data is { Count: > 0 } stringList)
{
var outmatches = new List<(int, Boundary)>();//tuple results i.e. pattern index and where

for (int i = 0; i < stringList.Count; i++)
{
var idx = tc.FullContent.IndexOf(stringList[i], comparisonType);
while (idx != -1)
if (src.XPath is not null)
{
bool skip = false;
if (src.UseWordBoundaries)
var targets = tc.GetStringFromXPath(src.XPath);
foreach (var target in targets)
{
if (idx > 0 && char.IsLetterOrDigit(tc.FullContent[idx - 1]))
{
skip = true;
}
if (idx + stringList[i].Length < tc.FullContent.Length && char.IsLetterOrDigit(tc.FullContent[idx + stringList[i].Length]))
var matches = GetMatches(target.Item1, stringList[i], comparisonType, tc, src);
foreach (var match in matches)
{
skip = true;
match.Index += target.Item2.Index;
outmatches.Add((i,match));
}
}
if (!skip)
}
if (src.JsonPath is not null)
{
var targets = tc.GetStringFromJsonPath(src.JsonPath);
foreach (var target in targets)
{
Boundary newBoundary = new()
{
Length = stringList[i].Length,
Index = idx
};
if (tc.ScopeMatch(src.Scopes, newBoundary))
var matches = GetMatches(target.Item1, stringList[i], comparisonType, tc, src);
foreach (var match in matches)
{
outmatches.Add((i, newBoundary));
match.Index += target.Item2.Index;
outmatches.Add((i,match));
}
}
idx = tc.FullContent.IndexOf(stringList[i], idx + stringList[i].Length, comparisonType);
}
if (src.JsonPath is null && src.XPath is null)
{
var matches = GetMatches(tc.FullContent, stringList[i], comparisonType, tc, src);
outmatches.AddRange(matches.Select(x => (i, x)));
}
}

Expand All @@ -99,5 +102,38 @@ public static OperationResult SubstringIndexOperationDelegate(Clause clause, obj
}
return new OperationResult(false, null);
}

private static IEnumerable<Boundary> GetMatches(string target, string query, StringComparison comparisonType, TextContainer tc, OatSubstringIndexClause src)
{
var idx = target.IndexOf(query, comparisonType);
while (idx != -1)
{
bool skip = false;
if (src.UseWordBoundaries)
{
if (idx > 0 && char.IsLetterOrDigit(target[idx - 1]))
{
skip = true;
}
if (idx + query.Length < target.Length && char.IsLetterOrDigit(target[idx + query.Length]))
{
skip = true;
}
}
if (!skip)
{
Boundary newBoundary = new()
{
Length = query.Length,
Index = idx
};
if (tc.ScopeMatch(src.Scopes, newBoundary))
{
yield return newBoundary;
}
}
idx = target.IndexOf(query, idx + query.Length, comparisonType);
}
}
}
}
5 changes: 5 additions & 0 deletions AppInspector.RulesEngine/Resources/languages.json
Original file line number Diff line number Diff line change
Expand Up @@ -253,5 +253,10 @@
"name": "Package.appxmanifest",
"extensions": [ ".appxmanifest" ],
"type": "code"
},
{
"name": "XML",
"extensions": [ ".xml" ],
"type": "build"
}
]
4 changes: 2 additions & 2 deletions AppInspector.RulesEngine/RuleProcessor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ List<MatchRecord> ProcessBoundary(ClauseCapture cap)
/// <returns>A List of the matches against the Rules the processor is configured with.</returns>
public List<MatchRecord> AnalyzeFile(string contents, FileEntry fileEntry, LanguageInfo languageInfo, IEnumerable<string>? tagsToIgnore = null, int numLinesContext = 3)
{
TextContainer textContainer = new(contents, languageInfo.Name, _languages);
TextContainer textContainer = new(contents, languageInfo.Name, _languages, _opts.LoggerFactory?.CreateLogger<TextContainer>() ?? NullLogger<TextContainer>.Instance);
return AnalyzeFile(textContainer, fileEntry, languageInfo, tagsToIgnore, numLinesContext);
}

Expand Down Expand Up @@ -291,7 +291,7 @@ public async Task<List<MatchRecord>> AnalyzeFileAsync(FileEntry fileEntry, Langu

using var sr = new StreamReader(fileEntry.Content);

TextContainer textContainer = new(await sr.ReadToEndAsync().ConfigureAwait(false), languageInfo.Name, _languages);
TextContainer textContainer = new(await sr.ReadToEndAsync().ConfigureAwait(false), languageInfo.Name, _languages, _opts.LoggerFactory?.CreateLogger<TextContainer>() ?? NullLogger<TextContainer>.Instance);
foreach (var ruleCapture in analyzer.GetCaptures(rules, textContainer))
{
if (cancellationToken?.IsCancellationRequested is true)
Expand Down
28 changes: 28 additions & 0 deletions AppInspector.RulesEngine/RulesVerifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Xml.XPath;
using JsonCons.JsonPath;
using Microsoft.ApplicationInspector.Common;
using Microsoft.ApplicationInspector.RulesEngine.OatExtensions;
using Microsoft.CST.OAT;
Expand Down Expand Up @@ -152,6 +154,32 @@ public RuleStatus CheckIntegrity(ConvertedOatRule convertedOatRule)
errors.Add(MsgHelp.FormatString(MsgHelp.ID.VERIFY_RULES_REGEX_FAIL, rule.Id ?? "", searchPattern.Pattern ?? "", e.Message));
}
}

if (searchPattern.JsonPath is not null)
{
try
{
_ = JsonSelector.Parse(searchPattern.JsonPath);
}
catch (Exception e)
{
_logger?.LogError("The provided JsonPath '{JsonPath}' value was not valid in Rule {Id} : {message}", searchPattern.JsonPath, rule.Id, e.Message);
errors.Add(string.Format("The provided JsonPath '{0}' value was not valid in Rule {1} : {2}", searchPattern.JsonPath, rule.Id, e.Message));
}
}

if (searchPattern.XPath is not null)
{
try
{
XPathExpression.Compile(searchPattern.XPath);
}
catch (Exception e)
{
_logger?.LogError("The provided XPath '{XPath}' value was not valid in Rule {Id} : {message}", searchPattern.XPath, rule.Id, e.Message);
errors.Add(string.Format("The provided XPath '{0}' value was not valid in Rule {1} : {2}", searchPattern.JsonPath, rule.Id, e.Message));
}
}
}

// validate conditions
Expand Down
Loading

0 comments on commit 41d733a

Please sign in to comment.