Skip to content

Commit

Permalink
feat: Add a validate() method.
Browse files Browse the repository at this point in the history
  • Loading branch information
vxern committed Jan 8, 2023
1 parent 1da130b commit 6b8913a
Show file tree
Hide file tree
Showing 10 changed files with 221 additions and 28 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 2.1.0

- Added a method `.validate()` for validating files.
- Renamed `parser.dart` to `robots.dart`.

## 2.0.1

- Converted the `onlyApplicableTo` parameter in `Robots.parse()` from a `String`
Expand All @@ -8,8 +13,9 @@
## 2.0.0

- Additions:
- Added developer dependencies:
- Added dependencies:
- `meta` for static analysis.
- Added developer dependencies:
- `test` for testing.
- Added support for the 'Sitemap' field.
- Added support for specifying:
Expand Down
12 changes: 9 additions & 3 deletions example/example.dart → example/parse_example.dart
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Future<void> main() async {
// Print the rulesets.
for (final ruleset in robots.rulesets) {
// Print the user-agent this ruleset applies to.
print(ruleset.userAgent);
print('User-agent: ${ruleset.userAgent}');

if (ruleset.allows.isNotEmpty) {
print('Allowed:');
Expand All @@ -33,10 +33,16 @@ Future<void> main() async {
}
}

const userAgent = 'wordcollector';

// False: it cannot.
print(robots.verifyCanAccess('/gist/', userAgent: '*'));
print(
"Can '$userAgent' access /gist/? ${robots.verifyCanAccess('/gist/', userAgent: userAgent)}",
);
// True: it can.
print(robots.verifyCanAccess('/wordcollector/robots_txt', userAgent: '*'));
print(
"Can '$userAgent' access /wordcollector/robots_txt/? ${robots.verifyCanAccess('/wordcollector/robots_txt/', userAgent: userAgent)}",
);
}

Future<String> fetchFileContents({required String host}) async {
Expand Down
50 changes: 50 additions & 0 deletions example/validate_example.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import 'package:robots_txt/robots_txt.dart';

Future<void> main() async {
// Validating an invalid file will throw a `FormatException`.
try {
Robots.validate('This is obviously an invalid robots.txt file.');
} on FormatException {
print('As expected, the first file is flagged as invalid.');
}

// Validating an already valid file.
try {
Robots.validate('''
User-agent: *
Disallow: /
Allow: /file.txt
Sitemap: https://example.com/sitemap.xml
''');
print('As expected also, the second file is not flagged as invalid.');
} on FormatException {
print('Welp, this was not supposed to happen.');
}

late final String contentsFromBefore;

// Validating a file with unsupported fields.
try {
Robots.validate(
contentsFromBefore = '''
User-agent: *
Some-field: abcd.txt
''',
);
} on FormatException {
print(
'This file is invalid on the grounds that it contains fields we did not '
'expect it to have.',
);
print(
"Let's fix that by including the custom field in the call to validate().",
);
try {
Robots.validate(contentsFromBefore, allowedFieldNames: {'Some-field'});
print('Aha! Now there are no issues.');
} on FormatException {
print('Welp, this also was not supposed to happen.');
}
}
}
2 changes: 1 addition & 1 deletion lib/robots_txt.dart
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/// Lightweight, fully documented `robots.txt` file parser.
library robots_txt;

export 'src/parser.dart' show Robots, PrecedentRuleType, FieldType;
export 'src/robots.dart' show Robots, PrecedentRuleType, FieldType;
export 'src/rule.dart' show Rule, FindRule, Precedence, PrecedenceStrategy;
export 'src/ruleset.dart' show Ruleset, FindRuleInRuleset;
114 changes: 95 additions & 19 deletions lib/src/parser.dart → lib/src/robots.dart
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,20 @@ import 'package:meta/meta.dart';
import 'package:robots_txt/src/rule.dart';
import 'package:robots_txt/src/ruleset.dart';

/// Taking a set of [allowedFieldNames], builds a regular expression matching
/// only to valid `robots.txt` files.
@internal
RegExp buildValidFilePattern({Set<String> allowedFieldNames = const {}}) {
final fieldNameExpression =
[FieldType.defaultFieldNameExpression, ...allowedFieldNames].join('|');

return RegExp(
'^(?:(?:(?:$fieldNameExpression):(?:.+))(?:\\s*(?:#.*)?)\n?){0,}\$',
);
}

/// Defines a Regex pattern that matches to comments.
@internal
final commentPattern = RegExp('#.*');

/// Stores information about a `robots.txt` file, exposing a simple and concise
Expand All @@ -25,31 +38,67 @@ class Robots {
const Robots._construct({required this.rulesets, required this.sitemaps});

/// Parses the contents of a `robots.txt` file, creating an instance of
/// `Robots`. If [onlyApplicableTo] is specified, the parser will ignore any
/// rulesets that do not apply to it.
/// `Robots`.
///
/// If [onlyApplicableTo] is specified, the parser will ignore user-agents
/// that are not included within it.
///
/// This function will never throw an exception.
/// This function will never throw an exception. If you wish to validate a
/// file
factory Robots.parse(
String contents, {
Set<String>? onlyApplicableTo,
}) {
contents = contents.replaceAll(commentPattern, '');
}) =>
Robots._parse(contents, onlyApplicableTo: onlyApplicableTo);

if (contents.trim().isEmpty) {
return Robots._empty;
/// Taking the contents of `robots.txt` file, ensures that the file is valid,
/// and throws a `FormatException` if not.
///
/// By default, this function will only accept the following fields:
/// - User-agent
/// - Allow
/// - Disallow
/// - Sitemap
///
/// To accept custom fields, simply specify them in the [allowedFieldNames]
/// parameter.
static void validate(
String contents, {
Set<String> allowedFieldNames = const {},
}) {
final validFilePattern =
buildValidFilePattern(allowedFieldNames: allowedFieldNames);
if (!validFilePattern.hasMatch(contents)) {
throw const FormatException(
'The file is not a valid `robots.txt` file.',
);
}

final lines = contents.split('\n').where((line) => line.isNotEmpty);

return Robots._fromLines(lines, onlyApplicableTo: onlyApplicableTo);
Robots._parse(contents, throwOnError: true);
}

/// Iterates over [lines] and sequentially parses each ruleset, optionally
/// ignoring those rulesets which are not relevant to [onlyApplicableTo].
factory Robots._fromLines(
Iterable<String> lines, {
/// Splits [contents] into lines and iterates over them, sequentially parsing
/// each field, optionally ignoring those user-agents that are not found in
/// [onlyApplicableTo].
///
/// If [throwOnError] is `true`, this function will re-throw errors caught
/// during parsing.
factory Robots._parse(
String contents, {
Set<String>? onlyApplicableTo,
bool throwOnError = false,
}) {
final List<String> lines;
{
contents = contents.replaceAll(commentPattern, '');

if (contents.trim().isEmpty) {
return Robots._empty;
}

lines = contents.split('\n').where((line) => line.isNotEmpty).toList();
}

final rulesets = <Ruleset>[];
final sitemaps = <String>[];

Expand Down Expand Up @@ -80,7 +129,7 @@ class Robots {

late FieldType previousType;
for (var index = 0; index < lines.length; index++) {
final field = _getFieldFromLine(lines.elementAt(index));
final field = _getFieldFromLine(lines[index]);
if (field == null) {
continue;
}
Expand Down Expand Up @@ -112,8 +161,12 @@ class Robots {
final RegExp pattern;
try {
pattern = _convertPathToRegExp(field.value);
} on FormatException {
break;
} on FormatException catch (exception) {
if (throwOnError) {
throw wrapFormatException(exception, field.value, index);
} else {
break;
}
}
disallows.add(
Rule(
Expand All @@ -131,8 +184,12 @@ class Robots {
final RegExp pattern;
try {
pattern = _convertPathToRegExp(field.value);
} on FormatException {
break;
} on FormatException catch (exception) {
if (throwOnError) {
throw wrapFormatException(exception, field.value, index);
} else {
break;
}
}
allows.add(
Rule(
Expand Down Expand Up @@ -221,6 +278,21 @@ class Robots {
}
}

/// Taking an [exception], a [line] and the [index] of that line, creates a more
/// informational `FormatException`.
@internal
FormatException wrapFormatException(
Exception exception,
String line,
int index,
) =>
FormatException('''
Line $index of the file, defined as
$line
is invalid:
$exception
''');

/// Describes the type of a rule.
@internal
enum RuleType {
Expand Down Expand Up @@ -271,6 +343,10 @@ enum FieldType {
/// Contains the field types that introduce rules.
static const rules = [FieldType.allow, FieldType.disallow];

/// A partial regular expression defining a union of the default field names.
static final defaultFieldNameExpression =
FieldType.values.map((value) => value.key).join('|');

/// Constructs a `FieldType`.
const FieldType({required this.key, required this.example});

Expand Down
2 changes: 1 addition & 1 deletion lib/src/ruleset.dart
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import 'package:meta/meta.dart';

import 'package:robots_txt/src/parser.dart';
import 'package:robots_txt/src/robots.dart';
import 'package:robots_txt/src/rule.dart';

/// A collection of `Rule`s, and the `user-agent` they are relevant to inside
Expand Down
2 changes: 1 addition & 1 deletion pubspec.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: robots_txt
version: 2.0.1
version: 2.1.0

description: A complete, dependency-less and fully documented `robots.txt` ruleset parser.

Expand Down
7 changes: 7 additions & 0 deletions test/contents_definitions.dart → test/samples.dart
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ const emptyContents = '';
/// Invalid `robots.txt` contents.
const invalidContents = 'This is an invalid robots.txt file.';

/// Valid `robots.txt` file with all supported fields with example values and an
/// extra field named 'Field'.
final validContentsWithCustomFieldName = [
...FieldType.values.map((value) => value.toField()),
'Field: unknown',
].join('\n');

/// Valid `robots.txt` file with an invalid disallow field.
final validContentsInvalidPattern = '''
${FieldType.userAgent.toField('A')}
Expand Down
4 changes: 2 additions & 2 deletions test/parser_test.dart → test/tests/parser_test.dart
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import 'package:test/test.dart';

import 'package:robots_txt/robots_txt.dart';

import 'contents_definitions.dart';
import '../samples.dart';

void main() {
late Robots robots;
Expand Down Expand Up @@ -49,7 +49,7 @@ void main() {
});
});

group('logical rules', () {
group('rules with logical applicability', () {
test('defined without a user agent.', () {
expect(
() => robots = Robots.parse(rulesWithoutUserAgent),
Expand Down
48 changes: 48 additions & 0 deletions test/tests/validator_test.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import 'package:test/test.dart';

import 'package:robots_txt/robots_txt.dart';

import '../samples.dart';

void main() {
group('The validator correctly deals with', () {
test('an empty file.', () {
expect(() => Robots.validate(emptyContents), returnsNormally);
});

test('an invalid file.', () {
expect(() => Robots.validate(invalidContents), throwsFormatException);
});

test('a valid file with a custom field name not accounted for.', () {
expect(
() => Robots.validate(validContentsWithCustomFieldName),
throwsFormatException,
);
});

test('a valid file with a custom field name.', () {
expect(
() => Robots.validate(
validContentsWithCustomFieldName,
allowedFieldNames: {'Field'},
),
returnsNormally,
);
});

test('a valid file with an invalid path definition.', () {
expect(
() => Robots.validate(validContentsInvalidPattern),
throwsFormatException,
);
});

test('a valid file.', () {
expect(
() => Robots.validate(validContentsValidPattern),
returnsNormally,
);
});
});
}

0 comments on commit 6b8913a

Please sign in to comment.