diff --git a/README.md b/README.md index 2ed8398..091ff70 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ The most common source of these addresses outside of Terraform Core is JSON representation of state, plan, or schemas as obtained via [`hashicorp/terraform-exec`](https://github.com/hashicorp/terraform-exec). -## Example +## Parsing Provider Addresses + +### Example ```go p, err := ParseRawProviderSourceString("hashicorp/aws") @@ -23,7 +25,7 @@ if err != nil { // } ``` -## Legacy address +### Legacy address A legacy address is by itself (without more context) ambiguous. For example `aws` may represent either the official `hashicorp/aws` @@ -36,7 +38,7 @@ the address was produced by an affected version. If you do not have that context you should parse the string via `ParseRawProviderSourceString` and then check `addr.IsLegacy()`. -### What to do with a legacy address? +#### What to do with a legacy address? Ask the Registry API whether and where the provider was moved to @@ -70,7 +72,7 @@ If you cache results (which you should), ensure you have invalidation mechanism in place because target (migrated) namespace may change. Hard-coding migrations anywhere in code is strongly discouraged. -### `terraform` provider +#### `terraform` provider Like any other legacy address `terraform` is also ambiguous. Such address may (most unlikely) represent a custom-built provider called `terraform`, @@ -86,3 +88,24 @@ i.e. assume all of its logic including schema is contained within Terraform Core. In such case you should just use `NewBuiltInProvider("terraform")`. + +## Parsing Module Addresses + +### Example + +```go +registry, err := ParseRawModuleSourceRegistry("hashicorp/subnets/cidr") +if err != nil { + // deal with error +} + +// registry == ModuleSourceRegistry{ +// PackageAddr: ModuleRegistryPackage{ +// Host: svchost.Hostname("registry.terraform.io"), +// Namespace: "hashicorp", +// Name: "subnets", +// TargetSystem: "cidr", +// }, +// Subdir: "", +// }, +``` diff --git a/module.go b/module.go new file mode 100644 index 0000000..f90279e --- /dev/null +++ b/module.go @@ -0,0 +1,241 @@ +package tfaddr + +import ( + "fmt" + "path" + "regexp" + "strings" + + svchost "github.com/hashicorp/terraform-svchost" +) + +// ModuleSourceRegistry is representing a module listed in a Terraform module +// registry. +type ModuleSourceRegistry struct { + // PackageAddr is the registry package that the target module belongs to. + // The module installer must translate this into a ModuleSourceRemote + // using the registry API and then take that underlying address's + // PackageAddr in order to find the actual package location. + PackageAddr ModuleRegistryPackage + + // If Subdir is non-empty then it represents a sub-directory within the + // remote package that the registry address eventually resolves to. + // This will ultimately become the suffix of the Subdir of the + // ModuleSourceRemote that the registry address translates to. + // + // Subdir uses a normalized forward-slash-based path syntax within the + // virtual filesystem represented by the final package. It will never + // include `../` or `./` sequences. + Subdir string +} + +// DefaultModuleRegistryHost is the hostname used for registry-based module +// source addresses that do not have an explicit hostname. +const DefaultModuleRegistryHost = svchost.Hostname("registry.terraform.io") + +var moduleRegistryNamePattern = regexp.MustCompile("^[0-9A-Za-z](?:[0-9A-Za-z-_]{0,62}[0-9A-Za-z])?$") +var moduleRegistryTargetSystemPattern = regexp.MustCompile("^[0-9a-z]{1,64}$") + +// ParseRawModuleSourceRegistry only accepts module registry addresses, and +// will reject any other address type. +func ParseRawModuleSourceRegistry(raw string) (ModuleSourceRegistry, error) { + var err error + + var subDir string + raw, subDir = splitPackageSubdir(raw) + if strings.HasPrefix(subDir, "../") { + return ModuleSourceRegistry{}, fmt.Errorf("subdirectory path %q leads outside of the module package", subDir) + } + + parts := strings.Split(raw, "/") + // A valid registry address has either three or four parts, because the + // leading hostname part is optional. + if len(parts) != 3 && len(parts) != 4 { + return ModuleSourceRegistry{}, fmt.Errorf("a module registry source address must have either three or four slash-separated components") + } + + host := DefaultModuleRegistryHost + if len(parts) == 4 { + host, err = svchost.ForComparison(parts[0]) + if err != nil { + // The svchost library doesn't produce very good error messages to + // return to an end-user, so we'll use some custom ones here. + switch { + case strings.Contains(parts[0], "--"): + // Looks like possibly punycode, which we don't allow here + // to ensure that source addresses are written readably. + return ModuleSourceRegistry{}, fmt.Errorf("invalid module registry hostname %q; internationalized domain names must be given as direct unicode characters, not in punycode", parts[0]) + default: + return ModuleSourceRegistry{}, fmt.Errorf("invalid module registry hostname %q", parts[0]) + } + } + if !strings.Contains(host.String(), ".") { + return ModuleSourceRegistry{}, fmt.Errorf("invalid module registry hostname: must contain at least one dot") + } + // Discard the hostname prefix now that we've processed it + parts = parts[1:] + } + + ret := ModuleSourceRegistry{ + PackageAddr: ModuleRegistryPackage{ + Host: host, + }, + + Subdir: subDir, + } + + if host == svchost.Hostname("github.com") || host == svchost.Hostname("bitbucket.org") { + return ret, fmt.Errorf("can't use %q as a module registry host, because it's reserved for installing directly from version control repositories", host) + } + + if ret.PackageAddr.Namespace, err = parseModuleRegistryName(parts[0]); err != nil { + if strings.Contains(parts[0], ".") { + // Seems like the user omitted one of the latter components in + // an address with an explicit hostname. + return ret, fmt.Errorf("source address must have three more components after the hostname: the namespace, the name, and the target system") + } + return ret, fmt.Errorf("invalid namespace %q: %s", parts[0], err) + } + if ret.PackageAddr.Name, err = parseModuleRegistryName(parts[1]); err != nil { + return ret, fmt.Errorf("invalid module name %q: %s", parts[1], err) + } + if ret.PackageAddr.TargetSystem, err = parseModuleRegistryTargetSystem(parts[2]); err != nil { + if strings.Contains(parts[2], "?") { + // The user was trying to include a query string, probably? + return ret, fmt.Errorf("module registry addresses may not include a query string portion") + } + return ret, fmt.Errorf("invalid target system %q: %s", parts[2], err) + } + + return ret, nil +} + +// parseModuleRegistryName validates and normalizes a string in either the +// "namespace" or "name" position of a module registry source address. +func parseModuleRegistryName(given string) (string, error) { + // Similar to the names in provider source addresses, we defined these + // to be compatible with what filesystems and typical remote systems + // like GitHub allow in names. Unfortunately we didn't end up defining + // these exactly equivalently: provider names can only use dashes as + // punctuation, whereas module names can use underscores. So here we're + // using some regular expressions from the original module source + // implementation, rather than using the IDNA rules as we do in + // ParseProviderPart. + + if !moduleRegistryNamePattern.MatchString(given) { + return "", fmt.Errorf("must be between one and 64 characters, including ASCII letters, digits, dashes, and underscores, where dashes and underscores may not be the prefix or suffix") + } + + // We also skip normalizing the name to lowercase, because we historically + // didn't do that and so existing module registries might be doing + // case-sensitive matching. + return given, nil +} + +// parseModuleRegistryTargetSystem validates and normalizes a string in the +// "target system" position of a module registry source address. This is +// what we historically called "provider" but never actually enforced as +// being a provider address, and now _cannot_ be a provider address because +// provider addresses have three slash-separated components of their own. +func parseModuleRegistryTargetSystem(given string) (string, error) { + // Similar to the names in provider source addresses, we defined these + // to be compatible with what filesystems and typical remote systems + // like GitHub allow in names. Unfortunately we didn't end up defining + // these exactly equivalently: provider names can't use dashes or + // underscores. So here we're using some regular expressions from the + // original module source implementation, rather than using the IDNA rules + // as we do in ParseProviderPart. + + if !moduleRegistryTargetSystemPattern.MatchString(given) { + return "", fmt.Errorf("must be between one and 64 ASCII letters or digits") + } + + // We also skip normalizing the name to lowercase, because we historically + // didn't do that and so existing module registries might be doing + // case-sensitive matching. + return given, nil +} + +// String returns a full representation of the address, including any +// additional components that are typically implied by omission in +// user-written addresses. +// +// We typically use this longer representation in error message, in case +// the inclusion of normally-omitted components is helpful in debugging +// unexpected behavior. +func (s ModuleSourceRegistry) String() string { + if s.Subdir != "" { + return s.PackageAddr.String() + "//" + s.Subdir + } + return s.PackageAddr.String() +} + +// ForDisplay is similar to String but instead returns a representation of +// the idiomatic way to write the address in configuration, omitting +// components that are commonly just implied in addresses written by +// users. +// +// We typically use this shorter representation in informational messages, +// such as the note that we're about to start downloading a package. +func (s ModuleSourceRegistry) ForDisplay() string { + if s.Subdir != "" { + return s.PackageAddr.ForDisplay() + "//" + s.Subdir + } + return s.PackageAddr.ForDisplay() +} + +// splitPackageSubdir detects whether the given address string has a +// subdirectory portion, and if so returns a non-empty subDir string +// along with the trimmed package address. +// +// If the given string doesn't have a subdirectory portion then it'll +// just be returned verbatim in packageAddr, with an empty subDir value. +func splitPackageSubdir(given string) (packageAddr, subDir string) { + packageAddr, subDir = sourceDirSubdir(given) + if subDir != "" { + subDir = path.Clean(subDir) + } + return packageAddr, subDir +} + +// sourceDirSubdir takes a source URL and returns a tuple of the URL without +// the subdir and the subdir. +// +// ex: +// dom.com/path/?q=p => dom.com/path/?q=p, "" +// proto://dom.com/path//*?q=p => proto://dom.com/path?q=p, "*" +// proto://dom.com/path//path2?q=p => proto://dom.com/path?q=p, "path2" +func sourceDirSubdir(src string) (string, string) { + // URL might contains another url in query parameters + stop := len(src) + if idx := strings.Index(src, "?"); idx > -1 { + stop = idx + } + + // Calculate an offset to avoid accidentally marking the scheme + // as the dir. + var offset int + if idx := strings.Index(src[:stop], "://"); idx > -1 { + offset = idx + 3 + } + + // First see if we even have an explicit subdir + idx := strings.Index(src[offset:stop], "//") + if idx == -1 { + return src, "" + } + + idx += offset + subdir := src[idx+2:] + src = src[:idx] + + // Next, check if we have query parameters and push them onto the + // URL. + if idx = strings.Index(subdir, "?"); idx > -1 { + query := subdir[idx:] + subdir = subdir[:idx] + src += query + } + + return src, subdir +} diff --git a/module_package.go b/module_package.go new file mode 100644 index 0000000..cba03a7 --- /dev/null +++ b/module_package.go @@ -0,0 +1,87 @@ +package tfaddr + +import ( + "strings" + + svchost "github.com/hashicorp/terraform-svchost" +) + +// A ModulePackage represents a physical location where Terraform can retrieve +// a module package, which is an archive, repository, or other similar +// container which delivers the source code for one or more Terraform modules. +// +// A ModulePackage is a string in go-getter's address syntax. By convention, +// we use ModulePackage-typed values only for the result of successfully +// running the go-getter "detectors", which produces an address string which +// includes an explicit installation method prefix along with an address +// string in the format expected by that installation method. +// +// Note that although the "detector" phase of go-getter does do some simple +// normalization in certain cases, it isn't generally possible to compare +// two ModulePackage values to decide if they refer to the same package. Two +// equal ModulePackage values represent the same package, but there might be +// other non-equal ModulePackage values that also refer to that package, and +// there is no reliable way to determine that. +// +// Don't convert a user-provided string directly to ModulePackage. Instead, +// use ParseModuleSource with a remote module address and then access the +// ModulePackage value from the result, making sure to also handle the +// selected subdirectory if any. You should convert directly to ModulePackage +// only for a string that is hard-coded into the program (e.g. in a unit test) +// where you've ensured that it's already in the expected syntax. +type ModulePackage string + +func (p ModulePackage) String() string { + return string(p) +} + +// A ModuleRegistryPackage is an extra indirection over a ModulePackage where +// we use a module registry to translate a more symbolic address (and +// associated version constraint given out of band) into a physical source +// location. +// +// ModuleRegistryPackage is distinct from ModulePackage because they have +// disjoint use-cases: registry package addresses are only used to query a +// registry in order to find a real module package address. These being +// distinct is intended to help future maintainers more easily follow the +// series of steps in the module installer, with the help of the type checker. +type ModuleRegistryPackage struct { + Host svchost.Hostname + Namespace string + Name string + TargetSystem string +} + +func (s ModuleRegistryPackage) String() string { + // Note: we're using the "display" form of the hostname here because + // for our service hostnames "for display" means something different: + // it means to render non-ASCII characters directly as Unicode + // characters, rather than using the "punycode" representation we + // use for internal processing, and so the "display" representation + // is actually what users would write in their configurations. + return s.Host.ForDisplay() + "/" + s.ForRegistryProtocol() +} + +func (s ModuleRegistryPackage) ForDisplay() string { + if s.Host == DefaultModuleRegistryHost { + return s.ForRegistryProtocol() + } + return s.Host.ForDisplay() + "/" + s.ForRegistryProtocol() +} + +// ForRegistryProtocol returns a string representation of just the namespace, +// name, and target system portions of the address, always omitting the +// registry hostname and the subdirectory portion, if any. +// +// This is primarily intended for generating addresses to send to the +// registry in question via the registry protocol, since the protocol +// skips sending the registry its own hostname as part of identifiers. +func (s ModuleRegistryPackage) ForRegistryProtocol() string { + var buf strings.Builder + buf.WriteString(s.Namespace) + buf.WriteByte('/') + buf.WriteString(s.Name) + buf.WriteByte('/') + buf.WriteString(s.TargetSystem) + return buf.String() +} diff --git a/module_test.go b/module_test.go new file mode 100644 index 0000000..0ca9155 --- /dev/null +++ b/module_test.go @@ -0,0 +1,239 @@ +package tfaddr + +import ( + "testing" + + "github.com/google/go-cmp/cmp" + svchost "github.com/hashicorp/terraform-svchost" +) + +func TestParseRawModuleSourceRegistry_Simple(t *testing.T) { + tests := map[string]struct { + input string + want ModuleSourceRegistry + wantErr string + }{ + "main registry implied": { + input: "hashicorp/subnets/cidr", + want: ModuleSourceRegistry{ + PackageAddr: ModuleRegistryPackage{ + Host: svchost.Hostname("registry.terraform.io"), + Namespace: "hashicorp", + Name: "subnets", + TargetSystem: "cidr", + }, + Subdir: "", + }, + }, + "main registry implied, subdir": { + input: "hashicorp/subnets/cidr//examples/foo", + want: ModuleSourceRegistry{ + PackageAddr: ModuleRegistryPackage{ + Host: svchost.Hostname("registry.terraform.io"), + Namespace: "hashicorp", + Name: "subnets", + TargetSystem: "cidr", + }, + Subdir: "examples/foo", + }, + }, + "custom registry": { + input: "example.com/awesomecorp/network/happycloud", + want: ModuleSourceRegistry{ + PackageAddr: ModuleRegistryPackage{ + Host: svchost.Hostname("example.com"), + Namespace: "awesomecorp", + Name: "network", + TargetSystem: "happycloud", + }, + Subdir: "", + }, + }, + "custom registry, subdir": { + input: "example.com/awesomecorp/network/happycloud//examples/foo", + want: ModuleSourceRegistry{ + PackageAddr: ModuleRegistryPackage{ + Host: svchost.Hostname("example.com"), + Namespace: "awesomecorp", + Name: "network", + TargetSystem: "happycloud", + }, + Subdir: "examples/foo", + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + addr, err := ParseRawModuleSourceRegistry(test.input) + + if test.wantErr != "" { + switch { + case err == nil: + t.Errorf("unexpected success\nwant error: %s", test.wantErr) + case err.Error() != test.wantErr: + t.Errorf("wrong error messages\ngot: %s\nwant: %s", err.Error(), test.wantErr) + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if diff := cmp.Diff(addr, test.want); diff != "" { + t.Errorf("wrong result\n%s", diff) + } + }) + } + +} + +func TestParseRawModuleSourceRegistry(t *testing.T) { + tests := map[string]struct { + input string + wantString string + wantForDisplay string + wantForProtocol string + wantErr string + }{ + "public registry": { + input: `hashicorp/consul/aws`, + wantString: `registry.terraform.io/hashicorp/consul/aws`, + wantForDisplay: `hashicorp/consul/aws`, + wantForProtocol: `hashicorp/consul/aws`, + }, + "public registry with subdir": { + input: `hashicorp/consul/aws//foo`, + wantString: `registry.terraform.io/hashicorp/consul/aws//foo`, + wantForDisplay: `hashicorp/consul/aws//foo`, + wantForProtocol: `hashicorp/consul/aws`, + }, + "public registry using explicit hostname": { + input: `registry.terraform.io/hashicorp/consul/aws`, + wantString: `registry.terraform.io/hashicorp/consul/aws`, + wantForDisplay: `hashicorp/consul/aws`, + wantForProtocol: `hashicorp/consul/aws`, + }, + "public registry with mixed case names": { + input: `HashiCorp/Consul/aws`, + wantString: `registry.terraform.io/HashiCorp/Consul/aws`, + wantForDisplay: `HashiCorp/Consul/aws`, + wantForProtocol: `HashiCorp/Consul/aws`, + }, + "private registry with non-standard port": { + input: `Example.com:1234/HashiCorp/Consul/aws`, + wantString: `example.com:1234/HashiCorp/Consul/aws`, + wantForDisplay: `example.com:1234/HashiCorp/Consul/aws`, + wantForProtocol: `HashiCorp/Consul/aws`, + }, + "private registry with IDN hostname": { + input: `Испытание.com/HashiCorp/Consul/aws`, + wantString: `испытание.com/HashiCorp/Consul/aws`, + wantForDisplay: `испытание.com/HashiCorp/Consul/aws`, + wantForProtocol: `HashiCorp/Consul/aws`, + }, + "private registry with IDN hostname and non-standard port": { + input: `Испытание.com:1234/HashiCorp/Consul/aws//Foo`, + wantString: `испытание.com:1234/HashiCorp/Consul/aws//Foo`, + wantForDisplay: `испытание.com:1234/HashiCorp/Consul/aws//Foo`, + wantForProtocol: `HashiCorp/Consul/aws`, + }, + "invalid hostname": { + input: `---.com/HashiCorp/Consul/aws`, + wantErr: `invalid module registry hostname "---.com"; internationalized domain names must be given as direct unicode characters, not in punycode`, + }, + "hostname with only one label": { + // This was historically forbidden in our initial implementation, + // so we keep it forbidden to avoid newly interpreting such + // addresses as registry addresses rather than remote source + // addresses. + input: `foo/var/baz/qux`, + wantErr: `invalid module registry hostname: must contain at least one dot`, + }, + "invalid target system characters": { + input: `foo/var/no-no-no`, + wantErr: `invalid target system "no-no-no": must be between one and 64 ASCII letters or digits`, + }, + "invalid target system length": { + input: `foo/var/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaah`, + wantErr: `invalid target system "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaah": must be between one and 64 ASCII letters or digits`, + }, + "invalid namespace": { + input: `boop!/var/baz`, + wantErr: `invalid namespace "boop!": must be between one and 64 characters, including ASCII letters, digits, dashes, and underscores, where dashes and underscores may not be the prefix or suffix`, + }, + "missing part with explicit hostname": { + input: `foo.com/var/baz`, + wantErr: `source address must have three more components after the hostname: the namespace, the name, and the target system`, + }, + "errant query string": { + input: `foo/var/baz?otherthing`, + wantErr: `module registry addresses may not include a query string portion`, + }, + "github.com": { + // We don't allow using github.com like a module registry because + // that conflicts with the historically-supported shorthand for + // installing directly from GitHub-hosted git repositories. + input: `github.com/HashiCorp/Consul/aws`, + wantErr: `can't use "github.com" as a module registry host, because it's reserved for installing directly from version control repositories`, + }, + "bitbucket.org": { + // We don't allow using bitbucket.org like a module registry because + // that conflicts with the historically-supported shorthand for + // installing directly from BitBucket-hosted git repositories. + input: `bitbucket.org/HashiCorp/Consul/aws`, + wantErr: `can't use "bitbucket.org" as a module registry host, because it's reserved for installing directly from version control repositories`, + }, + "local path from current dir": { + // Can't use a local path when we're specifically trying to parse + // a _registry_ source address. + input: `./boop`, + wantErr: `a module registry source address must have either three or four slash-separated components`, + }, + "local path from parent dir": { + // Can't use a local path when we're specifically trying to parse + // a _registry_ source address. + input: `../boop`, + wantErr: `a module registry source address must have either three or four slash-separated components`, + }, + "main registry implied, escaping subdir": { + input: "hashicorp/subnets/cidr//../nope", + wantErr: `subdirectory path "../nope" leads outside of the module package`, + }, + "relative path without the needed prefix": { + input: "boop/bloop", + wantErr: "a module registry source address must have either three or four slash-separated components", + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + addr, err := ParseRawModuleSourceRegistry(test.input) + + if test.wantErr != "" { + switch { + case err == nil: + t.Errorf("unexpected success\nwant error: %s", test.wantErr) + case err.Error() != test.wantErr: + t.Errorf("wrong error messages\ngot: %s\nwant: %s", err.Error(), test.wantErr) + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if got, want := addr.String(), test.wantString; got != want { + t.Errorf("wrong String() result\ngot: %s\nwant: %s", got, want) + } + if got, want := addr.ForDisplay(), test.wantForDisplay; got != want { + t.Errorf("wrong ForDisplay() result\ngot: %s\nwant: %s", got, want) + } + if got, want := addr.PackageAddr.ForRegistryProtocol(), test.wantForProtocol; got != want { + t.Errorf("wrong ForRegistryProtocol() result\ngot: %s\nwant: %s", got, want) + } + }) + } +}