From 7571b0fde4960e7fcfbbd3d38071f5171b98dfcd Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Wed, 16 Oct 2019 14:35:06 -0400 Subject: [PATCH 01/15] Add rf_local_is_in function Signed-off-by: Jason T. Brown --- .../rasterframes/RasterFunctions.scala | 3 + .../expressions/localops/IsIn.scala | 88 +++++++++++++++++++ .../rasterframes/expressions/package.scala | 1 + .../rasterframes/RasterFunctionsSpec.scala | 21 +++++ 4 files changed, 113 insertions(+) create mode 100644 core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala diff --git a/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala b/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala index 213f0f77d..6107f9903 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/RasterFunctions.scala @@ -389,6 +389,9 @@ trait RasterFunctions { /** Cellwise inequality comparison between a tile and a scalar. */ def rf_local_unequal[T: Numeric](tileCol: Column, value: T): Column = Unequal(tileCol, value) + /** Test if each cell value is in provided array */ + def rf_local_is_in(tileCol: Column, arrayCol: Column) = IsIn(tileCol, arrayCol) + /** Return a tile with ones where the input is NoData, otherwise zero */ def rf_local_no_data(tileCol: Column): Column = Undefined(tileCol) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala new file mode 100644 index 000000000..fc96fa7fd --- /dev/null +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala @@ -0,0 +1,88 @@ +/* + * This software is licensed under the Apache 2 license, quoted below. + * + * Copyright 2019 Astraea, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * [http://www.apache.org/licenses/LICENSE-2.0] + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + * + * SPDX-License-Identifier: Apache-2.0 + * + */ + +package org.locationtech.rasterframes.expressions.localops + +import geotrellis.raster.Tile +import geotrellis.raster.mapalgebra.local.IfCell +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult +import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} +import org.apache.spark.sql.types.{ArrayType, DataType} +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, ExpressionDescription} +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.rf.TileUDT +import org.locationtech.rasterframes.encoders.CatalystSerializer._ +import org.locationtech.rasterframes.expressions.DynamicExtractors._ +import org.locationtech.rasterframes.expressions._ + +@ExpressionDescription( + usage = "_FUNC_(tile, rhs) - In each cell of `tile`, return true if the value is in rhs.", + arguments = """ + Arguments: + * tile - tile column to apply abs + * rhs - array to test against + """, + examples = """ + Examples: + > SELECT _FUNC_(tile, array); + ...""" +) +case class IsIn(left: Expression, right: Expression) extends BinaryExpression with CodegenFallback { + override val nodeName: String = "rf_local_is_in" + + override def dataType: DataType = left.dataType + + @transient private lazy val elementType: DataType = left.dataType.asInstanceOf[ArrayType].elementType + + override def checkInputDataTypes(): TypeCheckResult = + if(!tileExtractor.isDefinedAt(left.dataType)) { + TypeCheckFailure(s"Input type '${left.dataType}' does not conform to a raster type.") + } else right.dataType match { + case _: ArrayType ⇒ TypeCheckSuccess + case _ ⇒ TypeCheckFailure(s"Input type '${right.dataType}' does not conform to ArrayType.") + } + + override protected def nullSafeEval(input1: Any, input2: Any): Any = { + implicit val tileSer = TileUDT.tileSerializer + val (childTile, childCtx) = tileExtractor(left.dataType)(row(input1)) + + val arr = input2.asInstanceOf[ArrayData].toArray[AnyRef](elementType) + + childCtx match { + case Some(ctx) => ctx.toProjectRasterTile(op(childTile, arr)).toInternalRow + case None => op(childTile, arr).toInternalRow + } + + } + + protected def op(left: Tile, right: IndexedSeq[AnyRef]): Tile = { + def fn(i: Int): Boolean = right.contains(i) + IfCell(left, fn(_), 1, 0) + } + +} + +object IsIn { + def apply(left: Column, right: Column): Column = + new Column(IsIn(left.expr, right.expr)) +} diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala index ef614a9a3..7e4cb2dfb 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/package.scala @@ -87,6 +87,7 @@ package object expressions { registry.registerExpression[GreaterEqual]("rf_local_greater_equal") registry.registerExpression[Equal]("rf_local_equal") registry.registerExpression[Unequal]("rf_local_unequal") + registry.registerExpression[IsIn]("rf_local_is_in") registry.registerExpression[Undefined]("rf_local_no_data") registry.registerExpression[Defined]("rf_local_data") registry.registerExpression[Sum]("rf_tile_sum") diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala index f5256a32f..785dbfd36 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala @@ -972,4 +972,25 @@ class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { val dResult = df.select($"ld").as[Tile].first() dResult should be (randNDPRT.localDefined()) } + + it("should check values isin"){ + checkDocs("rf_local_is_in") + + // tile is 3 by 3 with values, 1 to 9 + val df = Seq((byteArrayTile, lit(1), lit(5), lit(10))).toDF("t", "one", "five", "ten") + .withColumn("in_expect_2", rf_local_is_in($"t", array($"one", $"five"))) + .withColumn("in_expect_1", rf_local_is_in($"t", array($"ten", $"five"))) + .withColumn("in_expect_0", rf_local_is_in($"t", array($"ten"))) + + val e2Result = df.select(rf_tile_sum($"in_expect_2")).as[Double].first() + e2Result should be (2.0) + + val e1Result = df.select(rf_tile_sum($"in_expect_1")).as[Double].first() + e1Result should be (1.0) + + val e0Result = df.select($"in_expect_1").as[Tile].first() + e0Result.toArray() should contain only (0) + +// lazy val invalid = df.select(rf_local_is_in($"t", lit("foobar"))).as[Tile].first() + } } From 0d03a6ea0b159af2c6e6ae2d7557cfdd0ebaee34 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Thu, 17 Oct 2019 15:31:12 -0400 Subject: [PATCH 02/15] Attempting to keep TravisCI from timing out by using jobs. --- .travis.yml | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 12cad75b7..7e1b64d73 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,3 @@ -sudo: false dist: xenial language: python @@ -28,11 +27,10 @@ install: - pip install rasterio shapely pandas numpy pweave - wget -O - https://piccolo.link/sbt-1.2.8.tgz | tar xzf - -script: - - sbt/bin/sbt -java-home $JAVA_HOME -batch test - - sbt/bin/sbt -java-home $JAVA_HOME -batch it:test - # - sbt -Dfile.encoding=UTF8 clean coverage test coverageReport - # Tricks to avoid unnecessary cache updates - - find $HOME/.sbt -name "*.lock" | xargs rm - - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm +jobs: + include: + - stage: "Unit Tests" + script: sbt/bin/sbt -java-home $JAVA_HOME -batch test + - stage: "Integration" + script: sbt/bin/sbt -java-home $JAVA_HOME -batch it:test From 137d0d16483269f331d498bd92daf4f94ecd3717 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Thu, 17 Oct 2019 15:55:01 -0400 Subject: [PATCH 03/15] Fix unit tests for rf_local_is_in Signed-off-by: Jason T. Brown --- .../rasterframes/expressions/localops/IsIn.scala | 2 +- .../locationtech/rasterframes/RasterFunctionsSpec.scala | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala index fc96fa7fd..9080243e1 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala @@ -52,7 +52,7 @@ case class IsIn(left: Expression, right: Expression) extends BinaryExpression wi override def dataType: DataType = left.dataType - @transient private lazy val elementType: DataType = left.dataType.asInstanceOf[ArrayType].elementType + @transient private lazy val elementType: DataType = right.dataType.asInstanceOf[ArrayType].elementType override def checkInputDataTypes(): TypeCheckResult = if(!tileExtractor.isDefinedAt(left.dataType)) { diff --git a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala index 785dbfd36..b424a730f 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/RasterFunctionsSpec.scala @@ -977,7 +977,10 @@ class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { checkDocs("rf_local_is_in") // tile is 3 by 3 with values, 1 to 9 - val df = Seq((byteArrayTile, lit(1), lit(5), lit(10))).toDF("t", "one", "five", "ten") + val df = Seq(byteArrayTile).toDF("t") + .withColumn("one", lit(1)) + .withColumn("five", lit(5)) + .withColumn("ten", lit(10)) .withColumn("in_expect_2", rf_local_is_in($"t", array($"one", $"five"))) .withColumn("in_expect_1", rf_local_is_in($"t", array($"ten", $"five"))) .withColumn("in_expect_0", rf_local_is_in($"t", array($"ten"))) @@ -988,7 +991,7 @@ class RasterFunctionsSpec extends TestEnvironment with RasterMatchers { val e1Result = df.select(rf_tile_sum($"in_expect_1")).as[Double].first() e1Result should be (1.0) - val e0Result = df.select($"in_expect_1").as[Tile].first() + val e0Result = df.select($"in_expect_0").as[Tile].first() e0Result.toArray() should contain only (0) // lazy val invalid = df.select(rf_local_is_in($"t", lit("foobar"))).as[Tile].first() From 7ca1a04d56871883a4846274aa1b1fd17f8444c8 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 18 Oct 2019 10:44:36 -0400 Subject: [PATCH 04/15] Expanded RasterRefSpec to ensure lazy tiles provide metadata without I/O. --- .travis.yml | 2 +- .../rasterframes/ref/RasterRefSpec.scala | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7e1b64d73..9b6f44ea2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,5 +32,5 @@ jobs: include: - stage: "Unit Tests" script: sbt/bin/sbt -java-home $JAVA_HOME -batch test - - stage: "Integration" + - stage: "Integration Tests" script: sbt/bin/sbt -java-home $JAVA_HOME -batch it:test diff --git a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala index 80f0a7082..51e3338d2 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ref/RasterRefSpec.scala @@ -253,14 +253,18 @@ class RasterRefSpec extends TestEnvironment with TestData { } } - it("should construct a RasterRefTile without I/O") { + it("should construct and inspect a RasterRefTile without I/O") { new Fixture { // SimpleRasterInfo is a proxy for header data requests. - val start = SimpleRasterInfo.cacheStats.hitCount() + val startStats = SimpleRasterInfo.cacheStats val t: ProjectedRasterTile = RasterRefTile(subRaster) - val result = Seq(t, subRaster.tile).toDF("tile").first() - val end = SimpleRasterInfo.cacheStats.hitCount() - end should be(start) + val df = Seq(t, subRaster.tile).toDF("tile") + val result = df.first() + SimpleRasterInfo.cacheStats.hitCount() should be(startStats.hitCount()) + SimpleRasterInfo.cacheStats.missCount() should be(startStats.missCount()) + val info = df.select(rf_dimensions($"tile"), rf_extent($"tile")).first() + SimpleRasterInfo.cacheStats.hitCount() should be(startStats.hitCount() + 2) + SimpleRasterInfo.cacheStats.missCount() should be(startStats.missCount()) } } } From 96aa7dc4a6af5e67cbce25ec0424da3d626c44d4 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 18 Oct 2019 11:47:52 -0400 Subject: [PATCH 05/15] Applying pre-partitioning to DataSources. --- .../datasource/raster/RasterSourceRelation.scala | 8 ++++++-- .../experimental/datasource/CachedDatasetRelation.scala | 2 ++ .../datasource/awspds/L8CatalogRelation.scala | 4 +++- .../datasource/awspds/MODISCatalogRelation.scala | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala index 6af519f56..9b381d3a6 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceRelation.scala @@ -69,6 +69,9 @@ case class RasterSourceRelation( catalog.schema.fields.filter(f => !catalogTable.bandColumnNames.contains(f.name)) } + protected def defaultNumPartitions: Int = + sqlContext.sparkSession.sessionState.conf.numShufflePartitions + override def schema: StructType = { val tileSchema = schemaOf[ProjectedRasterTile] val paths = for { @@ -84,10 +87,11 @@ case class RasterSourceRelation( override def buildScan(): RDD[Row] = { import sqlContext.implicits._ - // The general transformaion is: + // The general transformation is: // input -> path -> src -> ref -> tile // Each step is broken down for readability val inputs: DataFrame = sqlContext.table(catalogTable.tableName) + .repartition(defaultNumPartitions) // Basically renames the input columns to have the '_path' suffix val pathsAliasing = for { @@ -112,7 +116,7 @@ case class RasterSourceRelation( val df = if (lazyTiles) { // Expand RasterSource into multiple columns per band, and multiple rows per tile - // There's some unintentional fragililty here in that the structure of the expression + // There's some unintentional fragility here in that the structure of the expression // is expected to line up with our column structure here. val refs = RasterSourceToRasterRefs(subtileDims, bandIndexes, srcs: _*) as refColNames diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala index 1fac7699a..06947080d 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/CachedDatasetRelation.scala @@ -33,6 +33,8 @@ import org.locationtech.rasterframes.util._ * @since 8/24/18 */ trait CachedDatasetRelation extends ResourceCacheSupport { self: BaseRelation ⇒ + protected def defaultNumPartitions: Int = + sqlContext.sparkSession.sessionState.conf.numShufflePartitions protected def cacheFile: HadoopPath protected def constructDataset: Dataset[Row] diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala index 9a14c86f3..049617de6 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/L8CatalogRelation.scala @@ -68,7 +68,9 @@ case class L8CatalogRelation(sqlContext: SQLContext, sceneListPath: HadoopPath) .select(schema.map(f ⇒ col(f.name)): _*) .orderBy(ACQUISITION_DATE.name, PATH.name, ROW.name) .distinct() // The scene file contains duplicates. - .repartition(8, col(PATH.name), col(ROW.name)) + .repartition(defaultNumPartitions, col(PATH.name), col(ROW.name)) + + } } diff --git a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala index 30b3ba234..6e76acc36 100644 --- a/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala +++ b/experimental/src/main/scala/org/locationtech/rasterframes/experimental/datasource/awspds/MODISCatalogRelation.scala @@ -64,7 +64,7 @@ case class MODISCatalogRelation(sqlContext: SQLContext, sceneList: HadoopPath) $"${GID.name}") ++ bandCols: _* ) .orderBy(ACQUISITION_DATE.name, GID.name) - .repartition(8, col(GRANULE_ID.name)) + .repartition(defaultNumPartitions, col(GRANULE_ID.name)) } } From e4e8bcb795e968135b46bfd8dbf7125e11a67cf6 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Sun, 20 Oct 2019 20:03:31 -0400 Subject: [PATCH 06/15] Updated intro section. Added additional raster-read section. --- pyrasterframes/src/main/python/docs/index.md | 16 ++- .../src/main/python/docs/raster-read.pymd | 103 ++++++++++-------- 2 files changed, 71 insertions(+), 48 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/index.md b/pyrasterframes/src/main/python/docs/index.md index e3a37274b..f3be57721 100644 --- a/pyrasterframes/src/main/python/docs/index.md +++ b/pyrasterframes/src/main/python/docs/index.md @@ -2,15 +2,21 @@ RasterFrames® brings together Earth-observation (EO) data access, cloud computing, and DataFrame-based data science. The recent explosion of EO data from public and private satellite operators presents both a huge opportunity and a huge challenge to the data analysis community. It is _Big Data_ in the truest sense, and its footprint is rapidly getting bigger. -RasterFrames provides a DataFrame-centric view over arbitrary raster data, enabling spatiotemporal queries, map algebra raster operations, and compatibility with the ecosystem of Spark ML algorithms. By using DataFrames as the core cognitive and compute data model, it is able to deliver these features in a form that is both accessible to general analysts and scalable along with the rapidly growing data footprint. +RasterFrames provides a DataFrame-centric view over arbitrary geospatial raster data, enabling spatiotemporal queries, map algebra raster operations, and interoperability with Spark ML. By using the DataFrame as the core cognitive and compute data model, RasterFrames is able to deliver an extensive set of functionality in a form that is both horizontally scalable as well as familiar to general analysts and data scientists. It provides APIs for Python, SQL, and Scala. -To learn more, please see the @ref:[Getting Started](getting-started.md) section of this manual. +![RasterFrames](static/rasterframes-pipeline-nologo.png) -The source code can be found on GitHub at [locationtech/rasterframes](https://github.com/locationtech/rasterframes). +Through its custom [Spark DataSource](https://rasterframes.io/raster-read.html), RasterFrames can read various raster formats -- including GeoTIFF, JP2000, MRF, and HDF -- and from an [array of services](https://rasterframes.io/raster-read.html#uri-formats), such as HTTP, FTP, HDFS, S3 and WASB. It also supports reading the vector formats GeoJSON and WKT/WKB. RasterFrame contents can be filtered, transformed, summarized, resampled, and rasterized through [200+ raster and vector functions](https://rasterframes.io/reference.html). + +As part of the LocationTech family of projects, RasterFrames builds upon the strong foundations provided by GeoMesa (spatial operations) , GeoTrellis (raster operations), JTS (geometry modeling) and SFCurve (spatiotemporal indexing), integrating various aspects of these projects into a unified, DataFrame-centric analytics package. + +![](static/rasterframes-locationtech-stack.png) -RasterFrames is released under the [Apache 2.0 License](https://github.com/locationtech/rasterframes/blob/develop/LICENSE). +RasterFrames is released under the commercial-friendly [Apache 2.0](https://github.com/locationtech/rasterframes/blob/develop/LICENSE) open source license. -![RasterFrames](static/rasterframes-pipeline.png) +To learn more, please see the @ref:[Getting Started](getting-started.md) section of this manual. + +The source code can be found on GitHub at [locationtech/rasterframes](https://github.com/locationtech/rasterframes).
diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd index 53f3a96e6..30befbabd 100644 --- a/pyrasterframes/src/main/python/docs/raster-read.pymd +++ b/pyrasterframes/src/main/python/docs/raster-read.pymd @@ -14,7 +14,7 @@ RasterFrames registers a DataSource named `raster` that enables reading of GeoTI RasterFrames can also read from @ref:[GeoTrellis catalogs and layers](raster-read.md#geotrellis). -## Single Raster +## Single Rasters The simplest way to use the `raster` reader is with a single raster from a single URI or file. In the examples that follow we'll be reading from a Sentinel-2 scene stored in an AWS S3 bucket. @@ -33,14 +33,12 @@ print("CRS", crs.value.crsProj4) ``` ```python, raster_parts -parts = rf.select( +rf.select( rf_extent("proj_raster").alias("extent"), rf_tile("proj_raster").alias("tile") ) -parts ``` - You can also see that the single raster has been broken out into many arbitrary non-overlapping regions. Doing so takes advantage of parallel in-memory reads from the cloud hosted data source and allows Spark to work on manageable amounts of data per task. The following code fragment shows us how many subtiles were created from a single source image. ```python, count_by_uri @@ -55,6 +53,64 @@ tile = rf.select(rf_tile("proj_raster")).first()[0] display(tile) ``` +## Multiple Singleband Rasters + +In this example, we show reading [two bands](https://en.wikipedia.org/wiki/Multispectral_image) of [Landsat 8](https://landsat.gsfc.nasa.gov/landsat-8/) imagery (red and near-infrared), combining them with `rf_normalized_difference` to compute NDVI. As described in the section on @ref:[catalogs](raster-catalogs.md), image URIs in a single row are assumed to be from the same scene/granule, and therefore compatible. This pattern is commonly used when multiple bands are stored in separate files. + +```python, multi_singleband +bands = [f'B{b}' for b in [4, 5]] +uris = [f'https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/014/032/LC08_L1TP_014032_20190720_20190731_01_T1/LC08_L1TP_014032_20190720_20190731_01_T1_{b}.TIF' for b in bands] +catalog = ','.join(bands) + '\n' + ','.join(uris) + +rf = spark.read.raster(catalog, bands) \ + .withColumnRenamed('B4', 'red').withColumnRenamed('B5', 'NIR') \ + .withColumn('longitude_latitude', st_reproject(st_centroid(rf_geometry('red')), rf_crs('red'), lit('EPSG:4326'))) \ + .withColumn('NDVI', rf_normalized_difference('NIR', 'red')) \ + .where(rf_tile_sum('NDVI') > 10000) \ + .select('longitude_latitude', 'red', 'NIR', 'NDVI') +display(rf) +``` + +## Multiband Rasters + +A multiband raster is represented by a three dimensional numeric array stored in a single file. The first two dimensions are spatial, and the third dimension is typically designated for different spectral @ref:[bands](concepts.md#band). The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could measure other phenomena such as time, quality indications, or additional gas concentrations, etc. + +Multiband rasters files have a strictly ordered set of bands, which are typically indexed from 1. Some files have metadata tags associated with each band. Some files have a color interpetation metadata tag indicating how to interpret the bands. + +When reading a multiband raster or a @ref:[_catalog_](#raster-catalogs) describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, as a list of integers into the `band_indexes` parameter of the `raster` reader. + +For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`. + +```python, multiband +mb = spark.read.raster( + 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', + band_indexes=[0, 1, 2, 3], +) +display(mb) +``` + +If a band is passed into `band_indexes` that exceeds the number of bands in the raster, a projected raster column will still be generated in the schema but the column will be full of `null` values. + +You can also pass a _catalog_ and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items in `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column. + +Here is a trivial example with a _catalog_ over multiband rasters. We specify two columns containing URIs and two bands, resulting in four projected raster columns. + +```python, multiband_catalog +import pandas as pd +mb_cat = pd.DataFrame([ + {'foo': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', + 'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif' + }, +]) +mb2 = spark.read.raster( + spark.createDataFrame(mb_cat), + catalog_col_names=['foo', 'bar'], + band_indexes=[0, 1], + tile_dimensions=(64,64) +) +mb2.printSchema() +``` + ## URI Formats RasterFrames relies on three different I/O drivers, selected based on a combination of scheme, file extentions, and library availability. GDAL is used by default if a compatible version of GDAL (>= 2.4) is installed, and if GDAL supports the specified scheme. If GDAL is not available, either the _Java I/O_ or _Hadoop_ driver will be selected, depending on scheme. @@ -154,45 +210,6 @@ non_lazy In the initial examples on this page, you may have noticed that the realized (non-lazy) _tiles_ are shown, but we did not change `lazy_tiles`. Instead, we used @ref:[`rf_tile`](reference.md#rf-tile) to explicitly request the realized _tile_ from the lazy representation. -## Multiband Rasters - -A multiband raster represents a three dimensional numeric array. The first two dimensions are spatial, and the third dimension is typically designated for different spectral @ref:[bands](concepts.md#band). The bands could represent intensity of different wavelengths of light (or other electromagnetic radiation), or they could measure other phenomena such as time, quality indications, or additional gas concentrations, etc. - -Multiband rasters files have a strictly ordered set of bands, which are typically indexed from 1. Some files have metadata tags associated with each band. Some files have a color interpetation metadata tag indicating how to interpret the bands. - -When reading a multiband raster or a _catalog_ describing multiband rasters, you will need to know ahead of time which bands you want to read. You will specify the bands to read, **indexed from zero**, as a list of integers into the `band_indexes` parameter of the `raster` reader. - -For example, we can read a four-band (red, green, blue, and near-infrared) image as follows. The individual rows of the resulting DataFrame still represent distinct spatial extents, with a projected raster column for each band specified by `band_indexes`. - -```python, multiband -mb = spark.read.raster( - 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', - band_indexes=[0, 1, 2, 3], -) -mb.printSchema() -``` - -If a band is passed into `band_indexes` that exceeds the number of bands in the raster, a projected raster column will still be generated in the schema but the column will be full of `null` values. - -You can also pass a _catalog_ and `band_indexes` together into the `raster` reader. This will create a projected raster column for the combination of all items in `catalog_col_names` and `band_indexes`. Again if a band in `band_indexes` exceeds the number of bands in a raster, it will have a `null` value for the corresponding column. - -Here is a trivial example with a _catalog_ over multiband rasters. We specify two columns containing URIs and two bands, resulting in four projected raster columns. - -```python, multiband_catalog -import pandas as pd -mb_cat = pd.DataFrame([ - {'foo': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif', - 'bar': 's3://s22s-test-geotiffs/naip/m_3807863_nw_17_1_20160620.tif' - }, -]) -mb2 = spark.read.raster( - spark.createDataFrame(mb_cat), - catalog_col_names=['foo', 'bar'], - band_indexes=[0, 1], - tile_dimensions=(64,64) -) -mb2.printSchema() -``` ## GeoTrellis From eb899decbdb3c2c28b189e62cadc9d21fa649386 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 21 Oct 2019 10:52:39 -0400 Subject: [PATCH 07/15] rf_local_is_in python implementation Signed-off-by: Jason T. Brown --- .../expressions/localops/IsIn.scala | 2 +- docs/src/main/paradox/release-notes.md | 1 + .../src/main/python/docs/reference.pymd | 14 +++++-- .../python/pyrasterframes/rasterfunctions.py | 10 +++++ .../main/python/tests/PyRasterFramesTests.py | 2 +- .../main/python/tests/RasterFunctionsTests.py | 38 ++++++++++++++++++- 6 files changed, 60 insertions(+), 7 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala index 9080243e1..84008acbd 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/localops/IsIn.scala @@ -44,7 +44,7 @@ import org.locationtech.rasterframes.expressions._ """, examples = """ Examples: - > SELECT _FUNC_(tile, array); + > SELECT _FUNC_(tile, array(lit(33), lit(66), lit(99))); ...""" ) case class IsIn(left: Expression, right: Expression) extends BinaryExpression with CodegenFallback { diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 391e453d4..b206df37a 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -6,6 +6,7 @@ * _Breaking_ (potentially): removed `GeoTiffCollectionRelation` due to usage limitation and overlap with `RasterSourceDataSource` functionality. * Upgraded to Spark 2.4.4 + * Add `rf_local_is_in` raster function ### 0.8.3 diff --git a/pyrasterframes/src/main/python/docs/reference.pymd b/pyrasterframes/src/main/python/docs/reference.pymd index 195b7e5e0..53c70e47b 100644 --- a/pyrasterframes/src/main/python/docs/reference.pymd +++ b/pyrasterframes/src/main/python/docs/reference.pymd @@ -183,7 +183,7 @@ Parameters `tile_columns` and `tile_rows` are literals, not column expressions. Tile rf_array_to_tile(Array arrayCol, Int numCols, Int numRows) -Python only. Create a `tile` from a Spark SQL [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), filling values in row-major order. +Python only. Create a `tile` from a Spark SQL [Array][Array], filling values in row-major order. ### rf_assemble_tile @@ -374,6 +374,13 @@ Returns a `tile` column containing the element-wise equality of `tile1` and `rhs Returns a `tile` column containing the element-wise inequality of `tile1` and `rhs`. +### rf_local_is_in + + Tile rf_local_is_in(Tile tile, Array array) + Tile rf_local_is_in(Tile tile, list l) + +Returns a `tile` column with cell values of 1 where the `tile` cell value is in the provided array or list. The `array` is a Spark SQL [Array][Array]. A python `list` of numeric values can also be passed. + ### rf_round Tile rf_round(Tile tile) @@ -621,13 +628,13 @@ Python only. As with @ref:[`rf_explode_tiles`](reference.md#rf-explode-tiles), b Array rf_tile_to_array_int(Tile tile) -Convert Tile column to Spark SQL [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), in row-major order. Float cell types will be coerced to integral type by flooring. +Convert Tile column to Spark SQL [Array][Array], in row-major order. Float cell types will be coerced to integral type by flooring. ### rf_tile_to_array_double Array rf_tile_to_arry_double(Tile tile) -Convert tile column to Spark [Array](http://spark.apache.org/docs/2.3.2/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType), in row-major order. Integral cell types will be coerced to floats. +Convert tile column to Spark [Array][Array], in row-major order. Integral cell types will be coerced to floats. ### rf_render_ascii @@ -657,3 +664,4 @@ Runs [`rf_rgb_composite`](reference.md#rf-rgb-composite) on the given tile colum [RasterFunctions]: org.locationtech.rasterframes.RasterFunctions [scaladoc]: latest/api/index.html +[Array]: http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.types.ArrayType diff --git a/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py b/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py index 9c0e52f09..86adbb41b 100644 --- a/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py +++ b/pyrasterframes/src/main/python/pyrasterframes/rasterfunctions.py @@ -260,14 +260,24 @@ def rf_local_unequal_int(tile_col, scalar): """Return a Tile with values equal 1 if the cell is not equal to a scalar, otherwise 0""" return _apply_scalar_to_tile('rf_local_unequal_int', tile_col, scalar) + def rf_local_no_data(tile_col): """Return a tile with ones where the input is NoData, otherwise zero.""" return _apply_column_function('rf_local_no_data', tile_col) + def rf_local_data(tile_col): """Return a tile with zeros where the input is NoData, otherwise one.""" return _apply_column_function('rf_local_data', tile_col) +def rf_local_is_in(tile_col, array): + """Return a tile with cell values of 1 where the `tile_col` cell is in the provided array.""" + from pyspark.sql.functions import array as sql_array, lit + if isinstance(array, list): + array = sql_array([lit(v) for v in array]) + + return _apply_column_function('rf_local_is_in', tile_col, array) + def _apply_column_function(name, *args): jfcn = RFContext.active().lookup(name) jcols = [_to_java_column(arg) for arg in args] diff --git a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py index 7cda3b997..3bb2ce491 100644 --- a/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py +++ b/pyrasterframes/src/main/python/tests/PyRasterFramesTests.py @@ -131,7 +131,7 @@ def test_tile_udt_serialization(self): cells[1][1] = nd a_tile = Tile(cells, ct.with_no_data_value(nd)) round_trip = udt.fromInternal(udt.toInternal(a_tile)) - self.assertEquals(a_tile, round_trip, "round-trip serialization for " + str(ct)) + self.assertEqual(a_tile, round_trip, "round-trip serialization for " + str(ct)) schema = StructType([StructField("tile", TileUDT(), False)]) df = self.spark.createDataFrame([{"tile": a_tile}], schema) diff --git a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py index ca17dc325..70fec0504 100644 --- a/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py +++ b/pyrasterframes/src/main/python/tests/RasterFunctionsTests.py @@ -347,8 +347,11 @@ def test_rf_local_data_and_no_data(self): import numpy as np from numpy.testing import assert_equal - t = Tile(np.array([[1, 3, 4], [5, 0, 3]]), CellType.uint8().with_no_data_value(5)) - #note the convert is due to issue #188 + nd = 5 + t = Tile( + np.array([[1, 3, 4], [nd, 0, 3]]), + CellType.uint8().with_no_data_value(nd)) + # note the convert is due to issue #188 df = self.spark.createDataFrame([Row(t=t)])\ .withColumn('lnd', rf_convert_cell_type(rf_local_no_data('t'), 'uint8')) \ .withColumn('ld', rf_convert_cell_type(rf_local_data('t'), 'uint8')) @@ -359,3 +362,34 @@ def test_rf_local_data_and_no_data(self): result_d = result['ld'] assert_equal(result_d.cells, np.invert(t.cells.mask)) + + def test_rf_local_is_in(self): + from pyspark.sql.functions import lit, array, col + from pyspark.sql import Row + import numpy as np + from numpy.testing import assert_equal + + nd = 5 + t = Tile( + np.array([[1, 3, 4], [nd, 0, 3]]), + CellType.uint8().with_no_data_value(nd)) + # note the convert is due to issue #188 + df = self.spark.createDataFrame([Row(t=t)]) \ + .withColumn('a', array(lit(3), lit(4))) \ + .withColumn('in2', rf_convert_cell_type( + rf_local_is_in(col('t'), array(lit(0), lit(4))), + 'uint8')) \ + .withColumn('in3', rf_convert_cell_type(rf_local_is_in('t', 'a'), 'uint8')) \ + .withColumn('in4', rf_convert_cell_type( + rf_local_is_in('t', array(lit(0), lit(4), lit(3))), + 'uint8')) \ + .withColumn('in_list', rf_convert_cell_type(rf_local_is_in(col('t'), [4, 1]), 'uint8')) + + result = df.first() + self.assertEqual(result['in2'].cells.sum(), 2) + assert_equal(result['in2'].cells, np.isin(t.cells, np.array([0, 4]))) + self.assertEqual(result['in3'].cells.sum(), 3) + self.assertEqual(result['in4'].cells.sum(), 4) + self.assertEqual(result['in_list'].cells.sum(), 2, + "Tile value {} should contain two 1s as: [[1, 0, 1],[0, 0, 0]]" + .format(result['in_list'].cells)) From e7b3b90bb9a35a1f2365f477405604c6241c6223 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Mon, 21 Oct 2019 11:10:48 -0400 Subject: [PATCH 08/15] Close #310 move reference to static Signed-off-by: Jason T. Brown --- .../docs/reference.pymd => docs/src/main/paradox/reference.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pyrasterframes/src/main/python/docs/reference.pymd => docs/src/main/paradox/reference.md (100%) diff --git a/pyrasterframes/src/main/python/docs/reference.pymd b/docs/src/main/paradox/reference.md similarity index 100% rename from pyrasterframes/src/main/python/docs/reference.pymd rename to docs/src/main/paradox/reference.md From 52983e33990b7d30f103b583bebba162e638c19d Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Wed, 23 Oct 2019 14:21:44 -0400 Subject: [PATCH 09/15] Update doc to use rf_local_is_in when masking; fix #351 Signed-off-by: Jason T. Brown --- .../src/main/python/docs/nodata-handling.pymd | 24 +++-------- .../main/python/docs/supervised-learning.pymd | 43 ++++++++----------- 2 files changed, 26 insertions(+), 41 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd index c9fffe390..84af2cbd0 100644 --- a/pyrasterframes/src/main/python/docs/nodata-handling.pymd +++ b/pyrasterframes/src/main/python/docs/nodata-handling.pymd @@ -105,32 +105,23 @@ Drawing on @ref:[local map algebra](local-algebra.md) techniques, we will create ```python, def_mask from pyspark.sql.functions import lit -mask_part = unmasked.withColumn('nodata', rf_local_equal('scl', lit(0))) \ - .withColumn('defect', rf_local_equal('scl', lit(1))) \ - .withColumn('cloud8', rf_local_equal('scl', lit(8))) \ - .withColumn('cloud9', rf_local_equal('scl', lit(9))) \ - .withColumn('cirrus', rf_local_equal('scl', lit(10))) - -one_mask = mask_part.withColumn('mask', rf_local_add('nodata', 'defect')) \ - .withColumn('mask', rf_local_add('mask', 'cloud8')) \ - .withColumn('mask', rf_local_add('mask', 'cloud9')) \ - .withColumn('mask', rf_local_add('mask', 'cirrus')) - -cell_types = one_mask.select(rf_cell_type('mask')).distinct() +mask = unmasked.withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10]) + +cell_types = mask.select(rf_cell_type('mask')).distinct() cell_types ``` Because there is not a NoData already defined, we will choose one. In this particular example, the minimum value is greater than zero, so we can use 0 as the NoData value. ```python, pick_nd -blue_min = one_mask.agg(rf_agg_stats('blue').min.alias('blue_min')) +blue_min = mask.agg(rf_agg_stats('blue').min.alias('blue_min')) blue_min ``` We can now construct the cell type string for our blue band's cell type, designating 0 as NoData. ```python, get_ct_string -blue_ct = one_mask.select(rf_cell_type('blue')).distinct().first()[0][0] +blue_ct = mask.select(rf_cell_type('blue')).distinct().first()[0][0] masked_blue_ct = CellType(blue_ct).with_no_data_value(0) masked_blue_ct.cell_type_name ``` @@ -139,9 +130,8 @@ Now we will use the @ref:[`rf_mask_by_value`](reference.md#rf-mask-by-value) to ```python, mask_blu with_nd = rf_convert_cell_type('blue', masked_blue_ct) -masked = one_mask.withColumn('blue_masked', - rf_mask_by_value(with_nd, 'mask', lit(1))) \ - .drop('nodata', 'defect', 'cloud8', 'cloud9', 'cirrus', 'blue') +masked = mask.withColumn('blue_masked', + rf_mask_by_value(with_nd, 'mask', lit(1))) ``` We can verify that the number of NoData cells in the resulting `blue_masked` column matches the total of the boolean `mask` _tile_ to ensure our logic is correct. diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd index c66697032..99f095b1a 100644 --- a/pyrasterframes/src/main/python/docs/supervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/supervised-learning.pymd @@ -32,7 +32,7 @@ catalog_df = pd.DataFrame([ {b: uri_base.format(b) for b in cols} ]) -df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(128, 128)) \ +df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(256, 256)) \ .repartition(100) df = df.select( @@ -91,23 +91,12 @@ To filter only for good quality pixels, we follow roughly the same procedure as ```python, make_mask from pyspark.sql.functions import lit -mask_part = df_labeled \ - .withColumn('nodata', rf_local_equal('scl', lit(0))) \ - .withColumn('defect', rf_local_equal('scl', lit(1))) \ - .withColumn('cloud8', rf_local_equal('scl', lit(8))) \ - .withColumn('cloud9', rf_local_equal('scl', lit(9))) \ - .withColumn('cirrus', rf_local_equal('scl', lit(10))) - -df_mask_inv = mask_part \ - .withColumn('mask', rf_local_add('nodata', 'defect')) \ - .withColumn('mask', rf_local_add('mask', 'cloud8')) \ - .withColumn('mask', rf_local_add('mask', 'cloud9')) \ - .withColumn('mask', rf_local_add('mask', 'cirrus')) \ - .drop('nodata', 'defect', 'cloud8', 'cloud9', 'cirrus') - +df_labeled = df_labeled \ + .withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10])) + # at this point the mask contains 0 for good cells and 1 for defect, etc # convert cell type and set value 1 to NoData -df_mask = df_mask_inv.withColumn('mask', +df_mask = df_labeled.withColumn('mask', rf_with_no_data(rf_convert_cell_type('mask', 'uint8'), 1.0) ) @@ -213,20 +202,26 @@ retiled.printSchema() ``` Take a look at a sample of the resulting output and the corresponding area's red-green-blue composite image. +Recall the label coding: 1 is forest (purple), 2 is cropland (green) and 3 is developed areas(yellow). ```python, display_rgb sample = retiled \ - .select('prediction', rf_rgb_composite('red', 'grn', 'blu').alias('rgb')) \ + .select('prediction', 'red', 'grn', 'blu') \ .sort(-rf_tile_sum(rf_local_equal('prediction', lit(3.0)))) \ .first() -sample_rgb = sample['rgb'] -mins = np.nanmin(sample_rgb.cells, axis=(0,1)) -plt.imshow((sample_rgb.cells - mins) / (np.nanmax(sample_rgb.cells, axis=(0,1)) - mins)) -``` +sample_rgb = np.concatenate([sample['red'].cells[:, :, None], + sample['grn'].cells[ :, :, None], + sample['blu'].cells[ :, :, None]], axis=2) +# plot scaled RGB +scaling_quantiles = np.nanpercentile(sample_rgb, [3.00, 97.00], axis=(0,1)) +scaled = np.clip(sample_rgb, scaling_quantiles[0, :], scaling_quantiles[1, :]) +scaled -= scaling_quantiles[0, :] +scaled /= (scaling_quantiles[1, : ] - scaling_quantiles[0, :]) -Recall the label coding: 1 is forest (purple), 2 is cropland (green) and 3 is developed areas(yellow). +fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) +ax1.imshow(scaled) -```python, display_prediction -display(sample['prediction']) +# display prediction +ax2.imshow(sample['prediction'].cells) ``` From 49cc14bb83d7ab0ffb983575530a4cffce8f5f14 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Wed, 23 Oct 2019 14:37:32 -0400 Subject: [PATCH 10/15] Doc supervised, set tile size to 256 for visual Signed-off-by: Jason T. Brown --- .../src/main/python/docs/supervised-learning.pymd | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/supervised-learning.pymd b/pyrasterframes/src/main/python/docs/supervised-learning.pymd index 99f095b1a..81a81f634 100644 --- a/pyrasterframes/src/main/python/docs/supervised-learning.pymd +++ b/pyrasterframes/src/main/python/docs/supervised-learning.pymd @@ -32,7 +32,8 @@ catalog_df = pd.DataFrame([ {b: uri_base.format(b) for b in cols} ]) -df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(256, 256)) \ +tile_size = 256 +df = spark.read.raster(catalog_df, catalog_col_names=cols, tile_dimensions=(tile_size, tile_size)) \ .repartition(100) df = df.select( @@ -193,10 +194,10 @@ scored = model.transform(df_mask.drop('label')) retiled = scored \ .groupBy('extent', 'crs') \ .agg( - rf_assemble_tile('column_index', 'row_index', 'prediction', 128, 128).alias('prediction'), - rf_assemble_tile('column_index', 'row_index', 'B04', 128, 128).alias('red'), - rf_assemble_tile('column_index', 'row_index', 'B03', 128, 128).alias('grn'), - rf_assemble_tile('column_index', 'row_index', 'B02', 128, 128).alias('blu') + rf_assemble_tile('column_index', 'row_index', 'prediction', tile_size, tile_size).alias('prediction'), + rf_assemble_tile('column_index', 'row_index', 'B04', tile_size, tile_size).alias('red'), + rf_assemble_tile('column_index', 'row_index', 'B03', tile_size, tile_size).alias('grn'), + rf_assemble_tile('column_index', 'row_index', 'B02', tile_size, tile_size).alias('blu') ) retiled.printSchema() ``` From 73be68cc0b6e0516fdbae3c42c8a256302e7af77 Mon Sep 17 00:00:00 2001 From: "Jason T. Brown" Date: Wed, 23 Oct 2019 14:58:44 -0400 Subject: [PATCH 11/15] Fix nodata doc Signed-off-by: Jason T. Brown --- pyrasterframes/src/main/python/docs/nodata-handling.pymd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrasterframes/src/main/python/docs/nodata-handling.pymd b/pyrasterframes/src/main/python/docs/nodata-handling.pymd index 84af2cbd0..df7c30804 100644 --- a/pyrasterframes/src/main/python/docs/nodata-handling.pymd +++ b/pyrasterframes/src/main/python/docs/nodata-handling.pymd @@ -105,7 +105,7 @@ Drawing on @ref:[local map algebra](local-algebra.md) techniques, we will create ```python, def_mask from pyspark.sql.functions import lit -mask = unmasked.withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10]) +mask = unmasked.withColumn('mask', rf_local_is_in('scl', [0, 1, 8, 9, 10])) cell_types = mask.select(rf_cell_type('mask')).distinct() cell_types From 45ed7e96c3435680cee6baf152e2724124c59df9 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Tue, 5 Nov 2019 14:35:17 -0500 Subject: [PATCH 12/15] Ensure default tile size is applied to `raster` reader. --- .../rasterframes/datasource/raster/RasterSourceDataSource.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala index 061e9fb56..03b2fd0da 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/raster/RasterSourceDataSource.scala @@ -37,7 +37,7 @@ class RasterSourceDataSource extends DataSourceRegister with RelationProvider { override def shortName(): String = SHORT_NAME override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val bands = parameters.bandIndexes - val tiling = parameters.tileDims + val tiling = parameters.tileDims.orElse(Some(NOMINAL_TILE_DIMS)) val lazyTiles = parameters.lazyTiles val spec = parameters.pathSpec val catRef = spec.fold(_.registerAsTable(sqlContext), identity) From 4c398bb98eb43c59bed807608e2bd84ec772717f Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Tue, 5 Nov 2019 16:57:26 -0500 Subject: [PATCH 13/15] Added forced truncation of WKT types in Markdown/HTML rendering. --- .../rasterframes/util/DataFrameRenderers.scala | 7 +++++-- .../locationtech/rasterframes/ExtensionMethodSpec.scala | 8 ++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala b/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala index ae57edcf3..36872332f 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/util/DataFrameRenderers.scala @@ -24,12 +24,14 @@ package org.locationtech.rasterframes.util import geotrellis.raster.render.ColorRamps import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions.{base64, concat, concat_ws, length, lit, substring, when} +import org.apache.spark.sql.jts.JTSTypes import org.apache.spark.sql.types.{StringType, StructField} import org.locationtech.rasterframes.expressions.DynamicExtractors import org.locationtech.rasterframes.{rfConfig, rf_render_png, rf_resample} +import org.apache.spark.sql.rf.WithTypeConformity /** - * DataFrame extensiosn for rendering sample content in a number of ways + * DataFrame extension for rendering sample content in a number of ways */ trait DataFrameRenderers { private val truncateWidth = rfConfig.getInt("max-truncate-row-element-length") @@ -47,8 +49,9 @@ trait DataFrameRenderers { lit("\">") ) else { + val isGeom = WithTypeConformity(c.dataType).conformsTo(JTSTypes.GeometryTypeInstance) val str = resolved.cast(StringType) - if (truncate) + if (truncate || isGeom) when(length(str) > lit(truncateWidth), concat(substring(str, 1, truncateWidth), lit("...")) ) diff --git a/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala b/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala index 4f5fe3591..bb3894162 100644 --- a/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala +++ b/core/src/test/scala/org/locationtech/rasterframes/ExtensionMethodSpec.scala @@ -39,7 +39,7 @@ import scala.xml.parsing.XhtmlParser class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSupport { lazy val rf = sampleTileLayerRDD.toLayer - describe("DataFrame exention methods") { + describe("DataFrame extension methods") { it("should maintain original type") { val df = rf.withPrefixedColumnNames("_foo_") "val rf2: RasterFrameLayer = df" should compile @@ -49,7 +49,7 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu "val Some(col) = df.spatialKeyColumn" should compile } } - describe("RasterFrameLayer exention methods") { + describe("RasterFrameLayer extension methods") { it("should provide spatial key column") { noException should be thrownBy { rf.spatialKeyColumn @@ -124,6 +124,10 @@ class ExtensionMethodSpec extends TestEnvironment with TestData with SubdivideSu val md3 = rf.toMarkdown(truncate=true, renderTiles = false) md3 shouldNot include(" Date: Fri, 8 Nov 2019 12:09:15 -0500 Subject: [PATCH 14/15] PR feedback. Regression fix. Release notes update. --- .../aggregates/TileRasterizerAggregate.scala | 2 +- ...11.h30v06.006.2019120033434_01.mrf.aux.xml | 92 ------------------- .../geotiff/GeoTiffDataSource.scala | 4 +- .../geotiff/GeoTiffDataSourceSpec.scala | 26 +++--- docs/src/main/paradox/release-notes.md | 7 +- 5 files changed, 23 insertions(+), 108 deletions(-) delete mode 100644 core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml diff --git a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala index 360ef93dd..6647f4258 100644 --- a/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala +++ b/core/src/main/scala/org/locationtech/rasterframes/expressions/aggregates/TileRasterizerAggregate.scala @@ -138,7 +138,7 @@ object TileRasterizerAggregate { } } - // Scan table and constuct what the TileLayerMetadata would be in the specified destination CRS. + // Scan table and construct what the TileLayerMetadata would be in the specified destination CRS. val tlm: TileLayerMetadata[SpatialKey] = df .select( ProjectedLayerMetadataAggregate( diff --git a/core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml b/core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml deleted file mode 100644 index 5a18f6944..000000000 --- a/core/src/test/resources/MCD43A4.A2019111.h30v06.006.2019120033434_01.mrf.aux.xml +++ /dev/null @@ -1,92 +0,0 @@ - - - LERC - PIXEL - - - 06121997 - MODIS - MODIS - Terra - Aqua - MODIS - MODIS - Passed - Passed was set as a default value. More algorithm will be developed - 0 - AMBRALS_V4.0R1 - v1.0500m - 15.0 - 463.312716527778 - volume - 2400 - 2400 - Day - Mandatory QA: - 0 = processed, good quality (full BRDF inversions) - 1 = processed, see other QA (magnitude BRDF inversions) - - 6.1 - 150.120692476232 - N - False - 75.0 - 86400 - 43200 - 19.9448109058663, 30.0666177912155, 29.9990071837477, 19.8789125843729 - 127.31379517564, 138.161359988435, 150.130532080915, 138.321766284772 - 1, 2, 3, 4 - HDFEOS_V2.19 - 30 - 10.5067/MODIS/MCD43A4.006 - 10.5067/MODIS/MCD43A4.006 - http://dx.doi.org - http://dx.doi.org - MYD09GA.A2019113.h30v06.006.2019115025936.hdf, MYD09GA.A2019114.h30v06.006.2019117021858.hdf, MYD09GA.A2019115.h30v06.006.2019117044251.hdf, MYD09GA.A2019116.h30v06.006.2019118031111.hdf, MYD09GA.A2019117.h30v06.006.2019119025916.hdf, MYD09GA.A2019118.h30v06.006.2019120030848.hdf, MOD09GA.A2019113.h30v06.006.2019115032521.hdf, MOD09GA.A2019114.h30v06.006.2019116030646.hdf, MOD09GA.A2019115.h30v06.006.2019117050730.hdf, MOD09GA.A2019116.h30v06.006.2019118032616.hdf, MOD09GA.A2019117.h30v06.006.2019119032020.hdf, MOD09GA.A2019118.h30v06.006.2019120032257.hdf, MCD43DB.A2019110.6.h30v06.hdf - MCD43A4.A2019111.h30v06.006.2019120033434.hdf - 6.1.34 - MODIS/Terra+Aqua BRDF/Albedo Nadir BRDF-Adjusted Ref Daily L3 Global - 500m - BRDF_Albedo_Band_Mandatory_Quality_Band1 - 0 - 500m - 29.9999999973059 - 1 - NOT SET - 0 - 0 - 0 - 100 - 0 - 6.0.42 - MODAPS - Linux minion7043 3.10.0-957.5.1.el7.x86_64 #1 SMP Fri Feb 1 14:54:57 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux - 2019-04-30T03:34:48.000Z - 0 - 0 - 99 - 0 - 2019-04-13 - 00:00:00.000000 - 2019-04-28 - 23:59:59.999999 - processed once - further update is anticipated - Not Investigated - See http://landweb.nascom/nasa.gov/cgi-bin/QA_WWW/qaFlagPage.cgi?sat=aqua the product Science Quality status. - 06121997 - MCD43A4 - 19.9999999982039 - 2015 - 51030006 - concatenated flags - 0, 254 - 6 - 6 - 127.701332684185 - 255 - - - BRDF_Albedo_Band_Mandatory_Quality_Band1 - concatenated flags - - diff --git a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala index 256d6b38b..d236449ed 100644 --- a/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala +++ b/datasource/src/main/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSource.scala @@ -49,6 +49,7 @@ class GeoTiffDataSource def shortName() = GeoTiffDataSource.SHORT_NAME + /** Read single geotiff as a relation. */ def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { require(parameters.path.isDefined, "Valid URI 'path' parameter required.") sqlContext.withRasterFrames @@ -57,6 +58,7 @@ class GeoTiffDataSource GeoTiffRelation(sqlContext, p) } + /** Write dataframe containing bands into a single geotiff. Note: performs a driver collect, and is not "big data" friendly. */ override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { require(parameters.path.isDefined, "Valid URI 'path' parameter required.") val path = parameters.path.get @@ -67,8 +69,6 @@ class GeoTiffDataSource require(tileCols.nonEmpty, "Could not find any tile columns.") - - val destCRS = parameters.crs.orElse(df.asLayerSafely.map(_.crs)).getOrElse( throw new IllegalArgumentException("A destination CRS must be provided") ) diff --git a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala index 817d7d5bf..c57737118 100644 --- a/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala +++ b/datasource/src/test/scala/org/locationtech/rasterframes/datasource/geotiff/GeoTiffDataSourceSpec.scala @@ -192,29 +192,36 @@ class GeoTiffDataSourceSpec } it("should write GeoTIFF without layer") { - val pr = col("proj_raster_b0") - val rf = spark.read.raster.withBandIndexes(0, 1, 2).load(rgbCogSamplePath.toASCIIString) - val out = Paths.get("target", "example2-geotiff.tif") - logger.info(s"Writing to $out") + val sample = rgbCogSample + val expectedExtent = sample.extent + val (expCols, expRows) = sample.tile.dimensions - withClue("explicit extent/crs") { + val rf = spark.read.raster.withBandIndexes(0, 1, 2).load(rgbCogSamplePath.toASCIIString) + + withClue("extent/crs columns provided") { + val out = Paths.get("target", "example2a-geotiff.tif") noException shouldBe thrownBy { rf .withColumn("extent", rf_extent(pr)) .withColumn("crs", rf_crs(pr)) - .write.geotiff.withCRS(LatLng).save(out.toString) + .write.geotiff.withCRS(sample.crs).save(out.toString) + checkTiff(out, expCols, expRows, expectedExtent, Some(sample.cellType)) } } - withClue("without explicit extent/crs") { + withClue("without extent/crs columns") { + val out = Paths.get("target", "example2b-geotiff.tif") noException shouldBe thrownBy { rf - .write.geotiff.withCRS(LatLng).save(out.toString) + .write.geotiff.withCRS(sample.crs).save(out.toString) + checkTiff(out, expCols, expRows, expectedExtent, Some(sample.cellType)) } } + withClue("with downsampling") { + val out = Paths.get("target", "example2c-geotiff.tif") noException shouldBe thrownBy { rf .write.geotiff @@ -223,9 +230,6 @@ class GeoTiffDataSourceSpec .save(out.toString) } } - - checkTiff(out, 128, 128, - Extent(-76.52586750038186, 36.85907177863949, -76.17461216980891, 37.1303690755922)) } it("should produce the correct subregion from layer") { diff --git a/docs/src/main/paradox/release-notes.md b/docs/src/main/paradox/release-notes.md index 5a9c70c5b..c07c66536 100644 --- a/docs/src/main/paradox/release-notes.md +++ b/docs/src/main/paradox/release-notes.md @@ -4,9 +4,12 @@ ### 0.8.4 -* _Breaking_ (potentially): removed `GeoTiffCollectionRelation` due to usage limitation and overlap with `RasterSourceDataSource` functionality. * Upgraded to Spark 2.4.4 - * Add `rf_local_is_in` raster function +* Added forced truncation of WKT types in Markdown/HTML rendering. ([#408](https://github.com/locationtech/rasterframes/pull/408)) +* Add `rf_local_is_in` raster function. ([#400](https://github.com/locationtech/rasterframes/pull/400)) +* Added partitioning to catalogs before processing in RasterSourceDataSource ([#397](https://github.com/locationtech/rasterframes/pull/397)) +* Fixed bug where `rf_tile_dimensions` would cause unnecessary reading of tiles. ([#394](https://github.com/locationtech/rasterframes/pull/394)) +* _Breaking_ (potentially): removed `GeoTiffCollectionRelation` due to usage limitation and overlap with `RasterSourceDataSource` functionality. ### 0.8.3 From ba2848e08d79998163a092b26ddd42fc987c0e43 Mon Sep 17 00:00:00 2001 From: "Simeon H.K. Fitch" Date: Fri, 8 Nov 2019 12:52:04 -0500 Subject: [PATCH 15/15] PR feedback. --- .../src/main/python/docs/raster-read.pymd | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pyrasterframes/src/main/python/docs/raster-read.pymd b/pyrasterframes/src/main/python/docs/raster-read.pymd index 30befbabd..443ee0d96 100644 --- a/pyrasterframes/src/main/python/docs/raster-read.pymd +++ b/pyrasterframes/src/main/python/docs/raster-read.pymd @@ -55,19 +55,24 @@ display(tile) ## Multiple Singleband Rasters -In this example, we show reading [two bands](https://en.wikipedia.org/wiki/Multispectral_image) of [Landsat 8](https://landsat.gsfc.nasa.gov/landsat-8/) imagery (red and near-infrared), combining them with `rf_normalized_difference` to compute NDVI. As described in the section on @ref:[catalogs](raster-catalogs.md), image URIs in a single row are assumed to be from the same scene/granule, and therefore compatible. This pattern is commonly used when multiple bands are stored in separate files. +In this example, we show the reading @ref:[two bands](concepts.md#band) of [Landsat 8](https://landsat.gsfc.nasa.gov/landsat-8/) imagery (red and near-infrared), combining them with `rf_normalized_difference` to compute [NDVI](https://en.wikipedia.org/wiki/Normalized_difference_vegetation_index), a common measure of vegetation health. As described in the section on @ref:[catalogs](raster-catalogs.md), image URIs in a single row are assumed to be from the same scene/granule, and therefore compatible. This pattern is commonly used when multiple bands are stored in separate files. ```python, multi_singleband bands = [f'B{b}' for b in [4, 5]] uris = [f'https://landsat-pds.s3.us-west-2.amazonaws.com/c1/L8/014/032/LC08_L1TP_014032_20190720_20190731_01_T1/LC08_L1TP_014032_20190720_20190731_01_T1_{b}.TIF' for b in bands] catalog = ','.join(bands) + '\n' + ','.join(uris) -rf = spark.read.raster(catalog, bands) \ - .withColumnRenamed('B4', 'red').withColumnRenamed('B5', 'NIR') \ - .withColumn('longitude_latitude', st_reproject(st_centroid(rf_geometry('red')), rf_crs('red'), lit('EPSG:4326'))) \ - .withColumn('NDVI', rf_normalized_difference('NIR', 'red')) \ - .where(rf_tile_sum('NDVI') > 10000) \ - .select('longitude_latitude', 'red', 'NIR', 'NDVI') +rf = (spark.read.raster(catalog, bands) + # Adding semantic names + .withColumnRenamed('B4', 'red').withColumnRenamed('B5', 'NIR') + # Adding tile center point for reference + .withColumn('longitude_latitude', st_reproject(st_centroid(rf_geometry('red')), rf_crs('red'), lit('EPSG:4326'))) + # Compute NDVI + .withColumn('NDVI', rf_normalized_difference('NIR', 'red')) + # For the purposes of inspection, filter out rows where there's not much vegetation + .where(rf_tile_sum('NDVI') > 10000) + # Order output + .select('longitude_latitude', 'red', 'NIR', 'NDVI')) display(rf) ```