IBM · tomblench · May 10, 2022 · Mar 21, 2022 · Apr 26, 2022 · Apr 26, 2022
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,4 +1,5 @@
 # UNRELEASED
+- [BREAKING CHANGE] Converter support for sink connector has changed. See README.md for details.
 - [BREAKING CHANGE] Configuration parameters have changed for url, database, authentication, and last change sequence. See README.md for details.
 - [UPGRADED] Connector now supports all authentication types via the `cloudant.auth.type` configuration parameter. When using an authentication type of "iam", the API key is configured via the `cloudant.apikey` configuration parameter.
 - [UPGRADED] Upgraded Gradle distribution from 4.5.1 to 7.4

diff --git a/README.md b/README.md
@@ -29,16 +29,22 @@ the [Kafka Connector documentation](http://docs.confluent.io/3.0.1/connect/userg
 
 1. `bootstrap.servers`
 2. If using a standalone worker `offset.storage.file.filename`.
-3. The following configuration when using the Cloudant connector as either a source or a sink:
 
-Parameter | Value
----:|:---
-key.converter|org.apache.kafka.connect.json.JsonConverter
-value.converter|org.apache.kafka.connect.json.JsonConverter
-key.converter.schemas.enable|true
-value.converter.schemas.enable|true
+### Converter configuration
 
-Assume these settings in a file `connect-standalone.properties` or `connect-distributed.properties`.
+The kafka distribution defaults are usually as follows:
+```
+key.converter=org.apache.kafka.connect.json.JsonConverter
+value.converter=org.apache.kafka.connect.json.JsonConverter
+key.converter.schemas.enable=true
+value.converter.schemas.enable=true
+```
+
+#### Converter configuration: sink connector
+
+For the sink connector, kafka keys are currently ignored; therefore the key converter settings are not relevant.
+
+For the sink connector, we assume that the values in kafka are serialized JSON objects, and therefore `JsonConverter` is supported. If your values contain a schema (`{"schema": {...}, "payload": {...}}`), then set `value.converter.schemas.enable=true`, otherwise set `value.converter.schemas.enable=false`. Any other converter that converts the message values into `org.apache.kafka.connect.data.Struct` or `java.util.Map` types should also work. However, it must be noted that the subsequent serialization of `Map` or `Struct` values to JSON documents in the sink may not match expectations if a schema has not been provided.
 
 ### Authentication
 

diff --git a/build.gradle b/build.gradle
@@ -29,6 +29,8 @@ dependencies {
     testImplementation group: 'org.powermock', name: 'powermock-api-easymock', version: '1.6.4'
     testImplementation group: 'org.easymock', name: 'easymock', version: '3.4'
     testImplementation group: 'com.carrotsearch', name: 'junit-benchmarks', version: '0.7.2'
+    // for logging output when running tests
+    testRuntimeOnly group: 'org.slf4j', name: 'slf4j-simple', version: '1.7.36'
 }
 
 // Java versions

diff --git a/src/main/java/com/ibm/cloudant/kafka/common/utils/JavaCloudantUtil.java b/src/main/java/com/ibm/cloudant/kafka/common/utils/JavaCloudantUtil.java
@@ -15,30 +15,24 @@
 
 import com.ibm.cloud.cloudant.internal.ServiceFactory;
 import com.ibm.cloud.cloudant.v1.Cloudant;
-import com.ibm.cloud.cloudant.v1.model.BulkDocs;
-import com.ibm.cloud.cloudant.v1.model.Document;
-import com.ibm.cloud.cloudant.v1.model.DocumentResult;
-import com.ibm.cloud.cloudant.v1.model.PostBulkDocsOptions;
-import com.ibm.cloud.cloudant.v1.model.PutDatabaseOptions;
+import com.ibm.cloud.cloudant.v1.model.*;
 import com.ibm.cloud.sdk.core.service.exception.ServiceResponseException;
 import com.ibm.cloudant.kafka.common.CloudantConst;
 import com.ibm.cloudant.kafka.common.InterfaceConst;
 import com.ibm.cloudant.kafka.common.MessageKey;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.net.MalformedURLException;
-import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
+import java.util.stream.Collectors;
 
 public class JavaCloudantUtil {
 
@@ -67,7 +61,7 @@ public class JavaCloudantUtil {
 		);
 	}
 
-	public static JSONArray batchWrite(Map<String, String> props, JSONArray data)
+	public static JSONArray batchWrite(Map<String, String> props, List<Map<String, Object>> data)
 		throws JSONException {
 		// wrap result to JSONArray
 		JSONArray result = new JSONArray();
@@ -76,13 +70,7 @@ public static JSONArray batchWrite(Map<String, String> props, JSONArray data)
 			// get client object
 			Cloudant service = getClientInstance(props);
 
-			List<Document> listOfDocs = new ArrayList<>();
-			for(int i=0; i < data.length(); i++){
-				Map<String, Object> docProperties = data.getJSONObject(i).toMap();
-				Document doc = new Document();
-				doc.setProperties(docProperties);
-				listOfDocs.add(doc);
-			}
+			List<Document> listOfDocs = data.stream().map(d -> {Document doc = new Document(); doc.setProperties(d); return doc; }).collect(Collectors.toList());
 
 			// attempt to create database
 			createTargetDb(service, props.get(InterfaceConst.DB));
@@ -116,6 +104,7 @@ public static JSONArray batchWrite(Map<String, String> props, JSONArray data)
 				result.put(jsonResult);
 			}
 		} catch (Exception e) {
+			LOG.error("Exception caught in batchWrite()", e);
 			if(e.getMessage().equals(String.format(ResourceBundleUtil.get(
 				MessageKey.CLOUDANT_LIMITATION)))){
 				// try to put items from jsonResult before exception occurred

diff --git a/src/main/java/com/ibm/cloudant/kafka/connect/CloudantSinkTask.java b/src/main/java/com/ibm/cloudant/kafka/connect/CloudantSinkTask.java
@@ -18,20 +18,17 @@
 import com.ibm.cloudant.kafka.common.MessageKey;
 import com.ibm.cloudant.kafka.common.utils.JavaCloudantUtil;
 import com.ibm.cloudant.kafka.common.utils.ResourceBundleUtil;
-
+import com.ibm.cloudant.kafka.schema.ConnectRecordMapper;
 import org.apache.kafka.common.TopicPartition;
 import org.apache.kafka.common.config.ConfigException;
 import org.apache.kafka.connect.errors.ConnectException;
 import org.apache.kafka.connect.sink.SinkRecord;
 import org.apache.kafka.connect.sink.SinkTask;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.json.JSONArray;
-import org.json.JSONException;
-import org.json.JSONObject;
-import org.json.JSONTokener;
 
 import java.util.Collection;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 
@@ -48,64 +45,33 @@ public class CloudantSinkTask extends SinkTask {
 	private int taskNumber;
 	public static String guid_schema = null;
 	private Boolean replication;
-	public static volatile JSONArray jsonArray = new JSONArray();
 
+	private List<Map<String, Object>> jsonArray = new LinkedList<>();
+
+	private static ConnectRecordMapper<SinkRecord> mapper = new ConnectRecordMapper<>();
 
 	@Override
 	public String version() {
 		 return new CloudantSinkConnector().version();
 	}
 
-
 	//TODO: all sinkRecords in first Thread
 	@Override
 	public void put(Collection<SinkRecord> sinkRecords) {
-	
+
 		LOG.info("Thread[" + Thread.currentThread().getId() + "].sinkRecords = " + sinkRecords.size());
-
-		for (SinkRecord record : sinkRecords) {		
-			JSONObject jsonRecord;
-
-			JSONTokener tokener = new JSONTokener(record.value().toString());		
-			jsonRecord = new JSONObject(tokener);
-
-			if (jsonRecord.has(CloudantConst.CLOUDANT_REV)) {
-				jsonRecord.remove(CloudantConst.CLOUDANT_REV);
-			}
-
-			if(jsonRecord.has(CloudantConst.CLOUDANT_DOC_ID)){			
-				if(replication == false) {
-					//Add archive schema from SinkRecord when available
-					jsonRecord.put(InterfaceConst.KC_SCHEMA, record.valueSchema());
-
-					//Create object id from kafka
-					jsonRecord.put(CloudantConst.CLOUDANT_DOC_ID, 
-							record.topic() + "_" + 
-							record.kafkaPartition().toString() + "_" + 
-							Long.toString(record.kafkaOffset()) + "_" + 
-							jsonRecord.get(CloudantConst.CLOUDANT_DOC_ID));	
-				}
-				//OPTION B: IF replication == true => Do Nothing => Create mirror from Cloudant object
-
-				//OPTION C (not implemented): generate new id with  cloudant 
-				/*else {
-					LOG.info(MessageKey.GUID_SCHEMA + ": " + guid_schema);
-					LOG.warn(CloudantConst.CLOUDANT_DOC_ID + "from source database will removed");
-
-					//remove Cloudant _id
-					jsonRecord.remove(CloudantConst.CLOUDANT_DOC_ID);
-				}*/
-			}					
-			jsonArray.put(jsonRecord);
-
-			if ((jsonArray != null) && (jsonArray.length() >= batch_size)) {
-
-				flush(null);
-
-			} 
-		} 
-	}
 
+		sinkRecords.stream()
+				.map(mapper) // Convert ConnectRecord to Map
+				.sequential() // Avoid concurrent access to jsonArray
+				.forEach(recordValueAsMap -> {
+					recordValueAsMap.remove(CloudantConst.CLOUDANT_REV); // Remove the _rev
+					jsonArray.add(recordValueAsMap);
+					if (jsonArray.size() >= batch_size) {
+						flush(null);
+					}
+				});
+	}
 
 	@Override
 	public void stop() {
@@ -137,31 +103,12 @@ public void start(Map<String, String> props) {
 	@Override
 	public void flush(Map<TopicPartition, org.apache.kafka.clients.consumer.OffsetAndMetadata> offsets) {
 		LOG.debug("Flushing output stream for {" + config.getString(InterfaceConst.URL) + "}");
-
 		try {
-
-			if ((jsonArray != null) && (jsonArray.length() > 0)) {
-
-				JSONArray results = JavaCloudantUtil.batchWrite(config.originalsStrings(), jsonArray);
-				LOG.info("Committed " + jsonArray.length() + " documents to -> " + config.getString(InterfaceConst.URL));
-
-				// The results array has a record for every single document commit
-				// Processing this is expensive!
-				if (results != null) {
-					/* 
-					for (int i = 0; i < results.length(); i++) {
-						JSONObject result = (JSONObject) results.get(i);
-						LOG.debug(result.toString());
-					}
-					*/
-				}
-			}
-
-		} catch (JSONException e) {
-			LOG.error(e.getMessage(), e);
+				JavaCloudantUtil.batchWrite(config.originalsStrings(), jsonArray);
+				LOG.info("Committed " + jsonArray.size() + " documents to -> " + config.getString(InterfaceConst.URL));
 		} finally {
 			// Release memory (regardless if documents got committed or not)
-			jsonArray = new JSONArray(); ;
+			jsonArray = new LinkedList<>();
 		}
 	}
 

diff --git a/src/main/java/com/ibm/cloudant/kafka/schema/ConnectRecordMapper.java b/src/main/java/com/ibm/cloudant/kafka/schema/ConnectRecordMapper.java
@@ -0,0 +1,103 @@
+package com.ibm.cloudant.kafka.schema;
+
+import org.apache.kafka.connect.connector.ConnectRecord;
+import org.apache.kafka.connect.data.Field;
+import org.apache.kafka.connect.data.Schema;
+import org.apache.kafka.connect.data.Schema.Type;
+import org.apache.kafka.connect.data.Struct;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Function;
+
+public class ConnectRecordMapper<R extends ConnectRecord<R>> implements Function<ConnectRecord<R>, Map<String, Object>> {
+
+    private static Logger LOG = LoggerFactory.getLogger(ConnectRecordMapper.class);
+
+    public Map<String, Object> apply(ConnectRecord<R> record) {
+        // we can convert from a struct or a map - assume a map when a value schema is not provided
+        Schema.Type schemaType = record.valueSchema() == null ? Schema.Type.MAP : record.valueSchema().type();
+        Map<String, Object> toReturn = new HashMap<>();
+        switch (schemaType) {
+            case MAP:
+                if (record.value() instanceof Map) {
+                    return convertMap((Map) record.value(), toReturn);
+                } else {
+                    throw new IllegalArgumentException(String.format("Type %s not supported with schema of type Map (or no schema)",
+                            record.value().getClass()));
+                }
+            case STRUCT:
+                if (record.value() instanceof Struct) {
+                    return convertStruct((Struct) record.value(), toReturn);
+                } else {
+                    throw new IllegalArgumentException(String.format("Type %s not supported with schema of type Struct",
+                            record.value().getClass()));
+                }
+            default:
+                throw new IllegalArgumentException(String.format("Schema type %s not supported", record.valueSchema().type()));
+        }
+    }
+
+    // convert struct to map by adding key/values to passed in map, and returning it 
+    private Map<String, Object> convertStruct(Struct struct, Map<String, Object> outMap) {
+        Schema schema = struct.schema();
+
+        // iterate fields and add to map
+        for (Field f : schema.fields()) {
+            Object value = struct.get(f);
+            outMap.put(f.name(), getField(f.schema().type(), value));
+        }
+        return outMap;
+    }
+
+    // convert kafka map to map by adding key/values to passed in map, and returning it
+    private Map<String, Object> convertMap(Map inMap, Map<String, Object> outMap) {
+
+        for (Object k : inMap.keySet()) {
+            if (k instanceof String) {
+                Object v = inMap.get(k);
+                if (v instanceof Map) {
+                    outMap.put((String)k, convertMap((Map)v, new HashMap<>()));
+                } else if (v instanceof Struct) {
+                    outMap.put((String)k, convertStruct((Struct)v, new HashMap<>()));
+                } else {
+                    // assume that JSON serialiser knows how to deal with it
+                    outMap.put((String)k, v);
+                }
+            } else {
+                throw new IllegalArgumentException("unsupported type in map key " + k.getClass());
+            }
+        }
+        return outMap;
+    }
+
+    // get field value, recursing if necessary for struct types
+    private Object getField(Type type, Object value) {
+
+        switch (type) {
+            // primitive types: just return value (JSON serialiser will deal with conversion later)
+            case ARRAY:
+            case BOOLEAN:
+            case BYTES:
+            case FLOAT32:
+            case FLOAT64:
+            case INT16:
+            case INT32:
+            case INT64:
+            case INT8:
+            case STRING:
+                return value;
+            // map/struct cases: chain a new map onto this one, as the value, and recursively fill in its contents 
+            case MAP:
+                return convertMap((Map)value, new HashMap<>());
+            case STRUCT:
+                return convertStruct((Struct)value, new HashMap<>());
+            default:
+                throw new IllegalArgumentException("unknown type " + type);
+        }
+
+    }
+
+}