Apache Beam IllegalArgumentException on Google Dataflow with message `Not expecting a splittable ParDoSingle: should have been overridden` - google-cloud-dataflow

I am trying to write a pipeline which periodically checks a Google Storage bucket for new .gz files which are actually compressed .csv files. Then it writes those records to a BigQuery table. The following code was working in batch mode before I added the .watchForNewFiles(...) and .withMethod(STREAMING_INSERTS) parts. I am expecting it to run in streaming mode with those changes. However I am getting an exception that I can't find anything related on the web. Here is my code:
public static void main(String[] args) {
DataflowDfpOptions options = PipelineOptionsFactory.fromArgs(args)
//.withValidation()
.as(DataflowDfpOptions.class);
Pipeline pipeline = Pipeline.create(options);
Stopwatch sw = Stopwatch.createStarted();
log.info("DFP data transfer from GS to BQ has started.");
pipeline.apply("ReadFromStorage", TextIO.read()
.from("gs://my-bucket/my-folder/*.gz")
.withCompression(Compression.GZIP)
.watchForNewFiles(
// Check for new files every 30 seconds
Duration.standardSeconds(30),
// Never stop checking for new files
Watch.Growth.never()
)
)
.apply("TransformToTableRow", ParDo.of(new TableRowConverterFn()))
.apply("WriteToBigQuery", BigQueryIO.writeTableRows()
.to(options.getTableId())
.withMethod(STREAMING_INSERTS)
.withCreateDisposition(CREATE_NEVER)
.withWriteDisposition(WRITE_APPEND)
.withSchema(TableSchema)); //todo: use withJsonScheme(String json) method instead
pipeline.run().waitUntilFinish();
log.info("DFP data transfer from GS to BQ is finished in {} seconds.", sw.elapsed(TimeUnit.SECONDS));
}
/**
* Creates a TableRow from a CSV line
*/
private static class TableRowConverterFn extends DoFn<String, TableRow> {
#ProcessElement
public void processElement(ProcessContext c) throws Exception {
String[] split = c.element().split(",");
//Ignore the header line
//Since this is going to be run in parallel, we can't guarantee that the first line passed to this method will be the header
if (split[0].equals("Time")) {
log.info("Skipped header");
return;
}
TableRow row = new TableRow();
for (int i = 0; i < split.length; i++) {
TableFieldSchema col = TableSchema.getFields().get(i);
//String is the most common type, putting it in the first if clause for a little bit optimization.
if (col.getType().equals("STRING")) {
row.set(col.getName(), split[i]);
} else if (col.getType().equals("INTEGER")) {
row.set(col.getName(), Long.valueOf(split[i]));
} else if (col.getType().equals("BOOLEAN")) {
row.set(col.getName(), Boolean.valueOf(split[i]));
} else if (col.getType().equals("FLOAT")) {
row.set(col.getName(), Float.valueOf(split[i]));
} else {
//Simply try to write it as a String if
//todo: Consider other BQ data types.
row.set(col.getName(), split[i]);
}
}
c.output(row);
}
}
And the stack trace:
java.lang.IllegalArgumentException: Not expecting a splittable ParDoSingle: should have been overridden
at org.apache.beam.repackaged.beam_runners_google_cloud_dataflow_java.com.google.common.base.Preconditions.checkArgument(Preconditions.java:122)
at org.apache.beam.runners.dataflow.PrimitiveParDoSingleFactory$PayloadTranslator.payloadForParDoSingle(PrimitiveParDoSingleFactory.java:167)
at org.apache.beam.runners.dataflow.PrimitiveParDoSingleFactory$PayloadTranslator.translate(PrimitiveParDoSingleFactory.java:145)
at org.apache.beam.runners.core.construction.PTransformTranslation.toProto(PTransformTranslation.java:206)
at org.apache.beam.runners.core.construction.SdkComponents.registerPTransform(SdkComponents.java:86)
at org.apache.beam.runners.core.construction.PipelineTranslation$1.visitPrimitiveTransform(PipelineTranslation.java:87)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:668)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:660)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:660)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:660)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:660)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.visit(TransformHierarchy.java:660)
at org.apache.beam.sdk.runners.TransformHierarchy$Node.access$600(TransformHierarchy.java:311)
at org.apache.beam.sdk.runners.TransformHierarchy.visit(TransformHierarchy.java:245)
at org.apache.beam.sdk.Pipeline.traverseTopologically(Pipeline.java:458)
at org.apache.beam.runners.core.construction.PipelineTranslation.toProto(PipelineTranslation.java:59)
at org.apache.beam.runners.dataflow.DataflowPipelineTranslator.translate(DataflowPipelineTranslator.java:165)
at org.apache.beam.runners.dataflow.DataflowRunner.run(DataflowRunner.java:684)
at org.apache.beam.runners.dataflow.DataflowRunner.run(DataflowRunner.java:173)
at org.apache.beam.sdk.Pipeline.run(Pipeline.java:311)
at org.apache.beam.sdk.Pipeline.run(Pipeline.java:297)
at com.diply.data.App.main(App.java:66)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.codehaus.mojo.exec.ExecJavaMojo$1.run(ExecJavaMojo.java:282)
at java.lang.Thread.run(Thread.java:748)
Here is my command to publish the job on Dataflow:
clean compile exec:java -Dexec.mainClass=com.my.project.App "-Dexec.args=--runner=DataflowRunner --tempLocation=gs://my-bucket/tmp --tableId=Temp.TestTable --project=my-project --jobName=dataflow-dfp-streaming" -Pdataflow-runner
I use apache beam version 2.5.0. Here is the relevant section from my pom.xml.
<properties>
<beam.version>2.5.0</beam.version>
<bigquery.version>v2-rev374-1.23.0</bigquery.version>
<google-clients.version>1.23.0</google-clients.version>
...
</properties>

Running the code with Dataflow 2.4.0 gives a more explicit error: java.lang.UnsupportedOperationException: DataflowRunner does not currently support splittable DoFn
However, this answer suggests that this is supported since 2.2.0. This is indeed the case, and following this remark you need to add the --streaming option in your Dexec.args to force it into streaming mode.
I tested it with the code I supplied in the comments with both your pom and mine and both 1. produce your error without --streaming 2. run fine with --streaming
You might want to open a github beam issue since this behavior is not documented anywhere offically as far as I know.

Related

Using C# send Avro message to Azure Event Hub and then de-serialize using Scala Structured Streaming in Databricks 7.2/ Scala 3.0

So I have been banging my head against this for the last couple of days. I am having trouble de-serializing an Avro file that we are generating and sending into Azure Event Hub. We are attempting to do this with Databricks Runtime 7.2 Structured Streaming. Using the newer from_avro method described here to de-serialize the body of the event message.
import org.apache.spark.eventhubs._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.avro._
import org.apache.avro._
import org.apache.spark.sql.types._
import org.apache.spark.sql.avro.functions._
val connStr = "<EventHubConnectionstring>"
val customEventhubParameters =
EventHubsConf(connStr.toString())
.setMaxEventsPerTrigger(5)
//.setStartingPosition(EventPosition.fromStartOfStream)
val incomingStream = spark
.readStream
.format("eventhubs")
.options(customEventhubParameters.toMap)
.load()
.filter($"properties".getItem("TableName") === "Branches")
val avroSchema = s"""{"type":"record","name":"Branches","fields":[{"name":"_src_ChangeOperation","type":["null","string"]},{"name":"_src_CurrentTrackingId","type":["null","long"]},{"name":"_src_RecordExtractUTCTimestamp","type":"string"},{"name":"ID","type":["null","int"]},{"name":"BranchCode","type":["null","string"]},{"name":"BranchName","type":["null","string"]},{"name":"Address1","type":["null","string"]},{"name":"Address2","type":["null","string"]},{"name":"City","type":["null","string"]},{"name":"StateID","type":["null","int"]},{"name":"ZipCode","type":["null","string"]},{"name":"Telephone","type":["null","string"]},{"name":"Contact","type":["null","string"]},{"name":"Title","type":["null","string"]},{"name":"DOB","type":["null","string"]},{"name":"TimeZoneID","type":["null","int"]},{"name":"ObserveDaylightSaving","type":["null","boolean"]},{"name":"PaySummerTimeHour","type":["null","boolean"]},{"name":"PayWinterTimeHour","type":["null","boolean"]},{"name":"BillSummerTimeHour","type":["null","boolean"]},{"name":"BillWinterTimeHour","type":["null","boolean"]},{"name":"Deleted","type":["null","boolean"]},{"name":"LastUpdated","type":["null","string"]},{"name":"txJobID","type":["null","string"]},{"name":"SourceID","type":["null","string"]},{"name":"HP_UseHolPayHourMethod","type":["null","boolean"]},{"name":"HP_HourlyRatePercent","type":["null","float"]},{"name":"HP_RequiredWeeksOfEmployment","type":["null","float"]},{"name":"rgUseSystemSettings","type":["null","boolean"]},{"name":"rgDutySplitBy","type":["null","int"]},{"name":"rgBasePeriodDate","type":["null","string"]},{"name":"rgFirstDayOfWeek","type":["null","int"]},{"name":"rgDutyStartOfDayTime","type":["null","string"]},{"name":"rgHolidayStartOfDayTime","type":["null","string"]},{"name":"rgMinimumTimePeriod","type":["null","int"]},{"name":"rgLoadPublicTable","type":["null","boolean"]},{"name":"rgPOTPayPeriodID","type":["null","int"]},{"name":"rgPOT1","type":["null","string"]},{"name":"rgPOT2","type":["null","string"]},{"name":"Facsimile","type":["null","string"]},{"name":"CountryID","type":["null","int"]},{"name":"EmailAddress","type":["null","string"]},{"name":"ContractSecurityHistoricalWeeks","type":["null","int"]},{"name":"ContractSecurityFutureWeeks","type":["null","int"]},{"name":"TimeLinkTelephone1","type":["null","string"]},{"name":"TimeLinkTelephone2","type":["null","string"]},{"name":"TimeLinkTelephone3","type":["null","string"]},{"name":"TimeLinkTelephone4","type":["null","string"]},{"name":"TimeLinkTelephone5","type":["null","string"]},{"name":"AutoTakeMissedCalls","type":["null","boolean"]},{"name":"AutoTakeMissedCallsDuration","type":["null","string"]},{"name":"AutoTakeApplyDurationToCheckCalls","type":["null","boolean"]},{"name":"AutoTakeMissedCheckCalls","type":["null","boolean"]},{"name":"AutoTakeMissedCheckCallsDuration","type":["null","string"]},{"name":"DocumentLocation","type":["null","string"]},{"name":"DefaultPortalAccess","type":["null","boolean"]},{"name":"DefaultPortalSecurityRoleID","type":["null","int"]},{"name":"EmployeeTemplateID","type":["null","int"]},{"name":"SiteCardTemplateID","type":["null","int"]},{"name":"TSAllowancesHeaderID","type":["null","int"]},{"name":"TSMinimumWageHeaderID","type":["null","int"]},{"name":"TimeLinkClaimMade","type":["null","boolean"]},{"name":"TSAllowancePeriodBaseDate","type":["null","string"]},{"name":"TSAllowancePeriodID","type":["null","int"]},{"name":"TSMinimumWageCalcMethodID","type":["null","int"]},{"name":"FlexibleShiftsHeaderID","type":["null","int"]},{"name":"SchedulingUseSystemSettings","type":["null","boolean"]},{"name":"MinimumRestPeriod","type":["null","int"]},{"name":"TSMealBreakHeaderID","type":["null","int"]},{"name":"ServiceTracImportType","type":["null","int"]},{"name":"StandDownDiaryEventID","type":["null","int"]},{"name":"ScheduledDutyChangeMessageTemplateId","type":["null","int"]},{"name":"ScheduledDutyAddedMessageTemplateId","type":["null","int"]},{"name":"ScheduledDutyRemovedMessageTemplateId","type":["null","int"]},{"name":"NegativeMessageResponsesPermitted","type":["null","boolean"]},{"name":"PortalEventsStandardLocFirst","type":["null","boolean"]},{"name":"ReminderMessage","type":["null","boolean"]},{"name":"ReminderMessageDaysBefore","type":["null","int"]},{"name":"ReminderMessageTemplateId","type":["null","int"]},{"name":"ScheduledDutyChangeMessageAllowReply","type":["null","boolean"]},{"name":"ScheduledDutyAddedMessageAllowReply","type":["null","boolean"]},{"name":"PayAlertEscalationGroup","type":["null","int"]},{"name":"BudgetedPay","type":["null","int"]},{"name":"PayAlertVariance","type":["null","string"]},{"name":"BusinessUnitID","type":["null","int"]},{"name":"APH_Hours","type":["null","float"]},{"name":"APH_Period","type":["null","int"]},{"name":"APH_PeriodCount","type":["null","int"]},{"name":"AveragePeriodHoursRuleId","type":["null","int"]},{"name":"HolidayScheduleID","type":["null","int"]},{"name":"AutomationRuleProfileId","type":["null","int"]}]}"""
val decoded_df = incomingStream
.select(
from_avro($"body",avroSchema).alias("payload")
)
val query1 = (
decoded_df
.writeStream
.format("memory")
.queryName("read_hub")
.start()
)
I have verified that the file we are sending has a valid schema, that it has data in it and that it is getting to the stream job in the notebook before failing with the following stack trace that states that the data is malformed. However I am able to write the generated file to a .avro file and de-serialize it using the normal .read.format("avro") method just fine.
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:413)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:361)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.writeWithV2(WriteToDataSourceV2Exec.scala:322)
at org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec.run(WriteToDataSourceV2Exec.scala:329)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:39)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.executeCollect(V2CommandExec.scala:45)
at org.apache.spark.sql.execution.collect.Collector$.callExecuteCollect(Collector.scala:118)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:69)
at org.apache.spark.sql.execution.collect.Collector$.collect(Collector.scala:88)
at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:508)
at org.apache.spark.sql.execution.ResultCacheManager.getOrComputeResult(ResultCacheManager.scala:480)
at org.apache.spark.sql.execution.SparkPlan.executeCollectResult(SparkPlan.scala:396)
at org.apache.spark.sql.Dataset.collectResult(Dataset.scala:2986)
at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3692)
at org.apache.spark.sql.Dataset.$anonfun$collect$1(Dataset.scala:2953)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3684)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:835)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3682)
at org.apache.spark.sql.Dataset.collect(Dataset.scala:2953)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$16(MicroBatchExecution.scala:586)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:835)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runBatch$15(MicroBatchExecution.scala:581)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:276)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:274)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:71)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runBatch(MicroBatchExecution.scala:581)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$2(MicroBatchExecution.scala:231)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken(ProgressReporter.scala:276)
at org.apache.spark.sql.execution.streaming.ProgressReporter.reportTimeTaken$(ProgressReporter.scala:274)
at org.apache.spark.sql.execution.streaming.StreamExecution.reportTimeTaken(StreamExecution.scala:71)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.$anonfun$runActivatedStream$1(MicroBatchExecution.scala:199)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:57)
at org.apache.spark.sql.execution.streaming.MicroBatchExecution.runActivatedStream(MicroBatchExecution.scala:193)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:346)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:259)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 37.0 failed 4 times, most recent failure: Lost task 0.3 in stage 37.0 (TID 84, 10.139.64.5, executor 0): org.apache.spark.SparkException: Malformed records are detected in record parsing. Current parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'.
at org.apache.spark.sql.avro.AvroDataToCatalyst.nullSafeEval(AvroDataToCatalyst.scala:111)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:731)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$7(WriteToDataSourceV2Exec.scala:438)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1615)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:477)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:385)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:144)
at org.apache.spark.scheduler.Task.run(Task.scala:117)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$9(Executor.scala:657)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1581)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:660)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: -40
at org.apache.avro.io.parsing.Symbol$Alternative.getSymbol(Symbol.java:424)
at org.apache.avro.io.ResolvingDecoder.doAction(ResolvingDecoder.java:290)
at org.apache.avro.io.parsing.Parser.advance(Parser.java:88)
at org.apache.avro.io.ResolvingDecoder.readIndex(ResolvingDecoder.java:267)
at org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:179)
at org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:153)
at org.apache.avro.generic.GenericDatumReader.readField(GenericDatumReader.java:232)
at org.apache.avro.generic.GenericDatumReader.readRecord(GenericDatumReader.java:222)
at org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:175)
at org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:153)
at org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:145)
at org.apache.spark.sql.avro.AvroDataToCatalyst.nullSafeEval(AvroDataToCatalyst.scala:100)
... 16 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2478)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2427)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2426)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2426)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1131)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1131)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1131)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2678)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2625)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2613)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:917)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2313)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:382)
... 46 more
Caused by: org.apache.spark.SparkException: Malformed records are detected in record parsing. Current parse Mode: FAILFAST. To process malformed records as null result, try setting the option 'mode' as 'PERMISSIVE'.
at org.apache.spark.sql.avro.AvroDataToCatalyst.nullSafeEval(AvroDataToCatalyst.scala:111)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:731)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$7(WriteToDataSourceV2Exec.scala:438)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1615)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:477)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:385)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:144)
at org.apache.spark.scheduler.Task.run(Task.scala:117)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$9(Executor.scala:657)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1581)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:660)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: -40
at org.apache.avro.io.parsing.Symbol$Alternative.getSymbol(Symbol.java:424)
at org.apache.avro.io.ResolvingDecoder.doAction(ResolvingDecoder.java:290)
at org.apache.avro.io.parsing.Parser.advance(Parser.java:88)
at org.apache.avro.io.ResolvingDecoder.readIndex(ResolvingDecoder.java:267)
at org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:179)
at org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:153)
at org.apache.avro.generic.GenericDatumReader.readField(GenericDatumReader.java:232)
at org.apache.avro.generic.GenericDatumReader.readRecord(GenericDatumReader.java:222)
at org.apache.avro.generic.GenericDatumReader.readWithoutConversion(GenericDatumReader.java:175)
at org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:153)
at org.apache.avro.generic.GenericDatumReader.read(GenericDatumReader.java:145)
at org.apache.spark.sql.avro.AvroDataToCatalyst.nullSafeEval(AvroDataToCatalyst.scala:100)
... 16 more
Tech
C# Azure Function v3 .net core generating Avro file using Avro 1.8.2
Avro file is serialized to byte array using Generic Writer not Specific Writer and sent to Azure Event Hub
Databricks Runtime 7.2/Scala 3.0
Databricks notebooks written in Scala
Databricks Structured Stream Notebook to de-serialize the Avro message
and send to delta lake table
NOT using the following
Event Hub Capture
Kafka
Schema registry
Ok so I just figured out what the issue was. It was in how we were generating the avro message before sending it to event hub. In our serialization method we were using the var writer = new GenericDatumWriter<GenericRecord>(schema); and IFileWriter<GenericRecord> to write to a memory stream and then just getting the byte array of that stream as seen below.
public byte[] Serialize(DataCapture data)
{
var schema = GenerateSchema(data.Schema);
var writer = new GenericDatumWriter<GenericRecord>(schema);
using(var ms = new MemoryStream())
{
using (IFileWriter<GenericRecord> fileWriter = DataFileWriter<GenericRecord>.OpenWriter(writer, ms))
{
foreach (var jsonString in data.Rows)
{
var record = new GenericRecord(schema);
var obj = JsonConvert.DeserializeObject<JObject>(jsonString);
foreach (var column in data.Schema.Columns)
{
switch (MapDataType(column.DataTypeName))
{
case AvroTypeEnum.Boolean:
record.Add(column.ColumnName, obj.GetValue(column.ColumnName).Value<bool?>());
break;
//Map all datatypes ect....removed to shorten example
default:
record.Add(column.ColumnName, obj.GetValue(column.ColumnName).Value<string>());
break;
}
}
fileWriter.Append(record);
}
}
return ms.ToArray();
}
}
When what we actually should do is use var writer = new DefaultWriter(schema); and var encoder = new BinaryEncoder(ms); to then write the records with writer.Write(record, encoder); before returning the byte array of the stream.
public byte[] Serialize(DataCapture data)
{
var schema = GenerateSchema(data.Schema);
var writer = new DefaultWriter(schema);
using (var ms = new MemoryStream())
{
var encoder = new BinaryEncoder(ms);
foreach (var jsonString in data.Rows)
{
var record = new GenericRecord(schema);
var obj = JsonConvert.DeserializeObject<JObject>(jsonString);
foreach (var column in data.Schema.Columns)
{
switch (MapDataType(column.DataTypeName))
{
case AvroTypeEnum.Boolean:
record.Add(column.ColumnName, obj.GetValue(column.ColumnName).Value<bool?>());
break;
//Map all datatypes ect....removed to shorten example
default:
record.Add(column.ColumnName, obj.GetValue(column.ColumnName).Value<string>());
break;
}
}
writer.Write(record, encoder);
}
return ms.ToArray();
}
}
So lesson learned is that not all Avro memory streams converted to byte[] are the same. The from_avro method will only de-serialize avro data the has been binary encoded with the BinaryEncoder class not data created with the IFileWriter. If there is something that I should be doing instead please let me know but this fixed my issue. Hopefully my pain will spare others the same.

Micronaut ReadTimeoutException

I have a Grails 4 application providing a REST API. One of the endpoints sometimes fail with the following exception:
io.micronaut.http.client.exceptions.ReadTimeoutException: Read Timeout
at io.micronaut.http.client.exceptions.ReadTimeoutException.<clinit>(ReadTimeoutException.java:26)
at io.micronaut.http.client.DefaultHttpClient$10.exceptionCaught(DefaultHttpClient.java:1917)
at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297)
at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:276)
at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:268)
at io.netty.channel.CombinedChannelDuplexHandler$DelegatingChannelHandlerContext.fireExceptionCaught(CombinedChannelDuplexHandler.java:426)
at io.netty.channel.ChannelHandlerAdapter.exceptionCaught(ChannelHandlerAdapter.java:92)
at io.netty.channel.CombinedChannelDuplexHandler$1.fireExceptionCaught(CombinedChannelDuplexHandler.java:147)
at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:143)
at io.netty.channel.CombinedChannelDuplexHandler.exceptionCaught(CombinedChannelDuplexHandler.java:233)
at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297)
at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:276)
at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:268)
at io.netty.handler.timeout.ReadTimeoutHandler.readTimedOut(ReadTimeoutHandler.java:98)
at io.netty.handler.timeout.ReadTimeoutHandler.channelIdle(ReadTimeoutHandler.java:90)
at io.netty.handler.timeout.IdleStateHandler$ReaderIdleTimeoutTask.run(IdleStateHandler.java:505)
at io.netty.handler.timeout.IdleStateHandler$AbstractIdleTask.run(IdleStateHandler.java:477)
at io.netty.util.concurrent.PromiseTask$RunnableAdapter.call(PromiseTask.java:38)
at io.netty.util.concurrent.ScheduledFutureTask.run(ScheduledFutureTask.java:127)
at io.netty.util.concurrent.AbstractEventExecutor.safeExecute(AbstractEventExecutor.java:163)
at io.netty.util.concurrent.SingleThreadEventExecutor.runAllTasks(SingleThreadEventExecutor.java:405)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:500)
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:906)
at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
at java.base/java.lang.Thread.run(Thread.java:834)
The endpoint uses micronaut http client to call other systems. The remote system takes a very long time to respond, causing the ReadTimeOutException.
Here is the code calling the remote Service:
class RemoteTaskService implements GrailsConfigurationAware {
String taskStepperUrl
// initializes fields from configuration
void setConfiguration(Config config) {
taskStepperUrl = config.getProperty('services.stepper')
}
private BlockingHttpClient getTaskClient() {
HttpClient.create(taskStepperUrl.toURL()).toBlocking()
}
List<Map> loadTasksByProject(long projectId) {
try {
retrieveRemoteList("/api/tasks?projectId=${projectId}")
} catch(HttpClientResponseException e) {
log.error("Loading tasks of project failed with status: ${e.status.code}: ${e.message}")
throw new NotFoundException("No tasks found for project ${projectId}")
}
}
private List<Map> retrieveRemoteList(String path) {
HttpRequest request = HttpRequest.GET(path)
HttpResponse<List> response = taskClient.exchange(request, List) as HttpResponse<List>
response.body()
}
}
I've tried resolving it using the following configuration in my application.yml:
micronaut:
server:
read-timeout: 30
and
micronaut.http.client.read-timeout: 30
...with no success. Despite my configuration, the timeout still occurs around 10s after calling the endpoint.
How can I change the read timeout duration for the http rest client?
micronaut.http.client.read-timeout takes a duration, so you should add a measuring unit to the value, like 30s, 30m or 30h.
It seems that the configuration values are not injected in the manually created http clients.
A solution is to configure the HttpClient at creation, setting the readTimeout duration:
private BlockingHttpClient getTaskClient() {
HttpClientConfiguration configuration = new DefaultHttpClientConfiguration()
configuration.readTimeout = Duration.ofSeconds(30)
new DefaultHttpClient(taskStepperUrl.toURL(), configuration).toBlocking()
}
In my case I was streaming a file from a client as
#Get(value = "${service-path}", processes = APPLICATION_OCTET_STREAM)
Flowable<byte[]> fullImportStream();
so when I got this my first impulse was to increase the read-timeout value. Though, for streaming scenarios the property that applies is read-idle-timeout as stated in the docs https://docs.micronaut.io/latest/guide/configurationreference.html#io.micronaut.http.client.DefaultHttpClientConfiguration

Writing to Google Cloud Storage from PubSub using Cloud Dataflow using windowing

I am receiving messages to dataflow via pubsub in streaming mode (which is required for my desires).
Each message should be stored in its own file in GCS.
Since unbounded collections in TextIO.Write is not supported I tried to divide the PCollection into windows which contain one element each.
And writes each window to google-cloud-storage.
Here is my code:
public static void main(String[] args) {
DataflowPipelineOptions options = PipelineOptionsFactory.create()
.as(DataflowPipelineOptions.class);
options.setRunner(BlockingDataflowPipelineRunner.class);
options.setProject(PROJECT_ID);
options.setStagingLocation(STAGING_LOCATION);
options.setStreaming(true);
Pipeline pipeline = Pipeline.create(options);
PubsubIO.Read.Bound<String> readFromPubsub = PubsubIO.Read.named("ReadFromPubsub")
.subscription(SUBSCRIPTION);
PCollection<String> streamData = pipeline.apply(readFromPubsub);
PCollection<String> windowedMessage = streamData.apply(Window.<String>triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))).discardingFiredPanes());
e
windowedMessage.apply(TextIO.Write.to("gs://pubsub-outputs/1"));
pipeline.run();
}
I still receive the same error got before windowing.
The DataflowPipelineRunner in streaming mode does not support TextIO.Write.
What is the code for executing the described above.
TextIO work with Bound PCollection, you could write into GCS with API Storage.
You could do :
PipeOptions options = data.getPipeline().getOptions().as(PipeOptions.class);
data.apply(WithKeys.of(new SerializableFunction<String, String>() {
public String apply(String s) { return "mykey"; } }))
.apply(Window.<KV<String, String>>into(FixedWindows.of(Duration.standardMinutes(options.getTimeWrite()))))
.apply(GroupByKey.create())
.apply(Values.<Iterable<String>>create())
.apply(ParDo.of(new StorageWrite(options)));
You create a Window with an operation of groupBy and you could write with iterable into Storage. the processElement of StorageWrite :
PipeOptions options = c.getPipelineOptions().as(PipeOptions.class);
String date = ISODateTimeFormat.date().print(c.window().maxTimestamp());
String isoDate = ISODateTimeFormat.dateTime().print(c.window().maxTimestamp());
String blobName = String.format("%s/%s/%s", options.getBucketRepository(), date, options.getFileOutName() + isoDate);
BlobId blobId = BlobId.of(options.getGCSBucket(), blobName);
WriteChannel writer = storage.writer(BlobInfo.builder(blobId).contentType("text/plain").build());
for (Iterator<String> it = c.element().iterator(); it.hasNext();) {
writer.write(ByteBuffer.wrap(it.next().getBytes()));
}
writer.close();

Jenkins Pipeline Execute Multiple FreeStyleProjects in Parallel [duplicate]

The script is not iterating through all the values of the 'modules' array.
class Module {
public String name = '';
public Boolean isCustom = false;
public Module(String name, Boolean custom){
this.name = name;
this.isCustom = custom;
}
}
//creates array from the ext_module env var
modules = [];
EXT_MODULE.split(',').each {
modules.add(new Module(it, false));
}
println modules;
modules.each {
println "MODULE NAME ::::: ${it.name}"
if(it.isCustom)
{
println "install custom";
} else {
println "install non custom";
}
};
This is the result of the run. The array shows 4 elements, but the code inside the .each black only executes once.
Running: Print Message
[Module#71f09325, Module#e1ddb41, Module#7069a674, Module#1f68f952]
Running: Print Message
MODULE NAME ::::: puppetlabs-ntp
Running: Print Message
install non custom
Running: End of Workflow
Finished: SUCCESS
The messages "Running: Print Message" and "Running: End of Workflow" indicate that you are using the new workflow plugin: https://wiki.jenkins-ci.org/display/JENKINS/Workflow+Plugin. This plugin currently has a bug causing at least some Groovy iterations involving a closure to be aborted after one iteration: https://issues.jenkins-ci.org/browse/JENKINS-26481
The workaround is to simply use an old school for loop (code below).
Also, NonCPS is another workaround.
There is an open issue for this matter. See here: https://issues.jenkins-ci.org/browse/JENKINS-26481
Update, Oct 24th, 2016
/**
* Dumps environment varibles to the log, using an old school for loop.
*/
import com.cloudbees.groovy.cps.NonCPS
def version = '1.0'
#NonCPS
def dumpEnvVars() {
def str = "Dumping build environment variables...\n"
for (Map.Entry<String, String> entry : currentBuild.build().environment) {
str += " ${entry.key} = ${entry.value}\n"
}
echo str
}
return this;
As of yesterday, the new Pipeline plugin was delivered in version 2.0 and correct this problem.
.each closures now work, but .collect still only iterate once.

can't run the automated project in testcomplete when it calls from jenkins

can't run the automated project in testcomplete when calls from jenkins.
In our continuous integration part ,the project is automated using testcomplete and it is calling through jenkins with the help of bat file.The scripts inside the bat file is
"C:\Program Files\Automated QA\TestComplete 7\Bin\TestComplete.exe " "D:\Test Complete7 Projects\ProjectInput_AllSamples\ProjecInputs.pjs" /r /p:Samples /rt:Main "iexplore" /e
It will open testcomplete and iexplorer ,but it is not filling the data(automation).
It is working perfectly when we directly call the bat file with out jenkins.Is there any solution
From your description it sounds like something in Windows stopping you from allowing your test application to work normally. It might be the fact that the second user could be a problem but I can't confirm that as I was not able find any definite explanations of how it works in Windows XP. I am pretty sure that this won't work on a Windows Vista, 7, 8 or server machine though because of the changes in architecture.
It sounds like the best solution is to make sure that your automated UI tests are started by an interactive user. When I was trying to add automated testing to our builds we used TestComplete 7 on a Windows XP SP2 virtual machine. In order to start our tests as an interactive user we:
Made an user log on when windows started, this way there was always an interactive user which means there was an actual desktop session which has access to the keyboard / mouse. I seem to remember (but can't find any links at the moment) that without an interactive user there is no active desktop that can access the keyboard / mouse.
We wrote a little app that would start when the interactive user logged on. This app would look at a specific file and when that file changed / was created it would read the file and start the application. The code for this app looked somewhat like this:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace ApplicationStarter
{
class Program
{
// The string used to indicate that the application should quit.
private const string ExitString = "exit";
// The path which is being watched for changes.
private static string s_LoadFilePath;
static void Main(string[] args)
{
try
{
{
Debug.Assert(
args != null,
"The arguments array should not be null.");
Debug.Assert(
args.Length == 1,
"There should only be one argument.");
}
s_LoadFilePath = args[0];
{
Console.WriteLine(
string.Format(
CultureInfo.InvariantCulture,
"Watching: {0}",
s_LoadFilePath));
}
if (File.Exists(s_LoadFilePath))
{
RunApplication(s_LoadFilePath);
}
using (var watcher = new FileSystemWatcher())
{
watcher.IncludeSubdirectories = false;
watcher.NotifyFilter =
NotifyFilters.LastAccess
| NotifyFilters.LastWrite
| NotifyFilters.FileName
| NotifyFilters.DirectoryName;
watcher.Path = Path.GetDirectoryName(s_LoadFilePath);
watcher.Filter = Path.GetFileName(s_LoadFilePath);
try
{
watcher.Created += OnConfigFileCreate;
watcher.EnableRaisingEvents = true;
// Now just sit here and wait until hell freezes over
// or until the user tells us that it has
string line = string.Empty;
while (!string.Equals(line, ExitString, StringComparison.OrdinalIgnoreCase))
{
line = Console.ReadLine();
}
}
finally
{
watcher.Created -= OnConfigFileCreate;
}
}
}
catch (Exception e)
{
Console.WriteLine(e);
}
}
private static void RunApplication(string configFilePath)
{
var appPath = string.Empty;
var arguments = string.Empty;
using (var reader = new StreamReader(configFilePath, Encoding.UTF8))
{
appPath = reader.ReadLine();
arguments = reader.ReadLine();
}
// Run the application
StartProcess(appPath, arguments);
}
private static void StartProcess(string path, string arguments)
{
var startInfo = new ProcessStartInfo();
{
startInfo.FileName = path;
startInfo.Arguments = arguments;
startInfo.ErrorDialog = false;
startInfo.UseShellExecute = true;
startInfo.RedirectStandardOutput = false;
startInfo.RedirectStandardError = false;
}
Console.WriteLine(
string.Format(
CultureInfo.InvariantCulture,
"{0} Starting process {1}",
DateTime.Now,
path));
using (var exec = new Process())
{
exec.StartInfo = startInfo;
exec.Start();
}
}
private static void OnConfigFileCreate(
object sender,
FileSystemEventArgs e)
{
Console.WriteLine(
string.Format(
CultureInfo.InvariantCulture,
"{0} File change event ({1}) for: {2}",
DateTime.Now,
e.ChangeType,
e.FullPath));
// See that the file is there. If so then start the app
if (File.Exists(e.FullPath) &&
string.Equals(s_LoadFilePath, e.FullPath, StringComparison.OrdinalIgnoreCase))
{
// Wait for a bit so that the file is no
// longer locked by other processes
Thread.Sleep(500);
// Now run the application
RunApplication(e.FullPath);
}
}
}
}
This app expects the file to have 2 lines, the first with the app you want to start and the second with the arguments, so in your case something like this:
C:\Program Files\Automated QA\TestComplete 7\Bin\TestComplete.exe
"D:\Test Complete7 Projects\ProjectInput_AllSamples\ProjecInputs.pjs" /r /p:Samples /rt:Main "iexplore" /e
You should be able to generate this file from Jenkins in a build step.
Finally you may need to watch the TestComplete process for exit so that you can grab the results at the end but I'll leave that as an exercise to reader.
If you are running Jenkins (either master or slave) as a windows service, ensure it is running as a user and not as Local System.
We also do the same as Gentlesea's recommends, we run TestExecute on our Jenkins Slaves and keepo the TestComplete licenses for the people designing the TestComplete scripts.

Resources