Question populating nested records in Avro using a GenericRecord - avro

Suppose I’ve got the following schema:
{
"name" : "Profile",
"type" : "record",
"fields" : [
{ "name" : "firstName", "type" : "string" },
{ "name" : "address" , "type" : {
"type" : "record",
"name" : "AddressUSRecord",
"fields" : [
{ "name" : "address1" , "type" : "string" },
{ "name" : "address2" , "type" : "string" },
{ "name" : "city" , "type" : "string" },
{ "name" : "state" , "type" : "string" },
{ "name" : "zip" , "type" : "int" },
{ "name" : "zip4", "type": "int" }
]
}
}
]
}
I’m using a GenericRecord to represent each Profile that gets created. To add a firstName, it’s easy to do the following:
Schema sch = Schema.parse(schemaFile);
DataFileWriter<GenericRecord> fw = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>()).create(sch, new File(outFile));
GenericRecord r = new GenericData.Record(sch);
r.put(“firstName”, “John”);
fw.append(r);
But how would I set the city, for example? How do I represent the key as a string that the r.put method can understand?
Thanks

For the schema above:
GenericRecord t = new GenericData.Record(sch.getField("address").schema());
t.put("city","beijing");
r.put("address",t);

Related

Apache Avro Union type

I'm using the Avro 1.11.0 library to write data into the Avro files using Python 3.7. I'm having some doubts about the union type of the Avro. Please find below the two schemas.
{
"name" : "name",
"type" : ["null", "string"],
"columnName" : "name",
}
{
"name" : "name",
"type" : ["string", "null"],
"columnName" : "name",
}
First schema contains union type as "type" : ["null", "string"] and second schema contains union type as "type" : ["string", "null"].
So is there any difference between the above mentioned schemas?
The only difference is that the specification states that if you want to use a default value, it should correspond to the first type in the union.
For example, these would be valid:
{
"name" : "name",
"type" : ["null", "string"],
"columnName" : "name",
"default": null,
}
{
"name" : "name",
"type" : ["string", "null"],
"columnName" : "name",
"default": "foo",
}
But these would not:
{
"name" : "name",
"type" : ["null", "string"],
"columnName" : "name",
"default": "foo",
}
{
"name" : "name",
"type" : ["string", "null"],
"columnName" : "name",
"default": null,
}
Since a union that includes null tends to mean something like an optional field, most people would put null as the first option in the union so that they can set the default value to null.

Avro schema for record type with empty object

I am trying to create avro schema for below json
{
"id": "TEST",
"status": "status",
"timestamp": "2019-01-01T00:00:22-03:00",
"comment": "add comments or replace it with adSummary data",
"error": {
"code": "ER1212132",
"msg": "error message"
}
}
the error object is optional, it could be
"error" :{}
Below is the avro schema without default value
{
"type" : "record",
"name" : "Order",
"fields" : [ {
"name" : "id",
"type" : "string"
}, {
"name" : "status",
"type" : "string"
}, {
"name" : "timestamp",
"type" : "string"
}, {
"name" : "comment",
"type" : ["null","string"],
"default": null
}, {
"name" : "error",
"type" : {
"type" : "record",
"name" : "error",
"fields" : [ {
"name" : "code",
"type" : "string"
}, {
"name" : "msg",
"type" : "string"
} ]
}
} ]
}
How can I add default value {} for error field in json.
{
"type" : "record",
"name" : "Order",
"fields" : [ {
"name" : "id",
"type" : "string"
}, {
"name" : "status",
"type" : "string"
}, {
"name" : "timestamp",
"type" : "string"
}, {
"name" : "comment",
"type" : ["null","string"],
"default": null
}, {
"name" : "error",
"type" : [{"type": "record", "fields":[{"name": "code", "type":"string"}, {"name": "msg", "type":"string"}]}, {"type": "record", "fields":[]}]
} ]
}

hive table to view the avro records which is streamed using flume getting Block size invalid or too large for this implementation: -40

I am creating the hive serde external table to view the twitter records which is streaming using flume.
My property file
# Naming the components on the current agent.
TwitterAgent.sources = Twitter
TwitterAgent.channels = MemChannel
TwitterAgent.sinks = HDFS
# Describing/Configuring the source
TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource
TwitterAgent.sources.Twitter.consumerKey = xxx
TwitterAgent.sources.Twitter.consumerSecret = xxx
TwitterAgent.sources.Twitter.accessToken = xxx
TwitterAgent.sources.Twitter.accessTokenSecret = xxx
TwitterAgent.sources.Twitter.keywords = kafka
# Describing/Configuring the sink
TwitterAgent.sinks.HDFS.type = hdfs
TwitterAgent.sinks.HDFS.hdfs.path = hdfs://xxx:8000/topics/flumedata
TwitterAgent.sinks.HDFS.hdfs.fileType = DataStream
TwitterAgent.sinks.HDFS.hdfs.writeFormat = Text
TwitterAgent.sinks.HDFS.hdfs.batchSize = 10000
TwitterAgent.sinks.HDFS.hdfs.rollSize = 0
TwitterAgent.sinks.HDFS.hdfs.rollCount = 100000
TwitterAgent.sinks.hdfs.serializer=Text
# Describing/Configuring the channel
TwitterAgent.channels.MemChannel.type = memory
TwitterAgent.channels.MemChannel.capacity = 100000
TwitterAgent.channels.MemChannel.transactionCapacity = 1000
TwitterAgent.channels.MemChannel.byteCapacity = 6912212
# Binding the source and sink to the channel
TwitterAgent.sources.Twitter.channels = MemChannel
TwitterAgent.sinks.HDFS.channel = MemChannel
Query to create a hive external table
CREATE EXTERNAL TABLE twitter_tweets
COMMENT "just drop the schema right into the HQL"
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.literal'='{
"type" : "record",
"name" : "Doc",
"doc" : "adoc",
"fields" : [ {
"name" : "id",
"type" : "string"
}, {
"name" : "user_friends_count",
"type" : [ "int", "null" ]
}, {
"name" : "user_location",
"type" : [ "string", "null" ]
}, {
"name" : "user_description",
"type" : [ "string", "null" ]
}, {
"name" : "user_statuses_count",
"type" : [ "int", "null" ]
}, {
"name" : "user_followers_count",
"type" : [ "int", "null" ]
}, {
"name" : "user_name",
"type" : [ "string", "null" ]
}, {
"name" : "user_screen_name",
"type" : [ "string", "null" ]
}, {
"name" : "created_at",
"type" : [ "string", "null" ]
}, {
"name" : "text",
"type" : [ "string", "null" ]
}, {
"name" : "retweet_count",
"type" : [ "long", "null" ]
}, {
"name" : "retweeted",
"type" : [ "boolean", "null" ]
}, {
"name" : "in_reply_to_user_id",
"type" : [ "long", "null" ]
}, {
"name" : "source",
"type" : [ "string", "null" ]
}, {
"name" : "in_reply_to_status_id",
"type" : [ "long", "null" ]
}, {
"name" : "media_url_https",
"type" : [ "string", "null" ]
}, {
"name" : "expanded_url",
"type" : [ "string", "null" ]
} ]
}');
LOAD DATA INPATH '/topics/flumedata/FlumeData.*' OVERWRITE INTO TABLE twitter_tweets;
After creating the table, when i hit select * from twitter_tweets;
It is not giving any data, it throughs an error
org.apache.hive.service.cli.HiveSQLException: java.io.IOException: org.apache.avro.AvroRuntimeException: java.io.IOException: Block size invalid or too large for this implementation: -40
Where i went wrong, i dono why iam getting this block size issue. Can anyone guide me.

Apache NiFi not converting recognizing decimal type in convertJsontoAvro Processor

I have a ConvertJsontoAvro processor in NiFi 1.4 and am having difficulty getting the proper datatype of decimal within the avro. The data is being transformed into bytes using logical Avro data types within ExecuteSQL processor, converting avro to Json using ConvertAvrotoJSON processor, and then using ConvertJsonToAvro processor to put into HDFS using PutParquet.
My schema is :
{
"type" : "record",
"name" : "schema",
"fields" : [ {
"name" : "entryDate",
"type" : [ "null", {
"type" : "long",
"logicalType" : "timestamp-micros"
} ],
"default" : null
}, {
"name" : "points",
"type" : [ "null", {
"type" : "bytes",
"logicalType" : "decimal",
"precision" : 18,
"scale" : 6
} ],
"default" : null
}]
}
My JSON:
{
"entryDate" : 2018-01-26T13:48:22.087,
"points" : 6.000000
}
I get an error for the avro saying
Cannont convert field points: Cannot resolve union : {"bytes": "+|Ð" not in ["null", {"type":"bytes","logicalType":"decimal","precision":18,"scale":6}]"
Is there some type of work around for this?...
Currently you cannot mix null type and logical types due to bug in Avro. Check this still unresolved issue:
https://issues.apache.org/jira/browse/AVRO-1891
Also the defaults value cannot be null. This should work for you:
{
"type" : "record",
"name" : "schema",
"fields" : [ {
"name" : "entryDate",
"type" : {
"type" : "long",
"logicalType" : "timestamp-micros"
},
"default" : 0
}, {
"name" : "points",
"type" : {
"type" : "bytes",
"logicalType" : "decimal",
"precision" : 18,
"scale" : 6
},
"default" : ""
}]
}
For anyone interested, I was able to set the decimal and a default value as null (in cases when the field is null or missing), currently using Nifi 1.14.0
{
"name": "value",
"type": [
"null",
{
"type": "bytes",
"logicalType": "decimal",
"precision": 8,
"scale": 4
}
],
"default": null
}

How to define type for a specific field in ElasticSearch for Rails

I am struggling with elasticsearch-rails.
I have the following mapping:
{
"listings" : {
"mappings" : {
"listing" : {
"properties" : {
"address" : {
"type" : "string"
},
"authorized" : {
"type" : "boolean"
},
"categories" : {
"properties" : {
"created_at" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"id" : {
"type" : "long"
},
"name" : {
"type" : "string"
},
"parent_id" : {
"type" : "long"
},
"updated_at" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"url_name" : {
"type" : "string"
}
}
},
"cid" : {
"type" : "string"
},
"city" : {
"type" : "string"
},
"country" : {
"type" : "string"
},
"created_at" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"featured" : {
"type" : "boolean"
},
"geojson" : {
"type" : "string"
},
"id" : {
"type" : "long"
},
"latitude" : {
"type" : "string"
},
"longitude" : {
"type" : "string"
},
"name" : {
"type" : "string"
},
"phone" : {
"type" : "string"
},
"postal" : {
"type" : "string"
},
"province" : {
"type" : "string"
},
"thumbnail_filename" : {
"type" : "string"
},
"updated_at" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"url" : {
"type" : "string"
}
}
}
}
}
}
I would like to change the type for the geojson field from string to geo_point so I can use the geo_shape query on it.
I tried this in my model:
settings index: { number_of_shards: 1 } do
mappings dynamic: 'false' do
indexes :geojson, type: 'geo_shape'
end
end
with peculiar results. When I queried the mapping with $ curl 'localhost:9200/_all/_mapping?pretty', the geojson field still shows as type: string.
Within a Rails console, if I do Listing.mappings.to_hash, it seems to show that the geojson field is of type geo_shape.
And yet when running this query:
Listing.search(query: { fuzzy_like_this: { fields: [:name], like_text: "gap" } }, query: { fuzzy_like_this_field: { city: { like_text: "San Francisco" } } }, query: { geo_shape: { geojson: { shape: { type: :envelope, coordinates: [[37, -122],[38,-123]] } } } }); response.results.total; response.results.map { |r| puts "#{r._score} | #{r.name}, #{r.city} (lat: #{r.latitude}, lon: #{r.longitude})" }
ES complains that the geojson field is not of type geo_shape.
What am I missing? How do I tell ES that I want the geojson field to be of type geo_shape and not string?
The issue was that I didn't delete and recreate the mapping.
In the rails console, I ran Model.__elasticsearch__.delete_index! and then Model.__elasticsearch__.create_index! followed by Model.import

Resources