I need to do web scraping for a website which has allowed robot access. Below is the robot.txt file's content.
User-agent: *
Disallow:
Sitemap:https://www.sample.com/sitemap-index.xml
But when I try to fetch the website's content using nokogiri, it's being detected.
Nokogiri::HTML(open('https://www.sample.com/search?q=test', :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE))
Here the output:
> (Document:0x3fda40e7cf70 {
name = "document",
children = [
#(DTD:0x3fda40e9591c { name = "html" }),
#(Element:0x3fda40e8c95c {
name = "html",
attributes = [ #(Attr:0x3fda4071a598 { name = "style", value = "height:100%" })],
children = [
#(Element:0x3fda3fefa28c {
name = "head",
children = [
#(Element:0x3fda401a3088 {
name = "meta",
attributes = [ #(Attr:0x3fda40ebd7a0 { name = "name", value = "ROBOTS" }), #(Attr:0x3fda40ebd778 { name = "content", value = "NOINDEX, NOFOLLOW" })]
}),
#(Element:0x3fda4074faf4 {
name = "meta",
attributes = [ #(Attr:0x3fda3ff0beec { name = "name", value = "format-detection" }), #(Attr:0x3fda3ff0bed8 { name = "content", value = "telephone=no" })]
}),
#(Element:0x3fda401ca700 {
name = "meta",
attributes = [ #(Attr:0x3fda401c2050 { name = "name", value = "viewport" }), #(Attr:0x3fda401c217c { name = "content", value = "initial-scale=1.0" })]
}),
#(Element:0x3fda4079a284 {
name = "meta",
attributes = [ #(Attr:0x3fda4078bfb8 { name = "http-equiv", value = "X-UA-Compatible" }), #(Attr:0x3fda4078bf04 { name = "content", value = "IE=edge,chrome=1" })]
})]
}),
#(Element:0x3fda407e2e6c {
name = "body",
attributes = [ #(Attr:0x3fda430205f0 { name = "style", value = "margin:0px;height:100%" })],
children = [
#(Element:0x3fda4072e2a0 {
name = "iframe",
attributes = [
#(Attr:0x3fda3ff45214 {
name = "src",
value = "/_Incapsula_Resource?SWUDNSAI=28&xinfo=5-66719320-0%200NNN%20RT%281543054979096%20247%29%20q%280%20-1%20-1%20-1%29%20r%280%20-1%29%20B12%284%2c315%2c0%29%20U2&incident_id=245000650118470008-256430953704260629&edet=12&cinfo=04000000"
}),
#(Attr:0x3fda3ff451d8 { name = "frameborder", value = "0" }),
#(Attr:0x3fda3ff451b0 { name = "width", value = "100%" }),
#(Attr:0x3fda3ff45188 { name = "height", value = "100%" }),
#(Attr:0x3fda3ff45174 { name = "marginheight", value = "0px" }),
#(Attr:0x3fda3ff4514c { name = "marginwidth", value = "0px" })],
children = [ #(Text "Request unsuccessful. Incapsula incident ID: 245000650118470008-256430953704260629")]
})]
})]
})]
})
How can I achieve this web scraping?
Related
My requirement is to create a dynamic resource Confluent Schema. Below is the schema.tf file.
Basically, need to include map type object and will be creating different Schema by passing name and file attributes. What changes to be done on below highlighted "schema" file parameter so that it can be included in the for_each block?
resource "confluent_schema" "sample_avro_schema" {
schema_registry_cluster {
id = confluent_schema_registry_cluster.essentials.id
}
rest_endpoint = confluent_schema_registry_cluster.essentials.rest_endpoint
for_each = toset(var.subject_name_avro)
subject_name = each.key
format = "AVRO"
**schema = file("modules/confluent_kafka_cluster_dedicated/schemas/sample_schema_avro.avsc")**
credentials {
key = confluent_api_key.env-manager-schema-registry-api-key.id
secret = confluent_api_key.env-manager-schema-registry-api-key.secret
}
}
Variable declaration as below: variable.tf file
variable "subject_name_avro" {
description = "AVRO Schema Name"
type = list(string)
default = ["avro-topic-value"]
}
And I am running this execution using .tfvars file:
subject_name_avro = ["avro-topic-1-value"]
My requirement is to include below changes in .tfvars file. Kindly suggest what resource and variable level changes to be done to include schema file parameter dynamically.
subject_name_avro = [
{
subject_name_avro = "avro-topic-1-value"
schema = file("modules/confluent_kafka_cluster_dedicated/schemas/sample_schema_avro1.avsc")
},
{
subject_name_avro = "avro-topic-2-value"
schema = file("modules/confluent_kafka_cluster_dedicated/schemas/sample_schema_avro2.avsc")
},
]
Sample file content "sample_schema_avro.avsc"
{
"type": "record",
"namespace": "io.confluent.developer.avro",
"name": "Purchase",
"fields": [
{
"name": "item",
"type": "string"
},
{
"name": "amount",
"type": "double"
},
{
"name": "customer_id",
"type": "string"
}
]
}
You can't use file in a variabiles. You can use only path in your case:
subject_name_avro = [
{
subject_name_avro = "avro-topic-1-value"
schema = "./modules/confluent_kafka_cluster_dedicated/schemas/sample_schema_avro1.avsc"
},
{
subject_name_avro = "avro-topic-2-value"
schema = "./modules/confluent_kafka_cluster_dedicated/schemas/sample_schema_avro2.avsc"
},
]
To iterate over this you can use count or for_each. With for_each it would be:
resource "confluent_schema" "sample_avro_schema" {
for_each = {for idx, val in var.subject_name_avro: idx => val}
schema_registry_cluster {
id = confluent_schema_registry_cluster.essentials.id
}
rest_endpoint = confluent_schema_registry_cluster.essentials.rest_endpoint
subject_name = each.value.subject_name_avro
format = "AVRO"
**schema = file(each.value.schema)
credentials {
key = confluent_api_key.env-manager-schema-registry-api-key.id
secret = confluent_api_key.env-manager-schema-registry-api-key.secret
}
}
Public Shared Function GenerateParagraph() As Paragraph
Dim element = New Paragraph(New Run(New FieldChar(New FormFieldData(New FormFieldName() With {
.Val = "Check1"
}, New Enabled(), New CalculateOnExit() With {
.Val = BooleanValues.Zero
}, New CheckBox(New AutomaticallySizeFormField(), New DefaultCheckboxFormFieldState() With {
.Val = BooleanValues.Zero
}))) With {
.FieldCharType = FieldCharValues.Begin
}), New BookmarkStart() With {
.Name = "Check1",
.Id = 0
}, New Run(New FieldCode(" FORMCHECKBOX ") With {
.Space = "preserve"
}), New Run(New FieldChar() With {
.FieldCharType = FieldCharValues.[End]
}), New BookmarkEnd() With {
.Id = 0
}, New Run(New Text("My check box"))) With {
.RsidParagraphAddition = "00784880",
.RsidRunAdditionDefault = "00B77989"
}
Return element
End Function
I want to add table 'mm' to table 'fmenu.pages' but it doesnt work at all. Error: attempt to index a nil value (field 'main2'). Its about last line. Code:
local fmenu = {
selected_button = 0,
menu = {
font = 1,
},
pages = {
["main"] = {
name = "name",
id = 1,
btns = {
{name = "name I", id = 1}
}
}
}
}
local mm = {
["main2"] = {
name = "name2",
id = 2,
btns = {
{name = "name I", id = 1},
{name = "name II", id = 2}
}
}
}
table.insert(fmenu.pages, mm)
print(fmenu.pages["main2"].name)
How about using table.merge from lua-stdlib?
local table = require"std.table"
local fmenu = {
selected_button = 0,
menu = {
font = 1,
},
pages = {
["main"] = {
name = "name",
id = 1,
btns = {
{name = "name I", id = 1}
}
}
}
}
local mm = {
["main2"] = {
name = "name2",
id = 2,
btns = {
{name = "name I", id = 1},
{name = "name II", id = 2}
}
}
}
table.merge(fmenu.pages, mm)
print(fmenu.pages["main2"].name)
I have searched the forums and stack overflow for it and still can't understand how to do this. I don't know how to parse the results for a specific album in the results. I want to get the photos for the profile pictures album. So far I got:
func retrieveFBProfileAlbum(){
print("RetrievingFBProfileAlbum")
FBSDKGraphRequest(graphPath: "me/albums/", parameters: ["fields": "name, photos"]).startWithCompletionHandler({ (connection, result, error) -> Void in
if (error == nil){
// let results = result as! NSDictionary
// print(result)
if let array: AnyObject = result["data"] {
for resultDict in array as! [AnyObject] {
if let resultDict = resultDict as? [String : AnyObject] {
for (name, value) in resultDict{
print("NAME A: \(name), VALUE A \(value)")
}
}
}
}
}else{
print(error.localizedDescription)
}
})
}
I got the results as id, name, photos, id, name, photos. Not sure what to do with this :( I can print only the names of the albums but I don't know how to get the pics of Profile Pictures album:/
Please help as Ive spent many hours on this and still can't get it.
Here's the console output for print(Name:
RetrievingFBProfileAlbum
NAME A: id, VALUE A 357463944266729
NAME A: photos, VALUE A {
data = (
{
"created_time" = "2016-05-04T11:04:02+0000";
id = 1192004844145964;
name = "Yess!!:) Our New Baby!:)100% Electric !";
},
{
"created_time" = "2016-04-29T12:58:29+0000";
id = 1189130857766696;
name = "Coming on Wed!!:)";
},
{
"created_time" = "2016-04-09T16:55:35+0000";
id = 1176951032318012;
name = "Celebrating the good times!:)\nSwietujemy dobre czasy!:)";
},
{
"created_time" = "2016-02-27T10:11:35+0000";
id = 1144990872180695;
name = "The card on the tv said: 'TV in working order. Help yourself' lol Found close to Canonbury station on the Pavement lol\nKartka na telewizorze mowi: 'Dzialajacy Telewizor Wez sobie' Znalezione niedaleko Canonbury Station w Londynie lol!";
},
{
"created_time" = "2016-02-20T21:44:56+0000";
id = 1141193462560436;
name = "Amazing gig in this place:)\nWspanialy koncert w tym miejscu:)";
},
{
"created_time" = "2016-02-20T15:00:47+0000";
id = 1141030565910059;
name = "Pretty rainy in Oxford today;)";
},
{
"created_time" = "2016-02-04T12:26:26+0000";
id = 1132017403478042;
},
{
"created_time" = "2016-02-04T12:15:08+0000";
id = 1132014160145033;
},
{
"created_time" = "2015-12-18T21:22:28+0000";
id = 1105697299443386;
},
{
"created_time" = "2015-12-18T21:13:21+0000";
id = 1105694492777000;
},
{
"created_time" = "2015-12-17T16:07:03+0000";
id = 1105120696167713;
name = "Oxford Circus! Lol!";
},
{
"created_time" = "2015-12-14T10:39:47+0000";
id = 1103646716315111;
name = "On the way to see our new house:)\nW drodze do naszego nowego domku:)";
},
{
"created_time" = "2015-12-12T16:08:24+0000";
id = 1102853349727781;
name = "I wonder what are they queuing for lol";
},
{
"created_time" = "2015-12-12T16:08:24+0000";
id = 1102853346394448;
name = "I wonder what are they queuing for lol";
},
{
"created_time" = "2015-12-05T13:38:44+0000";
id = 1099659323380517;
name = "Had a nice recording session today";
},
{
"created_time" = "2015-11-25T08:36:30+0000";
id = 1095105427169240;
name = "Morning Charm of London;)";
},
{
"created_time" = "2015-11-24T10:55:52+0000";
id = 1094653970547719;
name = "Working on a bigger app project for a client!:)";
},
{
"created_time" = "2015-11-21T19:03:42+0000";
id = 1093483963998053;
name = "Marta's Birthday Party!";
},
{
"created_time" = "2015-11-19T12:34:39+0000";
id = 1092463290766787;
},
{
"created_time" = "2015-11-19T12:07:40+0000";
id = 1092455410767575;
},
{
"created_time" = "2015-11-13T10:20:02+0000";
id = 1089819014364548;
name = "This picture was made by Jeremy Pollard";
},
{
"created_time" = "2015-11-12T22:59:17+0000";
id = 1089636391049477;
},
{
"created_time" = "2015-11-12T22:59:17+0000";
id = 1089636377716145;
},
{
"created_time" = "2015-11-12T22:59:17+0000";
id = 1089636341049482;
},
{
"created_time" = "2015-11-12T22:59:17+0000";
id = 1089636331049483;
}
);
paging = {
cursors = {
after = MTA4OTYzNjMzMTA0OTQ4MwZDZD;
before = MTE5MjAwNDg0NDE0NTk2NAZDZD;
};
next = "https://graph.facebook.com/v2.5/357463944266729/photos?access_token=EAARr4VZBUZB1sBAGkjT94xtdTntZBMlYfSGyeGdX7AV8UiejafZB3j3BUtcizUci4H7exNYhK8knZATvJlGqEK8ItBt7nA6avvNQmLp3FdLcxwK5as9kHvy10mmBDdP1m74LbLq07PfZBdZB7ZBYskogHthmjFSMlJ2XjAcWyGKh2fAQ2wgwmKVj59NlxjJMJmLLjFv6PPshfQZDZD&limit=25&after=MTA4OTYzNjMzMTA0OTQ4MwZDZD";
};
}
NAME A: name, VALUE A Mobile Uploads
NAME A: id, VALUE A 105303352816124
NAME A: photos, VALUE A {
data = (
{
"created_time" = "2016-03-17T13:03:22+0000";
id = 1157099090969873;
},
{
"created_time" = "2015-11-18T15:00:58+0000";
id = 1092093134137136;
},
{
"created_time" = "2015-11-17T16:03:07+0000";
id = 1091665917513191;
name = "Working on the app design for our newest productivity app! #productivity #wellbeing #mindfulness #ios #apps";
},
{
"created_time" = "2015-11-17T08:19:03+0000";
id = 1091515567528226;
name = "Morning Pages app coming soon!!";
},
{
"created_time" = "2015-11-10T21:23:09+0000";
id = 1088809341132182;
name = "Morning Pages app coming soon!!";
},
{
"created_time" = "2015-11-10T15:23:04+0000";
id = 1088690757810707;
name = "Working on the app design for our newest productivity app! #productivity #wellbeing #mindfulness #ios #apps";
},
{
"created_time" = "2015-11-09T15:22:10+0000";
id = 1088254854520964;
},
{
"created_time" = "2015-11-09T09:00:22+0000";
id = 1088140484532401;
name = "Code Imagination is working on a new app! #ios #morningpages #appdevelopment #appdesign";
},
{
"created_time" = "2015-11-08T19:20:02+0000";
id = 1087913804555069;
name = "Morning Ritual will clear your mind. Simple and already forgotten technique of well-being.";
},
{
"created_time" = "2015-11-07T16:03:03+0000";
id = 1087375167942266;
name = "My iPhone app is coming soon out! :)";
},
{
"created_time" = "2015-11-06T17:15:14+0000";
id = 1086951271317989;
},
{
"created_time" = "2015-11-05T18:01:33+0000";
id = 1086518038027979;
name = "Morning Ritual will clear your mind. Simple and already forgotten technique of well-being.";
},
{
"created_time" = "2015-11-05T17:15:12+0000";
id = 1086503548029428;
},
{
"created_time" = "2015-11-04T10:00:43+0000";
id = 1085954754750974;
},
{
"created_time" = "2015-11-03T18:00:11+0000";
id = 1085715074774942;
name = "Code Imagination is working on a therapeutic app! Coming Soon!";
},
{
"created_time" = "2015-11-03T13:10:04+0000";
id = 1085617851451331;
name = "Code Imagination is working on a new app! #ios #morningpages #appdevelopment #appdesign";
},
{
"created_time" = "2015-11-02T16:03:12+0000";
id = 1085240851489031;
name = ":)";
},
{
"created_time" = "2015-10-31T13:06:03+0000";
id = 1084283338251449;
name = "Morning Pages app coming soon!!";
},
{
"created_time" = "2015-10-31T08:19:08+0000";
id = 1084204508259332;
name = "Working on the app design for our newest productivity app! #productivity #wellbeing #mindfulness #ios #apps";
},
{
"created_time" = "2015-10-30T09:52:10+0000";
id = 1083819734964476;
name = "Working on a new app project!! :)";
},
{
"created_time" = "2015-10-22T16:43:44+0000";
id = 1080595525286897;
},
{
"created_time" = "2015-10-22T16:43:44+0000";
id = 1080595035286946;
},
{
"created_time" = "2015-08-22T05:57:16+0000";
id = 1050046911675092;
name = "Dziekuje wszystkim za zyczenia; )\nThx All for the Birthday Wishes :)";
},
{
"created_time" = "2015-05-20T10:12:04+0000";
id = 999296236750160;
name = "Little Bell in Japanese. Available soon! Meanwhile check the english version here: http://buff.ly/1GoTGIH";
},
{
"created_time" = "2015-05-19T14:18:40+0000";
id = 999010216778762;
name = "Little bell the Japanese Version available soon;))\nLittle Bell wersja Japonska dostepna wkrotce;)";
}
);
paging = {
cursors = {
after = OTk5MDEwMjE2Nzc4NzYy;
before = MTE1NzA5OTA5MDk2OTg3MwZDZD;
};
next = "https://graph.facebook.com/v2.5/105303352816124/photos?access_token=EAARr4VZBUZB1sBAGkjT94xtdTntZBMlYfSGyeGdX7AV8UiejafZB3j3BUtcizUci4H7exNYhK8knZATvJlGqEK8ItBt7nA6avvNQmLp3FdLcxwK5as9kHvy10mmBDdP1m74LbLq07PfZBdZB7ZBYskogHthmjFSMlJ2XjAcWyGKh2fAQ2wgwmKVj59NlxjJMJmLLjFv6PPshfQZDZD&limit=25&after=OTk5MDEwMjE2Nzc4NzYy";
};
}
I have pasted just the begining of the results as its quite long. I appreciate any help.
i have a variable in jquery. first am adding some values to it.
now again am adding some more values to existing variable.
here is my code. if any one know help me......
var eventToAdd = {
title: $("#txtSubject").val(),
description: $("#addEventDesc").val(),
start: $("#txtStartdate").val(),
end: $("#txtEnddate").val()
};
var ChkBox = document.getElementById("rbtnHourly");
if (ChkBox.checked == true) {
**eventToAdd = {**
FREQ: "Hourly",
INTERVAL: $("#updown").val(),
BYSETPOS: "",
BYDAY: "",
BYMONTH: "",
BYMONTHDAY: ""
};
}
var ChkBox = document.getElementById("rbtnDaily");
if (ChkBox.checked == true) {
var rbtnEveryday = document.getElementById("rbtnEveryday");
if (rbtnEveryday.checked == true) {
**eventToAdd = {**
FREQ: "Daily",
INTERVAL: $("#TextBox1").val(),
BYSETPOS: "",
BYDAY: "MO,TU,WE,TH,FR,SA,SU",
BYMONTH: "",
BYMONTHDAY: ""
};
}
else {
**var eventToAdd1 = {**
FREQ: "Daily",
INTERVAL: 1,
BYSETPOS: "",
BYDAY: "MO,TU,WE,TH,FR",
BYMONTH: "",
BYMONTHDAY: ""
};
};
}
I'm not entirely sure what you are asking but take a look at jQuery.extend()
http://api.jquery.com/jQuery.extend/
This enables you to extend an object with other values
You add values to an existing object by assigning to properties to it like this:
if (ChkBox.checked) {
eventToAdd.FREQ = "Hourly";
eventToAdd.INTERVAL = $("#updown").val();
eventToAdd.BYSETPOS = "";
eventToAdd.BYDAY = "";
eventToAdd.BYMONTH = "";
eventToAdd.BYMONTHDAY = "";
}