cft

Web scraping Google Trends with Nodejs

A step-by-step tutorial on creating a Google Trends web scraper in Nodejs.


user

Mikhail Zub

a year ago | 29 min read

Full code

If you don't need an explanation, have a look at the full code example in the online IDE

const puppeteer = require("puppeteer-extra");

const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const searchQueries = ["Mercedes"]; // what we want to search (for interestOverTime, interestByRegion, relatedQueries, relatedTopics)

// const searchQueries = ["Mercedes", "BMW", "Audi"]; // what we want to search (for interestOverTime, comparedByRegion, interestByRegion, relatedQueries)

const URL = `https://trends.google.com/trends/explore?q=${encodeURI(searchQueries.join(","))}&hl=en`;

async function getGoogleTrendsResults() {

const browser = await puppeteer.launch({

headless: false,

args: ["--no-sandbox", "--disable-setuid-sandbox"],

});

const page = await browser.newPage();

await page.setDefaultNavigationTimeout(60000);

await page.goto(URL);

await page.waitForTimeout(5000);

await page.reload();

const interestOverTime = {};

const comparedByRegion = [];

const interestByRegion = [];

const relatedQueries = [];

const relatedTopics = {};

const valuePattern = /%22value%22:%22(?<value>[^%]+)/gm; //https://regex101.com/r/PNcP1u/1

page.on("response", async (response) => {

if (response.headers()["content-type"]?.includes("application/")) {

const responseData = await response.text();

const responseURL = await response.url();

if (responseURL.includes("widgetdata/multiline?")) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

interestOverTime.timelineData = parsedData?.timelineData?.map((dataEl) => ({

date: decodeURI(dataEl.formattedTime),

values: searchQueries.map((queryEl, i) => ({

query: queryEl,

value: dataEl.formattedValue[i],

extractedValue: dataEl.value[i],

})),

}));

interestOverTime.averages = parsedData.averages.map((dataEl, i) => ({

query: searchQueries[i],

value: dataEl,

}));

} else {

const values = [...responseURL.matchAll(valuePattern)].map(({ groups }) => groups.value);

if (responseURL.includes("widgetdata/comparedgeo?")) {

if (values.length > 1) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

comparedByRegion.push(

...parsedData.geoMapData.map((dataEl) => ({

geo: dataEl.geoCode,

location: dataEl.geoName,

maxValueIndex: dataEl.maxValueIndex,

values: searchQueries.map((queryEl, i) => ({

query: queryEl,

value: dataEl.formattedValue[i],

extractedValue: dataEl.value[i],

})),

}))

);

} else {

for (query of searchQueries) {

if (values[0] === query) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

interestByRegion.push({

query,

data: parsedData.geoMapData.map((dataEl) => ({

geo: dataEl.geoCode,

location: dataEl.geoName,

maxValueIndex: dataEl.maxValueIndex,

value: dataEl.formattedValue[0],

extractedValue: dataEl.value[0],

})),

});

}

}

}

} else if (responseURL.includes("widgetdata/relatedsearches?")) {

for (query of searchQueries) {

if (values[0] === query) {

if (responseURL.includes("%22keywordType%22:%22ENTITY%22")) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

relatedTopics.top = parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({

topic: {

title: dataEl.topic.title,

type: dataEl.topic.type,

},

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

}));

relatedTopics.rising = parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({

topic: {

title: dataEl.topic.title,

type: dataEl.topic.type,

},

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

}));

} else {

const parsedData = JSON.parse(responseData.slice(6))?.default;

relatedQueries.push({

searchQuery: query,

top: parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({

query: dataEl.query,

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

})),

rising: parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({

query: dataEl.query,

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

})),

});

}

}

}

}

}

}

});

await page.waitForTimeout(10000);

await browser.close();

return { interestOverTime, comparedByRegion, interestByRegion, relatedQueries, relatedTopics };

}

getGoogleTrendsResults().then((result) => console.dir(result, { depth: null }));

Preparation

First, we need to create a Node.js* project and add npm packages puppeteer, puppeteer-extra and puppeteer-extra-plugin-stealth to control Chromium (or Chrome, or Firefox, but now we work only with Chromium which is used by default) over the DevTools Protocol in headless or non-headless mode.

To do this, in the directory with our project, open the command line and enter npm init -y, and then npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth.

*If you don't have Node.js installed, you can download it from nodejs.org and follow the installation documentation.

📌Note: also, you can use puppeteer without any extensions, but I strongly recommended use it with puppeteer-extra with puppeteer-extra-plugin-stealth to prevent website detection that you are using headless Chromium or that you are using web driver. You can check it on Chrome headless tests website. The screenshot below shows you a difference.

Process

In this case, we need to intercept all responses when loading the page and get all the required data. The Gif below illustrates that approach:

Code explanation

Declare puppeteer to control Chromium browser from puppeteer-extra library and StealthPlugin to prevent website detection that you are using web driver from puppeteer-extra-plugin-stealth library:

const puppeteer = require("puppeteer-extra");

const StealthPlugin = require("puppeteer-extra-plugin-stealth");

Next, we "say" to puppeteer use StealthPlugin, write search queries and the search URL. I write two different search queries to show you all available results (some of them are available only for a single search query, but some are available only for multiple search queries):

const searchQueries = ["Mercedes"];

// const searchQueries = ["Mercedes", "BMW", "Audi"];

const URL = `https://trends.google.com/trends/explore?q=${encodeURI(searchQueries.join(","))}&hl=en`;

Next, write a function to control the browser, and get information:

async function getGoogleTrendsResults() {

...

}

In this function first we need to define browser using puppeteer.launch({options}) method with current options, such as headless: false and args: ["--no-sandbox", "--disable-setuid-sandbox"].

These options mean that we use headless mode and array with arguments which we use to allow the launch of the browser process in the online IDE. And then we open a new page:

const browser = await puppeteer.launch({

headless: false,

args: ["--no-sandbox", "--disable-setuid-sandbox"],

});

const page = await browser.newPage();

Next, we change default (30 sec) time for waiting for selectors to 60000 ms (1 min) for slow internet connection with .setDefaultNavigationTimeout() method, go to URL with .goto() method and use .waitForTimeout() method to wait 5 seconds:

await page.setDefaultNavigationTimeout(60000);

await page.goto(URL);

await page.waitForTimeout(5000);

Because google possibly blocks direct search requests we need to reload the page:

await page.reload();

Then, we define arrays and objects with the results and write valuePattern to find the correct responses from all of them:

const interestOverTime = {};

const comparedByRegion = [];

const interestByRegion = [];

const relatedQueries = [];

const relatedTopics = {};

const valuePattern = /%22value%22:%22(?<value>[^%]+)/gm;

Next, we right response intercept function:

page.on("response", async (response) => {

...

})

In this function first, we check that the response header content-type have the "application/" part. And if its true we define and write responseData with all data comes from current response and responseUrl with URL:

if (response.headers()["content-type"]?.includes("application/")) {

const responseData = await response.text();

const responseURL = await response.url();

...

}

Next, we need to separate response with interestOverTime data from all other:

if (responseURL.includes("widgetdata/multiline?")) {

...

} else {

...

}

If response with interestOverTime data, we need to get the correct data to work with it. To do this we remove the first six chars ()]}', and the new line /n) from responseData(.slice(6) method), parse received JSON string to JS object (JSON.parse() method) and get data from default key:

const parsedData = JSON.parse(responseData.slice(6))?.default;

Next, we build the timelineData array from our parsed response data with objects containing date and values keys and the averages array with objects containing query and value keys (to get the correct date we need to decode the URI encoded string):

interestOverTime.timelineData = parsedData?.timelineData?.map((dataEl) => ({

date: decodeURI(dataEl.formattedTime),

values: searchQueries.map((queryEl, i) => ({

query: queryEl,

value: dataEl.formattedValue[i],

extractedValue: dataEl.value[i],

})),

}));

interestOverTime.averages = parsedData.averages.map((dataEl, i) => ({

query: searchQueries[i],

value: dataEl,

}));

Next, to get all other data we need to get values from responseURL using the matchAll method and the spread syntax([...]):

const values = [...responseURL.matchAll(valuePattern)].map(({ groups }) => groups.value);

Next, to get comparedByRegion data we need a response in which values array length is more than one:

if (values.length > 1) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

comparedByRegion.push(

...parsedData.geoMapData.map((dataEl) => ({

geo: dataEl.geoCode,

location: dataEl.geoName,

maxValueIndex: dataEl.maxValueIndex,

values: searchQueries.map((queryEl, i) => ({

query: queryEl,

value: dataEl.formattedValue[i],

extractedValue: dataEl.value[i],

})),

}))

);

}

Otherwise, we can get interestByRegion data. To do this, we need to iterate searchQueries array with the for...of loop. In the loop, we compare the single value values with query. If it's equal to each other, we can fill the interestByRegion array by pushing results from parsedData:

else {

for (query of searchQueries) {

if (values[0] === query) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

interestByRegion.push({

query,

data: parsedData.geoMapData.map((dataEl) => ({

geo: dataEl.geoCode,

location: dataEl.geoName,

maxValueIndex: dataEl.maxValueIndex,

value: dataEl.formattedValue[0],

extractedValue: dataEl.value[0],

})),

});

}

}

}

Next, to get relatedTopics and relatedQueries data we need a response with ""widgetdata/relatedsearches?"" part in the responseURL. If it's true, we need to iterate over searchQueries and compare the single value values with query again:

for (query of searchQueries) {

if (values[0] === query) {

...

}

}

Next, we check if responseURL contains "%22keywordType%22:%22ENTITY%22" part, we can get relatedTopics data:

if (responseURL.includes("%22keywordType%22:%22ENTITY%22")) {

const parsedData = JSON.parse(responseData.slice(6))?.default;

relatedTopics.top = parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({

topic: {

title: dataEl.topic.title,

type: dataEl.topic.type,

},

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

}));

relatedTopics.rising = parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({

topic: {

title: dataEl.topic.title,

type: dataEl.topic.type,

},

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

}));

}

Otherwise, we can get relatedQueries data:

else {

const parsedData = JSON.parse(responseData.slice(6))?.default;

relatedQueries.push({

searchQuery: query,

top: parsedData.rankedList[0].rankedKeyword.map((dataEl) => ({

query: dataEl.query,

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

})),

rising: parsedData.rankedList[1].rankedKeyword.map((dataEl) => ({

query: dataEl.query,

value: dataEl.formattedValue,

extractedValue: dataEl.value,

link: "https://trends.google.com" + dataEl.link,

})),

});

}

And finally, we wait 10 seconds (wait for all responses to be finished) and close the browser and return the received data:

await page.waitForTimeout(10000);

await browser.close();

return { interestOverTime, comparedByRegion, interestByRegion, relatedQueries, relatedTopics };

Now we can launch our parser:

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

Output

📌Note: I've combined the results of different runs of our parser into one for convenience.

{

"interestOverTime":{

"timelineData":[

{

"date":"Aug 29 – Sep 4, 2021",

"values":[

{

"query":"Mercedes",

"value":"74",

"extractedValue":74

},

{

"query":"BMW",

"value":"100",

"extractedValue":100

},

{

"query":"Audi",

"value":"60",

"extractedValue":60

}

]

},

... and other dates

],

"averages":[

{

"query":"Mercedes",

"value":65

},

{

"query":"BMW",

"value":89

},

{

"query":"Audi",

"value":55

}

]

},

"comparedByRegion":[

{

"geo":"LT",

"location":"Lithuania",

"maxValueIndex":1,

"values":[

{

"query":"Mercedes",

"value":"13%",

"extractedValue":13

},

{

"query":"BMW",

"value":"52%",

"extractedValue":52

},

{

"query":"Audi",

"value":"35%",

"extractedValue":35

}

]

},

... and other regions

],

"interestByRegion": {

"query":"Mercedes",

"data":[

{

"geo":"GR",

"location":"Greece",

"maxValueIndex":0,

"value":"36",

"extractedValue":36

},

... and other regions

],

"query":"BMW",

"data":[

{

"geo":"GR",

"location":"Greece",

"maxValueIndex":0,

"value":"37",

"extractedValue":37

},

... and other regions

],

"query":"Audi",

"data":[

{

"geo":"XK",

"location":"Kosovo",

"maxValueIndex":0,

"value":"100",

"extractedValue":100

},

... and other regions

]

},

"relatedQueries":[

{

"searchQuery":"Audi",

"top":[

{

"query":"a4 audi",

"value":"100",

"extractedValue":100,

"link":"https://trends.google.com/trends/explore?q=a4+audi&date=today+12-m"

},

... and other queries

],

"rising":[

{

"query":"wheel of fortune contestant loses audi",

"value":"+2,800%",

"extractedValue":2800,

"link":"https://trends.google.com/trends/explore?q=wheel+of+fortune+contestant+loses+audi&date=today+12-m"

},

... and other queries

]

},

{

"searchQuery":"BMW",

"top":[

{

"query":"bmw price",

"value":"100",

"extractedValue":100,

"link":"https://trends.google.com/trends/explore?q=bmw+price&date=today+12-m"

},

... and other queries

],

"rising":[

{

"query":"bmw x6 2022",

"value":"+1,450%",

"extractedValue":1450,

"link":"https://trends.google.com/trends/explore?q=bmw+x6+2022&date=today+12-m"

},

... and other queries

]

},

{

"searchQuery":"Mercedes",

"top":[

{

"query":"mercedes benz",

"value":"100",

"extractedValue":100,

"link":"https://trends.google.com/trends/explore?q=mercedes+benz&date=today+12-m"

},

... and other queries

],

"rising":[

{

"query":"mercedes f1 2022",

"value":"+2,250%",

"extractedValue":2250,

"link":"https://trends.google.com/trends/explore?q=mercedes+f1+2022&date=today+12-m"

},

... and other queries

]

}

],

"relatedTopics":{

"top":[

{

"topic":{

"title":"Mercedes-Benz",

"type":"Luxury vehicles company"

},

"value":"100",

"extractedValue":100,

"link":"https://trends.google.com/trends/explore?q=/m/052mx&date=today+12-m"

},

...and other topics

],

"rising":[

{

"topic":{

"title":"Mercedes-Benz EQB",

"type":"SUV"

},

"value":"+700%",

"extractedValue":700,

"link":"https://trends.google.com/trends/explore?q=/g/11h__y1vw4&date=today+12-m"

},

...and other topics

]

}

}

Using Google Trends API

The difference is that you don't need to use browser automation to scrape results, and write the parser from scratch and maintain it, which saves a lot of time.

There's also a chance that the request might be blocked at some point from Google. Instead, you just need to iterate the structured JSON and get the data you want.

First, we need to install google-search-results-nodejs. To do this you need to enter in your console: npm i google-search-results-nodejs

Here's the full code example, if you don't need an explanation:

require("dotenv").config();

const SerpApi = require("google-search-results-nodejs");

const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const searchQueries = "Mercedes"; // what we want to search (for interestOverTime, interestByRegion, relatedQueries, relatedTopics)

// const searchQueries = "Mercedes, BMW, Audi"; // what we want to search (for interestOverTime, comparedByRegion)

const dataTypesMultiple = ["TIMESERIES", "GEO_MAP"];

const dataTypesSingle = ["TIMESERIES", "GEO_MAP_0", "RELATED_TOPICS", "RELATED_QUERIES"];

const params = {

engine: "google_trends", // search engine

q: searchQueries, // search query

};

const getJson = () => {

return new Promise((resolve) => {

search.json(params, resolve);

});

};

const getResults = async () => {

const trendsResults = {};

if (searchQueries.split(",").length > 1) {

for (type of dataTypesMultiple) {

params.data_type = type;

const searchResult = await getJson();

if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;

else if (type === "GEO_MAP") trendsResults.comparedByRegion = searchResult.compared_breakdown_by_region;

}

} else {

for (type of dataTypesSingle) {

params.data_type = type;

const searchResult = await getJson();

if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;

else if (type === "GEO_MAP_0") trendsResults.interestByRegion = searchResult.interest_by_region;

else if (type === "RELATED_TOPICS") trendsResults.relatedTopics = searchResult.related_topics;

else if (type === "RELATED_QUERIES") trendsResults.relatedQueries = searchResult.related_queries;

}

}

return trendsResults;

};

getResults().then((result) => console.dir(result, { depth: null }));

Code explanation

First, we need to declare SerpApi from google-search-results-nodejs library and define new search instance with your API key from SerpApi:

const SerpApi = require("google-search-results-nodejs");

const search = new SerpApi.GoogleSearch(API_KEY);

Next, we write two different search queries to show you all available results (some of them are available only for a single search query, but some are available only for multiple search queries) and the necessary parameters for making a request :

const searchQueries = "Mercedes";

// const searchQueries = "Mercedes, BMW, Audi";

const dataTypesMultiple = ["TIMESERIES", "GEO_MAP"];

const dataTypesSingle = ["TIMESERIES", "GEO_MAP_0", "RELATED_TOPICS", "RELATED_QUERIES"];

const params = {

engine: "google_trends",

q: searchQueries,

};

Next, we wrap the search method from the SerpApi library in a promise to further work with the search results:

const getJson = () => {

return new Promise((resolve) => {

search.json(params, resolve);

});

};

And finally, we declare the function getResult that gets data from the page and return it:

const getResults = async () => {

...

};

In this function first, we declare an object trendsResults with results data:

const trendsResults = {};

Next, we need to define if searchQueries has one or more values. If it has more than one, we can get interestOverTime and comparedByRegion data. To do this, we need to iterate over dataTypesMultiple array (with the for...of loop) and set data_type value in the params object. Then we just receive searchResult and select the necessary data:

if (searchQueries.split(",").length > 1) {

for (type of dataTypesMultiple) {

params.data_type = type;

const searchResult = await getJson();

if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;

else if (type === "GEO_MAP") trendsResults.comparedByRegion = searchResult.compared_breakdown_by_region;

}

}

Otherwise, we iterate over dataTypesSingle array and get all data from the single query search request:

else {

for (type of dataTypesSingle) {

params.data_type = type;

const searchResult = await getJson();

if (type === "TIMESERIES") trendsResults.interestOverTime = searchResult.interest_over_time;

else if (type === "GEO_MAP_0") trendsResults.interestByRegion = searchResult.interest_by_region;

else if (type === "RELATED_TOPICS") trendsResults.relatedTopics = searchResult.related_topics;

else if (type === "RELATED_QUERIES") trendsResults.relatedQueries = searchResult.related_queries;

}

}

After, we run the getResults function and print all the received information in the console with the console.dir method, which allows you to use an object with the necessary parameters to change default output options. Watch Node.js documentation for more info:

getResults().then((result) => console.dir(result, { depth: null }));

Output

📌Note: I've combined the results of different runs of our parser into one for convenience.

{

"interestOverTime":{

"timeline_data":[

{

"date":"Aug 29 – Sep 4, 2021",

"values":[

{

"query":"Mercedes",

"value":"74",

"extracted_value":74

},

{

"query":"BMW",

"value":"100",

"extracted_value":100

},

{

"query":"Audi",

"value":"60",

"extracted_value":60

}

]

},

... and other dates

],

"averages":[

{

"query":"Mercedes",

"value":66

},

{

"query":"BMW",

"value":90

},

{

"query":"Audi",

"value":55

}

]

},

"comparedByRegion":[

{

"geo":"LT",

"location":"Lithuania",

"max_value_index":1,

"values":[

{

"query":"Mercedes",

"value":"13%",

"extracted_value":13

},

{

"query":"BMW",

"value":"52%",

"extracted_value":52

},

{

"query":"Audi",

"value":"35%",

"extracted_value":35

}

]

},

...and other regions

],

"interestByRegion":[

{

"geo":"AL",

"location":"Albania",

"max_value_index":0,

"value":"100",

"extracted_value":100

},

...and other regions

],

"relatedTopics":{

"rising":[

{

"topic":{

"value":"/g/11h__y1vw4",

"title":"Mercedes-Benz EQB",

"type":"SUV"

},

"value":"+700%",

"extracted_value":700,

"link":"https://trends.google.com/trends/explore?q=/g/11h__y1vw4&date=today+12-m",

"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_TOPICS&date=today+12-m&engine=google_trends&q=%2Fg%2F11h__y1vw4&tz=420"

},

... and other topics

],

"top":[

{

"topic":{

"value":"/m/052mx",

"title":"Mercedes-Benz",

"type":"Luxury vehicles company"

},

"value":"100",

"extracted_value":100,

"link":"https://trends.google.com/trends/explore?q=/m/052mx&date=today+12-m",

"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_TOPICS&date=today+12-m&engine=google_trends&q=%2Fm%2F052mx&tz=420"

},

... and other topics

]

},

"relatedQueries":{

"rising":[

{

"query":"mercedes eqxx",

"value":"+2,450%",

"extracted_value":2450,

"link":"https://trends.google.com/trends/explore?q=mercedes+eqxx&date=today+12-m",

"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_QUERIES&date=today+12-m&engine=google_trends&q=mercedes+eqxx&tz=420"

},

... and other queries

],

"top":[

{

"query":"mercedes benz",

"value":"100",

"extracted_value":100,

"link":"https://trends.google.com/trends/explore?q=mercedes+benz&date=today+12-m",

"serpapi_link":"https://serpapi.com/search.json?data_type=RELATED_QUERIES&date=today+12-m&engine=google_trends&q=mercedes+benz&tz=420"

},

... and other queries

]

}

}

Links

If you want to see some projects made with SerpApi, please write me a message.

Upvote


user
Created by

Mikhail Zub


people
Post

Upvote

Downvote

Comment

Bookmark

Share


Related Articles