This is a program that scrapes the data out of a tshirt website and then writes the product info to a CSV file.
There are 3 scrape functions and 1 write function.
Right now, I am having an absolute nightmare trying to get my head around how to implement promises here without any 3rd party libraries or packages. Is this possible with just the native features of ES6?
Due to the async nature of the requests, I need each function and its requests to finish completely before the next one is called. This is so I can use the variables such as urlSet in the next function.
How can I do this simply without rewriting my whole code?
I should mention that each of these functions work on an individual basis, they've all been tested several times.
Does each function become an individual promise?
Code is below, thank you:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
    //Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
// First scrape loads front page of shirts4mike and finds the first product pages/menus
function firstScrape(){
    request(url, function(error, response, html) {
        if(!error && response.statusCode == 200){
            var $ = cheerio.load(html);
        //iterate over links with 'shirt'
            $('a[href*=shirt]').each(function(){
                var a = $(this).attr('href');
                //create new link
                var scrapeLink = url + a;
                //for each new link, go in and find out if there is a submit button. 
                //If there, add it to the set
                request(scrapeLink, function(error,response, html){
                    if(!error && response.statusCode == 200) {
                        var $ = cheerio.load(html);
                        //if page has a submit it must be a product page
                        if($('[type=submit]').length !== 0){
                            //add page to set
                            urlSet.add(scrapeLink);
                        } else if(remainder == undefined) {
                            //if not a product page, add it to remainder so it another scrape can be performed.
                            remainder = scrapeLink;                         
                        }
                    }
                });
            });     
        }
    });
}
//Scrape next level of menus to find remaning product pages to add to urlSet
function secondScrape() {
    request(remainder, function(error, response, html) {
        if(!error && response.statusCode == 200){
            var $ = cheerio.load(html);
            $('a[href*=shirt]').each(function(){
                var a = $(this).attr('href');
                //create new link
                var scrapeLink = url + a;
                request(scrapeLink, function(error,response, html){
                    if(!error && response.statusCode == 200){
                        var $ = cheerio.load(html);
                        //collect remaining product pages and add to set
                        if($('[type=submit]').length !== 0){
                            urlSet.add(scrapeLink);
                        }
                    }
                });
            });     
        }
    });
}
//call lastScraper so we can grab data from the set (product pages)
function lastScraper(){
    //scrape set, product pages
    for(var item of urlSet){
        var url = item;
        request(url, function(error, response, html){
            if(!error && response.statusCode == 200){
                var $ = cheerio.load(html);
                //grab data and store as variables
                var price = $('.price').text();
                var imgURL = $('.shirt-picture').find('img').attr('src');
                var title = $('body').find('.shirt-details > h1').text().slice(4);
                var tshirtObject = {};
                //add values into tshirt object
                tshirtObject.Title = title;
                tshirtObject.Price = price;
                tshirtObject.ImageURL = imgURL;
                tshirtObject.URL = url;
                tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
                //add the object into the array of tshirts
                tshirtArray.push(tshirtObject);
            }
        });
    }
}
//Convert array of tshirt objects and write to CSV file
function convertJson2Csv(){
    //The scraper should generate a folder called `data` if it doesn’t exist.
    var dir ='./data';
    if(!fs.existsSync(dir)){
        fs.mkdirSync(dir);
    }
    var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
    //convert tshirt data into CSV and pass in fields
    var csv = json2csv({ data: tshirtArray, fields: fields });
    //Name of file will be the date
    var fileDate = moment().format('MM-DD-YY');
    var fileName = dir + '/' + fileDate + '.csv';
    //Write file
    fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
        console.log('file saved');
        if (err) throw err;
    });
}
Answer
If you want to chain those functions with promises, then they have to return promises.
If you want to chain them with async module, then they have to take callbacks as arguments.
Right now they neither return a promise (or anything else), nor do they take callbacks (or anything else) as arguments. If the function doesn't take a callback and doesn't return anything then all you can do is call it and that's it. You will not be notified of any result.
Example
Callbacks
If you have 3 functions that take callbacks:
function fun1(cb) {
  setTimeout(() => {
    cb(null, "fun1");
  }, 1000);
}
function fun2(cb) {
  setTimeout(() => {
    cb(null, "fun2");
  }, 3000);
}
function fun3(cb) {
  setTimeout(() => {
    cb(null, "fun3");
  }, 100);
}
Then you can know when they finish:
fun3((err, value) => {
  console.log('fun3 finished:', value);
});
And you can easily wait for one before you start the other:
fun1((err1, val1) => {
  fun2((err2, val2) => {
    console.log("fun1 + fun2:", val1, val2);
  });
});
Promises
If your functions return promises:
function fun1() {
  return new Promise((res, rej) => {
    setTimeout(() => {
      res("fun1");
    }, 1000);
  });
}
function fun2() {
  return new Promise((res, rej) => {
    setTimeout(() => {
      res("fun2");
    }, 3000);
  });
}
function fun3() {
  return new Promise((res, rej) => {
    setTimeout(() => {
      res("fun3");
    }, 100);
  });
}
Then you can also know when they finish:
fun3().then(value => {
  console.log('fun3 finished:', value);
});
You can also easily nest the calls:
fun1().then(val1 => {
  fun2().then(val2 => {
    console.log("fun1 + fun2:", val1, val2);
  });
});
Or:
fun1()
.then(val1 => fun2())
.then(val2 => fun3())
.then(val3 => console.log('All 3 finished in series'));
etc.
To be able to do much more with both style, see documentation for:
 
No comments:
Post a Comment