User Tools

Site Tools


web_browsers:chrome:screen_scraper

Web Browsers - Chrome - Screen Scraper

Composed of two parts.

  • Parser
  • SaverApp

Load the two extension files into Chrome

To load these into chrome, you will need to go to the settings menu, choose more tools and then extensions.

  • Tick the “Developer Mode” box that you see, this will allow you to load unpacked extensions.
  • Click on the button “Load unpacked extension” and navigate to the directory where the Parser files exist.
  • Click on the button “Load unpacked extension” and navigate to the directory where the SaveApp files exist.

Once this has been completed, you will see a new icon appear as shown below

This will be used to start the page parsing.

Firstly though, you will need to launch the App, this is done by clicking on the “Launch” link as shown below

You need to configure the 2 parts so that they can communicate with each other, this is done by the IDs that are given to each of the APP/Extension.

You should now see a new window appear

You can click in the “my Application ID” box and copy the ID from there (just swoop the mouse over it whilst holding the left button down so that it selects it in blue and then CTRL+C to copy it.

You can now click the Speech mark button

To launch the Data Parser Extension

Click in the Destination ID box and paste the ID that you copied above and then click “Save ID” (you should only have to do this once per logon profile you use I would think). It should look as below

Do the same to copy the “Parser ID” and you will need to paste that ID into the Saver application box as shown below

The 2 parts are now configured to talk to each other.

Load up a storQM page, you should now be able to process them.

To do this, you need to ensure that you have started the AG Data Saver Application (AGDS) by clicking the “Launch” link.

Before you can process a page sequence, you need to select the file that the results will be saved into; this is a limitation of the browsers and has to be done as a manual event. To do this, click on the “Select Save File” button in AGDS window. You can now choose a file, it defaults to output.csv, but you can change this to anything you want.

Now, Load up a storQM page as you would normally, when you have the first results page, you can then click the

Icon which opens the “Data Parser “, at this point you can now click on start process, and it should save the page and then navigate through the subsequent pages until it’s completed.

You should see this in the “Data Parser”

And this in the AGDS

Examining the chosen output file should contain all the data from those pages in csv format.


Parser

index.html

index.html
<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <link href="main.css" rel="stylesheet">
</head>
 
<body>
  <h3>Data Parser</h3>
  <label>Parser ID is <input id="appid" type="text"  readonly></input></label>
  <div>
    <label for="sendId">Destination ID</label>
    <input id="sendId" type="text"></input>
 
    <button id="saveID">Save ID</button>
    <button id="process">Start Process</button>
  </div>
  <div id="log"></div>
 
  <script src="index.js"></script>
</body>
 
</html>

main.css

main.css
input[type="text"] {
  width: 240px;
}
 
#log {
  background-color: rgb(226, 226, 250);
  padding: 10px 20px;
  margin-top: 10px;
  height: 300px;
  border: 1px solid black;
  overflow-y: scroll;
  overflow-x: hidden;
}

index.js

index.js
(function(context){
  document.getElementById("appid").value=chrome.runtime.id;
  var remoteAppID ="";
  var action ="";
  var pageIndex=null;
  console.log('Starting');
  var sendId;
  sendId=document.getElementById("sendId");
 
  chrome.storage.local.get('remoteappid',function (result) {
    console.log(result);
    remoteAppID=result.remoteappid;
    console.log('In Loop' +remoteAppID);
    sendId.value=remoteAppID;
  });
 
  var logField = document.getElementById("log");
  var save=document.getElementById("saveID");
  var process=document.getElementById("process");
  save.addEventListener('click', function() {
    var remoteID=sendId.value;
    appendLog("Saved");
    chrome.storage.local.set({'remoteappid': remoteID});
  });
 
  process.addEventListener('click', function() {
    pageIndex=1;
    chrome.tabs.query(
      { currentWindow: true, active: true },
      function (tabArray) {
        var activeTabId= tabArray[0];
        chrome.tabs.sendMessage(activeTabId.id, {text: 'report_back',pageIndex: pageIndex}, doStuffWithDom);
      }
    );
  });
 
  function doStuffWithDom(res) {
    var domContent=res.domContent;
    var pageIndex=res.pageIndex;
 
    //console.log('I received the following DOM content:\n' + domContent);
 
    chrome.runtime.sendMessage(
      sendId.value,
      {myCustomMessage: JSON.stringify(domContent),myAction : "save",myIndex : pageIndex},
      function(response) {
        appendLog("Remote Message : " + response.result);
        if (response.Success=="false"){
          console.log("Remote Error : " + response.result);
          appendLog("Remote Error : " + response.result);
        }else{
          chrome.tabs.query(
            { currentWindow: true, active: true },
            function (tabArray) {
              var activeTabId= tabArray[0];
              chrome.tabs.sendMessage(activeTabId.id, {text: 'process_next'},dealWithNextButton);
            }
          );
          console.log(response.result);
          appendLog(response.result);
        }
      })
    }
 
    chrome.runtime.onMessageExternal.addListener(
      function(request, sender, sendResponse) {
        appendLog("Remote Message : " + request.myResultAction);
        if (request.result=="false"){
          console.log("Remote Error : " + request.myResultAction);
          appendLog("Remote Error : " + request.myResultAction);
        }else{
          chrome.tabs.query(
            { currentWindow: true, active: true },
            function (tabArray) {
              var activeTabId= tabArray[0];
              chrome.tabs.sendMessage(activeTabId.id, {text: 'process_next'},dealWithNextButton);
            }
          );
          console.log(request.result);
          appendLog(request.result);
        }
      }
    );
 
    function dealWithNextButton(res){
      var result = res.success;
      if (result==false){
        appendLog("Finished.");
        pageIndex=null;
      }else{
        appendLog("Still Processing.");
      }
    }
 
    chrome.webNavigation.onDOMContentLoaded.addListener(function (details) {
      if (pageIndex !=null){
        pageIndex+=1;
        chrome.tabs.query(
          { currentWindow: true, active: true },
          function (tabArray) {
            var activeTabId= tabArray[0];
            chrome.tabs.sendMessage(activeTabId.id, {text: 'report_back',pageIndex: pageIndex}, doStuffWithDom);
          }
        );
      }
    }
  );
 
  function doInCurrentTab(tabCallback) {
  }
 
 
  var appendLog = function(message) {
    logField.innerText+="\n"+message;
  };
 
  context.appendLog = appendLog;
})(window)

content.js

content.js
  // Listen for messages
  chrome.runtime.onMessage.addListener(function (msg, sender, sendResponse) {
    console.log('got here');
    // If the received message has the expected format...
    if (msg.text === 'report_back') {
      // Call the specified callback, passing
      // the web-page's DOM content as argument
      var tbl = document.getElementsByName('table1')[0].innerHTML;
      //sendResponse(document.all[0].outerHTML);
 
 
      var rows = [];
      var result = [];
      $('table[name=table1]>tbody>tr').each(function(id){
        var row = {'id': id+1};
        if ($(this).find('td').length!=0){
          $(this).find('td').each(function(index){
            row[index] = $(this).text();
          });
          result.push(row);
        }
      });
 
      var retResult = ({'domContent': result,'tableContent': result,'pageIndex':msg.pageIndex});
      sendResponse(retResult);
    }
    if (msg.text === 'process_next') {
      var nextButton = $('[name="nextButton"]');
      if (nextButton.length>0){
        nextButton.trigger( "click" );
        var retResultS = ({'success':true});
        sendResponse(retResultS);
      }else{
        var retResultF = ({'success':false});
        sendResponse(retResultF);
      }
    }
  });

eventPage.js

eventPage.js
var blacklistedIds = ["none"];
 
chrome.runtime.onMessageExternal.addListener(
  function(request, sender, sendResponse) {
    appendLog("MSG RCV : " + request.myResultAction+' ' +request.myResultIndex);
    if (request.myResultAction=='Ok Saved :') {
      appendLog(request.myResultAction+' ' +request.myResultIndex);
      sendResponse({"result":"Ok, got your message"});
    } else {
      sendResponse({"result":"Ops, I don't understand this message :" + request.myResultAction});
    }
  }
);

mainfest.json

manifest.json
{
  "name": "AG Data Parser Extension",
  "version": "1.1",
  "description": "Extension to parse pages and send them to the save sink.",
  "browser_action": {
    "default_title": "Send message to other apps",
    "default_icon": "icon_16.png",
    "default_popup": "index.html"
  },
  "background": {
    "scripts": ["eventPage.js"],
    "persistent": false
  },
  "content_scripts": [{
    "matches": ["<all_urls>"],
    "js": ["content.js","jquery.js"],
    "run_at": "document_end"
  }],
  "permissions": [ "activeTab", "notifications","storage","webNavigation"],
  "manifest_version": 2
}

Save Sink Events

index.html

index.html
<!DOCTYPE html>
 
<html>
<head>
  <meta charset="utf-8">
  <link href="main.css" rel="stylesheet">
</head>
 
<body>
  <h3>Data Parser</h3>
  <label>Parser ID is <input id="appid" type="text"  readonly></input></label>
  <div>
    <label for="sendId">Destination ID</label>
    <input id="sendId" type="text"></input>
    <button id="saveID">Save ID</button>
    <button id="process">Start Process</button>
  </div>
 
  <div id="log"></div>
 
  <script src="index.js"></script>
</body>
</html>

main.css

main.css
input[type="text"] {
  width: 240px;
}
 
#log {
  background-color: rgb(226, 226, 250);
  padding: 10px 20px;
  margin-top: 10px;
  height: 300px;
  border: 1px solid black;
  overflow-y: scroll;
  overflow-x: hidden;
}

index.js

index.js
(function(context){
  document.getElementById("appid").value=chrome.runtime.id;
 
  var remoteAppID ="";
 
  chrome.storage.local.get('remoteappid',function (result) {
    remoteAppID=result.remoteappid;
    sendId.value=remoteAppID;
  });
 
  var logField = document.getElementById("log");
  var selectSave=document.getElementById("selectSave");
  var sendId=document.getElementById("sendId");
  var clearLog=document.getElementById("clearLog");
  var saveId=document.getElementById("saveId");
 
  saveId.addEventListener('click', function() {
    var remoteID=sendId.value;
    appendLog("Saved");
    chrome.storage.local.set({'remoteappid': remoteID});
    //alert (remoteID);
  });
 
  var chosenFileEntry=null;
 
  clearLog.addEventListener('click', function() {
    logField.innerText='';
  });
 
  errorHandler = function (obj) {
    sendResponse({"result":"Something Went Wrong."});
    console.log(obj);
  };
 
  selectSave.addEventListener('click', function() {
    chrome.fileSystem.chooseEntry({type: 'saveFile',
      suggestedName: 'output.csv'},
 
      function(writableFileEntry) {
        chosenFileEntry=writableFileEntry;
 
        writableFileEntry.createWriter(function(writer) {
          writer.seek(0);
          writer.truncate(0);
          writer.onwriteend = function(e) {
          };
 
          writer.write(new Blob(['0123456789'],
            {type: 'text/plain'}
          ));
        }, errorHandler);
      });
    });
 
 
    chrome.runtime.onMessageExternal.addListener(
      function(request, sender, sendResponse) {
        if (request.myAction) {
          appendLog("Action from "+sender.id+": "+request.myAction);
          //appendLog("Message from "+sender.id+": "+request.myCustomMessage);
 
          if (request.myAction=='save' && chosenFileEntry==null){
            sendResponse({"result":"No Save File Configured","Success":"false"});
          }
 
          if (request.myAction=='save' && chosenFileEntry!=null){
            var arrData = typeof JSONData != 'object' ? JSON.parse(request.myCustomMessage) : request.myCustomMessage;
            var CSV = '';
 
            for (var i = 0; i < arrData.length; i++) {
              var row = "";
 
              for (var index in arrData[i]) {
                //row += '"' + arrData[i][index] + '",';
                row +=  arrData[i][index] + ',';
              }
 
              row.slice(0, row.length - 1);
              CSV += row + '\r\n';
            }
 
            appendLog("Saving CSV : ");
 
            chosenFileEntry.file(function(file) {
              var reader = new FileReader();
 
              reader.onload = function(e) {
                var contents =e.target.result;
 
                if (request.myIndex==1){
                  contents='';
                  appendLog("Clearing Contents");
                }
 
                appendLog("Saving Index : " + request.myIndex);//+CSV);
                CSV=contents+CSV;
 
                chrome.fileSystem.getWritableEntry(chosenFileEntry,
                  function(writableFileEntry) {
                    writableFileEntry.createWriter(function(writer) {
                      writer.onwriteend = function(e) {
                        appendLog("Save Complete - Sending Message");
                        sendReply("true","Ok Saved :",request.myIndex);
                      };
 
                      chosenFileEntry.file(function(file) {
                        writer.write(new Blob([CSV],
                          {type: 'text/plain'}));
                      });
                    }, errorHandler);
                  });
                };
 
                reader.readAsText(file);
              });
            }
          } else {
            sendResponse({"result":"Ops, I don't understand this message :" + request});
          }
        });
 
        function sendReply(myResult,myResultAction,pageIndex){
          chrome.runtime.sendMessage(
            sendId.value,
            {result: myResult, myResultAction : myResultAction, myResultIndex : pageIndex},
 
            function(response) {
              console.log("response: "+JSON.stringify(response));
              appendLog("response: "+JSON.stringify(response));
            }
          )
        }
 
  var appendLog = function(message) {
    logField.innerText+="\n"+message;
  };
 
 
  context.appendLog = appendLog;
})(window)

manifest.json

manifest.json
{
  "manifest_version": 2,
  "name": "AG Data Saver Application",
  "description": "Application to manage the save sink events from the parser.",
  "version": "1.1",
  "minimum_chrome_version": "23",
  "icons": {
    "16": "icon_16.png"
  },
  "app": {
    "background": {
      "scripts": ["main.js"]
    }
  },
  "permissions": [{"fileSystem": ["write", "retainEntries", "directory"]},"storage"]
}

web_browsers/chrome/screen_scraper.txt · Last modified: 2020/07/15 10:30 by 127.0.0.1

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki