HTML <head> parsing (with libxml) example code by Lars Nilsson.
This commit is contained in:
parent
16b5dc710f
commit
883343ba63
@ -10,7 +10,7 @@ EXTRA_DIST = README curlgtk.c sepheaders.c simple.c postit2.c \
|
||||
post-callback.c multi-app.c multi-double.c multi-single.c \
|
||||
multi-post.c fopen.c simplepost.c makefile.dj curlx.c https.c \
|
||||
multi-debugcallback.c fileupload.c getinfo.c ftp3rdparty.c debug.c \
|
||||
anyauthput.c
|
||||
anyauthput.c htmltitle.cc
|
||||
|
||||
all:
|
||||
@echo "done"
|
||||
|
297
docs/examples/htmltitle.cc
Normal file
297
docs/examples/htmltitle.cc
Normal file
@ -0,0 +1,297 @@
|
||||
/*****************************************************************************
|
||||
* _ _ ____ _
|
||||
* Project ___| | | | _ \| |
|
||||
* / __| | | | |_) | |
|
||||
* | (__| |_| | _ <| |___
|
||||
* \___|\___/|_| \_\_____|
|
||||
*
|
||||
* $Id$
|
||||
*/
|
||||
|
||||
// Get a web page, parse it with libxml.
|
||||
//
|
||||
// Written by Lars Nilsson
|
||||
//
|
||||
// GNU C++ compile command line suggestion (edit paths accordingly):
|
||||
//
|
||||
// g++ -Wall -I/opt/curl/include -I/opt/libxml/include/libxml2 htmltitle.cc \
|
||||
// -o htmltitle -L/opt/curl/lib -L/opt/libxml/lib -lcurl -lxml2
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
|
||||
//
|
||||
// Case-insensitive string comparison
|
||||
//
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define COMPARE(a, b) (!stricmp((a), (b)))
|
||||
#else
|
||||
#define COMPARE(a, b) (!strcasecmp((a), (b)))
|
||||
#endif
|
||||
|
||||
//
|
||||
// libxml callback context structure
|
||||
//
|
||||
|
||||
struct Context
|
||||
{
|
||||
Context(): addTitle(false) { }
|
||||
|
||||
bool addTitle;
|
||||
std::string title;
|
||||
};
|
||||
|
||||
//
|
||||
// libcurl variables for error strings and returned data
|
||||
|
||||
static char errorBuffer[CURL_ERROR_SIZE];
|
||||
static std::string buffer;
|
||||
|
||||
//
|
||||
// libcurl write callback function
|
||||
//
|
||||
|
||||
static int writer(char *data, size_t size, size_t nmemb,
|
||||
std::string *writerData)
|
||||
{
|
||||
if (writerData == NULL)
|
||||
return 0;
|
||||
|
||||
writerData->append(data, size*nmemb);
|
||||
|
||||
return size * nmemb;
|
||||
}
|
||||
|
||||
//
|
||||
// libcurl connection initialization
|
||||
//
|
||||
|
||||
static bool init(CURL *&conn, char *url)
|
||||
{
|
||||
CURLcode code;
|
||||
|
||||
conn = curl_easy_init();
|
||||
|
||||
if (conn == NULL)
|
||||
{
|
||||
fprintf(stderr, "Failed to create CURL connection\n");
|
||||
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
|
||||
if (code != CURLE_OK)
|
||||
{
|
||||
fprintf(stderr, "Failed to set error buffer [%d]\n", code);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
code = curl_easy_setopt(conn, CURLOPT_URL, url);
|
||||
if (code != CURLE_OK)
|
||||
{
|
||||
fprintf(stderr, "Failed to set URL [%s]\n", errorBuffer);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1);
|
||||
if (code != CURLE_OK)
|
||||
{
|
||||
fprintf(stderr, "Failed to set redirect option [%s]\n", errorBuffer);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
|
||||
if (code != CURLE_OK)
|
||||
{
|
||||
fprintf(stderr, "Failed to set writer [%s]\n", errorBuffer);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
|
||||
if (code != CURLE_OK)
|
||||
{
|
||||
fprintf(stderr, "Failed to set write data [%s]\n", errorBuffer);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//
|
||||
// libxml start element callback function
|
||||
//
|
||||
|
||||
static void startElement(void *voidContext,
|
||||
const xmlChar *name,
|
||||
const xmlChar **attributes)
|
||||
{
|
||||
Context *context = (Context *)voidContext;
|
||||
|
||||
if (COMPARE((char *)name, "TITLE"))
|
||||
{
|
||||
context->title = "";
|
||||
context->addTitle = true;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// libxml end element callback function
|
||||
//
|
||||
|
||||
static void endElement(void *voidContext,
|
||||
const xmlChar *name)
|
||||
{
|
||||
Context *context = (Context *)voidContext;
|
||||
|
||||
if (COMPARE((char *)name, "TITLE"))
|
||||
context->addTitle = false;
|
||||
}
|
||||
|
||||
//
|
||||
// Text handling helper function
|
||||
//
|
||||
|
||||
static void handleCharacters(Context *context,
|
||||
const xmlChar *chars,
|
||||
int length)
|
||||
{
|
||||
if (context->addTitle)
|
||||
context->title.append((char *)chars, length);
|
||||
}
|
||||
|
||||
//
|
||||
// libxml PCDATA callback function
|
||||
//
|
||||
|
||||
static void characters(void *voidContext,
|
||||
const xmlChar *chars,
|
||||
int length)
|
||||
{
|
||||
Context *context = (Context *)voidContext;
|
||||
|
||||
handleCharacters(context, chars, length);
|
||||
}
|
||||
|
||||
//
|
||||
// libxml CDATA callback function
|
||||
//
|
||||
|
||||
static void cdata(void *voidContext,
|
||||
const xmlChar *chars,
|
||||
int length)
|
||||
{
|
||||
Context *context = (Context *)voidContext;
|
||||
|
||||
handleCharacters(context, chars, length);
|
||||
}
|
||||
|
||||
//
|
||||
// libxml SAX callback structure
|
||||
//
|
||||
|
||||
static htmlSAXHandler saxHandler =
|
||||
{
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
startElement,
|
||||
endElement,
|
||||
NULL,
|
||||
characters,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
cdata,
|
||||
NULL
|
||||
};
|
||||
|
||||
//
|
||||
// Parse given (assumed to be) HTML text and return the title
|
||||
//
|
||||
|
||||
static void parseHtml(const std::string &html,
|
||||
std::string &title)
|
||||
{
|
||||
htmlParserCtxtPtr ctxt;
|
||||
Context context;
|
||||
|
||||
ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "",
|
||||
XML_CHAR_ENCODING_NONE);
|
||||
|
||||
htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
|
||||
htmlParseChunk(ctxt, "", 0, 1);
|
||||
|
||||
htmlFreeParserCtxt(ctxt);
|
||||
|
||||
title = context.title;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
CURL *conn = NULL;
|
||||
CURLcode code;
|
||||
std::string title;
|
||||
|
||||
// Ensure one argument is given
|
||||
|
||||
if (argc != 2)
|
||||
{
|
||||
fprintf(stderr, "Usage: %s <url>\n", argv[0]);
|
||||
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Initialize CURL connection
|
||||
|
||||
if (!init(conn, argv[1]))
|
||||
{
|
||||
fprintf(stderr, "Connection initializion failed\n");
|
||||
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Retrieve content for the URL
|
||||
|
||||
code = curl_easy_perform(conn);
|
||||
if (code != CURLE_OK)
|
||||
{
|
||||
fprintf(stderr, "Failed to get '%s' [%s]\n", argv[1], errorBuffer);
|
||||
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Parse the (assumed) HTML code
|
||||
|
||||
parseHtml(buffer, title);
|
||||
|
||||
// Display the extracted title
|
||||
|
||||
printf("Title: %s\n", title.c_str());
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user