119 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			119 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*****************************************************************************
 | |
|  *                                  _   _ ____  _
 | |
|  *  Project                     ___| | | |  _ \| |
 | |
|  *                             / __| | | | |_) | |
 | |
|  *                            | (__| |_| |  _ <| |___
 | |
|  *                             \___|\___/|_| \_\_____|
 | |
|  *
 | |
|  * $Id$
 | |
|  *
 | |
|  * Download a document and use libtidy to parse the HTML.
 | |
|  * Written by Jeff Pohlmeyer
 | |
|  *
 | |
|  * LibTidy => http://tidy.sourceforge.net
 | |
|  *
 | |
|  * gcc -Wall -I/usr/local/include tidycurl.c -lcurl -ltidy -o tidycurl
 | |
|  *
 | |
|  */
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include <tidy/tidy.h>
 | |
| #include <tidy/buffio.h>
 | |
| #include <curl/curl.h>
 | |
| 
 | |
| /* curl write callback, to fill tidy's input buffer...  */
 | |
| uint write_cb(char *in, uint size, uint nmemb, TidyBuffer *out)
 | |
| {
 | |
|   uint r;
 | |
|   r = size * nmemb;
 | |
|   tidyBufAppend( out, in, r );
 | |
|   return(r);
 | |
| }
 | |
| 
 | |
| /* Traverse the document tree */
 | |
| void dumpNode(TidyDoc doc, TidyNode tnod, int indent )
 | |
| {
 | |
|   TidyNode child;
 | |
|   for ( child = tidyGetChild(tnod); child; child = tidyGetNext(child) )
 | |
|   {
 | |
|     ctmbstr name = tidyNodeGetName( child );
 | |
|     if ( name )
 | |
|     {
 | |
|       /* if it has a name, then it's an HTML tag ... */
 | |
|       TidyAttr attr;
 | |
|       printf( "%*.*s%s ", indent, indent, "<", name);
 | |
|       /* walk the attribute list */
 | |
|       for ( attr=tidyAttrFirst(child); attr; attr=tidyAttrNext(attr) ) {
 | |
|         printf(tidyAttrName(attr));
 | |
|         tidyAttrValue(attr)?printf("=\"%s\" ",
 | |
|                                    tidyAttrValue(attr)):printf(" ");
 | |
|       }
 | |
|       printf( ">\n");
 | |
|     }
 | |
|     else {
 | |
|       /* if it doesn't have a name, then it's probably text, cdata, etc... */
 | |
|       TidyBuffer buf;
 | |
|       tidyBufInit(&buf);
 | |
|       tidyNodeGetText(doc, child, &buf);
 | |
|       printf("%*.*s\n", indent, indent, buf.bp?(char *)buf.bp:"");
 | |
|       tidyBufFree(&buf);
 | |
|     }
 | |
|     dumpNode( doc, child, indent + 4 ); /* recursive */
 | |
|   }
 | |
| }
 | |
| 
 | |
| 
 | |
| int main(int argc, char **argv )
 | |
| {
 | |
|   CURL *curl;
 | |
|   char curl_errbuf[CURL_ERROR_SIZE];
 | |
|   TidyDoc tdoc;
 | |
|   TidyBuffer docbuf = {0};
 | |
|   TidyBuffer tidy_errbuf = {0};
 | |
|   int err;
 | |
|   if ( argc == 2) {
 | |
|     curl = curl_easy_init();
 | |
|     curl_easy_setopt(curl, CURLOPT_URL, argv[1]);
 | |
|     curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_errbuf);
 | |
|     curl_easy_setopt(curl, CURLOPT_NOPROGRESS, no);
 | |
|     curl_easy_setopt(curl, CURLOPT_VERBOSE, yes);
 | |
|     curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
 | |
| 
 | |
|     tdoc = tidyCreate();
 | |
|     tidyOptSetBool(tdoc, TidyForceOutput, yes); /* try harder */
 | |
|     tidyOptSetInt(tdoc, TidyWrapLen, 4096);
 | |
|     tidySetErrorBuffer( tdoc, &tidy_errbuf );
 | |
|     tidyBufInit(&docbuf);
 | |
| 
 | |
|     curl_easy_setopt(curl, CURLOPT_WRITEDATA, &docbuf);
 | |
|     err=curl_easy_perform(curl);
 | |
|     if ( !err ) {
 | |
|       err = tidyParseBuffer(tdoc, &docbuf); /* parse the input */
 | |
|       if ( err >= 0 ) {
 | |
|         err = tidyCleanAndRepair(tdoc); /* fix any problems */
 | |
|         if ( err >= 0 ) {
 | |
|           err = tidyRunDiagnostics(tdoc); /* load tidy error buffer */
 | |
|           if ( err >= 0 ) {
 | |
|             dumpNode( tdoc, tidyGetRoot(tdoc), 0 ); /* walk the tree */
 | |
|             fprintf(stderr, "%s\n", tidy_errbuf.bp); /* show errors */
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|     else
 | |
|       fprintf(stderr, "%s\n", curl_errbuf);
 | |
| 
 | |
|     /* clean-up */
 | |
|     curl_easy_cleanup(curl);
 | |
|     tidyBufFree(&docbuf);
 | |
|     tidyBufFree(&tidy_errbuf);
 | |
|     tidyRelease(tdoc);
 | |
|     return(err);
 | |
| 
 | |
|   }
 | |
|   else
 | |
|     printf( "usage: %s <url>\n", argv[0] );
 | |
| 
 | |
|   return(0);
 | |
| }
 | 
