User:Hendrik Brummermann/XHTMLDumper.java

//

// This work is licensed under CC-BY
// (Creative Commons License - Attribution 2.0).
// see: http://creativecommons.org/licenses/by/2.0/

// You need the program "tidy" in your system's search path.

/*
 * $Log: XHTMLDumper.java,v $
 * Revision 1.8  2005/01/08 12:01:30  nhb
 * Fixing invokation of wget
 *
 * Revision 1.7  2004/12/11 18:08:28  nhb
 * Store output of tidy into a file instead of reading it directly from stdout.
 * Do not depand on node.toString() dumping the whole xml tree.
 *
 * Revision 1.6  2004/09/28 19:50:46  nhb
 * Bugfix: Doppeltes head-Element beseitigt und im inline Stylesheet die linke Spalte auf 0 gesetzt
 *
 * Revision 1.5  2004/09/28 19:09:55  nhb
 * - Skriptbasiertes Herunterladen von allen Links auf einer Seite.
 * - Der Head-Bereich wird ausgetauscht.
 * - Bild-URLs werden entsprechend umgeschrieben.
 * - Ausgabe eines wget-Skripts fuer Bilder
 * - keine Umkodierung von UTF-8 nach ISO-8859-1 mehr.
 *
 * Revision 1.4  2004/08/29 18:12:50  nhb
 * Neue Klasse: Book
 *
 * Revision 1.3  2004/08/28 21:27:26  nhb
 * *** empty log message ***
 *
 * Revision 1.2  2004/08/28 08:16:09  nhb
 * Refectoring
 *
 * Revision 1.1  2004/08/23 22:16:56  nhb
 * inital checkin
 *
 */
package nhb.wikipedia;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringWriter;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.xpath.XPathAPI;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.traversal.NodeIterator;
import org.xml.sax.SAXException;

/**
 * fetches a collection of articles to disk.
 * It can rewrite links and remove the MediaWiki navigation.
 * Several articles can be combined to one file.
 *
 * @author Hendrik Brummermann <[email protected]>
 * @link http://creativecommons.org/licenses/by/2.0/
 */
public class XHTMLDumper {

    // --> --> --> --> --> --> --> --> --> --> --> --> --> --> -->
    private static final String URL_PREFIX = "http://localhost:10080"; //"http://wiki";
    private static final String WIKI_PATH = "/mediawiki/index.php/";
    private static final String UPLOAD_PATH = "/mediawiki/images";
    private static final String TARGET = "/tmp/wiki";
    private static final String IMAGE_FOLDER = "wiki_files";
    private static final String ID_SEP = "_____";
    // <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <-- <--


    public XHTMLDumper() {
    }


    public class Article {
        // Variablen
        // private final Namespace NS_HTML = Namespace.getNamespace("http://www.w3.org/1999/xhtml");
        private String title = null;
        private String url = null;
        private Element root = null;
        private Element content = null;
        private boolean unifyIDs = false;
        private boolean convertShortTags = true;
        private boolean fetchPageRequisites = true;
        private boolean rewriteLocalURLs = true;
        private Set pageRequisites = new HashSet();
        private String head = "<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\r\n"
    + "<link rel=\"shortcut icon\" href=\"wiki_files/favicon.ico\" />\r\n"
    + "<style type=\"text/css\" media=\"screen,projection\">/*<![CDATA[*/ @import \"wiki_files/main.css\"; /*]]>*/#content {margin: 0}</style>\r\n"
    + "<link rel=\"stylesheet\" type=\"text/css\" media=\"print\" href=\"wiki_files/commonPrint.css\" />\r\n"
    + "<script type=\"text/javascript\" src=\"wiki_files/wikibits.js\"> </script>\r\n"
    + "<title>HISLSF - Dokumentation</title>\r\n"
    + "</head>";
        // <title>DB-Interface-Admin - His</title>

        public Article(String title) {
            this.title = title;
            url = URL_PREFIX + WIKI_PATH + title;
            fetchAsXHTML();
        }

        public void process() {
            extractContent();
            unifyIDsAndConvertShortTags(root);
        }

        /**
         * Stores the file to disk.
         *
         * @throws IOException bei einem E/A-Fehler
         * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers
         * @throws SAXException bei einem XML-Fehler
         */
        public void saveToDisk() throws SAXException, IOException, ParserConfigurationException {
            process();
            replaceHead();
//            fetchPageRequisites();
            String filename = TARGET + "/" + title.replace(' ', '_').replace('/', '-') + ".html";
            OutputStream of = new FileOutputStream(filename);
            of.write(XMLUtils.dumpXML(root).getBytes("UTF-8"));
            of.close();
        }

        /**
         * replaces the head-element <!-- preserving the title-element.-->
         *
         * @throws IOException bei einem E/A-Fehler
         * @throws ParserConfigurationException Konfigurationsfehler des XML-Parsers
         * @throws SAXException bei einem XML-Fehler
         */
        private void replaceHead() throws SAXException, IOException, ParserConfigurationException {
            Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new ByteArrayInputStream(head.getBytes()));
            Node oldHead = null;
            for (int i = 0; i < root.getChildNodes().getLength(); i++) {
                oldHead = root.getChildNodes().item(i);
                if ((oldHead instanceof Element) && ((Element) oldHead).getNodeName().equalsIgnoreCase("head")) {
                    break;
                }
            }
            Node newHead = doc.getDocumentElement();
            newHead = root.getOwnerDocument().importNode(newHead, true);
            root.insertBefore(newHead, oldHead);
            root.removeChild(oldHead);
        }

        /**
         * Downloads a HTML-document, converts it into xhtml using tidy
         * and parses it into an xml object tree.
         */
        private void fetchAsXHTML() {
            try {
                // fetch
                String file = NetUtil.fetchDocumentAsFile(url);

                // run tidy
                //"tidy -asxhtml -utf8 $1 >$1.html 2> /dev/null"
//              Process process = Runtime.getRuntime().exec("tidy -q -asxhtml -utf8 " + file);
                String outFile = File.createTempFile("xhtml", ".html").getAbsolutePath();
                Process process = Runtime.getRuntime().exec("tidy -q -asxhtml -utf8 -o " + outFile + " "+ file);
/*                System.out.println("sleeping");
                Thread.sleep(5000);
                System.out.println("sleeped");*/
                process.waitFor();
                //Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(process.getInputStream());
                Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new FileInputStream(outFile));
                root = doc.getDocumentElement();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        /**
         * Extracts the content (i. e. strips the navigation).
         */
        private void extractContent() {
            try {
                content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']");

                // Einige Elemente loeschen
//                XMLUtils.removeChildren(root, "//self::node()[@id='contentSub' or @id='siteSub' or @id='toc' or @class='printfooter' or @id='catlinks' or @class='editsection']");
                XMLUtils.removeChildren(root, "//self::node()[@id='column-one' or @id='footer' or @id='contentSub' or @id='siteSub' or @class='printfooter' or @id='catlinks' or @class='editsection']");

            } catch (TransformerException e) {
                e.printStackTrace();
            }
        }

/*
name: top
id: contentTop
id: bodyContent
id: contentSub
*/

        private void unifyIDsAndConvertShortTags(Element element) {
            NodeList nodes = element.getChildNodes();
            for (int i = 0; i < nodes.getLength(); i++) {
                Node node = nodes.item(i);
                if (node instanceof Element) {
                    Element e = (Element) node;
                    String nodeName = node.getNodeName();

                    // unify IDs
                    if (unifyIDs && "a".equals(nodeName)) {
                        String val = e.getAttribute("name");
                        if (!val.equals("")) {
                            e.setAttribute("name", title + ID_SEP + val);
                            e.setAttribute("id", title + ID_SEP + val);
                        }
                        val = e.getAttribute("href");
                        if ((val.length() > 1) && val.charAt(0) == '#') {
                            e.setAttribute("href", title + ID_SEP + val.substring(1));
                            System.out.println("#" + title + ID_SEP + val.substring(1));
                        }
                    }

                    // convert short tags
                    if (convertShortTags && ("a".equals(nodeName) || "div".equals(nodeName)) && (e.getFirstChild() == null)) {
                        e.appendChild(e.getOwnerDocument().createTextNode(""));
                    }

                    // convert links to other pages
                    if (rewriteLocalURLs && "a".equals(nodeName)) {
                        String val = e.getAttribute("href");
                        if (val.startsWith(WIKI_PATH)) {
                            e.setAttribute("href", val.substring(WIKI_PATH.length()));
                        }
                    }

                    // collection image urls and rewrite img-src links.
                    if (fetchPageRequisites && "img".equals(nodeName)) {
                        String url = e.getAttribute("src");
                        pageRequisites.add(url);
                        if (url.startsWith(UPLOAD_PATH)) {
                            url = IMAGE_FOLDER + url.substring(UPLOAD_PATH.length() + 5);
                            e.setAttribute("src", url);
                        }
                    }

                    // go to the next level
                    unifyIDsAndConvertShortTags((Element) node);
                }
            }
        }

        /**
         * Returns a set of page requisites (like images)
         *
         * @return Set
         */
        public Set getPageRequisites() {
            return pageRequisites;
        }

        /**
         * Return the xml object.
         *
         * @return Element
         */
        public Element getXML() {
            if (content != null) {
                return content;
            } else {
                return root;
            }
        }
    }


    public class Book {
        private Set pages = new HashSet();
        private Set pageRequisites = new HashSet();

        /**
         * creates a new book
         *
         * @param name page containing a list of links
         * @throws IOException
         */
        public Book(String name) throws IOException {
            // fetch wiki text
            BufferedReader br = NetUtil.fetchDocumentAsBufferedReader(URL_PREFIX + WIKI_PATH + name + "?action=raw");
            fetchLinkList(br);
            br.close();
        }

        /**
         * Fetches all pages of this book
         * @throws IOException
         * @throws ParserConfigurationException
         * @throws SAXException
         */
        public void fetchBook() throws SAXException, IOException, ParserConfigurationException {
            Iterator itr = pages.iterator();
            while (itr.hasNext()) {
                String page = (String) itr.next();
                System.out.println("fetching " + page + "...");
                Article article = new Article(page);
                article.saveToDisk();
                pageRequisites.addAll(article.getPageRequisites());
            }
            fetchPageRequisites();
        }

        private void fetchPageRequisites() {
            System.out.println("cd " + TARGET + "/" + IMAGE_FOLDER);
            Iterator itr = pageRequisites.iterator();
            while (itr.hasNext()) {
                System.out.println("wget -N " + URL_PREFIX + itr.next());
            }
        }


        private void fetchLinkList(BufferedReader br) throws IOException {
            String line = br.readLine();
            while (line != null) {
                int pos = line.indexOf("[[");
                while (pos > -1) {
                    line = line.substring(pos + 2);
                    int posEnd = line.indexOf("]]");
                    if (posEnd == -1) { // is the link closed?
                        break;
                    }
                    String link = line.substring(0, posEnd);
                    pos = link.indexOf("|");
                    if (pos > -1) {
                        link = link.substring(0, pos);
                    }
                    link = link.trim();
                    String page = link;
                    if (page.length() == 0) {
                        continue;
                    }
                    page = page.replace(' ', '_');
                    pages.add(page);

                    // find next link
                    line = line.substring(posEnd + 2);
                    pos = line.indexOf("[[");
                }
                line = br.readLine();
            }
        }
    }


    public class Cover {
        private Element root = null;

        public Cover(String name) {
            Article cover = new Article(name);
            root = cover.getXML();
            XMLUtils.removeChildren(root, "//div[@id='content']/*");
            XMLUtils.removeChildren(root, "//div[@id='column-one' or @id='footer']");
            try {
                Element content = (Element) XPathAPI.selectSingleNode(root, "//div[@id='content']");

            } catch (TransformerException e) {
                e.printStackTrace();
            }
        }

        public Element getXML() {
            return root;
        }
    }


    public static class XMLUtils {
        /** hide constructor */
        private XMLUtils() { }

        /**
         * Dumps an XML-tree into a String
         *
         * @param node xml-node
         * @return String
         */
        public static String dumpXML(Node node) {
            try {
                // Message-ID: <[email protected]> From: "Billy Ng"
                DOMSource source = new DOMSource(node);
                TransformerFactory tfFactory = TransformerFactory.newInstance();
                Transformer transformer = tfFactory.newTransformer();
                StringWriter sw = new StringWriter();
                StreamResult result = new StreamResult(sw);
                transformer.transform(source, result);
                return sw.toString();
            } catch (TransformerConfigurationException e) {
                e.printStackTrace();
            } catch (TransformerException e) {
                e.printStackTrace();
            }
            return "";
        }

        public static void removeChildren(Element parent, String xpath) {
            try {
                NodeIterator itr = XPathAPI.selectNodeIterator(parent, xpath);
                Node node = itr.nextNode();
                Set set = new HashSet();
                while (node != null) {
                    set.add(node);
                    node = itr.nextNode();
                }
                Iterator itr2 = set.iterator();
                while (itr2.hasNext()) {
                    node = (Node) itr2.next();
                    node.getParentNode().removeChild(node);
                }
            } catch (TransformerException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Utility class for network access.
     */
    public static final class NetUtil {
        private static final int BUFFER_SIZE = 10240;

        /** Hide constructor */
        private NetUtil() {
        }

        /**
         * Gibt einen BufferedReader mit dem Ziel der URL zurueck.
         *
         * @param urlString URL
         * @return BufferedReader
         * @throws IOException bei einem E/A-Fehler
         */
        public static BufferedReader fetchDocumentAsBufferedReader(String urlString) throws IOException {
            URL url = new URL(urlString);
            InputStream is = url.openStream();
            return new BufferedReader(new InputStreamReader(is));
        }

        /**
         * Laedt ein Dokument aus dem Netz herunter und
         * speichert es in einer lokalen Datei.
         *
         * @param urlString URL
         * @return Dateiname
         * @throws IOException bei einem Fehler
         */
        public static String fetchDocumentAsFile(String urlString) throws IOException {
            byte[] temp = new byte[BUFFER_SIZE + 1];
            URL url = new URL(urlString);
            BufferedInputStream is = new BufferedInputStream(url.openStream());
            File file = File.createTempFile("dump", ".html");
            file.deleteOnExit();
            String tempFile = file.getAbsolutePath();
            BufferedOutputStream os = new BufferedOutputStream(new FileOutputStream(tempFile));
            while (true) {
                int aval = is.available();
                if (aval == 0) {
                    try {
                        Thread.sleep(100);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
                int count = is.read(temp, 0, BUFFER_SIZE);
                if (count == -1) {
                    break;
                }
                os.write(temp, 0, count);
            }
            is.close();
            os.close();
            return tempFile;
        }

    }

    /**
     * main entry point
     *
     * @param args command line arguments
     * @throws Exception if something unexpected happend
     */
    public static void main(String[] args) throws Exception {
        if (args.length == 0) {
            System.err.println("Aufruf: nhb.wikipedia.XHTMLDumper title-of-link-list");
            System.err.println("    title-of-link-list is the title of the page containing a list of links.");
            System.exit(1);
        }
        XHTMLDumper xd = new XHTMLDumper();
        Book book = xd.new Book(args[0]);
        book.fetchBook();

        /*article.process();
        System.out.println(article.getXML());*/

        System.out.println("Fertig.");
    }
}

//

Content Disclaimer

Informasi ini disarikan dari Wikipedia dan disajikan kembali untuk tujuan edukasi. Konten tersedia di bawah lisensi CC BY-SA 3.0. Kami tidak bertanggung jawab atas ketidakakuratan data yang bersumber dari kontribusi publik tersebut.

  1. The information displayed on this website is sourced in part or in whole from Wikipedia and has been adapted for the purpose of restating it. We strive to provide accurate and relevant information, however:
  2. There is no guarantee of absolute accuracy. Wikipedia is an open, collaborative project that can be edited by anyone, so information is subject to change.
  3. It is not intended to constitute professional advice. The content displayed is for informational and educational purposes only. For important decisions (e.g., medical, legal, or financial), please consult a professional.
  4. Content copyright. Wikipedia is licensed under the Creative Commons Attribution-ShareAlike License (CC BY-SA). This means that content may be reused with appropriate attribution and shared under a similar license.
  5. Responsible use. Any risk arising from the use of information from this website is entirely the responsibility of the user.