Archive for May, 2010|Monthly archive page

Shuffle XML Elements with XPath and VTD-XML

This is a simple app that shuffles elements in an XML file. It uses XPath to address individual element then re-arrange and re-combine the fragments.  Those fragments are identified by their offsets and lengths, both of which are obtained by calling VTDNav’s getElementFragment().

Why not using SAX and STaX?

Simply speaking, the lack of XPath support makes it very tedious, almost impossible, to re-arrange XML element fragments.

Why not using DOM?

Aside from performance and memory usage, the redundant, wasteful de-serialization/serialization contributes nothing but overhead to the task.

The Code

Input XML:

   <root>
     <a> text </a>
     <b> text </b>
     <c> text </c>
     <a> text </a>
     <b> text </b>
     <c> text </c>
     <a> text </a>
     <b> text </b>
     <c> text </c>
   </root>

Output.xml

   <root>
     <a> text </a>
     <a> text </a>
     <a> text </a>
     <b> text </b>
     <b> text </b>
     <b> text </b>
     <c> text </c>
     <c> text </c>
     <c> text </c>
   </root>

Java Code:

import com.ximpleware.*;
import java.io.*;
public class shuffle {
    public static void main(String[] args) throws Exception {
        VTDGen vg = new VTDGen();
        AutoPilot ap0 = new AutoPilot();
        AutoPilot ap1 = new AutoPilot();
        AutoPilot ap2 = new AutoPilot();
        ap0.selectXPath("/root/a");
        ap1.selectXPath("/root/b");
        ap2.selectXPath("/root/c");

        if (vg.parseFile("old.xml",false)){
            VTDNav vn = vg.getNav();
            ap0.bind(vn);
            ap1.bind(vn);
            ap2.bind(vn);
            FileOutputStream fos = new FileOutputStream("new.xml");
            fos.write("<root>".getBytes());
            byte[] ba = vn.getXML().getBytes();
            while(ap0.evalXPath()!=-1){
                long l= vn.getElementFragment();
                int offset = (int)l;
                int len = (int)(l>>32);
                fos.write('\n');
                fos.write(ba,offset, len);
            }
            ap0.resetXPath();
            while(ap1.evalXPath()!=-1){
                long l= vn.getElementFragment();
                int offset = (int)l;
                int len = (int)(l>>32);
                fos.write('\n');
                fos.write(ba,offset, len);
            }
            ap1.resetXPath();
            while(ap2.evalXPath()!=-1){
                long l= vn.getElementFragment();
                int offset = (int)l;
                int len = (int)(l>>32);
                fos.write('\n');
                fos.write(ba,offset, len);
            }
            ap2.resetXPath();
            fos.write('\n');
            fos.write("</root>".getBytes());
        }
    }
}

C# code:

using System;
using com.ximpleware;

namespace shuffle
{
    public class shuffle
    {
        public static void Main(String[] args)
        {
            VTDGen vg = new VTDGen();
            AutoPilot ap0 = new AutoPilot();
            AutoPilot ap1 = new AutoPilot();
            AutoPilot ap2 = new AutoPilot();
            ap0.selectXPath("/root/a");
            ap1.selectXPath("/root/b");
            ap2.selectXPath("/root/c");
            Encoding eg = System.Text.Encoding.GetEncoding("utf-8");
            if (vg.parseFile("old.xml", false))
            {
                VTDNav vn = vg.getNav();
                ap0.bind(vn);
                ap1.bind(vn);
                ap2.bind(vn);
                FileStream fos = new FileStream("new.xml", System.IO.FileMode.OpenOrCreate);
                //fos.Write("<root>".getBytes());
                byte[] ba0,ba1, ba2, ba3, ba4;
                //ba0 = eg.GetBytes("
                ba1 = eg.GetBytes("<root>");
                ba2 = eg.GetBytes("</root>");
                ba3 = eg.GetBytes("\n");
                fos.Write(ba1, 0, ba1.Length);
                byte[] ba = vn.getXML().getBytes();
                while (ap0.evalXPath() != -1)
                {
                    long l = vn.getElementFragment();
                    int offset = (int)l;
                    int len = (int)(l >> 32);
                    fos.Write(ba3,0,ba3.Length);
                    fos.Write(ba, offset, len);
                }
                ap0.resetXPath();
                while (ap1.evalXPath() != -1)
                {
                    long l = vn.getElementFragment();
                    int offset = (int)l;
                    int len = (int)(l >> 32);
                    fos.Write(ba3,0,ba3.Length);
                    fos.Write(ba, offset, len);
                }
                ap1.resetXPath();
                while (ap2.evalXPath() != -1)
                {
                    long l = vn.getElementFragment();
                    int offset = (int)l;
                    int len = (int)(l >> 32);
                    fos.Write(ba3,0,ba3.Length);
                    fos.Write(ba, offset, len);
                }
                ap2.resetXPath();
                fos.Write(ba3,0,ba3.Length);
                fos.Write(ba2,0,ba2.Length);
            }
        }
    }
}

Process Huge XML Documents with Extended VTD-XML

If you have XML files that are larger than 2GB, and don’t want to lose the benefit of XPath (full set), you will be surprised on how handy and fast extended VTD-XML can become.

New since version 2.3 and a part of full VTD-XML distribution , extended VTD-XML expands the maximum document size to 256 GB and requires 64-bit JVM to achieve those limits.

Extended VTD-XML works with XML either with standard in-memory mode (like the standard VTD-XML), or the memory mapped mode, which allows partial document loading.

The code examples below shows you how to use extended VTD-XML to process XML document using in-memory mode and memory mapped mode:

Memory Mapped Mode

import com.ximpleware.extended.*;
public class mem_mapped_read {
	// first read is the longer version of loading the XML file 
	public static void first_read() throws Exception{
	XMLMemMappedBuffer xb = new XMLMemMappedBuffer();
        VTDGenHuge vg = new VTDGenHuge();
        xb.readFile("test.xml");
        vg.setDoc(xb);
        vg.parse(true);
        VTDNavHuge vn = vg.getNav();
        System.out.println("text data ===>" + vn.toString(vn.getText()));
	}	

	// second read is the shorter version of loading the XML file 
	public static void second_read() throws Exception{
	    VTDGenHuge vg = new VTDGenHuge();
	    if (vg.parseFile("test.xml",true,VTDGenHuge.MEM_MAPPED)){
	        VTDNavHuge vn = vg.getNav();
	        System.out.println("text data ===>" + vn.toString(vn.getText()));
	    }
	}

	public static void main(String[] s) throws Exception{
		first_read();
	 	second_read();
	}
}

In Memory Mode

/**
 * This is a demonstration of how to use the extended VTD parser
 * to process large XML file. 
 *
 */
import com.ximpleware.extended.*;
public class in_mem_read {
	// first read is the longer version of loading the XML file 
	public static void first_read() throws Exception{
		XMLBuffer xb = new XMLBuffer();
        VTDGenHuge vg = new VTDGenHuge();
        xb.readFile("test.xml");
        vg.setDoc(xb);
        vg.parse(true);
        VTDNavHuge vn = vg.getNav();
        System.out.println("text data ===>" + vn.toString(vn.getText()));
	}	

	// second read is the shorter version of loading the XML file 
	public static void second_read() throws Exception{
	    VTDGenHuge vg = new VTDGenHuge();
	    if (vg.parseFile("test.xml",true,VTDGenHuge.IN_MEMORY)){
	        VTDNavHuge vn = vg.getNav();
	        System.out.println("text data ===>" + vn.toString(vn.getText()));
	    }
	}

	public static void main(String[] s) throws Exception{
		first_read();
	 	second_read();
	}
}

Namespace Compensation with VTD-XML

When you extract an element fragment by calling VTDNav’s getElementFragment(), you won’t be able to carry along the name spaces within context. If you want to extract an element fragment along with its name spaces, you need to call getElementFragmentNs(). Below are the examples of calling

The Java Version:

// Insert a ns-compensated fragment into an XML doc

public class FragmentTest {    
    public static void main(String[] s) throws Exception{
        // instantiate VTDGen and XMLModifier
        VTDGen vg = new VTDGen();
        XMLModifier xm = new XMLModifier();
        AutoPilot ap = new AutoPilot();
        AutoPilot ap2 = new AutoPilot();
        ap.selectXPath("(/*/*/*)[position()>1 and position()<4]");
        ap2.selectXPath("/*/*/*");
        if (vg.parseFile("soap2.xml",true)){
            VTDNav vn = vg.getNav();
            xm.bind(vn);
            ap2.bind(vn);
            ap.bind(vn);
            ap2.evalXPath();
            ElementFragmentNs ef = vn.getElementFragmentNs();
            int i = -1;
            while((i=ap.evalXPath())!=-1){
                xm.insertAfterElement(ef);
            }           
            xm.output(new FileOutputStream("new_soap.xml"));
        }        
    }
}

The C# version

using System;
using com.ximpleware;

namespace FragmentTest
{
    public class FragmentTest
    {
        public static void Main(String[] args)
        {
            // instantiate VTDGen and XMLModifier
            VTDGen vg = new VTDGen();
            XMLModifier xm = new XMLModifier();
            AutoPilot ap = new AutoPilot();
            AutoPilot ap2 = new AutoPilot();
            ap.selectXPath("(/*/*/*)[position()>1 and position()<4]");
            ap2.selectXPath("/*/*/*");
            if (vg.parseFile("soap2.xml", true))
            {
                VTDNav vn = vg.getNav();
                xm.bind(vn);
                ap2.bind(vn);
                ap.bind(vn);
                ap2.evalXPath();
                ElementFragmentNs ef = vn.getElementFragmentNs();
                int i = -1;
                while ((i = ap.evalXPath()) != -1)
                {
                    xm.insertAfterElement(ef);
                }
                xm.output("new_soap.xml");
            }
        }
    }
}

The C version

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <io.h>
#include "xpath1.h"
#include "helper.h"
#include "vtdGen.h"
#include "vtdNav.h"
#include "autoPilot.h"
#include "XMLModifier.h"
#include "nodeRecorder.h"
#include "bookMark.h"

struct exception_context the_exception_context[1];

void main(){
	exception e;
	Try{
		VTDGen *vg = NULL; /* This is the VTDGen that parses XML */
		VTDNav *vn = NULL; /* This is the VTDNav that navigates the VTD records */
		AutoPilot *ap = NULL, *ap2=NULL;
		XMLModifier *xm = NULL;
		ElementFragmentNs *ef = NULL;
		int i= -1;
		Long l= -1;

		vg = createVTDGen();
		ap = createAutoPilot2();
		ap2 = createAutoPilot2();
		xm = createXMLModifier();
		selectXPath(ap,L"(/*/*/*)[position()>1 and position()<4]");
		selectXPath(ap2,L"/*/*/*");
		if (parseFile(vg,TRUE,"soap2.xml")){
			//FILE *f1 = fopen("d:/new3.xml","wb");
			vn = getNav(vg);
			bind(ap,vn);
			bind(ap2,vn);
			bind4XMLModifier(xm,vn);
			evalXPath(ap2);
			ef = getElementFragmentNs(vn);
			
			while( (i=evalXPath(ap))!=-1){
				insertAfterElement4(xm,ef);
				printf(" index %d \n",i);
			}
			//fwrite(vn->XMLDoc+vn->docOffset,sizeof(UByte),vn->docLen,f1);
			output2(xm,"new3.xml");
			//fclose(f1);
			free(vn->XMLDoc);
			freeVTDNav(vn);
		}
		freeElementFragmentNs(ef);
		freeXMLModifier(xm);
		freeAutoPilot(ap);
		freeAutoPilot(ap2);
		freeVTDGen(vg);
		
	}Catch(e){
		printf("exception !!!!!!!!!!! \n");
	}
}

Using VTD-XML to Replace Element Names

This code example shows you how to replace the element name of an XML document using XPath and XMLModifier in VTD-XML. The key is to combine XPath and XMLModifier’s updateElementName() at the cursor node.

The Java version is below:

/*
 * Change all elements to lalalala
 */
import com.ximpleware.*;

public class changeElementName {

    public static void main(String[] args) throws Exception{
        
        String xml = "<aaaa> <bbbbb> <ccccc> </ccccc> <ccccc/> <ccccc></ccccc> </bbbbb> </aaaa>";
        VTDGen vg = new VTDGen();
        vg.setDoc(xml.getBytes());
        vg.parse(false);
        VTDNav vn = vg.getNav();
        AutoPilot ap = new AutoPilot(vn);
        ap.selectXPath("//*");
        XMLModifier xm = new XMLModifier(vn);
        int i;
        while(ap.evalXPath()!=-1){
            xm.updateElementName("lalalala");
        }
        xm.output("lala.xml");
        
    }
}

The C# version is here:

using System;

using com.ximpleware;
using System.IO;
namespace FragmentTest
{
    public class FragmentTest
    {
        public static void Main(String[] args)
        {

            String xml = "<aaaa> <bbbbb> <ccccc> </ccccc> <ccccc/> <ccccc></ccccc> </bbbbb> </aaaa>";
            Encoding eg = Encoding.GetEncoding("utf-8");
            VTDGen vg = new VTDGen();
            vg.setDoc(eg.GetBytes(xml));
            vg.parse(false);
            VTDNav vn = vg.getNav();
            AutoPilot ap = new AutoPilot(vn);
            ap.selectXPath("//*");
            XMLModifier xm = new XMLModifier(vn);
            while (ap.evalXPath() != -1)
            {
                xm.updateElementName("d:/lalalala");
            }
            xm.output("lala.xml");
        }
    }
}