diff --git a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java
index 80b2fe15d07..776fdd40242 100644
--- a/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java
+++ b/sandbox/contributions/webcrawler-LARM/src/de/lanlab/larm/fetcher/URLMessage.java
@@ -59,6 +59,7 @@ import java.io.*;
import de.lanlab.larm.util.URLUtils;
import de.lanlab.larm.net.URLNormalizer;
import de.lanlab.larm.net.HostManager;
+import de.lanlab.larm.net.*;
/**
* represents a URL which is passed around in the messageHandler
@@ -74,6 +75,11 @@ public class URLMessage implements Message, Serializable
*/
protected URL url;
+ /**
+ * docID or 0 (used with repository)
+ */
+ long docId;
+
/**
* Description of the Field
*/
@@ -85,39 +91,72 @@ public class URLMessage implements Message, Serializable
protected URL referer;
/**
- * externalized referer URL, to prevent multiple calls to url.toExternalForm()
+ * externalized referer URL, to prevent multiple calls to
+ * url.toExternalForm()
*/
protected volatile String refererString;
/**
- * externalized referer URL, to prevent multiple calls to url.toExternalForm()
+ * externalized referer URL, to prevent multiple calls to
+ * url.toExternalForm()
*/
protected volatile String refererNormalizedString;
/**
* normalized URL, as defined by {@link de.lanlab.larm.net.URLNormalizer}
- * (lower case, index.* removed, all characters except alphanumeric ones escaped)
+ * (lower case, index.* removed, all characters except alphanumeric ones
+ * escaped)
*/
protected String normalizedURLString;
+ /**
+ * ANCHOR: an ordinary link like <a href="..."> (or AREA or IMG)
+ * FRAME: a <FRAME src="..."> tag
+ * REDIRECT: the link between two pages after a 301/302/307 result code
+ */
+ byte linkType;
- boolean isFrame;
+ public final static byte LINKTYPE_ANCHOR=0;
+ public final static byte LINKTYPE_FRAME=1;
+ public final static byte LINKTYPE_REDIRECT=2;
+ protected final static String LINKTYPE_STRING[] = { "A/IMG/AREA", "FRAME", "Redirect" };
+
+ public int getLinkType()
+ {
+ return linkType;
+ }
+
+ public String getLinkTypeString()
+ {
+ return LINKTYPE_STRING[linkType];
+ }
/**
* anchor text, as in <a href="...">Anchor</a>
*/
protected String anchor;
+ public void setDocId(long docId)
+ {
+ this.docId = docId;
+ }
+
+ public long getDocId()
+ {
+ return docId;
+ }
+
/**
* Constructor for the URLMessage object
*
- * @param url Description of the Parameter
- * @param referer Description of the Parameter
- * @param isFrame Description of the Parameter
- * @param anchor Description of the Parameter
+ * @param url Description of the Parameter
+ * @param referer Description of the Parameter
+ * @param isFrame Description of the Parameter
+ * @param anchor Description of the Parameter
+ * @param hostManager Description of the Parameter
*/
- public URLMessage(URL url, URL referer, boolean isFrame, String anchor, HostManager hostManager)
+ public URLMessage(URL url, URL referer, byte linkType, String anchor, HostResolver hostResolver)
{
//super();
this.url = url;
@@ -125,19 +164,57 @@ public class URLMessage implements Message, Serializable
this.referer = referer;
this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
- this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostManager)) : null;
- this.isFrame = isFrame;
+ this.refererNormalizedString = referer != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(referer, hostResolver)) : null;
+ this.linkType = linkType;
this.anchor = anchor != null ? anchor : "";
- this.normalizedURLString = URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostManager));
+ this.normalizedURLString = url != null ? URLUtils.toExternalFormNoRef(URLNormalizer.normalize(url, hostResolver)) : null;
//this.normalizedURLString = URLNormalizer.
//System.out.println("" + refererString + " -> " + urlString);
+ this.docId = 0;
}
+ public URLMessage(URL url, String normalizedURL, URL referer, String normalizedReferer, byte linkType, String anchor)
+ {
+ //super();
+ this.url = url;
+ this.urlString = url != null ? URLUtils.toExternalFormNoRef(url) : null;
+
+ this.referer = referer;
+ this.refererString = referer != null ? URLUtils.toExternalFormNoRef(referer) : null;
+ this.refererNormalizedString = normalizedReferer;
+ this.linkType = linkType;
+ this.anchor = anchor != null ? anchor : "";
+ this.normalizedURLString = normalizedURL;
+ //this.normalizedURLString = URLNormalizer.
+ //System.out.println("" + refererString + " -> " + urlString);
+ this.docId = 0;
+ }
+
+ public URLMessage(URLMessage other)
+ {
+ this.url = other.url;
+ this.urlString = other.urlString;
+ this.referer = other.referer;
+ this.refererString = other.refererString;
+ this.refererNormalizedString = other.refererNormalizedString;
+ this.linkType = other.linkType;
+ this.anchor = other.anchor;
+ this.normalizedURLString = other.normalizedURLString;
+ this.docId = other.docId;
+ }
+
+ /**
+ * Gets the normalizedURLString attribute of the URLMessage object
+ *
+ * @return The normalizedURLString value
+
+ */
public String getNormalizedURLString()
{
return this.normalizedURLString;
}
+
/**
* Gets the url attribute of the URLMessage object
*
@@ -193,6 +270,17 @@ public class URLMessage implements Message, Serializable
}
+ /**
+ * Gets the normalizedRefererString attribute of the URLMessage object
+ *
+ * @return The normalizedRefererString value
+ */
+ public String getNormalizedRefererString()
+ {
+ return this.refererNormalizedString;
+ }
+
+
/**
* Gets the anchor attribute of the URLMessage object
*
@@ -226,10 +314,12 @@ public class URLMessage implements Message, Serializable
{
out.writeObject(url);
out.writeObject(referer);
- out.writeBoolean(isFrame);
- out.writeUTF(anchor);
- out.writeUTF(refererNormalizedString);
- out.writeUTF(normalizedURLString);
+ out.writeByte(linkType);
+ out.writeUTF(anchor != null ? anchor : "");
+ out.writeUTF(refererNormalizedString != null ? refererNormalizedString : "");
+ out.writeUTF(normalizedURLString != null ? normalizedURLString : "");
+ out.write((int)((docId >> 32) & 0xffffffff) );
+ out.write((int)(docId & 0xffffffff));
}
@@ -247,11 +337,13 @@ public class URLMessage implements Message, Serializable
url = (URL) in.readObject();
referer = (URL) in.readObject();
urlString = url.toExternalForm();
- refererString = referer.toExternalForm();
- isFrame = in.readBoolean();
+ refererString = referer != null ? referer.toExternalForm() : "";
+ linkType = in.readByte();
anchor = in.readUTF();
refererNormalizedString = in.readUTF();
normalizedURLString = in.readUTF();
+ docId = in.read() << 32;
+ docId |= in.read();
}
@@ -262,7 +354,7 @@ public class URLMessage implements Message, Serializable
*/
public String getInfo()
{
- return (referer != null ? refererString : "") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + (isFrame ? "1" : "0") + "\t" + anchor;
+ return (referer != null ? refererString : "") + "\t" + urlString + "\t" + this.getNormalizedURLString() + "\t" + linkType + "\t" + anchor;
}
}