| 1 |
|
|
| 2 |
|
|
| 3 |
|
|
| 4 |
|
|
| 5 |
|
|
| 6 |
|
|
| 7 |
|
|
| 8 |
|
|
| 9 |
|
|
| 10 |
|
|
| 11 |
|
|
| 12 |
|
package ca.spaz.cron.datasource.www; |
| 13 |
|
|
| 14 |
|
import java.io.*; |
| 15 |
|
import java.net.URL; |
| 16 |
|
import java.util.*; |
| 17 |
|
import java.util.regex.*; |
| 18 |
|
|
| 19 |
|
import org.apache.log4j.Logger; |
| 20 |
|
import org.htmlparser.util.Translate; |
| 21 |
|
|
| 22 |
|
import ca.spaz.cron.database.FoodGroup; |
| 23 |
|
|
| 24 |
|
|
| 25 |
|
|
| 26 |
|
|
| 27 |
|
|
| 28 |
|
|
| 29 |
|
public class Foodcatter { |
| 30 |
0 |
private static final Pattern CK_COM_GROUP = Pattern |
| 31 |
|
.compile("<option value=\"(\\d+)\">([^<]+)</option>"); |
| 32 |
|
|
| 33 |
0 |
private static final Pattern CK_COM_FN_END = Pattern.compile("</select>"); |
| 34 |
|
|
| 35 |
0 |
private static final Pattern CK_COM_FN_START = Pattern |
| 36 |
|
.compile("<select name=\"filter\""); |
| 37 |
|
|
| 38 |
|
private static final String CK_COM_URL = "http://www.calorieking.com/foods/"; |
| 39 |
|
|
| 40 |
|
private static final String ND_COM_URL = "http://www.nutritiondata.com/color/nutrition.js"; |
| 41 |
|
|
| 42 |
|
|
| 43 |
|
|
| 44 |
|
|
| 45 |
0 |
private static final Logger logger = Logger.getLogger(Foodcatter.class); |
| 46 |
|
|
| 47 |
0 |
private static Foodcatter instance = null; |
| 48 |
|
|
| 49 |
|
private URL url; |
| 50 |
|
|
| 51 |
0 |
private static final Pattern ND_COM_FN_START = Pattern |
| 52 |
|
.compile("function +foodcat"); |
| 53 |
|
|
| 54 |
0 |
private static final Pattern ND_COM_FN_END = Pattern.compile("}"); |
| 55 |
|
|
| 56 |
0 |
private static final Pattern ND_COM_GROUP = CK_COM_GROUP; |
| 57 |
|
|
| 58 |
|
public static final Foodcatter getInstance() { |
| 59 |
0 |
if (null == instance) { |
| 60 |
0 |
instance = new Foodcatter(); |
| 61 |
|
} |
| 62 |
0 |
return instance; |
| 63 |
|
} |
| 64 |
|
|
| 65 |
0 |
private Foodcatter() { |
| 66 |
0 |
} |
| 67 |
|
|
| 68 |
|
|
| 69 |
|
|
| 70 |
|
|
| 71 |
|
|
| 72 |
|
|
| 73 |
|
|
| 74 |
|
|
| 75 |
|
|
| 76 |
|
public List getFoodGroups(URL sourceUrl, Pattern startGroups, |
| 77 |
|
Pattern endGroups, Pattern groupID, int keyGroup, class="keyword">int nameGroup) |
| 78 |
|
throws IOException { |
| 79 |
0 |
InputStream ins = sourceUrl.openStream(); |
| 80 |
0 |
BufferedReader br = new BufferedReader(class="keyword">new InputStreamReader(ins)); |
| 81 |
0 |
String s = br.readLine(); |
| 82 |
0 |
StringBuffer sb = new StringBuffer(); |
| 83 |
0 |
while (s != null) { |
| 84 |
0 |
sb.append(s); |
| 85 |
0 |
s = br.readLine(); |
| 86 |
0 |
} |
| 87 |
|
|
| 88 |
0 |
List ret = new ArrayList(); |
| 89 |
|
|
| 90 |
0 |
Matcher matchFunction = startGroups.matcher(sb.toString()); |
| 91 |
0 |
Matcher matchGroup = groupID.matcher(sb.toString()); |
| 92 |
0 |
Matcher matchEnd = endGroups.matcher(sb.toString()); |
| 93 |
0 |
if (matchFunction.find()) { |
| 94 |
0 |
if (logger.isDebugEnabled()) { |
| 95 |
0 |
logger.debug("getFoodGroups() - Found function start"); |
| 96 |
|
} |
| 97 |
0 |
int sidx = matchFunction.end(); |
| 98 |
0 |
int eidx = -1; |
| 99 |
0 |
if (matchEnd.find(sidx)) { |
| 100 |
0 |
eidx = matchEnd.start(); |
| 101 |
|
} |
| 102 |
0 |
if (eidx < 0) { |
| 103 |
0 |
logger.error("getFoodGroups() - No end to function", null); |
| 104 |
0 |
ret = Collections.EMPTY_LIST; |
| 105 |
0 |
return ret; |
| 106 |
|
} |
| 107 |
0 |
while (sidx < eidx && sidx > 0) { |
| 108 |
0 |
if (matchGroup.find(sidx)) { |
| 109 |
0 |
sidx = matchGroup.end(); |
| 110 |
0 |
WWWFoodGroup fg = new WWWFoodGroup(matchGroup.group(keyGroup), |
| 111 |
|
Translate.decode(matchGroup.group(nameGroup))); |
| 112 |
0 |
ret.add(fg); |
| 113 |
0 |
} else { |
| 114 |
0 |
sidx = -1; |
| 115 |
|
} |
| 116 |
0 |
} |
| 117 |
|
} |
| 118 |
0 |
return ret; |
| 119 |
|
} |
| 120 |
|
|
| 121 |
|
public static void main(String[] args) throws IOException { |
| 122 |
0 |
List lis = getInstance().getFoodGroups(new URL(ND_COM_URL), |
| 123 |
|
ND_COM_FN_START, ND_COM_FN_END, ND_COM_GROUP, 1, 2); |
| 124 |
0 |
List lis2 = getInstance().getFoodGroups(new URL(CK_COM_URL), |
| 125 |
|
CK_COM_FN_START, CK_COM_FN_END, CK_COM_GROUP, 1, 2); |
| 126 |
0 |
for (Iterator iter = lis.iterator(); iter.hasNext();) { |
| 127 |
0 |
FoodGroup g = (FoodGroup) iter.next(); |
| 128 |
0 |
System.out.println(g); |
| 129 |
0 |
} |
| 130 |
0 |
for (Iterator iter = lis2.iterator(); iter.hasNext();) { |
| 131 |
0 |
FoodGroup g = (FoodGroup) iter.next(); |
| 132 |
0 |
System.out.println(g); |
| 133 |
0 |
} |
| 134 |
0 |
} |
| 135 |
|
|
| 136 |
|
} |